#!/bin/bash set -euf -o pipefail scripts=$(dirname $0) # shellcheck source=jenkins-helpers.sh . $scripts/jenkins-helpers.sh # shellcheck source=round-robin.sh . $scripts/round-robin.sh convert_args_to_variables "$@" obligatory_variables rr[ci_project] rr[ci_config] ssh_host ssh_port # Execution mode: baseline, bisect, jenkins-full # shellcheck disable=SC2154 rr[mode]="${rr[mode]-baseline}" # Set custom revision for one of the projects, and use baseline revisions # for all other projects. # shellcheck disable=SC2154 rr[baseline_branch]="${rr[baseline_branch]-linaro-local/ci/${rr[ci_project]}/${rr[ci_config]}}" # shellcheck disable=SC2154 rr[update_baseline]="${rr[update_baseline]-update}" # shellcheck disable=SC2154 rr[top_artifacts]="${rr[top_artifacts]-$(pwd)/artifacts}" # Set metric to perf by default. # shellcheck disable=SC2154 rr[metric]="${rr[metric]-perf}" # {toolchain_name}-{toolchain_ver}-{target}-{bmk}-{cflags} IFS=- read -a ci_config <> ${rr[top_artifacts]}/results echo "-$EXTERNAL_FAIL" >> ${rr[top_artifacts]}/results build_ret=1 break ;; esac # Sleep a little to avoid flooding ci.linaro.org on transient ssh # failures. sleep 5 done echo "$results_id" | sed -e "s/@build_num@/$build_num/g" \ > "$results_id_file" return $build_ret ) } # Compare results obtained from perf data between $1 and $2 # and generate results-compare.csv compare_results_perf () { ( set -euf -o pipefail case "${cflags[0]}" in "-Os"*|"-Oz"*) # We use 1% tolerance for binary size # and 10% tolerance for symbol size. exe_threshold=101 symbol_threshold=110 ;; *) # We use 3% tolerance for binary speed # and 15% tolerance for symbol speed. exe_threshold=103 symbol_threshold=115 # Reduce thresholds when bisecting to avoid considering borderline # regressions as spurious. This should break cycles of build and # bisect jobs triggering each other on borderline regressions. if [ x"${rr[mode]}" = x"bisect" ]; then exe_threshold=102 symbol_threshold=110 fi ;; esac local -a arr local bmk symbol time size result prev_bmk echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv printf "extra_build_params=" > $run_step_artifacts/extra-bisect-params # Read result lines from <(tail -n +2 ...) below. # "-n +2" is to skip the header line. prev_bmk="" while IFS=, read -a arr; do bmk=${arr[0]} symbol=${arr[1]} time=${arr[2]} size=${arr[3]} case "${cflags[0]}" in "-Os"*|"-Oz"*) metric="$size" ;; *) metric="$time" ;; esac # Skip case where we have no info ("n/a") if [ "$metric" != "n/a" ]; then # Remove padding from the tail of $symbol (padding is added by # csvs2table.py for better formatting). local short_symbol="${symbol%%[ ]*}" case "$short_symbol" in "["*) threshold=$symbol_threshold ;; *"_base.default") threshold=$exe_threshold ;; *) threshold=$symbol_threshold ;; esac if ! [ "$metric" -le "$threshold" ]; then result=100 echo "# $bmk,$symbol regressed by $metric" >> $run_step_artifacts/results.regressions if [ x"$bmk" != x"$prev_bmk" ]; then printf "++benchmarks %s " $bmk >> $run_step_artifacts/extra-bisect-params prev_bmk="$bmk" fi else result=1 fi echo "$bmk,$symbol,$result" >> $run_step_artifacts/results-compare.csv fi done < <(tail -n +2 $run_step_artifacts/results.csv) printf "\n" >> $run_step_artifacts/extra-bisect-params ) } compare_results_vect () { ( set -euf -o pipefail echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv while IFS=, read -a arr; do bmk=${arr[0]} # hack to trim padding symbol=$(echo ${arr[1]} | xargs) base_num_vect_loops=${arr[3]} target_num_vect_loops=${arr[4]} if (( base_num_vect_loops > target_num_vect_loops )); then echo "$bmk, $symbol, $base_num_vect_loops, $target_num_vect_loops" \ >> $run_step_artifacts/results-compare.csv fi done < <(tail -n +2 $run_step_artifacts/results.csv) ) } compare_results () { ( set -euf -o pipefail local metric=$1 local ref_results_id="$2" local new_results_id="$3" local cmp_options="$4" local results_ref results_new results_ref=$(cat $ref_results_id) results_new=$(cat $new_results_id) case "${rr[target]}" in "arm_eabi") cmp_options="$cmp_options --has_perf_logs no" ;; esac $scripts/tcwg-benchmark-results.sh \ --results_ref $results_ref ++results $results_new \ --top_artifacts "$run_step_artifacts" --verbose $verbose \ --metric "$metric" $cmp_options \ > $run_step_artifacts/results.log 2>&1 case $metric in "perf") compare_results_perf ;; "vect") compare_results_vect ;; *) echo "Invalid metric: $metric"; exit 1 ;; esac ) } # Exit with code 0 if no new regressions between results_id-1 and -2 compared to # regression between results_id-1 and -2 in base-artifacts/. no_regression_vs_p () { ( set -euf -o pipefail local ref_artifacts=$1 local new_artifacts=$2 # Check for build and correctness regressions. no_build_regression_p "$@" # Generate ref-results-compare.csv. The value of "1" means that the result # in the 2nd run is no worse than the result in the 1st run (as expected). # The value of "100" means that the result in the 2nd run is worse than # the result in the 1st run (unexpected). # Note that we can grab previously-generated ref-results-compares.csv from # base-artifacts/, but it could have been generated with an older version # of scripts, so it's safer and more resilient to re-generate it from original # perf data. if [ ! -f "$ref_artifacts/results_id-1" ] || [ ! -f "$ref_artifacts/results_id-2" ]; then return 0 fi # missing reference results, which we have listed in # tcwg-benchmark-results.broken-list. Once all entries referencing missing # results are discarded, we'll remove this workaround. # Otherwise compare_results will fail while fetching baseline results, # and we'll consider this failure as a regression. if cat "$scripts/tcwg-benchmark-results.broken-list" \ | grep -q "^$(cat $ref_artifacts/results_id-1)\$\|^$(cat $ref_artifacts/results_id-2)\$"; then return 0 fi # compare_results "${rr[metric]}" "$ref_artifacts/results_id-1" "$ref_artifacts/results_id-2" \ "--num_dsos 1 --num_symbols 0" while IFS= read -r -d '' i do mv $i "$(dirname $i)"/ref-"$(basename $i)" done < <(find $run_step_artifacts/ -type f -name "results*" -print0) # Similarly, generate new-results-compare.csv. if [ ! -f "$new_artifacts/results_id-1" ] || [ ! -f "$new_artifacts/results_id-2" ]; then return 1 fi compare_results "${rr[metric]}" "$new_artifacts/results_id-1" "$new_artifacts/results_id-2" \ "--num_dsos 1 --num_symbols 0" while IFS= read -r -d '' i do mv $i "$(dirname $i)"/new-"$(basename $i)" done < <(find $run_step_artifacts/ -type f -name "results*" -print0) # Now compare the two reports. # If "ref" has value of "100" (bad state), and "new" has value of "100" # (also bad state), then we get no change, no regression, and final value # of 100% * 100/100 == 100. # # If "ref" has value of "1" (good state), and "new" has value of "1" # (also good state), then we get no change, no regression, and final value # of 100% * 1/1 == 100. # # If "ref" has value of "100" (bad state), and "new" has value of "1" # (good state), then we get a progression, and final value # of 100% * 1/100 == 1. # # If "ref" has value of "1" (good state), and "new" has value of "100" # (bad state), then we get a regression, and final value # of 100% * 100/1 == 10000. We detect this below by comparing vs "5000". $scripts/../bmk-scripts/csvs2table.py -p 0 --relative $run_step_artifacts/ref-results-compare.csv $run_step_artifacts/new-results-compare.csv > $run_step_artifacts/results-compare.csv local -a arr local bmk symbol result status prev_bmk local -a bisect_bmks # Read result lines from <(tail -n +2 ...) below. # "-n +2" is to skip the header line. Set $status to "1" if there is # a regression. status=0 prev_bmk="" # Delete results.regressions generated by compare_results() calls above. rm -f $run_step_artifacts/results.regressions while IFS=, read -a arr; do bmk=${arr[0]} symbol=${arr[1]} result=${arr[2]} if ! [ "$result" -le "5000" ]; then echo "# $bmk,$symbol regressed" >> $run_step_artifacts/results.regressions status=1 if [ x"$bmk" != x"$prev_bmk" ]; then bisect_bmks+=("++benchmarks" "$bmk") prev_bmk="$bmk" fi fi done < <(tail -n +2 $run_step_artifacts/results-compare.csv) echo "extra_build_params=${bisect_bmks[*]}" > $run_step_artifacts/extra-bisect-params return $status ) } # Exit with code 0 if no regression compared to base-artifacts/. # Inspect build results ./results and performance results in ./results_id. no_regression_to_base_p () { ( set -euf -o pipefail no_build_regression_p "$@" local ref_artifacts=$1 local new_artifacts=$2 if ! [ -f "$ref_artifacts/results_id" ]; then return 0 fi # missing reference results, which we have listed in # tcwg-benchmark-results.broken-list. Once all entries referencing missing # results are discarded, we'll remove this workaround. # Otherwise compare_results will fail while fetching baseline results, # and we'll consider this failure as a regression. if cat "$scripts/tcwg-benchmark-results.broken-list" \ | grep -q "^$(cat $ref_artifacts/results_id)\$"; then return 0 fi # if ! [ -f "$new_artifacts/results_id" ]; then return 1 fi # Make sure there is no stray results.regression file, which we use # as failure marker. assert ! [ -f $run_step_artifacts/results.regressions ] local compare_opts="" case "${cflags[0]}" in *"_LTO"*) compare_opts="--num_symbols 0 --entry_threshold 10" ;; esac compare_results "${rr[metric]}" "$ref_artifacts/results_id" "$new_artifacts/results_id" "$compare_opts" if [ -f $run_step_artifacts/results.regressions ]; then return 1 fi return 0 ) } # Implement rr[breakup_updated_components] hook. tcwg_bmk_breakup_updated_components () { ( set -euf -o pipefail # Compiler changes tend to cause the most regressions. # Breakup updated components into compiler and the rest of components # to reduce the number of builds. local cc case "${rr[toolchain]}" in llvm) cc="llvm" ;; gnu|gnu_eabi) cc="gcc" ;; *) assert false ;; esac if print_updated_components "\n" | grep -q "^$cc\$"; then echo "$cc" print_updated_components "\n" | grep -v "^$cc\$" | tr '\n' ' ' | sed -e "s/ \$//g" echo else print_updated_components "\n" fi ) } # shellcheck disable=SC2154 rr[breakup_updated_components]=tcwg_bmk_breakup_updated_components run_step stop_on_fail -10 reset_artifacts run_step stop_on_fail x prepare_abe run_step skip_on_fail -9 build_abe binutils run_step skip_on_fail -8 build_abe stage1 -- "${gcc_override_configure[@]}" run_step skip_on_fail x clean_sysroot case "${rr[components]}" in *glibc*) run_step skip_on_fail -7 build_abe linux run_step skip_on_fail -6 build_abe glibc ;; *newlib*) run_step skip_on_fail -6 build_abe newlib ;; esac patch_branch="" if [ x"${rr[metric]}" = x"vect" ]; then patch_branch="--patch linaro-local/vect-metric-branch" fi run_step skip_on_fail -5 build_abe stage2 -- $patch_branch "${gcc_override_configure[@]}" case "${rr[toolchain]}" in llvm) run_step skip_on_fail -3 build_llvm true ;; esac case "${#cflags[@]}" in 2) # Don't bisect benchmark build/run failures in *-vs-* configurations. # Bisections happen only for regressions with build scores >=0, # which will happen if benchmark "${cflags[1]}" succeeds. run_step skip_on_fail -1 benchmark "${cflags[0]}" -- ${rr[top_artifacts]}/results_id-1 run_step skip_on_fail 0 benchmark "${cflags[1]}" -- ${rr[top_artifacts]}/results_id-2 # Set final "build" score to "1" for compatibility with older results run_step skip_on_fail 1 true # shellcheck disable=SC2154 rr[no_regression_p]=no_regression_vs_p run_step reset_on_fail x check_regression ;; 1) # Bisect benchmark build/run failures in non-vs configurations. # Set score to "0" with "true". run_step skip_on_fail 0 true run_step skip_on_fail 1 benchmark "${cflags[0]}" -- ${rr[top_artifacts]}/results_id rr[no_regression_p]=no_regression_to_base_p run_step reset_on_fail x check_regression ;; esac run_step stop_on_fail x update_baseline run_step stop_on_fail x push_baseline trap "" EXIT