#!/bin/bash set -euf -o pipefail scripts=$(dirname $0) # shellcheck source=jenkins-helpers.sh . $scripts/jenkins-helpers.sh # shellcheck source=round-robin.sh . $scripts/round-robin.sh convert_args_to_variables "$@" obligatory_variables rr[ci_project] rr[ci_config] ssh_host ssh_port declare -A rr # Execution mode: baseline, bisect, jenkins-full rr[mode]="${rr[mode]-baseline}" # Set custom revision for one of the projects, and use baseline revisions # for all other projects. rr[baseline_branch]="${rr[baseline_branch]-linaro-local/ci/${rr[ci_project]}/${rr[ci_config]}}" rr[update_baseline]="${rr[update_baseline]-update}" rr[top_artifacts]="${rr[top_artifacts]-$(pwd)/artifacts}" # Set metric to perf by default. rr[metric]="${rr[metric]-perf}" # {toolchain_name}-{toolchain_ver}-{target}-{bmk}-{cflags} IFS=- read -a ci_config <" to the console output. # Strip this last line. head -n -1 $run_step_artifacts/benchmark-build.log \ > $run_step_artifacts/benchmark.log local build_status local build_ret while true; do # Ssh connection to ci.linaro.org occasionally drops. We need # to check whether benchmarking has finished, and, if not, continue # to watch its output. We detect that the job has finished if the last # line of console output is "Finished: ". build_status=$(tail -n 1 $run_step_artifacts/benchmark.log) case "$build_status" in "Finished: SUCCESS") build_ret=0 break ;; "Finished: "*) echo "# Benchmarking infra is offline:" >> ${rr[top_artifacts]}/results echo "-$EXTERNAL_FAIL" >> ${rr[top_artifacts]}/results build_ret=1 break ;; esac # After ci.linaro.org update on 2021-10-11 behavior of console command # option has changed: before the update it exited immediately for finished builds, # and after the update "console" hangs indefinitely for finished builds. # We workaround this by using "timeout 1m". sleep 300 (timeout 1m \ ssh -p2222 -l $USER@linaro.org ci.linaro.org \ console tcwg-benchmark $build_num || true) \ | tee $run_step_artifacts/benchmark.log done echo "$results_id" | sed -e "s/@build_num@/$build_num/g" \ > "$results_id_file" return $build_ret ) } # Compare results obtained from perf data between $1 and $2 # and generate results-compare.csv compare_results_perf () { ( set -euf -o pipefail local exe_threshold symbol_threshold case "${cflags[0]}" in "-Os"*|"-Oz"*) # We use 1% tolerance for binary size # and 10% tolerance for symbol size. exe_threshold=1 symbol_threshold=10 ;; *) # We use 3% tolerance for binary speed # and 15% tolerance for symbol speed. exe_threshold=3 symbol_threshold=15 # Reduce thresholds when bisecting to avoid considering borderline # regressions as spurious. This should break cycles of build and # bisect jobs triggering each other on borderline regressions. if [ x"${rr[mode]}" = x"bisect" ]; then exe_threshold=2 symbol_threshold=10 fi ;; esac local -a arr local metric bmk symbol rtime rsize time1 time2 size1 size2 local regression short_symbol short_regression local result prev_bmk echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv printf "extra_build_params=" > $run_step_artifacts/extra-bisect-params assert_with_msg "Found stale regression files" \ [ x"$(find $run_step_artifacts/ -name "*.regression" | wc -l)" = x"0" ] local metric_id regressed_by case "${cflags[0]}" in "-Os"*|"-Oz"*) metric_id="size" regressed_by="grew in size by" ;; *) metric_id="time" regressed_by="slowed down by" ;; esac # Read result lines from <(tail -n +2 ...) below. # "-n +2" is to skip the header line. prev_bmk="" while IFS=, read -a arr; do bmk=${arr[0]} symbol=${arr[1]} rtime=${arr[2]} rsize=${arr[3]} time1=${arr[4]} time2=${arr[5]} size1=${arr[6]} size2=${arr[7]} case $metric_id in size) metric=$rsize ;; time) metric=$rtime ;; *) assert false ;; esac # Skip case where we have no info ("n/a") if [ "$metric" != "n/a" ]; then metric=$(($metric - 100)) # Remove padding from the tail of $symbol (padding is added by # csvs2table.py for better formatting). short_symbol="$(echo "$symbol" | sed -e "s/ *\$//")" local bmk_exe case "$short_symbol" in "["*) bmk_exe=false ;; *"_base.default") bmk_exe=true ;; *) bmk_exe=false ;; esac local threshold if $bmk_exe; then threshold=$exe_threshold else threshold=$symbol_threshold fi if ! [ "$metric" -le "$threshold" ]; then result=100 case $metric_id in size) short_regression="$regressed_by ${metric}%" regression="$short_regression from $size1 to $size2 bytes" ;; time) short_regression="$regressed_by ${metric}%" regression="$short_regression from $time1 to $time2 perf samples" ;; *) assert false ;; esac if $bmk_exe; then short_regression="$bmk $short_regression" regression="$bmk $regression" # Detect magic sample counts that indicate failure to build # and failure to run case "$time2" in 888888888) short_regression="$bmk failed to run correctly" regression="$short_regression" ;; 999999999) short_regression="$bmk failed to build" regression="$short_regression" ;; esac echo "$metric,$bmk,$symbol,$short_regression,$regression" >> $run_step_artifacts/exe.regressions else short_regression="$bmk:$short_symbol $short_regression" regression="$bmk:$short_symbol $regression" echo "$metric,$bmk,$symbol,$short_regression,$regression" >> $run_step_artifacts/$bmk.regression fi if [ x"$bmk" != x"$prev_bmk" ]; then printf "++benchmarks %s " $bmk >> $run_step_artifacts/extra-bisect-params prev_bmk="$bmk" fi else result=1 fi echo "$bmk,$symbol,$result" >> $run_step_artifacts/results-compare.csv fi done < <(tail -n +2 $run_step_artifacts/results.csv) printf "\n" >> $run_step_artifacts/extra-bisect-params # Comparison is done. Below we generate regression report. cat > $run_step_artifacts/jira-body.txt <> $run_step_artifacts/jira-body.txt <> $run_step_artifacts/jira-body.txt <> $run_step_artifacts/jira-body.txt <> $run_step_artifacts/jira-body.txt <> $run_step_artifacts/jira-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt < $run_step_artifacts/mail-subject.txt < $run_step_artifacts/results.regressions echo "# $short_regression" >> $run_step_artifacts/results.regressions ) } compare_results_vect () { ( set -euf -o pipefail echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv while IFS=, read -a arr; do bmk=${arr[0]} # hack to trim padding symbol=$(echo ${arr[1]} | xargs) base_num_vect_loops=${arr[3]} target_num_vect_loops=${arr[4]} if (( base_num_vect_loops > target_num_vect_loops )); then echo "$bmk, $symbol, $base_num_vect_loops, $target_num_vect_loops" \ >> $run_step_artifacts/results-compare.csv fi done < <(tail -n +2 $run_step_artifacts/results.csv) ) } compare_results () { ( set -euf -o pipefail local metric=$1 local ref_results_id="$2" local new_results_id="$3" local cmp_options="$4" local results_ref results_new results_ref=$(cat $ref_results_id) results_new=$(cat $new_results_id) case "${rr[target]}" in "arm_eabi") cmp_options="$cmp_options --has_perf_logs no" ;; esac $scripts/tcwg-benchmark-results.sh \ --results_ref $results_ref ++results $results_new \ --top_artifacts "$run_step_artifacts" --verbose $verbose \ --metric "$metric" $cmp_options \ > $run_step_artifacts/results.log 2>&1 case $metric in "perf") compare_results_perf ;; "vect") compare_results_vect ;; *) echo "Invalid metric: $metric"; exit 1 ;; esac ) } # Exit with code 0 if no new regressions between results_id-1 and -2 compared to # regression between results_id-1 and -2 in base-artifacts/. no_regression_vs_p () { ( set -euf -o pipefail local ref_artifacts=$1 local new_artifacts=$2 # Check for build and correctness regressions. no_build_regression_p "$@" # Generate ref-results-compare.csv. The value of "1" means that the result # in the 2nd run is no worse than the result in the 1st run (as expected). # The value of "100" means that the result in the 2nd run is worse than # the result in the 1st run (unexpected). # Note that we can grab previously-generated ref-results-compares.csv from # base-artifacts/, but it could have been generated with an older version # of scripts, so it's safer and more resilient to re-generate it from original # perf data. if [ ! -f "$ref_artifacts/results_id-1" ] || [ ! -f "$ref_artifacts/results_id-2" ]; then return 0 fi # missing reference results, which we have listed in # tcwg-benchmark-results.broken-list. Once all entries referencing missing # results are discarded, we'll remove this workaround. # Otherwise compare_results will fail while fetching baseline results, # and we'll consider this failure as a regression. if cat "$scripts/tcwg-benchmark-results.broken-list" \ | grep -q "^$(cat $ref_artifacts/results_id-1)\$\|^$(cat $ref_artifacts/results_id-2)\$"; then return 0 fi # compare_results "${rr[metric]}" "$ref_artifacts/results_id-1" "$ref_artifacts/results_id-2" \ "--num_dsos 1 --num_symbols 0" while IFS= read -r -d '' i do mv $i "$(dirname $i)"/ref-"$(basename $i)" done < <(find $run_step_artifacts/ -type f -name "results*" -print0) # Similarly, generate new-results-compare.csv. if [ ! -f "$new_artifacts/results_id-1" ] || [ ! -f "$new_artifacts/results_id-2" ]; then return 1 fi compare_results "${rr[metric]}" "$new_artifacts/results_id-1" "$new_artifacts/results_id-2" \ "--num_dsos 1 --num_symbols 0" while IFS= read -r -d '' i do mv $i "$(dirname $i)"/new-"$(basename $i)" done < <(find $run_step_artifacts/ -type f -name "results*" -print0) # Now compare the two reports. # If "ref" has value of "100" (bad state), and "new" has value of "100" # (also bad state), then we get no change, no regression, and final value # of 100% * 100/100 == 100. # # If "ref" has value of "1" (good state), and "new" has value of "1" # (also good state), then we get no change, no regression, and final value # of 100% * 1/1 == 100. # # If "ref" has value of "100" (bad state), and "new" has value of "1" # (good state), then we get a progression, and final value # of 100% * 1/100 == 1. # # If "ref" has value of "1" (good state), and "new" has value of "100" # (bad state), then we get a regression, and final value # of 100% * 100/1 == 10000. We detect this below by comparing vs "5000". $scripts/../bmk-scripts/csvs2table.py -p 0 --relative $run_step_artifacts/ref-results-compare.csv $run_step_artifacts/new-results-compare.csv > $run_step_artifacts/results-compare.csv local -a arr local bmk symbol result status prev_bmk local -a bisect_bmks # Read result lines from <(tail -n +2 ...) below. # "-n +2" is to skip the header line. Set $status to "1" if there is # a regression. status=0 prev_bmk="" # Delete results.regressions generated by compare_results() calls above. rm -f $run_step_artifacts/results.regressions while IFS=, read -a arr; do bmk=${arr[0]} symbol=${arr[1]} result=${arr[2]} if ! [ "$result" -le "5000" ]; then echo "# $bmk,$symbol regressed" >> $run_step_artifacts/results.regressions status=1 if [ x"$bmk" != x"$prev_bmk" ]; then bisect_bmks+=("++benchmarks" "$bmk") prev_bmk="$bmk" fi fi done < <(tail -n +2 $run_step_artifacts/results-compare.csv) echo "extra_build_params=${bisect_bmks[*]}" > $run_step_artifacts/extra-bisect-params return $status ) } # Exit with code 0 if no regression compared to base-artifacts/. # Inspect build results ./results and performance results in ./results_id. no_regression_to_base_p () { ( set -euf -o pipefail no_build_regression_p "$@" local ref_artifacts=$1 local new_artifacts=$2 if ! [ -f "$ref_artifacts/results_id" ]; then return 0 fi # missing reference results, which we have listed in # tcwg-benchmark-results.broken-list. Once all entries referencing missing # results are discarded, we'll remove this workaround. # Otherwise compare_results will fail while fetching baseline results, # and we'll consider this failure as a regression. if cat "$scripts/tcwg-benchmark-results.broken-list" \ | grep -q "^$(cat $ref_artifacts/results_id)\$"; then return 0 fi # if ! [ -f "$new_artifacts/results_id" ]; then return 1 fi # Make sure there is no stray results.regression file, which we use # as failure marker. # We can, potentially, call ${rr[no_regression_p]} several times in # a row during update_baseline() step, but we should stop at the first # regression. Therefore, we should never see results.regressions exist. assert ! [ -f $run_step_artifacts/results.regressions ] local compare_opts="" case "${cflags[0]}" in *"_LTO"*) compare_opts="--num_symbols 0 --entry_threshold 10" ;; esac compare_results "${rr[metric]}" "$ref_artifacts/results_id" "$new_artifacts/results_id" "$compare_opts" if [ -f $run_step_artifacts/results.regressions ]; then return 1 fi return 0 ) } # Implement rr[breakup_updated_components] hook. tcwg_bmk_breakup_updated_components () { ( set -euf -o pipefail # Compiler changes tend to cause the most regressions. # Breakup updated components into compiler and the rest of components # to reduce the number of builds. local cc case "${rr[toolchain]}" in llvm) cc="llvm" ;; gnu|gnu_eabi) cc="gcc" ;; *) assert false ;; esac if print_updated_components "\n" | grep -q "^$cc\$"; then echo "$cc" print_updated_components "\n" | grep -v "^$cc\$" | tr '\n' ' ' | sed -e "s/ \$//g" echo else print_updated_components "\n" fi ) } rr[breakup_updated_components]=tcwg_bmk_breakup_updated_components run_step stop_on_fail -10 reset_artifacts run_step stop_on_fail x prepare_abe run_step skip_on_fail -9 build_abe binutils run_step skip_on_fail -8 build_abe stage1 -- "${gcc_override_configure[@]}" run_step skip_on_fail x clean_sysroot case "${rr[components]}" in *glibc*) run_step skip_on_fail -7 build_abe linux run_step skip_on_fail -6 build_abe glibc ;; *newlib*) run_step skip_on_fail -6 build_abe newlib ;; esac patch_branch="" if [ x"${rr[metric]}" = x"vect" ]; then patch_branch="--patch linaro-local/vect-metric/master" fi run_step skip_on_fail -5 build_abe stage2 -- $patch_branch "${gcc_override_configure[@]}" case "${rr[toolchain]}" in llvm) run_step skip_on_fail -3 build_llvm true ;; esac case "${#cflags[@]}" in 2) # Don't bisect benchmark build/run failures in *-vs-* configurations. # Bisections happen only for regressions with build scores >=0, # which will happen if benchmark "${cflags[1]}" succeeds. run_step skip_on_fail -1 benchmark -- "${cflags[0]}" ${rr[top_artifacts]}/results_id-1 run_step skip_on_fail 0 benchmark -- "${cflags[1]}" ${rr[top_artifacts]}/results_id-2 # Set final "build" score to "1" for compatibility with older results run_step skip_on_fail 1 true rr[no_regression_p]=no_regression_vs_p run_step reset_on_fail x check_regression ;; 1) run_step skip_on_fail 1 benchmark -- "${cflags[0]}" ${rr[top_artifacts]}/results_id rr[no_regression_p]=no_regression_to_base_p run_step reset_on_fail x check_regression ;; esac run_step stop_on_fail x update_baseline run_step stop_on_fail x push_baseline trap "" EXIT