#!/bin/bash set -euf -o pipefail scripts=$(dirname $0) # shellcheck source=jenkins-helpers.sh . $scripts/jenkins-helpers.sh # shellcheck source=round-robin.sh . $scripts/round-robin.sh convert_args_to_variables "$@" obligatory_variables rr[ci_project] rr[ci_config] declare -A rr # Execution mode: build or bisect rr[mode]="${rr[mode]-build}" # Set custom revision for one of the projects, and use baseline revisions # for all other projects. rr[baseline_branch]="${rr[baseline_branch]-linaro-local/ci/${rr[ci_project]}/${rr[ci_config]}}" rr[update_baseline]="${rr[update_baseline]-ignore}" rr[top_artifacts]="${rr[top_artifacts]-$(pwd)/artifacts}" # store date of the run rr[run_date]="$(date --utc --iso-8601=seconds)" # {toolchain_name}-{toolchain_ver}-{target}-{bmk}-{cflags} IFS=- read -a ci_config <" to the console output. # Strip this last line. head -n -1 $run_step_artifacts/benchmark-build.log \ > $run_step_artifacts/benchmark.log local build_status local build_ret while true; do # Ssh connection to ci.linaro.org occasionally drops. We need # to check whether benchmarking has finished, and, if not, continue # to watch its output. We detect that the job has finished if the last # line of console output is "Finished: ". build_status=$(tail -n 1 $run_step_artifacts/benchmark.log) case "$build_status" in "Finished: SUCCESS") build_ret=0 break ;; "Finished: "*) echo "# Benchmarking infra is offline:" >> ${rr[top_artifacts]}/results echo "-$EXTERNAL_FAIL" >> ${rr[top_artifacts]}/results build_ret=1 break ;; esac # After ci.linaro.org update on 2021-10-11 behavior of console command # option has changed: before the update it exited immediately for finished builds, # and after the update "console" hangs indefinitely for finished builds. # We workaround this by using "timeout 1m". sleep 300 (timeout 1m \ ssh -p2222 -l $USER@linaro.org ci.linaro.org \ console tcwg-benchmark $build_num || true) \ | tee $run_step_artifacts/benchmark.log done echo "$results_id" | sed -e "s/@build_num@/$build_num/g" \ > "$results_id_file" return $build_ret ) } # Compare results obtained from metric data between $1 and $2 # and generate results-compare.csv compare_results () { ( set -euf -o pipefail local metric_id="$1" local ref_results_id="$2" local new_results_id="$3" local cmp_options="$4" local results_ref results_new results_ref=$(cat $ref_results_id) results_new=$(cat $new_results_id) case "${rr[target]}" in "arm_eabi") cmp_options="$cmp_options --has_perf_logs no" ;; esac $scripts/tcwg-benchmark-results.sh \ --results_ref $results_ref ++results $results_new \ --top_artifacts "$run_step_artifacts" --verbose $verbose $cmp_options \ > $run_step_artifacts/results.log 2>&1 & local res res=0 && wait $! || res=$? if [ $res != 0 ]; then return $EXTERNAL_FAIL fi case "$metric_id:$cflags" in cflags:*"VECT"*) metric_id="vect" ;; cflags:"-Os"*|cflags:"-Oz"*) metric_id="size" ;; cflags:*) metric_id="time" ;; esac local regressed_by improved_by changed_by exe_threshold symbol_threshold case $metric_id in size) # We use 1% tolerance for binary size # and 10% tolerance for symbol size. exe_threshold=1 symbol_threshold=10 regressed_by="grew in size by" improved_by="reduced in size by" ;; time) # We use 3% tolerance for binary speed # and 15% tolerance for symbol speed. exe_threshold=3 symbol_threshold=15 # Reduce thresholds when bisecting to avoid considering borderline # regressions as spurious. This should break cycles of build and # bisect jobs triggering each other on borderline regressions. if [ x"${rr[mode]}" = x"bisect" ]; then exe_threshold=2 symbol_threshold=10 fi regressed_by="slowed down by" improved_by="speeds up by" ;; vect) exe_threshold=0 symbol_threshold=0 regressed_by="reduced by" improved_by="increased up by" ;; *) assert false ;; esac local -a arr local metric bmk symbol rtime rsize rvect time1 time2 size1 size2 vect1 vect2 local long_diag short_symbol short_diag local result prev_bmk echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv printf "extra_build_params=" > $run_step_artifacts/extra-bisect-params assert_with_msg "Found stale regression files" \ [ x"$(find $run_step_artifacts/ -name "*.regression" | wc -l)" = x"0" ] # Read result lines from <(tail -n +2 ...) below. # "-n +2" is to skip the header line. prev_bmk="" while IFS=, read -a arr; do bmk=${arr[0]} symbol=${arr[1]} rtime=${arr[2]} rsize=${arr[3]} rvect=${arr[4]} # $arr[5] is used to store rel_symbol_md5sum, ignore it. time1=${arr[6]} time2=${arr[7]} size1=${arr[8]} size2=${arr[9]} vect1=${arr[10]} vect2=${arr[11]} md5sum1=${arr[12]} md5sum2=${arr[13]} case $metric_id in size) metric=$rsize ;; time) metric=$rtime ;; vect) metric=$rvect ;; *) assert false ;; esac # Skip processing time metric if md5sums match. # If either md5sum is -1, then it indicates that we don't have checksum # computed correctly for this symbol, and thus continue with comparison. if [ x"$md5sum1" == x"$md5sum2" ] \ && [ x"$md5sum1" != x"-1" ] \ && [ x"$metric_id" == x"time" ] \ && [ "$metric" != "n/a" ]; then if (( $metric != 0 )); then echo "warning: Samples differ for $symbol having same md5sums: $time1, $time2" fi # TODO: Fix md5sum processing. #continue fi # Skip case where we have no info ("n/a") if [ "$metric" != "n/a" ]; then metric=$(($metric - 100)) # For vect metric, relative value < 100 will be a regression. # So effectively for any metric, if $metric is positive, # then it's a regression. if [ x"$metric_id" = x"vect" ]; then metric=$((-$metric)) fi # Remove padding from the tail of $symbol (padding is added by # csvs2table.py for better formatting). short_symbol="$(echo "$symbol" | sed -e "s/ *\$//")" local bmk_exe case "$short_symbol" in "["*) bmk_exe=false ;; *"_base.default") bmk_exe=true ;; *) bmk_exe=false ;; esac local threshold neg_threshold if $bmk_exe; then threshold=$exe_threshold neg_threshold=$((-exe_threshold)) else threshold=$symbol_threshold neg_threshold=$((-symbol_threshold)) fi if [ "$metric" -gt "$threshold" ] || [ "$metric" -lt "$neg_threshold" ]; then if [ "$metric" -gt "$threshold" ]; then change_kind="regression" changed_by=$regressed_by else change_kind="improvement" changed_by=$improved_by fi # There's a regression result=100 case $metric_id in size) short_diag="$changed_by ${metric}%" long_diag="$short_diag from $size1 to $size2 bytes" ;; time) short_diag="$changed_by ${metric}%" long_diag="$short_diag from $time1 to $time2 perf samples" ;; vect) short_diag="$changed_by ${metric}%" long_diag="$short_diag from $vect1 to $vect2" ;; *) assert false ;; esac if $bmk_exe; then short_diag="$bmk $short_diag" long_diag="$bmk $long_diag" # Detect magic sample counts that indicate failure to build # and failure to run case "$time1:$time2" in 888888888:888888888|999999999:999999999) # Should never happen as we have neither # an improvement nor a regression. assert false ;; *:999999999) change_kind="regression" short_diag="$bmk failed to build" long_diag="$short_diag" ;; 999999999:888888888) change_kind="improvement" short_diag="$bmk built OK, but failed to run" long_diag="$short_diag" ;; *:888888888) change_kind="regression" short_diag="$bmk failed to run" long_diag="$short_diag" ;; 888888888:*) change_kind="improvement" short_diag="$bmk run OK" long_diag="$short_diag" ;; 999999999:*) change_kind="improvement" short_diag="$bmk built and run OK" long_diag="$short_diag" ;; esac echo "$metric,$bmk,$symbol,$short_diag,$long_diag" >> $run_step_artifacts/exe.$change_kind else short_diag="$bmk:$short_symbol $short_diag" long_diag="$bmk:$short_symbol $long_diag" echo "$metric,$bmk,$symbol,$short_diag,$long_diag" >> $run_step_artifacts/$bmk.symbols-$change_kind fi if [ x"$bmk" != x"$prev_bmk" ]; then printf "++benchmarks %s " $bmk >> $run_step_artifacts/extra-bisect-params prev_bmk="$bmk" fi else result=1 fi echo "$bmk,$symbol,$result" >> $run_step_artifacts/results-compare.csv fi # Read from results-internal.csv instead of results.csv, because # the latter may contain commas in demangled symbol names, which will # interfere with parsing. done < <(tail -n +2 $run_step_artifacts/results-internal.csv) printf "\n" >> $run_step_artifacts/extra-bisect-params # Comparison is done. Below we generate regression report. cat > $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt <> $run_step_artifacts/mail-body.txt < $run_step_artifacts/mail-subject.txt < $run_step_artifacts/results.regressions echo "# $short_diag" >> $run_step_artifacts/results.regressions fi ) } # Exit with code 0 if no regression compared to base-artifacts/. # Inspect build results ./results and performance results in ./results_id. no_regression_p () { ( set -euf -o pipefail no_build_regression_p "$@" local ref_artifacts=$1 local new_artifacts=$2 if ! [ -f "$ref_artifacts/results_id" ]; then return 0 fi # missing reference results, which we have listed in # tcwg-benchmark-results.broken-list. Once all entries referencing missing # results are discarded, we'll remove this workaround. # Otherwise compare_results will fail while fetching baseline results, # and we'll consider this failure as a regression. if cat "$scripts/tcwg-benchmark-results.broken-list" \ | grep -q "^$(cat $ref_artifacts/results_id)\$"; then return 0 fi # if ! [ -f "$new_artifacts/results_id" ]; then return 1 fi # Make sure there is no stray results.regression file, which we use # as failure marker. # We can, potentially, call ${rr[no_regression_p]} several times in # a row during update_baseline() step, but we should stop at the first # regression. Therefore, we should never see results.regressions exist. assert ! [ -f $run_step_artifacts/results.regressions ] local compare_opts="" case "$cflags" in *"_LTO"*) compare_opts="--num_symbols 0 --entry_threshold 10" ;; esac compare_results "$metric_id" "$ref_artifacts/results_id" "$new_artifacts/results_id" "$compare_opts" if [ -f $run_step_artifacts/results.regressions ]; then return 1 fi return 0 ) } # Implement rr[breakup_changed_components] hook. tcwg_bmk_breakup_changed_components () { ( set -euf -o pipefail # Compiler changes tend to cause the most regressions. # Breakup updated components into compiler and the rest of components # to reduce the number of builds. local cc case "${rr[toolchain]}" in llvm) cc="llvm" ;; gnu|gnu_eabi) cc="gcc" ;; *) assert false ;; esac if print_changed_components "\n" | grep -q "^$cc\$"; then echo "$cc" print_changed_components "\n" | grep -v "^$cc\$" | tr '\n' ' ' | sed -e "s/ \$//g" echo else print_changed_components "\n" fi ) } rr[breakup_changed_components]=tcwg_bmk_breakup_changed_components run_step stop_on_fail -10 reset_artifacts run_step stop_on_fail x prepare_abe run_step skip_on_fail -9 build_abe binutils run_step skip_on_fail -8 build_abe stage1 -- "${gcc_override_configure[@]}" run_step skip_on_fail x clean_sysroot case "${rr[components]}" in *glibc*) run_step skip_on_fail -7 build_abe linux run_step skip_on_fail -6 build_abe glibc ;; *newlib*) run_step skip_on_fail -6 build_abe newlib ;; esac run_step skip_on_fail -5 build_abe stage2 -- "${gcc_override_configure[@]}" case "${rr[toolchain]}" in llvm) run_step skip_on_fail -3 build_llvm true ;; esac run_step skip_on_fail 1 benchmark -- "$cflags" ${rr[top_artifacts]}/results_id run_step reset_on_fail x check_regression run_step stop_on_fail x update_baseline trap "" EXIT