1 files changed, 302 insertions, 691 deletions
diff --git a/tcwg_bmk-build.sh b/tcwg_bmk-build.sh
index a5b60197..ce99257b 100755
--- a/tcwg_bmk-build.sh
+++ b/tcwg_bmk-build.sh
@@ -10,80 +10,66 @@ scripts=$(dirname $0)
 
 convert_args_to_variables "$@"
 
-obligatory_variables rr[ci_project] rr[ci_config] ssh_host ssh_port
+obligatory_variables rr[ci_project] rr[ci_config]
 declare -A rr
 
-# Execution mode: baseline, bisect, jenkins-full
-rr[mode]="${rr[mode]-baseline}"
+# All bmk config about hw and benchs is implemented in this file
+# shellcheck source=tcwg_bmk-config.sh
+. $scripts/tcwg_bmk-config.sh
+
+# Execution mode: build or bisect
+rr[mode]="${rr[mode]-build}"
 
 # Set custom revision for one of the projects, and use baseline revisions
 # for all other projects.
 rr[baseline_branch]="${rr[baseline_branch]-linaro-local/ci/${rr[ci_project]}/${rr[ci_config]}}"
-rr[update_baseline]="${rr[update_baseline]-update}"
+rr[update_baseline]="${rr[update_baseline]-ignore}"
 rr[top_artifacts]="${rr[top_artifacts]-$(pwd)/artifacts}"
 
-# Set metric to perf by default.
-rr[metric]="${rr[metric]-perf}"
-
-# {toolchain_name}-{toolchain_ver}-{target}-{bmk}-{cflags}
-IFS=- read -a ci_config <<EOF
-${rr[ci_config]}
+# ${ci_project}--${ci_config} format is :
+#  'tcwg_bmk-#{PROFILE_NAME}-#{BMK}--#{TOOLCHAIN}-#{TARGET}-{toolchain_ver}-{cflags}'
+IFS=- read -a ci_pjt_cfg <<EOF
+${rr[ci_project]}--${rr[ci_config]}
 EOF
-rr[toolchain]=${rr[toolchain]-${ci_config[0]}}
-rr[target]=${rr[target]-${ci_config[2]}}
-benchmarks=("${benchmarks[@]-${ci_config[3]}}")
-if [ x"${benchmarks[*]}" = x"default" ]; then
-    benchmarks=("${ci_config[3]}")
-fi
-if ! test_array cflags; then
-    ci_config=("${ci_config[@]:4}")
-    # In ${ci_config[@]} we now have "-"-separated entries (due to IFS=- above).
-    # We restore "-" in compiler flags when doing flags="$flags-$flag" below.
-    # We use "_" to separate compiler options, and it is translated to " -"
-    # in benchmark().
-    cflags=()
-    while [ ${#ci_config[@]} -ge 1 ]; do
-	flags=""
-	while [ ${#ci_config[@]} -ge 1 ]; do
-	    flag="${ci_config[0]}"
-	    ci_config=("${ci_config[@]:1}")
-	    if [ x"$flag" = x"vs" ]; then
-		break
-	    fi
-	    flags="$flags-$flag"
-	done
-	cflags+=("$flags")
-    done
-fi
+
+rr[toolchain]=${rr[toolchain]-${ci_pjt_cfg[4]}}
+rr[target]=${rr[target]-${ci_pjt_cfg[5]}}
+
+cflags="${cflags--${ci_pjt_cfg[7]}}"
 
 gcc_mode=""
-for i in $(seq 0 $(("${#cflags[@]}" - 1))); do
-    cflags_mode=""
-    if [[ x"${cflags[$i]}" == x*"VECT"* ]]; then
-	rr[metric]="vect"
-    fi
+case "${rr[target]}:$cflags" in
+    "arm:"*"mthumb"*) gcc_mode=thumb ;;
+    "arm:"*"marm"*) gcc_mode=arm ;;
+    "arm:-Os"*|"arm:-Oz"*)
+	gcc_mode=thumb
+	cflags="${cflags}_mthumb"
+	;;
+    "arm:"*)
+	gcc_mode=arm
+	cflags="${cflags}_marm"
+	;;
+    "arm_eabi:"*)
+	cflags="${cflags}_mthumb"
+	;;
+esac
 
-    case "${rr[target]}:${cflags[$i]}" in
-	"arm:"*"mthumb"*) cflags_mode=thumb ;;
-	"arm:"*"marm"*) cflags_mode=arm ;;
-	"arm:-Os"*|"arm:-Oz"*)
-	    cflags_mode=thumb
-	    cflags[$i]="${cflags[$i]}_mthumb"
-	    ;;
-	"arm:"*)
-	    cflags_mode=arm
-	    cflags[$i]="${cflags[$i]}_marm"
-	    ;;
-	"arm_eabi:"*)
-	    cflags[$i]="${cflags[$i]}_mthumb"
-	    ;;
-    esac
-    if [ x"$gcc_mode" = x"" ]; then
-	gcc_mode="$cflags_mode"
-    elif [ x"$gcc_mode" != x"$cflags_mode" ]; then
-	assert_with_msg "Unsupported arm/thumb configuration ${cflags[$(($i - 1))]} and ${cflags[$i]}" false
-    fi
-done
+cflags="$(echo $cflags | sed -e "s/_/ -/g" -e "s/LTO/flto/g")"
+
+case "${rr[ci_project]}" in
+    *-*_size-*) rr[metric_id]="size" ;;
+    *-*_speed-*) rr[metric_id]="sample" ;;
+    *-*_vect-*) rr[metric_id]="num_vect_loops" ;;
+    *-*_sve-*) rr[metric_id]="num_sve_loops" ;;
+    *) assert_with_msg "Cannot determine metric from ${rr[ci_project]}" false ;;
+esac
+
+called_from_notify=${called_from_notify-false}
+
+hw=$(tcwg_bmk_hw)
+hw=${hw%_32} ; hw=${hw%_64}
+
+# -----------------------------------------------------------------------
 
 gcc_override_configure=()
 # Set default ARM/Thumb mode for AArch32 compiler.  This ensures that libraries
@@ -103,15 +89,21 @@ gcc_override_configure+=("--set" "gcc_override_configure=--disable-libsanitizer"
 # board type.
 case "${rr[target]}" in
     "arm_eabi") gcc_override_configure+=("--set" "gcc_override_configure=--disable-multilib"
-					 "--set" "gcc_override_configure=--with-cpu=cortex-m4"
 					 "--set" "gcc_override_configure=--with-mode=thumb"
 					 "--set" "gcc_override_configure=--with-float=hard"
 					) ;;
 esac
 
+
+rr[cpu]=$(tcwg_bmk_cpu)
+if [ "${rr[cpu]}" != "" ]; then
+    gcc_override_configure+=("--set" "gcc_override_configure=--with-cpu=${rr[cpu]}")
+    cflags="$cflags -mcpu=${rr[cpu]}"
+fi
+
 case "${rr[toolchain]}" in
     llvm)
-      rr[components]="binutils gcc linux glibc llvm" ;;
+      rr[components]="llvm" ;;
     gnu)
       rr[components]="binutils gcc linux glibc" ;;
     gnu_eabi)
@@ -137,24 +129,19 @@ trap print_traceback EXIT
 default_start_at=""
 default_finish_at=""
 case "${rr[mode]}" in
-    "baseline")
-	default_finish_at="update_baseline"
-	;;
     "bisect")
 	single_updated_component="$(print_single_updated_component)"
 	case $single_updated_component in
 	    binutils) default_start_at="build_abe-binutils" ;;
 	    gcc) default_start_at="build_abe-stage1" ;;
 	    linux|glibc) default_start_at="clean_sysroot" ;;
-	    llvm) default_start_at="build_llvm-true" ;;
+	    llvm) default_start_at="build_bmk_llvm" ;;
 	    newlib) default_start_at="build_abe-newlib" ;;
 	    *) assert_with_msg \
-        "Invalid single updated component \"$single_updated_component\"" false
-      ;;
+		   "Invalid single updated component \"$single_updated_component\"" false
+	       ;;
 	esac
-	default_finish_at="check_regression"
 	;;
-    "jenkins-full") ;;
 esac
 if [ x"$start_at" = x"default" ]; then
     start_at="$default_start_at"
@@ -163,15 +150,52 @@ if [ x"$finish_at" = x"default" ]; then
     finish_at="$default_finish_at"
 fi
 
+case "${rr[ci_project]}/${rr[ci_config]}" in
+    tcwg_bmk-code_speed-cpu2017rate/gnu-aarch64-master-O2|\
+    tcwg_bmk-code_speed-cpu2017rate/gnu-aarch64-master-O3|\
+    tcwg_bmk-code_speed-cpu2017rate/llvm-aarch64-master-O2|\
+    tcwg_bmk-code_speed-cpu2017rate/llvm-aarch64-master-O3)
+	rr[major]=3
+	rr[minor]=0
+	;;
+    *)
+	rr[major]=2
+	rr[minor]=3
+	;;
+esac
+
 run_step_init "$start_at" "$finish_at" "${rr[top_artifacts]}" "$verbose"
 
+build_bmk_llvm ()
+{
+    (
+    set -euf -o pipefail
+
+    local projects="clang;lld;openmp"
+    case "${rr[target]}" in
+	aarch64)
+	    # Flang is not supported for AArch32
+	    projects="$projects;flang"
+	    ;;
+    esac
+
+    build_llvm "$projects" "" "${rr[metric_id]}"
+
+    # Copy shared libraries to runtime sysroot dir
+    mkdir -p llvm-install/libc
+    rsync -a --del --include "*/" --include "*.so*" --exclude "*" \
+	  --delete-excluded llvm-install/lib/ llvm-install/libc/lib/
+    )
+}
+
 benchmark ()
 {
+    obligatory_variables ssh_host ssh_port
+
     (
     set -euf -o pipefail
 
-    local bmk_cflags="$2"
-    local results_id_file="$3"
+    local bmk_flags="$2"
 
     sanity_check_pwd
 
@@ -179,740 +203,327 @@ benchmark ()
     rm -rf "$(pwd)"/bin
     mkdir "$(pwd)"/bin
 
-    local bmk_flags bmk_ldflags reboot run_profile
-    bmk_flags="$(echo $bmk_cflags | sed -e "s/_/ -/g" -e "s/LTO/flto/g" \
-					-e "s/VECT/fdump-tree-vect-details/g")"
-    case "$bmk_cflags" in
-	"-Os"*|"-Oz"*)
+    local reboot run_profile
+
+    local hw_tag
+    hw_tag=$(tcwg_bmk_hw)
+
+    case "${rr[ci_project]}" in
+        *_size*|*_vect*|*_sve*)
 	    reboot=false
 	    run_profile="parallel"
+	    testmode="verify"
 	    ;;
 	*)
 	    reboot=true
 	    run_profile="serial"
+	    testmode="benchmark"
 	    ;;
     esac
 
-    local bench_list bin cc gnu_host gnu_target sysroot toolchain
-    gnu_host=$(print_gnu_target native)
-    gnu_target=$(print_gnu_target ${rr[target]})
-    sysroot="$(pwd)/abe/builds/destdir/$gnu_host/$gnu_target/libc"
+    local bench_list bin cc sysroot toolchain
     case "${rr[toolchain]}" in
 	llvm)
-	    local llvm_target
-	    llvm_target=$(echo "$gnu_target" | sed -e "s/^arm-/armv7a-/")
-	    bmk_flags="$bmk_flags --target=$llvm_target --sysroot=$sysroot"
-	    bmk_ldflags="$bmk_flags"
-	    # Use LLD for LLVM configurations.
-	    # Also, BFD linker crashes for AArch32 LTO builds,
-	    # see https://projects.linaro.org/browse/LLVM-562 .
-	    case "$bmk_ldflags" in
-		*"-fuse-ld="*) ;;
-		*) bmk_ldflags="$bmk_ldflags -fuse-ld=lld" ;;
-	    esac
+	    sysroot="$(pwd)/llvm-install/libc"
 	    bin="$(pwd)/llvm-install/bin"
 	    cc="$bin/"
 	    toolchain="llvm"
 	    ;;
 	gnu|gnu_eabi)
-	    bmk_ldflags="$bmk_flags"
+	    local gnu_host gnu_target
+	    gnu_host=$(print_gnu_target native)
+	    gnu_target=$(print_gnu_target ${rr[target]})
+	    sysroot="$(pwd)/abe/builds/destdir/$gnu_host/$gnu_target/libc"
 	    bin="$(pwd)/abe/builds/destdir/$gnu_host/bin"
 	    cc="$bin/$gnu_target-"
 	    toolchain="gnu"
+	    # Append -fdump-statistics-asmname to obtain compile time metrics.
+	    bmk_flags="$bmk_flags -fdump-statistics-asmname -fdump-tree-vect-details"
 	    ;;
     esac
-    case "${rr[toolchain]}:${benchmarks[*]}" in
-	llvm:spec2k6) bench_list="c_and_cxx" ;;
-	gnu:spec2k6) bench_list="all" ;;
-	llvm:spec2017) bench_list="spec2017_speed_nofortran" ;;
-	gnu:spec2017) bench_list="spec2017_speed" ;;
-	*) bench_list="${benchmarks[*]}" ;;
-    esac
+
+    bench_list="$(tcwg_bmk_benchs)"
+
     # shellcheck disable=SC2154
     sysroot="ssh://$ssh_host:$ssh_port:$sysroot"
 
-    local hw_tag
-    case "${rr[ci_project]}:${rr[target]}" in
-	*_apm_32*:*) hw_tag=apm_32 ;;
-	*_apm_64*:*) hw_tag=apm_64 ;;
-	*_apm*:arm*) hw_tag=apm_32 ;;
-	*_apm*:aarch64) hw_tag=apm_64 ;;
-	*_sq_32*:*) hw_tag=sq_32 ;;
-	*_sq_64*:*) hw_tag=sq_64 ;;
-	*_sq*:arm*) hw_tag=sq_32 ;;
-	*_sq*:aarch64) hw_tag=sq_64 ;;
-	*_tk1_32*:*) hw_tag=tk1_32 ;;
-	*_tk1*:arm*) hw_tag=tk1_32 ;;
-	*_tx1_32*:*) hw_tag=tx1_32 ;;
-	*_tx1_64*:*) hw_tag=tx1_64 ;;
-	*_tx1*:arm*) hw_tag=tx1_32 ;;
-	*_tx1*:aarch64) hw_tag=tx1_64 ;;
-	*_stm32*:arm*) hw_tag=stm32 ;;
-	*) echo "ERROR: Unknown hw_tag for ${rr[ci_project]}:${rr[target]}"; exit 1 ;;
-    esac
-
     local hw image_arch toolchain_proto
+
     toolchain_proto=ssh
     case "$hw_tag" in
-	apm_32) hw=apm; image_arch=armhf ;;
-	apm_64) hw=apm; image_arch=arm64 ;;
-	sq_32) hw=sq; image_arch=armhf ;;
-	sq_64) hw=sq; image_arch=arm64 ;;
-	tk1_32) hw=tk1; image_arch=armhf ;;
-	tx1_32) hw=tx1; image_arch=armhf ;;
-	tx1_64) hw=tx1; image_arch=arm64 ;;
 	stm32)
 	    hw=stm32; image_arch=amd64
 	    # When running benchmarks on stm32, we prefer to rsync the
 	    # toolchain to the board's host machine -- dev-02.tcwglab.
 	    toolchain_proto=rsync
 	    ;;
+	*_32) hw=${hw_tag/_32}; image_arch=armhf ;;
+	*_64) hw=${hw_tag/_64}; image_arch=arm64 ;;
 	*) echo "ERROR: Unknown hw_tag $hw_tag"; exit 1 ;;
     esac
 
-    local results_id="$hw_tag/${rr[ci_project]}/${rr[mode]}-${rr[ci_config]}/@build_num@"
+    # Create directory for tcwg-benchmark to upload results to.
+    # Note that files inside $results_dir will be owned by tcwg-benchmark.
+    local results_dir
+    results_dir="$(mktemp -d)"
+    chmod 0777 "$results_dir"
 
+    # Trigger benchmarking job and capture its console output.
+    # Ignore exit code of the trigger command to detect various failure
+    # conditions from examining the console log.
     # shellcheck disable=SC2154
     remote_exec "ci.linaro.org:2222::-l $USER@linaro.org" \
-		build tcwg-benchmark -w \
+		build tcwg-benchmark -f -v \
 		-p bmk_hw=$hw \
 		-p bench_list="$bench_list" \
 		-p cflags="$bmk_flags" \
-		-p ldflags="$bmk_ldflags" \
-		-p testmode=benchmark \
-		-p displaytag="${rr[ci_project]}/${rr[mode]}-${rr[ci_config]}" \
+		-p ldflags="$bmk_flags" \
+		-p testmode="$testmode" \
+		-p displaytag="${rr[ci_project]}/${rr[ci_config]}-${rr[mode]}" \
 		-p ignore_errors=true \
 		-p toolchain_url=$toolchain_proto://$ssh_host:$ssh_port:$cc \
 		-p toolchain_type=$toolchain \
 		-p sysroot="$sysroot" \
-		-p results_id="$results_id" \
+		-p results_dest="$ssh_host:$ssh_port:$results_dir" \
 		-p reboot="$reboot" \
 		-p run_profile="$run_profile" \
 		-p image_arch="$image_arch" \
 		${scripts_branch+-p scripts_branch="$scripts_branch"} \
 		${bmk_branch+-p bmk_branch="$bmk_branch"} \
-	| tee $run_step_artifacts/benchmark-start.log
+	| tee $run_step_artifacts/benchmark-build.log || true
 
     local build_num
-    build_num=$(cat $run_step_artifacts/benchmark-start.log \
-		    | sed -e "s/.*#\([0-9]\+\).*/\1/")
+    build_num=$(head -n1 $run_step_artifacts/benchmark-build.log \
+		    | sed -e "s/Started.*#\([0-9]\+\).*/\1/")
     assert_with_msg "Benchmark build number should not be 0!" \
-      [ "$build_num" -gt "0" ]
+		    [ "$build_num" -gt "0" ]
+
+    cat > $run_step_artifacts/benchmark_job.txt << EOF
+Url: https://ci.linaro.org/job/tcwg-benchmark/$build_num
+Name: $(curl -s "https://ci.linaro.org/job/tcwg-benchmark/$build_num/api/json?tree=displayName" \
+	   | jq -r ".displayName")
+EOF
 
     local build_status
     local build_ret
     while true; do
-	(remote_exec "ci.linaro.org:2222::-l $USER@linaro.org" \
-		     console tcwg-benchmark -n 1 -f $build_num || true) \
-	    | tee -a $run_step_artifacts/benchmark.log
-
 	# Ssh connection to ci.linaro.org occasionally drops.  We need
 	# to check whether benchmarking has finished, and, if not, continue
-	# to watch its output.
-	build_status=$(tail -n 1 $run_step_artifacts/benchmark.log)
+	# waiting.
+	build_status=$(curl -s \
+	   "https://ci.linaro.org/job/tcwg-benchmark/$build_num/api/json?tree=result" \
+	   | jq -r ".result")
 	case "$build_status" in
-	    "Finished: SUCCESS")
+	    "null")
+		# Continue waiting
+		true
+		;;
+	    "SUCCESS")
 	        build_ret=0
 	        break
 	        ;;
-	    "Finished: "*)
-		echo "# Benchmarking infra is offline:" >> ${rr[top_artifacts]}/results
+	    *)
+		echo "# Benchmarking infra is offline:" \
+		     >> ${rr[top_artifacts]}/results
 		echo "-$EXTERNAL_FAIL" >> ${rr[top_artifacts]}/results
 	        build_ret=1
 	        break
 	        ;;
 	esac
 
-	# Sleep a little to avoid flooding ci.linaro.org on transient ssh
-	# failures.
-	sleep 5
+	# Wait by following console output
+	(ssh -p2222 -l $USER@linaro.org ci.linaro.org \
+	     console tcwg-benchmark $build_num -f || true) \
+	    | tee $run_step_artifacts/benchmark-wait.log
     done
 
-    echo "$results_id" | sed -e "s/@build_num@/$build_num/g" \
-			     > "$results_id_file"
+    rm -rf "${rr[top_artifacts]}/annex"
+    mkdir "${rr[top_artifacts]}/annex"
+    ln -s "$results_dir" "${rr[top_artifacts]}/annex/bmk-data"
+
     return $build_ret
     )
 }
 
-# Compare results obtained from perf data between $1 and $2
-# and generate results-compare.csv
-compare_results_perf ()
+# Exit with code 0 if no regression compared to base-artifacts/.
+no_regression_p ()
 {
     (
     set -euf -o pipefail
 
-    local exe_threshold symbol_threshold
-    case "${cflags[0]}" in
-	"-Os"*|"-Oz"*)
-	    # We use 1% tolerance for binary size
-	    # and 10% tolerance for symbol size.
-	    exe_threshold=1
-	    symbol_threshold=10
-	    ;;
-	*)
-	    # We use 3% tolerance for binary speed
-	    # and 15% tolerance for symbol speed.
-	    exe_threshold=3
-	    symbol_threshold=15
-	    # Reduce thresholds when bisecting to avoid considering borderline
-	    # regressions as spurious.  This should break cycles of build and
-	    # bisect jobs triggering each other on borderline regressions.
-	    if [ x"${rr[mode]}" = x"bisect" ]; then
-		exe_threshold=2
-		symbol_threshold=10
-	    fi
-	    ;;
-    esac
-
-    local -a arr
-    local metric bmk symbol rtime rsize time1 time2 size1 size2
-    local regression short_symbol short_regression
-    local result prev_bmk
-    echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv
-    printf "extra_build_params=" > $run_step_artifacts/extra-bisect-params
-
-    assert_with_msg "Found stale regression files" \
-		    [ x"$(find $run_step_artifacts/ -name "*.regression" | wc -l)" = x"0" ]
-
-    local metric_id regressed_by
-    case "${cflags[0]}" in
-	"-Os"*|"-Oz"*)
-	    metric_id="size"
-	    regressed_by="grew in size by"
-	    ;;
-	*)
-	    metric_id="time"
-	    regressed_by="slowed down by"
-	    ;;
-    esac
-
-    # Read result lines from <(tail -n +2 ...) below.
-    # "-n +2" is to skip the header line.
-    prev_bmk=""
-    while IFS=, read -a arr; do
-	bmk=${arr[0]}
-	symbol=${arr[1]}
-	rtime=${arr[2]}
-	rsize=${arr[3]}
-	time1=${arr[4]}
-	time2=${arr[5]}
-	size1=${arr[6]}
-	size2=${arr[7]}
-
-	case $metric_id in
-	    size) metric=$rsize ;;
-	    time) metric=$rtime ;;
-	    *) assert false ;;
-	esac
-
-	# Skip case where we have no info ("n/a")
-	if [ "$metric" != "n/a" ]; then
-	    metric=$(($metric - 100))
-	    # Remove padding from the tail of $symbol (padding is added by
-	    # csvs2table.py for better formatting).
-	    short_symbol="$(echo "$symbol" | sed -e "s/ *\$//")"
-
-	    local bmk_exe
-	    case "$short_symbol" in
-		"["*) bmk_exe=false ;;
-		*"_base.default") bmk_exe=true ;;
-		*) bmk_exe=false ;;
-	    esac
-
-	    local threshold
-	    if $bmk_exe; then
-		threshold=$exe_threshold
-	    else
-		threshold=$symbol_threshold
-	    fi
-
-	    if ! [ "$metric" -le "$threshold" ]; then
-		result=100
-
-		case $metric_id in
-		    size)
-			short_regression="$regressed_by ${metric}%"
-			regression="$short_regression from $size1 to $size2 bytes"
-			;;
-		    time)
-			short_regression="$regressed_by ${metric}%"
-			regression="$short_regression from $time1 to $time2 perf samples" ;;
-		    *) assert false ;;
-		esac
-		if $bmk_exe; then
-		    short_regression="$bmk $short_regression"
-		    regression="$bmk $regression"
-		    # Detect magic sample counts that indicate failure to build
-		    # and failure to run
-		    case "$time2" in
-			888888888)
-			    short_regression="$bmk failed to run correctly"
-			    regression="$short_regression"
-			    ;;
-			999999999)
-			    short_regression="$bmk failed to build"
-			    regression="$short_regression"
-			    ;;
-		    esac
-		    echo "$metric,$bmk,$symbol,$short_regression,$regression" >> $run_step_artifacts/exe.regressions
-		else
-		    short_regression="$bmk:$short_symbol $short_regression"
-		    regression="$bmk:$short_symbol $regression"
-		    echo "$metric,$bmk,$symbol,$short_regression,$regression" >> $run_step_artifacts/$bmk.regression
-		fi
-		if [ x"$bmk" != x"$prev_bmk" ]; then
-		    printf "++benchmarks %s " $bmk >> $run_step_artifacts/extra-bisect-params
-		    prev_bmk="$bmk"
-		fi
-	    else
-		result=1
-	    fi
-	    echo "$bmk,$symbol,$result" >> $run_step_artifacts/results-compare.csv
-	fi
-    done < <(tail -n +2 $run_step_artifacts/results.csv)
-    printf "\n" >> $run_step_artifacts/extra-bisect-params
-
-    # Comparison is done.  Below we generate regression report.
-    cat > $run_step_artifacts/jira-body.txt <<EOF
-After \$COMMIT_COMPONENT \$COMMIT_LOG
-EOF
-    if [ -f $run_step_artifacts/exe.regressions ]; then
-	sort -gr -o $run_step_artifacts/exe.regressions \
-	     $run_step_artifacts/exe.regressions
-
-	cat >> $run_step_artifacts/jira-body.txt <<EOF
-
-the following benchmarks $regressed_by more than ${exe_threshold}%:
-EOF
-	local exe
-	while IFS=, read metric exe symbol short_regression regression; do
-	    cat >> $run_step_artifacts/jira-body.txt <<EOF
-- $regression
-EOF
-	    if [ -f $run_step_artifacts/$exe.regression ]; then
-		while IFS=, read metric bmk symbol short_regression regression; do
-		    cat >> $run_step_artifacts/jira-body.txt <<EOF
-  - $regression
-EOF
-		done < $run_step_artifacts/$exe.regression
-		# Delete $bmk.regressions so that it doesn't show up
-		# in symbol-regression loop below.
-		rm $run_step_artifacts/$exe.regression
-	    fi
-	done < $run_step_artifacts/exe.regressions
-    fi
-
-    find $run_step_artifacts/ -name "*.regression" -print0 | xargs -0 cat \
-	| sort -gr -o $run_step_artifacts/symbol.regressions
-    if [ x"$(cat $run_step_artifacts/symbol.regressions)" = x"" ]; then
-	# Delete empty file
-	rm $run_step_artifacts/symbol.regressions
-    fi
+    # check score-based regression
+    no_build_regression_p "$@"
 
-    if [ -f $run_step_artifacts/symbol.regressions ]; then
-	cat >> $run_step_artifacts/jira-body.txt <<EOF
+    # At this stage, there's no score-based regression.
+    # We are now checking metric-based regression.
 
-the following hot functions $regressed_by more than ${symbol_threshold}% (but their benchmarks $regressed_by less than ${exe_threshold}%):
-EOF
-	while IFS=, read metric bmk symbol short_regression regression; do
-	    cat >> $run_step_artifacts/jira-body.txt <<EOF
-- $regression
-EOF
-	done < $run_step_artifacts/symbol.regressions
-    fi
+    assert_with_msg "Benchmarking succeeded, but bmk-data is missing" \
+		    [ -e $run_step_top_artifacts/annex/bmk-data ]
 
-    cp $run_step_artifacts/jira-body.txt $run_step_artifacts/mail-body.txt
+    # Make sure there is no stray results.regression file, which we use
+    # as failure marker.
+    assert ! [ -f $run_step_artifacts/results.regressions ]
 
-    local bmk_suite="" publish_save_temps=false
-    case "${benchmarks[*]}" in
-	coremark) bmk_suite="EEMBC CoreMark" ;;
-	spec2k6|4*)
-	    bmk_suite="SPEC CPU2006"
-	    publish_save_temps=true
-	    ;;
-	spec2017|5*|6*)
-	    bmk_suite="SPEC CPU2017"
-	    publish_save_temps=true
-	    ;;
+    local compare_opts=""
+    case "${rr[target]}:$cflags" in
+	"arm_eabi":*) compare_opts="--has_perf_logs no" ;;
+	*) compare_opts="" ;;
     esac
 
-    cat >> $run_step_artifacts/mail-body.txt <<EOF
-
-Below reproducer instructions can be used to re-build both "first_bad" and "last_good" cross-toolchains used in this bisection.  Naturally, the scripts will fail when triggerring benchmarking jobs if you don't have access to Linaro TCWG CI.
-EOF
-
-    # Copy save-temps tarballs to artifacts, so that they are accessible.
-    # We can publish pre-processed source only for benchmarks derived from
-    # open-source projects.
-    # Note that we include save-temps artifacts for successful builds so that
-    # "last_good" build has the artifacts.
-    if $publish_save_temps; then
-	mkdir -p $run_step_artifacts/top-artifacts
-	local s_t
-	while read s_t; do
-	    rsync -a "$s_t/" $run_step_artifacts/top-artifacts/save-temps/
-	done < <(find results-1 -type d -name "save.*.temps")
+    if [ -f /usr/lib/linux-tools/install-armhf-perf-workaround.sh ]; then
+	# FIXME:
+	# In some cases perf report crashes when run from armhf container on
+	# ARMv8 machine.
+	# Install a workaround while we are investigating the cause.
+	sudo /usr/lib/linux-tools/install-armhf-perf-workaround.sh
     fi
 
-    if [ -d $run_step_artifacts/top-artifacts/save-temps/ ]; then
-	cat >> $run_step_artifacts/mail-body.txt <<EOF
-
-For your convenience, we have uploaded tarballs with pre-processed source and assembly files at:
-- First_bad save-temps: \$FIRST_BAD_ARTIFACTS/save-temps/
-- Last_good save-temps: \$LAST_GOOD_ARTIFACTS/save-temps/
-- Baseline save-temps: \$BASELINE_ARTIFACTS/save-temps/
-EOF
-    fi
+    local new_results="${rr[top_artifacts]}/annex/bmk-data"
+    local ref_results="base-artifacts/annex/bmk-data"
 
-    local compiler="" libc="" linker="" version="" target="" bmk_flags="" hw=""
-    case "${rr[toolchain]}" in
-	gnu)
-	    compiler="GCC"
-	    libc="Glibc"
-	    linker="GNU Linker"
-	    ;;
-	gnu_eabi)
-	    compiler="GCC"
-	    libc="Newlib"
-	    linker="GNU LD"
-	    ;;
-	llvm)
-	    compiler="Clang"
-	    libc="Glibc"
-	    linker="LLVM Linker"
-	    ;;
-    esac
-    case "${rr[ci_config]}" in
-	*-master-*) version="tip of trunk" ;;
-	*-release-*) version="latest release branch" ;;
-    esac
-    target=$(print_gnu_target ${rr[target]})
-    bmk_flags=$(echo "${cflags[0]}" | sed -e "s/_/ -/g" -e "s/LTO/flto/g" \
-					  -e "s/VECT/fdump-tree-vect-details/g")
-    case "${rr[ci_project]}" in
-	*_apm*) hw="APM Mustang 8x X-Gene1" ;;
-	*_tk1*) hw="NVidia TK1 4x Cortex-A15" ;;
-	*_tx1*) hw="NVidia TX1 4x Cortex-A57" ;;
-	*_stm32*) hw="STMicroelectronics STM32L476RGTx 1x Cortex-M4" ;;
-    esac
+    assert_with_msg "Benchmarking succeeded, but no annex/bmk-data results" \
+		    [ -d "$new_results" ]
 
-    cat >> $run_step_artifacts/mail-body.txt <<EOF
-
-Configuration:
-- Benchmark: $bmk_suite
-- Toolchain: $compiler + $libc + $linker
-- Version: all components were built from their $version
-- Target: $target
-- Compiler flags: $bmk_flags
-- Hardware: $hw
-
-This benchmarking CI is work-in-progress, and we welcome feedback and suggestions at linaro-toolchain@lists.linaro.org .  In our improvement plans is to add support for SPEC CPU2017 benchmarks and provide "perf report/annotate" data behind these reports.
-EOF
-
-    # Generate mail subject
-    if [ -f $run_step_artifacts/exe.regressions ]; then
-	IFS=, read metric bmk symbol short_regression regression \
-	   < <(head -n1 $run_step_artifacts/exe.regressions)
-    elif [ -f $run_step_artifacts/symbol.regressions ]; then
-	IFS=, read metric bmk symbol short_regression regression \
-	   < <(head -n1 $run_step_artifacts/symbol.regressions)
-    else
-	# Exit with no regressions
-	return 0
+    if ! [ -d "$ref_results" ]; then
+	# base-artifacts has no reference results.
+	# This can happen on init build (update_baseline=init).
+	# In such cases we compare results to themselves just as an exercise.
+	ref_results="$new_results"
+	assert_with_msg "No reference results" \
+			[ "${rr[update_baseline]}" = "init" ]
     fi
 
-    cat > $run_step_artifacts/mail-subject.txt <<EOF
-[TCWG CI] $short_regression after \$COMMIT_COMPONENT: \$COMMIT_SUBJECT
-EOF
-    cat $run_step_artifacts/jira-body.txt \
-	| sed -e "s/^/# /" > $run_step_artifacts/results.regressions
-    echo "# $short_regression" >> $run_step_artifacts/results.regressions
-    )
-}
-
-compare_results_vect ()
-{
-    (
-    set -euf -o pipefail
-    echo "bmk,symbol,result" > $run_step_artifacts/results-compare.csv
-
-    while IFS=, read -a arr; do
-        bmk=${arr[0]}
-        # hack to trim padding
-        symbol=$(echo ${arr[1]} | xargs)
-        base_num_vect_loops=${arr[3]}
-        target_num_vect_loops=${arr[4]}
-        if (( base_num_vect_loops > target_num_vect_loops )); then
-	    echo "$bmk, $symbol, $base_num_vect_loops, $target_num_vect_loops" \
-		>> $run_step_artifacts/results-compare.csv
-        fi
-    done < <(tail -n +2 $run_step_artifacts/results.csv)
-    )
-}
-
-compare_results ()
-{
-    (
-    set -euf -o pipefail
-
-    local metric=$1
-    local ref_results_id="$2"
-    local new_results_id="$3"
-    local cmp_options="$4"
-
-    local results_ref results_new
-    results_ref=$(cat $ref_results_id)
-    results_new=$(cat $new_results_id)
-
-    case "${rr[target]}" in
-        "arm_eabi")
-            cmp_options="$cmp_options --has_perf_logs no"
-            ;;
-    esac
-
+    # Compare vs previous run
+    mkdir -p ${rr[top_artifacts]}/results-vs-prev
+    ln -s ../results-vs-prev $run_step_artifacts/results-vs-prev
     $scripts/tcwg-benchmark-results.sh \
-        --results_ref $results_ref ++results $results_new \
-        --top_artifacts "$run_step_artifacts" --verbose $verbose \
-        --metric "$metric" $cmp_options \
-        > $run_step_artifacts/results.log 2>&1
-
-    case $metric in
-	"perf")
-	    compare_results_perf
-	    ;;
-	"vect")
-	    compare_results_vect
-	    ;;
-	*)
-	    echo "Invalid metric: $metric";
-	    exit 1
-	    ;;
-    esac
-    )
-}
-
-# Exit with code 0 if no new regressions between results_id-1 and -2 compared to
-# regression between results_id-1 and -2 in base-artifacts/.
-no_regression_vs_p ()
-{
-    (
-    set -euf -o pipefail
-
-    local ref_artifacts=$1
-    local new_artifacts=$2
-
-    # Check for build and correctness regressions.
-    no_build_regression_p "$@"
-
-    # Generate ref-results-compare.csv.  The value of "1" means that the result
-    # in the 2nd run is no worse than the result in the 1st run (as expected).
-    # The value of "100" means that the result in the 2nd run is worse than
-    # the result in the 1st run (unexpected).
-    # Note that we can grab previously-generated ref-results-compares.csv from
-    # base-artifacts/, but it could have been generated with an older version
-    # of scripts, so it's safer and more resilient to re-generate it from original
-    # perf data.
-    if [ ! -f "$ref_artifacts/results_id-1" ] || [ ! -f "$ref_artifacts/results_id-2" ]; then
-	return 0
+        --results_ref "$ref_results" ++results "$new_results" \
+        --top_artifacts "${rr[top_artifacts]}/results-vs-prev" \
+	--verbose $verbose --hw_tag "$(tcwg_bmk_hw)" \
+	$compare_opts \
+        > ${rr[top_artifacts]}/results-vs-prev/tcwg-benchmark-results.log 2>&1 &
+
+    local res
+    res=0 && wait $! || res=$?
+    if [ $res != 0 ]; then
+	return $EXTERNAL_FAIL
     fi
-    # <Workaround> missing reference results, which we have listed in
-    # tcwg-benchmark-results.broken-list.  Once all entries referencing missing
-    # results are discarded, we'll remove this workaround.
-    # Otherwise compare_results will fail while fetching baseline results,
-    # and we'll consider this failure as a regression.
-    if cat "$scripts/tcwg-benchmark-results.broken-list" \
-	    | grep -q "^$(cat $ref_artifacts/results_id-1)\$\|^$(cat $ref_artifacts/results_id-2)\$"; then
-	return 0
-    fi
-    # </Workaround>
-    compare_results "${rr[metric]}" "$ref_artifacts/results_id-1" "$ref_artifacts/results_id-2" \
-		    "--num_dsos 1 --num_symbols 0"
 
-    while IFS= read -r -d '' i
-    do
-        mv $i "$(dirname $i)"/ref-"$(basename $i)"
-    done < <(find $run_step_artifacts/ -type f -name "results*" -print0)
+    # Below call to output-bmk-results.py creates *.regression files.
+    assert_with_msg "Found stale regression files" \
+		    [ x"$(find $run_step_artifacts/ -name "*.regression" | wc -l)" = x"0" ]
 
-    # Similarly, generate new-results-compare.csv.
-    if [ ! -f "$new_artifacts/results_id-1" ] || [ ! -f "$new_artifacts/results_id-2" ]; then
-	return 1
-    fi
-    compare_results "${rr[metric]}" "$new_artifacts/results_id-1" "$new_artifacts/results_id-2" \
-		    "--num_dsos 1 --num_symbols 0"
-    while IFS= read -r -d '' i
-    do
-        mv $i "$(dirname $i)"/new-"$(basename $i)"
-    done < <(find $run_step_artifacts/ -type f -name "results*" -print0)
-
-    # Now compare the two reports.
-    # If "ref" has value of "100" (bad state), and "new" has value of "100"
-    # (also bad state), then we get no change, no regression, and final value
-    # of 100% * 100/100 == 100.
-    #
-    # If "ref" has value of "1" (good state), and "new" has value of "1"
-    # (also good state), then we get no change, no regression, and final value
-    # of 100% * 1/1 == 100.
-    #
-    # If "ref" has value of "100" (bad state), and "new" has value of "1"
-    # (good state), then we get a progression, and final value
-    # of 100% * 1/100 == 1.
-    #
-    # If "ref" has value of "1" (good state), and "new" has value of "100"
-    # (bad state), then we get a regression, and final value
-    # of 100% * 100/1 == 10000.  We detect this below by comparing vs "5000".
-    $scripts/../bmk-scripts/csvs2table.py -p 0 --relative $run_step_artifacts/ref-results-compare.csv $run_step_artifacts/new-results-compare.csv > $run_step_artifacts/results-compare.csv
-
-    local -a arr
-    local bmk symbol result status prev_bmk
-    local -a bisect_bmks
-
-    # Read result lines from <(tail -n +2 ...) below.
-    # "-n +2" is to skip the header line.  Set $status to "1" if there is
-    # a regression.
-    status=0
-    prev_bmk=""
-    # Delete results.regressions generated by compare_results() calls above.
-    rm -f $run_step_artifacts/results.regressions
-    while IFS=, read -a arr; do
-	bmk=${arr[0]}
-	symbol=${arr[1]}
-	result=${arr[2]}
-	if ! [ "$result" -le "5000" ]; then
-	    echo "# $bmk,$symbol regressed" >> $run_step_artifacts/results.regressions
-	    status=1
-	    if [ x"$bmk" != x"$prev_bmk" ]; then
-		bisect_bmks+=("++benchmarks" "$bmk")
-		prev_bmk="$bmk"
-	    fi
+    # Extract 5 most recent compare-results-vs-prev-internal.csv files from
+    # base-artifacts and compute std deviation out of them
+    local -a csvs_paths
+    csvs_paths=("results-vs-prev/compare-results-internal.csv"
+		  "$(basename $run_step_artifacts)/compare-results-vs-prev-internal.csv")
+
+    local -a history_csvs
+    local csv history_root=""
+    while read csv; do
+       if [ "$history_root" = "" ]; then
+           history_root="$csv"
+           continue
+       fi
+
+       history_csvs+=("$csv")
+    done < <(get_git_history -0 base-artifacts "${csvs_paths[@]}")
+
+    local csv tmpf
+    local -a compare_results_list=()
+    tmpf=$(mktemp)
+
+    # FIXME:
+    # To deal with some differences along base-artifacts recent history
+    # - remove 'Failed for column' message from csv file
+    # - skip emtpy csv files.
+    for csv in "${history_csvs[@]}"; do
+	grep -v 'Failed for column' "$csv" > "$tmpf" || true
+	cp "$tmpf" "$csv"
+	if [ -s "$csv" ]; then
+	  compare_results_list+=("$csv")
 	fi
-    done < <(tail -n +2 $run_step_artifacts/results-compare.csv)
-    echo "extra_build_params=${bisect_bmks[*]}" > $run_step_artifacts/extra-bisect-params
-    return $status
-    )
-}
+    done
 
-# Exit with code 0 if no regression compared to base-artifacts/.
-# Inspect build results ./results and performance results in ./results_id.
-no_regression_to_base_p ()
-{
-    (
-    set -euf -o pipefail
+    if [ ${#compare_results_list[@]} != 0 ]; then
+      $scripts/../bmk-scripts/compute-variability.py \
+        --inputs "${compare_results_list[@]}" ${rr[top_artifacts]}/results-vs-prev/compare-results-internal.csv \
+        --weights linear --method avg \
+        --output ${rr[top_artifacts]}/results-vs-prev/bmk-specific-variability-avg.csv || true
 
-    no_build_regression_p "$@"
+      $scripts/../bmk-scripts/compute-variability.py \
+        --inputs "${compare_results_list[@]}" ${rr[top_artifacts]}/results-vs-prev/compare-results-internal.csv \
+	--weights 2-peaks-linear --method max \
+        --output ${rr[top_artifacts]}/results-vs-prev/bmk-specific-variability-max.csv || true
+    fi
 
-    local ref_artifacts=$1
-    local new_artifacts=$2
+    rm -rf "$history_root" "$tmpf"
 
-    if ! [ -f "$ref_artifacts/results_id" ]; then
-	return 0
-    fi
-    # <Workaround> missing reference results, which we have listed in
-    # tcwg-benchmark-results.broken-list.  Once all entries referencing missing
-    # results are discarded, we'll remove this workaround.
-    # Otherwise compare_results will fail while fetching baseline results,
-    # and we'll consider this failure as a regression.
-    if cat "$scripts/tcwg-benchmark-results.broken-list" \
-	    | grep -q "^$(cat $ref_artifacts/results_id)\$"; then
-	return 0
-    fi
-    # </Workaround>
-    if ! [ -f "$new_artifacts/results_id" ]; then
-	return 1
-    fi
+    $scripts/../bmk-scripts/output-bmk-results.py \
+	--compare_results ${rr[top_artifacts]}/results-vs-prev/compare-results-internal.csv \
+	--variability_file ${rr[top_artifacts]}/results-vs-prev/bmk-specific-variability-avg.csv \
+	--variability_file_data "avg" \
+	--run_step_dir "$run_step_artifacts"/ \
+	--metric "${rr[metric_id]}" --mode "${rr[mode]}" \
+	--details quiet > $run_step_artifacts/output-bmk-results.log
 
-    # Make sure there is no stray results.regression file, which we use
-    # as failure marker.
-    # We can, potentially, call ${rr[no_regression_p]} several times in
-    # a row during update_baseline() step, but we should stop at the first
-    # regression.  Therefore, we should never see results.regressions exist.
-    assert ! [ -f $run_step_artifacts/results.regressions ]
+    # copy inputs useful to build the mail / jira / .. to mail dir
+    for resfile in $run_step_artifacts/{exe,symbol}.{regression,improvement}; do
+        if [ -f $resfile ]; then
+	   cp $resfile ${rr[top_artifacts]}/notify/
+	fi
+    done
 
-    local compare_opts=""
-    case "${cflags[0]}" in
-	*"_LTO"*) compare_opts="--num_symbols 0 --entry_threshold 10" ;;
-    esac
-    compare_results "${rr[metric]}" "$ref_artifacts/results_id" "$new_artifacts/results_id" "$compare_opts"
+    # return status rely on the presence of the results.regressions file
     if [ -f $run_step_artifacts/results.regressions ]; then
+        assert_with_msg "Found a regression while comparing the build against itself" \
+			[ "$ref_results" != "$new_results" ]
 	return 1
     fi
     return 0
     )
 }
 
-# Implement rr[breakup_updated_components] hook.
-tcwg_bmk_breakup_updated_components ()
-{
-    (
-    set -euf -o pipefail
-
-    # Compiler changes tend to cause the most regressions.
-    # Breakup updated components into compiler and the rest of components
-    # to reduce the number of builds.
-    local cc
-    case "${rr[toolchain]}" in
-	llvm) cc="llvm" ;;
-	gnu|gnu_eabi) cc="gcc" ;;
-	*) assert false ;;
-    esac
-
-    if print_updated_components "\n" | grep -q "^$cc\$"; then
-	echo "$cc"
-	print_updated_components "\n" | grep -v "^$cc\$" | tr '\n' ' ' | sed -e "s/ \$//g"
-	echo
-    else
-	print_updated_components "\n"
-    fi
-    )
-}
-rr[breakup_updated_components]=tcwg_bmk_breakup_updated_components
-
-run_step stop_on_fail -10 reset_artifacts
-run_step stop_on_fail x prepare_abe
-run_step skip_on_fail -9 build_abe binutils
-run_step skip_on_fail -8 build_abe stage1 -- "${gcc_override_configure[@]}"
-run_step skip_on_fail x clean_sysroot
-case "${rr[components]}" in
-    *glibc*)
-	run_step skip_on_fail -7 build_abe linux
-	run_step skip_on_fail -6 build_abe glibc
+# Compiler changes tend to cause the most regressions.
+# Breakup updated components into compiler and the rest of components
+# to reduce the number of builds.
+case "${rr[toolchain]}" in
+    llvm)
+	rr[breakup_changed_components]="breakup_changed_components llvm"
 	;;
-    *newlib*)
-	run_step skip_on_fail -6 build_abe newlib
+    gnu|gnu_eabi)
+	rr[breakup_changed_components]="breakup_changed_components gcc"
 	;;
+    *) assert false ;;
 esac
 
-patch_branch=""
-if [ x"${rr[metric]}" = x"vect" ]; then
-    patch_branch="--patch linaro-local/vect-metric-branch"
-fi
-
-run_step skip_on_fail -5 build_abe stage2 -- $patch_branch "${gcc_override_configure[@]}"
-
+run_step stop_on_fail -10 reset_artifacts
 case "${rr[toolchain]}" in
-    llvm) run_step skip_on_fail -3 build_llvm true ;;
-esac
-case "${#cflags[@]}" in
-    2)
-	# Don't bisect benchmark build/run failures in *-vs-* configurations.
-	# Bisections happen only for regressions with build scores >=0,
-	# which will happen if benchmark "${cflags[1]}" succeeds.
-	run_step skip_on_fail -1 benchmark -- "${cflags[0]}" ${rr[top_artifacts]}/results_id-1
-	run_step skip_on_fail 0 benchmark -- "${cflags[1]}" ${rr[top_artifacts]}/results_id-2
-	# Set final "build" score to "1" for compatibility with older results
-	run_step skip_on_fail 1 true
-	rr[no_regression_p]=no_regression_vs_p
-	run_step reset_on_fail x check_regression
+    gnu*)
+	run_step stop_on_fail x prepare_abe
+	run_step skip_on_fail -9 build_abe binutils
+	run_step skip_on_fail -8 build_abe stage1 -- \
+		 "${gcc_override_configure[@]}"
+	run_step skip_on_fail x clean_sysroot
+	case "${rr[components]}" in
+	    *glibc*)
+		run_step skip_on_fail -7 build_abe linux
+		run_step skip_on_fail -6 build_abe glibc
+		;;
+	    *newlib*)
+		run_step skip_on_fail -6 build_abe newlib
+		;;
+	esac
+	run_step skip_on_fail -5 build_abe stage2 -- \
+		 "${gcc_override_configure[@]}"
 	;;
-    1)
-	# Bisect benchmark build/run failures in non-vs configurations.
-	# Set score to "0" with "true".
-	run_step skip_on_fail 0 true
-	run_step skip_on_fail 1 benchmark -- "${cflags[0]}" ${rr[top_artifacts]}/results_id
-	rr[no_regression_p]=no_regression_to_base_p
-	run_step reset_on_fail x check_regression
+    llvm)
+
+	run_step skip_on_fail -3 build_bmk_llvm
 	;;
 esac
-run_step stop_on_fail x update_baseline
-run_step stop_on_fail x push_baseline
+run_step skip_on_fail 1 benchmark -- "$cflags"
+run_step reset_on_fail x check_regression
 
 trap "" EXIT