4 files changed, 49 insertions, 30 deletions
diff --git a/jenkins-helpers.sh b/jenkins-helpers.sh
index 473d16e2..9c0e796a 100644
--- a/jenkins-helpers.sh
+++ b/jenkins-helpers.sh
@@ -1241,9 +1241,13 @@ run_step ()
 	local skip=false
 	case "$run_step_status:$run_mode" in
 	    0:*) ;;
+	    $EXTERNAL_FAIL:stop_on_fail)
+		echo "STOPPING before ${step[*]} due to previous external failure"
+		return $EXTERNAL_FAIL
+		;;
 	    *:stop_on_fail)
-		echo "STOPPING before ${step[*]} due to previous failure"
-		return $EXPECTED_EXIT
+		echo "STOPPING before ${step[*]} due to previous internal failure"
+		return $INTERNAL_FAIL
 		;;
 	    *:skip_on_fail)
 		echo "SKIPPING ${step[*]} due to previous failure"
@@ -1277,9 +1281,13 @@ run_step ()
 
 	    case "$run_step_status:$run_mode" in
 		0:*) ;;
+		$EXTERNAL_FAIL:stop_on_fail|$EXTERNAL_FAIL:reset_on_fail)
+		    echo "STOPPING at ${step[*]} due to external failure"
+		    return $EXTERNAL_FAIL
+		    ;;
 		*:stop_on_fail|*:reset_on_fail)
-		    echo "STOPPING at ${step[*]} due to failure"
-		    return $EXPECTED_EXIT
+		    echo "STOPPING at ${step[*]} due to internal failure"
+		    return $INTERNAL_FAIL
 		    ;;
 		*:skip_on_fail)
 		    echo "CARRYING ON after failure in ${step[*]}"
@@ -1312,30 +1320,41 @@ EOF
 # trap 'print_traceback' EXIT
 # Then remove the trap at the end of your script:
 # trap "" EXIT
-# Use EXPECTED_EXIT in your code to mark a non zero
-# return as expected.
-# return $EXPECTED_EDIT
-# Anything other return code will get you a
+# Use $INTERNAL_FAIL in your code to mark a non zero return from
+# an expected internal failure (e.g., failure to build, test regression, etc.).
+# Use $EXTERNAL_FAIL in your code to mark a non zero return from
+# a foreseeable external failure (e.g., git server going down or benchmarking
+# infrastructure failure).
+# The main difference between $INTERNAL_FAIL and $EXTERNAL_FAIL is that
+# $INTERNAL_FAIL can be bisected and reduced at the commit or source code
+# level.  Conversely, there is no point in bisecting $EXTERNAL_FAIL problems,
+# and we should just ignore the current failure and wait for the external
+# system to return back to life.
+# Any other return code will get you a
 # traceback (including assert/assert_with_msg)
-EXPECTED_EXIT=123
+INTERNAL_FAIL=123
+EXTERNAL_FAIL=125
 print_traceback ()
 {
   local exit_status=$?
-  if [ $exit_status != $EXPECTED_EXIT ]; then
-    echo "ERROR Traceback (most recent call last):"
-    # Show most recent calls last
-    # >=1 to skip the trap handler entry
-    # Start from end-2 to skip the top level "main" entry
-    # which isn't useful
-    for (( i=${#FUNCNAME[@]}-2 ; i>=1 ; i-- )) ; do
-      source_file=${BASH_SOURCE[$i+1]}
-      line_no=${BASH_LINENO[$i]}
-      echo "  File: $source_file, line $line_no"
-      # Remove leading whitespace to keep indentation readable
-      echo "    $(sed -e "${line_no}!d" -e 's/^[[:space:]]*//' "$source_file")"
-    done
-    # We don't know the line number of the exit itself when we trap EXIT
-    echo "  File: ${BASH_SOURCE[0]}, line ${BASH_LINENO[0]}"
-    echo "    (trap handler, exit line unknown, exit status was $exit_status)"
-  fi
+  case $exit_status in
+      $INTERNAL_FAIL|$EXTERNAL_FAIL) ;;
+      *)
+	  echo "ERROR Traceback (most recent call last):"
+	  # Show most recent calls last
+	  # >=1 to skip the trap handler entry
+	  # Start from end-2 to skip the top level "main" entry
+	  # which isn't useful
+	  for (( i=${#FUNCNAME[@]}-2 ; i>=1 ; i-- )) ; do
+	      source_file=${BASH_SOURCE[$i+1]}
+	      line_no=${BASH_LINENO[$i]}
+	      echo "  File: $source_file, line $line_no"
+	      # Remove leading whitespace to keep indentation readable
+	      echo "    $(sed -e "${line_no}!d" -e 's/^[[:space:]]*//' "$source_file")"
+	  done
+	  # We don't know the line number of the exit itself when we trap EXIT
+	  echo "  File: ${BASH_SOURCE[0]}, line ${BASH_LINENO[0]}"
+	  echo "    (trap handler, exit line unknown, exit status was $exit_status)"
+	  ;;
+  esac
 }
diff --git a/round-robin.sh b/round-robin.sh
index e8aba4d8..1ffe7574 100644
--- a/round-robin.sh
+++ b/round-robin.sh
@@ -528,14 +528,14 @@ check_regression ()
     local score
     score=$(grep -v "^#" ${rr[top_artifacts]}/results | tail -n1)
 
-    if [ x"$score" = x"-125" ]; then
+    if [ x"$score" = x"-$EXTERNAL_FAIL" ]; then
 	echo "ERROR: We have encountered some infrastructure problem (e.g.,"
 	echo "       benchmarking boards are offline), andso we can't finish"
 	echo "       the build."
 	# Exit now and don't update baseline artifacts.
 	# By not creating trigger-build-* files, we signal
 	# round-robin-bisect.sh to skip this build/revision.
-	exit 125
+	exit $EXTERNAL_FAIL
     fi
 
     if [ x"${rr[update_baseline]}" = x"rebase" ]; then
diff --git a/tcwg-benchmark.sh b/tcwg-benchmark.sh
index 04d63d62..cc533d51 100755
--- a/tcwg-benchmark.sh
+++ b/tcwg-benchmark.sh
@@ -235,7 +235,7 @@ if ! ssh "$boardname" true; then
         nvidia-power-cycle.sh "${boardname%.tcwglab}"
         wait_for_ssh_server "$boardname" 22 100
     ) &
-    wait $! || exit 125
+    wait $! || exit $EXTERNAL_FAIL
     echo "Successfull powered-cycled $boardname"
     reboot=false
 fi
diff --git a/tcwg_bmk-build.sh b/tcwg_bmk-build.sh
index 51e33616..73fde092 100755
--- a/tcwg_bmk-build.sh
+++ b/tcwg_bmk-build.sh
@@ -268,7 +268,7 @@ benchmark ()
 	        ;;
 	    "Finished: "*)
 		echo "# Benchmarking infra is offline:" >> ${rr[top_artifacts]}/results
-		echo "-125" >> ${rr[top_artifacts]}/results
+		echo "-$EXTERNAL_FAIL" >> ${rr[top_artifacts]}/results
 	        build_ret=1
 	        break
 	        ;;