#!/usr/bin/env bash
set -eu
set -o pipefail
[[ ${DEBUG:-} != true ]] || set -x
dirname=$(dirname "$0")
function error() { echo "ERROR: $1" >&2; exit 1; }
function info() { echo "INFO: $1"; }
function log() { local log="${1?}"; shift; mkdir -p $(dirname "$log"); "$@" 2>&1 | tee -a "$log"; }
JENKINS_URL="${JENKINS_URL:-http://$JENKINS_SERVER:$JENKINS_PORT/}"
JOB_SET_NODES_OFFLINE=${JOB_SET_NODES_OFFLINE:-false}
JOB_SET_NODES_ONLINE=${JOB_SET_NODES_ONLINE:-false}
JENKINS_WGET="${JENKINS_WGET:-wget --auth-no-challenge}"
SLAVES_LIST_FILE="${1:-}"
[[ -n $SLAVES_LIST_FILE ]] || error "missing slave list argument"
SLAVES_CHECK="${2:-}"
[[ -n $SLAVES_CHECK ]] || error "missing slave check script"
SLAVES_REPORT="${3:-}"
[[ -n $SLAVES_REPORT ]] || error "missing slave report name"
$dirname/set-build-description.sh "Checking all slaves..."
WORKSPACE=${WORKSPACE:-$PWD/workspace}
ARTIFACTSDIR=${WORKSPACE}/artifacts/${SLAVES_REPORT}
LOGSDIR=${ARTIFACTSDIR}/logs
rm -rf $ARTIFACTSDIR $LOGSDIR
mkdir -p $WORKSPACE $ARTIFACTSDIR $LOGSDIR
declare -i errors=0
declare -i total=0
while read -r line; do
status=0
line=$(echo "$line" | sed -e 's/ *#.*//' -e 's/^ *//')
[[ -n $line ]] || continue
host=$(echo "$line" | cut -f1 -d,)
nodes=$(echo "$line" | cut -f2 -d,)
[[ -n $host ]] || continue
total=$((total + 1))
pushd $dirname >/dev/null
log $LOGSDIR/check_$host.log.txt ./remote-exec.sh $host ./$SLAVES_CHECK || status=$?
popd >/dev/null
[[ $status = 0 ]] || errors=$((errors + 1))
[[ $status = 0 ]] || info "$SLAVES_CHECK on $host returned with status $status"
for node in $nodes; do
# FIXME: We need to authenticate to Jenkins for this to work
offline_cause=$($JENKINS_WGET -q -O- "$JENKINS_URL/computer/$node/api/xml/?xpath=//offlineCauseReason" | sed 's|<[^>]*>||g') || true
offline_by_job=$(echo "$offline_cause" | grep "Set offline by" | sed 's|^.*Set offline by \([^:]*\):.*$|\1|' || true)
offline_agent=$(echo "$offline_cause" | grep "This agent is offline because Jenkins failed to launch the agent process on it." || true)
if [[ $status = 0 ]]; then
if [[ $offline_cause != "" ]]; then
if [[ $offline_by_job = job ]]; then
if [[ $JOB_SET_NODES_ONLINE = true ]]; then
info "setting node back online: $node"
$dirname/set-node-online.sh $node
else
info "skip setting of node back online: $node (JOB_SET_NODES_ONLINE=$JOB_SET_NODES_ONLINE)"
fi
else
if [ "x$offline_agent" != "x" ]; then
error "Node $node is offline, the agent could not start"
fi
case "x$offline_cause" in
*"Connection was broken"*)
error "Node $node is offline, connexion broken"
;;
esac
fi
fi
else
if [[ $JOB_SET_NODES_OFFLINE = true ]]; then
if [[ $offline_cause = "" || $offline_by_job = job ]]; then
info "setting node offline: $node"
message="node offline due to missing capacities."
[[ -z ${JENKINS_SERVER_COOKIE:-} ]] || \
message="$message
Set offline by job: $JOB_NAME / $BUILD_ID.
Console: slave check output log."
[[ -n ${JENKINS_SERVER_COOKIE:-} ]] || message="$message
Set offline by user: $USER, hostname: $(hostname), pid: $$."
$dirname/set-node-offline.sh $node "$message" || true # Avoid abort if can't be set offline
fi
else
info "skip setting of node offline: $node (JOB_SET_NODES_OFFLINE=$JOB_SET_NODES_OFFLINE)"
fi
fi
done
done < $SLAVES_LIST_FILE
$dirname/set-build-description.sh "Generating report..."
$dirname/postbuild-report.sh $SLAVES_LIST_FILE $SLAVES_REPORT
if [[ $errors -gt 0 ]]; then
$dirname/set-build-description.sh "FAILURE: $errors / $total hosts KO"
echo "FAILURE: $errors / $total hosts KO"
exit 1
else
$dirname/set-build-description.sh "SUCCESS: $total hosts OK"
echo "SUCCESS: $total hosts OK"
fi