#!/usr/bin/env bash set -eu set -o pipefail [[ ${DEBUG:-} != true ]] || set -x dirname=$(dirname "$0") function error() { echo "ERROR: $1" >&2; exit 1; } function info() { echo "INFO: $1"; } function log() { local log="${1?}"; shift; mkdir -p $(dirname "$log"); "$@" 2>&1 | tee -a "$log"; } JENKINS_URL="${JENKINS_URL:-http://$JENKINS_SERVER:$JENKINS_PORT/}" JOB_SET_NODES_OFFLINE=${JOB_SET_NODES_OFFLINE:-false} JOB_SET_NODES_ONLINE=${JOB_SET_NODES_ONLINE:-false} JENKINS_WGET="${JENKINS_WGET:-wget --auth-no-challenge}" SLAVES_LIST_FILE="${1:-}" [[ -n $SLAVES_LIST_FILE ]] || error "missing slave list argument" SLAVES_CHECK="${2:-}" [[ -n $SLAVES_CHECK ]] || error "missing slave check script" SLAVES_REPORT="${3:-}" [[ -n $SLAVES_REPORT ]] || error "missing slave report name" $dirname/set-build-description.sh "Checking all slaves..." WORKSPACE=${WORKSPACE:-$PWD/workspace} ARTIFACTSDIR=${WORKSPACE}/artifacts/${SLAVES_REPORT} LOGSDIR=${ARTIFACTSDIR}/logs rm -rf $ARTIFACTSDIR $LOGSDIR mkdir -p $WORKSPACE $ARTIFACTSDIR $LOGSDIR declare -i errors=0 declare -i total=0 while read -r line; do status=0 line=$(echo "$line" | sed -e 's/ *#.*//' -e 's/^ *//') [[ -n $line ]] || continue host=$(echo "$line" | cut -f1 -d,) nodes=$(echo "$line" | cut -f2 -d,) [[ -n $host ]] || continue total=$((total + 1)) pushd $dirname >/dev/null log $LOGSDIR/check_$host.log.txt ./remote-exec.sh $host ./$SLAVES_CHECK || status=$? popd >/dev/null [[ $status = 0 ]] || errors=$((errors + 1)) [[ $status = 0 ]] || info "$SLAVES_CHECK on $host returned with status $status" for node in $nodes; do # FIXME: We need to authenticate to Jenkins for this to work offline_cause=$($JENKINS_WGET -q -O- "$JENKINS_URL/computer/$node/api/xml/?xpath=//offlineCauseReason" | sed 's|<[^>]*>||g') || true offline_by_job=$(echo "$offline_cause" | grep "Set offline by" | sed 's|^.*Set offline by \([^:]*\):.*$|\1|' || true) offline_agent=$(echo "$offline_cause" | grep "This agent is offline because Jenkins failed to launch the agent process on it." || true) if [[ $status = 0 ]]; then if [[ $offline_cause != "" ]]; then if [[ $offline_by_job = job ]]; then if [[ $JOB_SET_NODES_ONLINE = true ]]; then info "setting node back online: $node" $dirname/set-node-online.sh $node else info "skip setting of node back online: $node (JOB_SET_NODES_ONLINE=$JOB_SET_NODES_ONLINE)" fi else if [ "x$offline_agent" != "x" ]; then error "Node $node is offline, the agent could not start" fi case "x$offline_cause" in *"Connection was broken"*) error "Node $node is offline, connexion broken" ;; esac fi fi else if [[ $JOB_SET_NODES_OFFLINE = true ]]; then if [[ $offline_cause = "" || $offline_by_job = job ]]; then info "setting node offline: $node" message="node offline due to missing capacities." [[ -z ${JENKINS_SERVER_COOKIE:-} ]] || \ message="$message
Set offline by job: $JOB_NAME / $BUILD_ID.
Console: slave check output log." [[ -n ${JENKINS_SERVER_COOKIE:-} ]] || message="$message
Set offline by user: $USER, hostname: $(hostname), pid: $$." $dirname/set-node-offline.sh $node "$message" || true # Avoid abort if can't be set offline fi else info "skip setting of node offline: $node (JOB_SET_NODES_OFFLINE=$JOB_SET_NODES_OFFLINE)" fi fi done done < $SLAVES_LIST_FILE $dirname/set-build-description.sh "Generating report..." $dirname/postbuild-report.sh $SLAVES_LIST_FILE $SLAVES_REPORT if [[ $errors -gt 0 ]]; then $dirname/set-build-description.sh "FAILURE: $errors / $total hosts KO" echo "FAILURE: $errors / $total hosts KO" exit 1 else $dirname/set-build-description.sh "SUCCESS: $total hosts OK" echo "SUCCESS: $total hosts OK" fi