#!/bin/bash set -e -o pipefail # shellcheck source=jenkins-helpers.sh . "$(dirname $0)"/jenkins-helpers.sh # Start a local docker instance with the requested arch and distro # This script is meant to be executed from Jenkins jobs inside TCWG # lab. It prints shell commands meant to be executed in the parent # shell, consisting in: # - definition of ${CONTAINER}, used to prefix commands that you want # to run inside the container. # - definition of ${CONTAINER_CLEANUP}, a cleanup statement remove the # container on exit for instance # - definition of ${session_host}, ${session_port}, and ${session_opts[@]} # can be used for a remote connection to the container usage() { echo "Usage: $0 [--arch container-arch] --distro flavour [--dryrun true/false] [--label label] [--newuser username:[uid]] [--node node] [--prefix prefix] [--secondary true/false] [--session-host host] [--session-name name] [--ssh_info true/false] [--task {build|test|bench}] [--user user] [--weight weight] [--verbose true/false] [--security options]" echo echo " container-arch: architecture (eg: amd64, i386, arm64, armhf)" echo " distro: distribution (eg: lts_1)" echo " dryrun: boolean, just print commands if true" echo " label: jenkins label; container is started on least-busy node; also sets container architecture" echo " newuser: new user to create inside container, [:] specification." echo " node: jenkins node; container is started on host mapped to the node" echo " prefix: prefix to prepend to output variables and functions" echo " secondary: create a secondary container that will use the same workspace" echo " session-host: hostname where the container will run, defaults to localhost" echo " useful if the name resolution does not work correctly" echo " session-name: session, in case the default '\$BUILD_NUMBER-\$JOB_NAME' is not suitable" echo " ssh_info: set \$ssh_host and \$ssh_port env variables in the container" echo " task: type of container (build, test or bench, default=build)" echo " user: remote user to use in the container." echo " weight: container weight, reserves resources. Default=1" echo " security: override the default container security options (currently CAP_SYS_PTRACE and unconfined seccomp)." echo " verbose: whether enable verbose output. Default=false" exit 1 } ssh_error() { echo "ERROR: ssh returned with code: $1, date: $(date). Trying another ssh connexion to $session_host to get debug logs:" ssh -v $session_host true return $1 } # Save stdout/stderr file descriptors exec 3>&1 4>&2 # Make sure all output goes to stderr exec 1>&2 container_arch="default" distro="default" dryrun=false label= node= newuser= prefix= secondary=false session_host= session_name= ssh_info=false task="build" weight=1 user= verbose="false" security="" while [ $# -ge 1 ] do case $1 in --arch) container_arch=$2 [ x${container_arch} = x ] && usage shift 2 ;; --distro) distro=$2 [ x${distro} = x ] && usage shift 2 ;; --dryrun) dryrun=$2 [ x${dryrun} = x ] && usage [ $dryrun != false ] && [ $dryrun != true ] && usage shift 2 ;; --label) label=$2 [ x${label} = x ] && usage shift 2 ;; --node) node=$2 [ x${node} = x ] && usage shift 2 ;; --newuser) newuser="$2" [ x${newuser} = x ] && usage shift 2 ;; --prefix) prefix=$2 [ x${prefix} = x ] && usage shift 2 ;; --secondary) secondary=$2 [ x$secondary = x ] && usage shift 2 ;; --session-host) session_host=$2 [ x${session_host} = x ] && usage shift 2 ;; --session-name) session_name=$2 [ x${session_name} = x ] && usage shift 2 ;; --ssh_info) ssh_info=$2 [ x$ssh_info = x ] && usage shift 2 ;; --task) task=$2 case "${task}" in build|precommit|bench|test) ;; *) usage ;; esac shift 2 ;; --user) user="$2"@ [ x${user} = x@ ] && usage shift 2 ;; --weight) weight=$2 [ x${weight} = x ] && usage shift 2 ;; --verbose) verbose=$2 [ x${verbose} = x ] && usage shift 2 ;; --security) security="$2" shift 2 ;; *) echo "Unsupported option: $1" usage ;; esac done if $verbose; then set -x; fi dryruncmd="" if $dryrun; then dryruncmd="echo" fi if [ x"$node" = x"" ] && [ x"$session_host" = x"" ] && [ x"$label" != x"" ]; then node=$(print_node_with_least_containers "$label") if [ x"$node" = x"" ]; then echo "ERROR: Cannot find node for $label" exit 1 fi fi if [ x"$node" != x"" ]; then if [ x"$session_host" != x"" ]; then echo "--session_host conflicts with --node" usage fi session_host=$(print_host_for_node $node) fi if [ x"$session_host" = x"" ]; then # Get first FQDN. This name needs to have .tcwglab suffix for VPN'ed # machines and entries in .ssh/config for external machines. session_host=$(hostname -A | cut -d" " -f 1) if [ "$session_host" = "" ]; then # WSL environment return empty string for "hostname -A", but outputs # a proper hostname (set in /etc/wsl.conf) for "hostname". session_host=$(hostname) assert_with_msg "Cannot get hostname" \ [ x"$session_host" != x"" ] fi arch_host="localhost" else arch_host="$session_host" fi if [ x"${container_arch}" = x"default" ]; then if [ x"$label" != x"" ]; then container_arch=$(print_arch_for_label "$label") else container_arch=$(print_arch_for_host "$arch_host") fi elif [ x"$label" != x"" ]; then echo "--arch conflicts with --label" usage fi if [ x"$session_name" = x ]; then # Set the default session_name, using BUILD_NUMBER and JOB_NAME, # as set by Jenkins. # shellcheck disable=SC2153 if [ "x$BUILD_NUMBER" != "x" ] && [ "x$JOB_NAME" != "x" ]; then session_name="$BUILD_NUMBER-$JOB_NAME-$task" else session_name="$USER-$(date +%Y%m%d-%H_%M_%S)" fi session_name=$(print_docker_name "$session_name") fi # Resolve LTS and LTS-1 values to Ubuntu distros. case "$distro" in lts_1) distro=focal ;; lts|default) distro=jammy ;; esac image=linaro/ci-${container_arch}-tcwg-build-ubuntu:${distro} # Avoid connection sharing because of race conditions with parallel builds. # Also, we don't really need ssh agent forwarding here, so, since precommit # testing takes this path, disable it for extra caution. Also see note # about ssh agent forward at wait_for_ssh_server below. SSH="ssh -Snone -oForwardAgent=no" pwd_translate=(cat) # Configure container for precommit testing: # - use tcwg-build user instead of tcwg-buildslave; # - disable ssh agent forwarding; # - use scratch docker volume for $WORKSPACE instead of bind-mounting from host; # -- use container_rsync() to transfer data to and from precommit container; # - mount everything else as read-only (e.g., ccache, snapshots-ref, etc.); # - translate absolute /home/* paths if [ "$task" = "precommit" ]; then if [ "$newuser" = "" ]; then newuser=tcwg-build fi if [ "$user" = "" ]; then user="$newuser" fi if [ "${WORKSPACE+set}" = "set" ]; then # Translate $WORKSPACE/* paths from $USER to $user. Or, specifically, # from /home/tcwg-buildslave/workspace/* to # /home/tcwg-build/workspace/*. dst_workspace=$(echo "$WORKSPACE" | sed -e "s#^$HOME#/home/$user#") pwd_translate=(sed -e "s#^$WORKSPACE#$dst_workspace#") fi fi assert_with_msg "user and USER variables should not be set to the same value" \ [ x"$user" != x"$USER" ] # Note that when we use this we *want* it to split on spaces # So that the shell runs: # foo bar docker <...> # Instead of: # "foo bar docker" <...> # Note: use "ssh -n" to avoid consuming stdin. This is especially important # in "while read $i;" loops we use to cleanup containers. Without this # we cleanup the first container, and entries for all other containers # are swallowed by ssh. DOCKER="$dryruncmd $SSH -n $session_host docker-wrapper" $DOCKER maybepull $image || ssh_error $? # If the configuration does not override the security options, use the default. if [ -z "$security" ]; then security="--cap-add=SYS_PTRACE" # We need this because of a bug in libgo's configure script: # it would crash when testing "whether setcontext clobbers TLS # variables", and report neither "no" nor "yes", later making # configure fail. # Also, because the sanitizers need to disable ASLR during the tests # and docker needs to explicitly enable the process to do that on all # architectures. security="${security} --security-opt seccomp:unconfined" case "$container_arch:$distro:$($DOCKER --version | cut -d" " -f3)" in armhf:focal:18*) # To run armhf focal images on old docker we need to disable # seccomp via --privileged option. We can't upgrade docker to # a newer version on TK1s as we will loose bridge network (presumably, # due to incompatibility with old 3.10 kernel), which we use in # jenkins CI builds. security="--privileged" ;; esac fi # Reserve resources according to weight and task nproc=$($SSH $session_host nproc --all) memlimit=$($SSH $session_host free -m | awk '/^Mem/ { print $2 }') pids=$(print_pids_limit "$task" "$weight") cpus=$(print_cpu_shares "$task" "$weight") memory=$(print_memory_limit "$task" "$weight" "$nproc" "$memlimit") memory_opt="" if [ x"$memory" != x"unlimited" ]; then memory_opt="--memory=${memory}M" fi if [ x"${JOB_NAME:+set}" = x"set" ]; then job_name="$JOB_NAME" fi wsl=false case "$($SSH $session_host uname -r)" in *"-WSL2") wsl=true ;; esac lock_workspace=false mounts_opt=() chown_mounts=() git_mounts=() force_port=() readarray -t mounts < <(print_mounts "$task" "$job_name" \ "-$container_arch-$distro" \ $SSH "$session_host") if $wsl; then # Enable WSL-Interop inside containers. This allows us to build toolchains # inside docker containers inside WSL2 environments, and still have ability # to run generated win32 executables. # If this doesn't work, make sure WSL2 version is 2.0.14 or later; # WSL 2.0.9 has a bug preventing interop outside of the "main init" # process tree. mounts+=(/init:/init:ro /run/WSL:/run/WSL) # FIXME: WSL VM is on a private network, and we have several # ports -- 22, 2222, and 32768 -- proxied inside it. I couldn't # figure out how to proxy a port range, so it's simpler to configure # docker to use a fixed port. # We should try to configure bridged network for WSL VM, so that no port # forwarding is necessary. force_port=(-p 32768:22) fi echo "MOUNTS: ${mounts[*]}" for mount in "${mounts[@]}"; do # Disassemble the mount ro=$(echo "$mount" | cut -s -d: -f 3) if [ "$ro" != "" ]; then assert [ "$ro" = "ro" ] # This is a read-only bind-mount or volume mount, e.g., # - ssh host keys, # - $WORKSPACE/base-artifacts/ for task==precommit, # - ccache-* for task==precommit. dst=$(echo "$mount" | cut -s -d: -f 2) src=$(echo "$mount" | cut -s -d: -f 1) else dst=$(echo "$mount" | cut -s -d: -f 2) if [ "$dst" != "" ]; then # This is a read-write bind-mount or volume mount, e.g., # - $WORKSPACE for task==build, # - ccache-* for task==build. src=$(echo "$mount" | cut -s -d: -f 1) assert_with_msg "Non-readonly mount for precommit task" \ [ "$task" != "precommit" -o "$wsl" = "true" ] else # This is a read-write scratch mount, e.g., # - $WORKSPACE for task==precommit. dst="$mount" src="" fi fi if [ "${WORKSPACE+set}" = "set" ] && [ "$dst" = "$WORKSPACE" ]; then lock_workspace=true fi dst=$(echo "$dst" | "${pwd_translate[@]}") # ccache-* volumes are owned by tcwg-buildslave, so don't let # anyone else write into them. It's fine, though, to use them # as read-only ccache for other users and for precommit testing. # Also see round-robin.sh:setup_ccache(). case "$src" in ccache-*) if [ "$user" != "" ] && [ "$user" != "tcwg-buildslave" ]; then dst="/home/$user/.ccache" ro="ro" fi ;; esac case "$src" in "/home/"*) # Make sure all bind-mount /home/* directories exist. # If a host bind-mount dir doesn't exist, then docker creates # it on the host with root:root owner, which can't be removed # by cleanup job. $dryruncmd $SSH $session_host mkdir -p "$src" ;; "") case "$dst:$ro" in *":ro") ;; # This is a read-only mount "/home/"*) # Similarly to above "mkdir -p", chown scratch volumes # under /home to to $user. chown_mounts+=("$dst") ;; esac ;; esac # See processing of git_mounts below. if [ "$src" != "" ] && git -C "$src" status >/dev/null 2>&1; then git_mounts+=("$dst") fi # Re-assemble the mount mount="$dst" if [ "$src" != "" ]; then mount="$src:$mount" if [ "$ro" != "" ]; then mount="$mount:ro" fi fi mounts_opt+=("-v" "$mount") done # For CI builds make sure to kill previous build, which might have been # aborted by jenkins, but processes could have survived. Otherwise old # build can start writing to files of the current build. if $lock_workspace; then # We may have several containers (one primary and several secondary) # sharing the same workspace, and we list these in $WORKSPACE/.lock. # Helpers stop_all_containers() and clean_all_containers() use this .lock # file to stop/cleanup all containers created in the current session. # We keep the .lock file, so that the primary container in the next # build can confirm that all containers are indeed removed. # # When cleanup routine is triggered by aborted jenkins build, we # often no longer have access to ssh-agent. Therefore, $SSH command # will likely fail. However, sometimes $SSH just hangs indefinitely, # which causes problems in tcwg-benchmark_backend job -- the shell hanging # in "trap" waits on $SSH and does not release the board lock file. # To avoid this we put a "timeout 10m" on the container cleanup. # Docker daemon will finish removing the container even if the caller # docker client is killed by timeout. if ! $secondary; then while read prev_container; do # Container may have been cleaned up by something else if $DOCKER stats --no-stream "$prev_container" &>/dev/null; then echo "NOTE: Removing previous container for $WORKSPACE" $DOCKER rm -vf "$prev_container" \ || echo "WARNING: Could not remove $prev_container" fi done < <($SSH $session_host flock "$WORKSPACE/.lock" \ cat "$WORKSPACE/.lock" || true) $SSH $session_host rm -f "$WORKSPACE/.lock" fi fi # Give access to all CPUs to container. # This happens by default on most machines, but on machines that can put # unused cores offline (TK1s and TX1s) it can happen that docker cpuset # cgroup gets configured to use only a subset of cores. If this happens, # then we have a bug in our HW scripts, and here is the best place # to detect this problem -- docker run will fail if it can't provide # any of the CPUs to the new container. cpuset_opt="--cpuset-cpus 0-$(($nproc - 1))" echo "DEBUG: starting docker on $session_host from $(hostname), date $(date)" # shellcheck disable=SC2206 docker_run=($DOCKER run --name $session_name -dtP "${force_port[@]}" \ "${mounts_opt[@]}" \ ${memory_opt} \ "--pids-limit=${pids}" \ "--cpu-shares=${cpus}" \ $cpuset_opt \ ${security} \ $image) echo "${docker_run[@]}" # FIXME: It seems in some cases $DOCKER run generates a session_id but # returns an error code. Try to get more information ret=0 session_id=$("${docker_run[@]}") || ret=$? if [ $ret -ne 0 ]; then ssh_error $ret if [ $ret -eq 255 ]; then echo "WARNING: $SSH $session_host returned an error ($ret). Trying another ssh connexion to get debug logs" $SSH -v $session_host true else echo "WARNING: docker run returned an error: $ret, trying to continue nonetheless..." fi fi if [ x"$session_id" = x ]; then echo "ERROR: could not create session_id" exit 1 fi # Remove the docker instance we have just created in case something # goes wrong. CONTAINER_CLEANUP="$DOCKER rm -fv ${session_id}" # shellcheck disable=SC2064 trap "exec 1>&3 2>&4 ; ${CONTAINER_CLEANUP}" EXIT if [ x"$newuser" != x"" ]; then $DOCKER exec "$session_id" \ new-user.sh --user "$newuser" --verbose "$verbose" fi if [ "$user" != "" ]; then for dir in "${chown_mounts[@]}"; do $DOCKER exec "$session_id" chown $user "$dir" done # Mark git mounts as safe git directories,so that git does not complain # about dubious ownership. This is important for get_git_history() # fetching sumfiles/flaky.xfail files. for dir in "${git_mounts[@]}"; do $DOCKER exec "$session_id" sudo -i -u $user \ git config --global --add safe.directory "$dir" done if [ "$user" = "tcwg-build" ]; then # FIXME: Hack -- use tcwg-buildslave's key while tcwg-build's # is unavailable. $DOCKER exec "$session_id" cp \ /home/tcwg-buildslave/.ssh/authorized_keys \ /home/tcwg-build/.ssh/authorized_keys $DOCKER exec "$session_id" chown $user \ /home/tcwg-build/.ssh/authorized_keys fi # Below $user is used as a prefix for $session_host user="$user@" fi session_port=$($DOCKER port $session_id 22 | cut -d: -f 2) || ssh_error $? session_opts=("-p$session_port") # SECURITY NOTE: this is the first time we are establishing ssh connection # to the container, and, provided we are using connection sharing, settings # specified for this connection may affect many or all of the subsequent # connections. In particular, if ssh agent forwarding is enabled for the # below connection, then it will be part of the master connection, and it # may persist through the whole lifetime of the container. if [ "$task" = "precommit" ]; then # FIXME: We should be OK to disable agent forwarding for most of our # jobs, but, for now, keep the previous state as the default. # Disable agent forwarding for precommit testing. session_opts+=("-oForwardAgent=no") fi # Wait until the ssh server is ready to serve connexions # Make sure connexion messages go to stderr, so that in case of # success stdout contains only the connexion info expected by the # caller. ret=0 $dryruncmd wait_for_ssh_server ${user}$session_host "" "${session_opts[@]}" \ || ret=$? if [ $ret != 0 ]; then echo SSH server did not respond, exiting exit $ret fi # Create the lock for the workspace, which will allow subsequent builds # to remove our container in case it's not cleaned up gracefully. if $lock_workspace; then $SSH $session_host bash -c "\"mkdir -p $WORKSPACE && echo $session_id | flock $WORKSPACE/.lock tee -a $WORKSPACE/.lock\"" fi # Do not remove the container upon exit: it is now ready trap EXIT ssh_info_opt="" if $ssh_info; then assert_with_msg "ssh_info is not supported for task==precommit" \ [ "$task" != "precommit" ] # FIXME: One of the things to fix for precommit benchmarking is to pass # all ${session_opts[@]} to the benchmarking container. Benchmarking # precommit workflow is tricky because precommit container needs to trigger # benchmarking job on ci.linaro.org, and then the benchmarking will # connect via ssh to the [precommit] build container. This connection # needs to happen with ssh agent DISABLED, so that processes inside # precommit container can't escape. To disable ssh agent forwarding # when connecting to this container -- we need to pass all # ${session_opts[@]}, which have -oForwardAgent=no. # # Triggering of the benchmarking job on ci.linaro.org can be arranged # by allowing tcwg-build's ssh key to trigger tcwg-benchmark job. This # relies on Jenkins ssh interface to be robust against # privileged-escalation attacks. # # Additionally, we should disable ssh agent forwarding by default, # and enable it only by request. ssh_info_opt="ssh_host=${user}$session_host ssh_port=$session_port" fi # Restore stdout/stderr exec 1>&3 2>&4 cat <