tcwg-start-container.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

#!/bin/bash

set -euf -o pipefail

# shellcheck source=jenkins-helpers.sh
. "$(dirname $0)"/jenkins-helpers.sh
convert_args_to_variables "$@"
shift "$SHIFT_CONVERTED_ARGS"

obligatory_variables container image
declare container image

dryrun="${dryrun-false}"
keep_existing="${keep_existing-true}"
verbose="${verbose-true}"
additional_options="${additional_options-}"
test_docker="${test_docker-false}"

if $verbose; then set -x; fi

# Check that docker can start a container.
test_docker()
{
    timeout 30s /root/docker-wrapper ps
    /root/docker-wrapper maybepull "$image"
    /root/docker-wrapper run --rm --entrypoint=/bin/sh "$image"
    echo "NOTE: Docker seems to be OK"
}

if [ -f /root/docker-wrapper ]; then
    # /root/docker-wrapper is created by dockerfiles/tcwg-base/tcwg-host/run.sh;
    # on benchmarking boards /root is bind-mounted inside "host" container.
    if $test_docker; then
	# The fact that we are here implies that we running as root on
	# a bare machine.
	test_docker &
	if ! wait $!; then
	    storage_driver=$(timeout 30s /root/docker-wrapper info \
				 | grep "Storage Driver" | awk '{print $3}' \
				 || true)
	    if [ x"$storage_driver" = x"" ] \
		   || [ x"$storage_driver" = x"devicemapper" ]; then
		# With the TK1's old kernel the only way to run docker
		# is to use devicemapper storage driver with loopback
		# backend, which is unfit for production usage.
		# Every few months the loopback file gets corrupted and
		# docker can't start.
		# To solve this we go nuclear on docker.
		timeout 30s /usr/sbin/service docker stop || true
		rm -rf /var/lib/docker/
		# If below hangs, then we'll just wait for the eventual
		# power-cycle.  If docker still doesn't work from a clean
		# state, then we need to investigate manually.
		/usr/sbin/service docker stop || true
	    fi
	    /usr/sbin/service docker restart
	    test_docker &
	    if ! wait $!; then
		echo "ERROR: Cannot make docker work on the system"
		exit 1
	    fi
	fi
    fi

    if [ x"$keep_existing" != x"false" ]; then
	# We have docker-wrapper available, so use it to workaround dockerhub's
	# limits on pull requests.  This is important for benchmarking boards,
	# which call tcwg-update-bmk-containers.sh for every build.
	/root/docker-wrapper maybepull "$image"
    else
	# We are asked to update the container unconditionally.
	# Make sure we will use latest image. 
	docker pull "$image"
    fi
else
    docker pull "$image"
fi

rm_cnt=""
if docker stats --no-stream "$container" >/dev/null 2>&1; then
    running=$(docker container inspect -f "{{.State.Running}}" "$container")
    case "$running:$keep_existing" in
	true:true) exit 0 ;;
	true:keep_if_same_image)
	    old_image=$(docker container inspect -f "{{.Image}}" "$container")
	    new_image=$(docker image inspect -f "{{.Id}}" "$image")
	    if [ x"$old_image" = x"$new_image" ]; then
		exit 0
	    fi
	    ;;
    esac

    if $dryrun; then
	exit $EXTERNAL_FAIL
    fi

    # Rename the current container to free-up the name for "docker run" below.
    # Use rename name starting with a number (seconds since epoch) so that
    # it'll be cleaned up even if something goes wrong here.
    rm_cnt="$(date +%Y-%m-%d)-$container.bak"
    docker rename "$container" "$rm_cnt" &
    res=0 && wait $! || res=$?
    if [ x"$res" != x"0" ]; then
	# Failure to rename a container is usually caused by container
	# restarting loop.  This restarting container can't be the current
	# one, so just delete it.
	docker stop "$container" || true
	if ! docker rm -v "$container"; then
	    docker rm -vf "$container"
	fi
	rm_cnt=""
    fi
fi

if $dryrun; then
    exit $EXTERNAL_FAIL
fi

qemu_mount=""
qemu_bin=$(mktemp -p $HOME)
case "$(uname -m):$image" in
    x86_64:*-arm64-tcwg-llvmbot-*)
	# See dockerfiles.git/tcwg-base/tcwg-llvmbot/start.sh for details
	# on how this works.
	cp "$(which qemu-aarch64-static)" "$qemu_bin"
	chmod +x "$qemu_bin"
	qemu_mount="-v $qemu_bin:/bin/qemu-aarch64-static"
	;;
esac

start_sh=$(mktemp)
docker run --rm $qemu_mount $image start.sh > "$start_sh"

bash "$start_sh" --verbose "$verbose" --additional_options "$additional_options" -- "$@"
rm "$start_sh" "$qemu_bin"

if [ x"$rm_cnt" != x"" ]; then
    # With the new container started delete the old one.
    # Note that if both old and new containers need an exclusive resource
    # (e.g., tcp port or connection to jenkins), then the new container might
    # need to restart a couple of times to wait for removal of the old one.
    #
    # We first try to gracefully shutdown the container
    docker stop "$rm_cnt" || true
    if ! docker rm -v "$rm_cnt"; then
	# ... and force SIGKILL only when necessary.
	docker rm -fv "$rm_cnt"
    fi
fi