tcwg-cleanup-stale-containers.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

#!/bin/bash

set -e

usage ()
{
    cat <<EOF
Options:
  --cleanup-running-hours HOURS
  --cleanup-stopped-hours HOURS
	Cleanup running/stopped containers that have been created more
	than HOURS ago.  Setting HOURS to negative values will
	run the cleanup in dry-run mode.  Value "0" disables the cleanup.

  --cleanup-ssh-agent-hours HOURS
	Cleanup stale ssh-agent processes that were started more than
	HOURS ago.  Value "0" disables the cleanup.

  --cleanup-images true/false
	Whether to cleanup untagged images

  --cleanup-volumes true/false
	Whether to cleanup dangling volumes

  --max-containers N
	Check that number of containers after cleanups doesn't exceed N.
	Value "0" disables the check.

  --verbose true/false
	Whether to run in verbose mode
EOF
    exit 1
}

cleanup_running_hours="-10"
cleanup_stopped_hours="-240"
cleanup_ssh_agent_hours="-48"
cleanup_images=false
cleanup_volumes=false
verbose=false
max_containers=0

while [ $# -gt 0 ]; do
    case $1 in
	--cleanup-running-hours) cleanup_running_hours="$2"; shift ;;
	--cleanup-stopped-hours) cleanup_stopped_hours="$2"; shift ;;
	--cleanup-ssh-agent-hours) cleanup_ssh_agent_hours="$2"; shift ;;
	--cleanup-images) cleanup_images="$2"; shift ;;
	--cleanup-volumes) cleanup_volumes="$2"; shift ;;
	--max-containers) max_containers="$2"; shift ;;
	--verbose) verbose="$2"; shift ;;
	*) echo "ERROR: Wrong option: $1"; usage ;;
    esac
    shift
done

if $verbose; then
    set -x
fi

DOCKER="docker"

do_cleanup_containers ()
{
    local hours="$1"
    local docker_ps_opts="$2"
    local action="$3"
    local action_msg="$4"
    local msg="$5"

    local cleanup_containers=true
    local dryrun_msg=""
    local only_jenkins_containers=true

    if [ "$hours" -eq "0" ]; then
	exit 0
    elif [ "$hours" -lt "0" ]; then
	hours="$((0-$hours))"
	cleanup_containers=false
	dryrun_msg=" (DRYRUN)"
    fi

    echo "$msg (more than ${hours}h)${dryrun_msg}"
    echo "Container report before:"
    $DOCKER ps $docker_ps_opts

    local -a rm_containers=()
    local curdate
    curdate=$(date +%s)
    for container in $($DOCKER ps $docker_ps_opts --format "{{.ID}}"); do
	local container_date container_seconds

	container_date=$(date +%s --date="$($DOCKER inspect --format "{{.Created}}" $container)")
	container_seconds=$((curdate-container_date))

	if [ "$(($container_seconds/3600))" -gt "$hours" ]; then

	    # Do we want to remove all containers, or only those
	    # started by Jenkins jobs?
	    if ${only_jenkins_containers}; then
		container_name=$($DOCKER inspect --format "{{.Name}}" $container)
		# Containers started by our Jenkins jobs start with a
		# number
		case ${container_name} in
		    /[0-9]*)
			;;
		    *)
			container=""
			;;
		esac
	    fi
	    if [ -n "$container" ]; then
	      rm_containers=("${rm_containers[@]}" "$container")
	    fi
	fi
    done

    local res
    local status="0"
    if [ ${#rm_containers[@]} != 0 ]; then
	echo "Containers to ${action_msg}: ${rm_containers[*]}"
	if $cleanup_containers; then
            for container in "${rm_containers[@]}"; do
		echo "Container to ${action_msg}: $container"
		$DOCKER $action $container &
		res=0; wait $! || res=$?
		if [ $res != 0 ]; then
		    echo "WARNING: $DOCKER $action $container -- exit status: $res"
		    status="1"
		fi
            done
	else
            echo "DRY_RUN: NOT ACTING ON CONTAINERS"
	    echo "Increasing exit code to indicate stale containers"
	    status="1"
	fi

	echo "Containers report after:"
	$DOCKER ps $docker_ps_opts
    else
	echo "Found no container to ${action_msg}"
    fi

    exit $status
}

res="0"
do_cleanup_containers $cleanup_running_hours "" "stop" "stop" "Stopping long-running containers" &
wait $! || res=$?
status=$res

res="0"
do_cleanup_containers $cleanup_stopped_hours "-a" "rm -fv" "remove" "Removing containers stopped long ago" &
wait $! || res=$?
status=$(($status|(2*$res)))

if $cleanup_volumes; then
    mapfile -t rm_volumes < <($DOCKER volume ls -q -f dangling=true)
    # Filter-out named volumes like host-home and home-$USER.  Leave only volumes
    # named like a sha1 hash.
    mapfile -t rm_volumes < <(echo "${rm_volumes[@]}" | tr " " "\n" | grep "^[a-f0-9]\{64\}\$")

    if [ ${#rm_volumes[@]} != 0 ]; then
	echo "Removing dangling volumes"
	for volume in "${rm_volumes[@]}"; do
	    $DOCKER volume rm $volume &
	    res=0; wait $! || res=$?
	    if [ $res != 0 ]; then
		echo "WARNING: $DOCKER volume rm $volume -- exit status: $res"
		status=$(($status|4))
	    fi
	done
    fi
else
    echo "DRY_RUN: NOT REMOVING DANGLING VOLUMES"
fi

if $cleanup_images; then
    # See dockerfiles.git/tcwg-base/tcwg-host/docker-wrapper for background
    # on image stamp files.
    stamp_dir=/home/shared/docker

    # Untag and prune images that haven't been used for 3 days or more.
    for image_tuple in $(docker images --format "{{.ID}}:{{.Repository}}:{{.Tag}}"); do
	image_id=$(echo "$image_tuple" | cut -d: -f 1)
	image=$(echo "$image_tuple" | cut -d: -f 2,3)
	image_stamp="$stamp_dir/$(echo "$image" | tr "/:" "_")"
	remove_if_not_used_since=$(($(date +%s) - 3*24*60*60))
	# Use negative comparison to handle non-existent stamp files.
	if ! [ "$(stat -c %Z "$image_stamp.use" 2>/dev/null)" \
		   -gt $remove_if_not_used_since ] 2>/dev/null; then
	    # Untag the image.  Use $image_id to handle previously-untagged
	    # images and other cases when we have no repo or tag reference.
	    docker rmi -f "$image_id" 2>/dev/null \
		|| echo "NOTE: Cannot remove $image_tuple"
	fi
    done
    # Prune untagged images.
    docker image prune -f
else
    echo "DRY_RUN: NOT REMOVING UNUSED IMAGES"
fi

# This is a workaround for https://issues.jenkins-ci.org/browse/JENKINS-49097.
# Matrix jobs leave stale ssh-agent processes, which accumulate into hundreds.
# This cleanup has nothing to do with docker containers, but creating
# a separate script/job for a one-liner doesn't seem to worth it.
if [ "$cleanup_ssh_agent_hours" -gt "0" ]; then
    res=0; killall --older-than ${cleanup_ssh_agent_hours}h -u $USER ssh-agent &
    wait $! || res=$?
    # Killall can fail for several reasons:
    # return-code 127: command not found
    # return-code 1: in general means no ssh-agent process was found
    # In the first case, we want the cleanup job to fail, so that we
    # know we need to install killall
    # The second case is OK, unless killall fails for another
    # reason. Assume OK for now.
    if [ $res != 0 ]; then
	case $res in
	    127)
		echo "WARNING: could not kill stale ssh-agent processes (killall command not found)"
		echo "Increasing exit code to indicate killall is missing"
		status=$(($status|16))
		;;
	    1)
		echo "WARNING: could not kill stale ssh-agent processes or there was no stale ssh-agent older than ${cleanup_ssh_agent_hours}h"
		;;
	esac
    fi
fi

# Check if we have more containers than max_containers
nb_containers=$($DOCKER ps -a | wc -l)

if [ ${max_containers} -gt 0 ] && [ ${nb_containers} -gt ${max_containers} ]; then
    echo "ERROR: Too many containers left after cleanup: ${nb_containers} (max: ${max_containers})"
    status=$(($status|32))
fi

exit $status