blob: e160337ce8755ddea6bb8a586d57b88b620169fd (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
#!/bin/bash
set -e
usage ()
{
cat <<EOF
Options:
--cleanup-running-hours HOURS
--cleanup-stopped-hours HOURS
Cleanup running/stopped containers that have been created more
than HOURS ago. Setting HOURS to negative values will
run the cleanup in dry-run mode. Value "0" disables the cleanup.
--cleanup-ssh-agent-hours HOURS
Cleanup stale ssh-agent processes that were started more than
HOURS ago. Value "0" disables the cleanup.
--cleanup-images true/false
Whether to cleanup untagged images
--cleanup-volumes true/false
Whether to cleanup dangling volumes
--max-containers N
Check that number of containers after cleanups doesn't exceed N.
Value "0" disables the check.
--verbose true/false
Whether to run in verbose mode
EOF
exit 1
}
cleanup_running_hours="-10"
cleanup_stopped_hours="-240"
cleanup_ssh_agent_hours="-48"
cleanup_images=false
cleanup_volumes=false
verbose=false
max_containers=0
while [ $# -gt 0 ]; do
case $1 in
--cleanup-running-hours) cleanup_running_hours="$2"; shift ;;
--cleanup-stopped-hours) cleanup_stopped_hours="$2"; shift ;;
--cleanup-ssh-agent-hours) cleanup_ssh_agent_hours="$2"; shift ;;
--cleanup-images) cleanup_images="$2"; shift ;;
--cleanup-volumes) cleanup_volumes="$2"; shift ;;
--max-containers) max_containers="$2"; shift ;;
--verbose) verbose="$2"; shift ;;
*) echo "ERROR: Wrong option: $1"; usage ;;
esac
shift
done
if $verbose; then
set -x
fi
DOCKER="docker"
do_cleanup_containers ()
{
local hours="$1"
local docker_ps_opts="$2"
local action="$3"
local action_msg="$4"
local msg="$5"
local cleanup_containers=true
local dryrun_msg=""
local only_jenkins_containers=true
if [ "$hours" -eq "0" ]; then
exit 0
elif [ "$hours" -lt "0" ]; then
hours="$((0-$hours))"
cleanup_containers=false
dryrun_msg=" (DRYRUN)"
fi
echo "$msg (more than ${hours}h)${dryrun_msg}"
echo "Container report before:"
$DOCKER ps $docker_ps_opts
local -a rm_containers=()
local curdate
curdate=$(date +%s)
for container in $($DOCKER ps $docker_ps_opts --format "{{.ID}}"); do
local container_date container_seconds
container_date=$(date +%s --date="$($DOCKER inspect --format "{{.Created}}" $container)")
container_seconds=$((curdate-container_date))
if [ "$(($container_seconds/3600))" -gt "$hours" ]; then
# Do we want to remove all containers, or only those
# started by Jenkins jobs?
if ${only_jenkins_containers}; then
container_name=$($DOCKER inspect --format "{{.Name}}" $container)
# Containers started by our Jenkins jobs start with a
# number
case ${container_name} in
/[0-9]*)
;;
*)
container=""
;;
esac
fi
if [ -n "$container" ]; then
rm_containers=("${rm_containers[@]}" "$container")
fi
fi
done
local res
local status="0"
if [ ${#rm_containers[@]} != 0 ]; then
echo "Containers to ${action_msg}: ${rm_containers[*]}"
if $cleanup_containers; then
for container in "${rm_containers[@]}"; do
echo "Container to ${action_msg}: $container"
$DOCKER $action $container &
res=0; wait $! || res=$?
if [ $res != 0 ]; then
echo "WARNING: $DOCKER $action $container -- exit status: $res"
status="1"
fi
done
else
echo "DRY_RUN: NOT ACTING ON CONTAINERS"
echo "Increasing exit code to indicate stale containers"
status="1"
fi
echo "Containers report after:"
$DOCKER ps $docker_ps_opts
else
echo "Found no container to ${action_msg}"
fi
exit $status
}
res="0"
do_cleanup_containers $cleanup_running_hours "" "stop" "stop" "Stopping long-running containers" &
wait $! || res=$?
status=$res
res="0"
do_cleanup_containers $cleanup_stopped_hours "-a" "rm -fv" "remove" "Removing containers stopped long ago" &
wait $! || res=$?
status=$(($status|(2*$res)))
if $cleanup_volumes; then
mapfile -t rm_volumes < <($DOCKER volume ls -q -f dangling=true)
# Filter-out named volumes like host-home and home-$USER. Leave only volumes
# named like a sha1 hash.
mapfile -t rm_volumes < <(echo "${rm_volumes[@]}" | tr " " "\n" | grep "^[a-f0-9]\{64\}\$")
if [ ${#rm_volumes[@]} != 0 ]; then
echo "Removing dangling volumes"
for volume in "${rm_volumes[@]}"; do
$DOCKER volume rm $volume &
res=0; wait $! || res=$?
if [ $res != 0 ]; then
echo "WARNING: $DOCKER volume rm $volume -- exit status: $res"
status=$(($status|4))
fi
done
fi
else
echo "DRY_RUN: NOT REMOVING DANGLING VOLUMES"
fi
if $cleanup_images; then
# See dockerfiles.git/tcwg-base/tcwg-host/docker-wrapper for background
# on image stamp files.
stamp_dir=/home/shared/docker
# Untag and prune images that haven't been used for 3 days or more.
for image_tuple in $(docker images --format "{{.ID}}:{{.Repository}}:{{.Tag}}"); do
image_id=$(echo "$image_tuple" | cut -d: -f 1)
image=$(echo "$image_tuple" | cut -d: -f 2,3)
image_stamp="$stamp_dir/$(echo "$image" | tr "/:" "_")"
remove_if_not_used_since=$(($(date +%s) - 3*24*60*60))
# Use negative comparison to handle non-existent stamp files.
if ! [ "$(stat -c %Z "$image_stamp.use" 2>/dev/null)" \
-gt $remove_if_not_used_since ] 2>/dev/null; then
# Untag the image. Use $image_id to handle previously-untagged
# images and other cases when we have no repo or tag reference.
docker rmi -f "$image_id" 2>/dev/null \
|| echo "NOTE: Cannot remove $image_tuple"
fi
done
# Prune untagged images.
docker image prune -f
else
echo "DRY_RUN: NOT REMOVING UNUSED IMAGES"
fi
# This is a workaround for https://issues.jenkins-ci.org/browse/JENKINS-49097.
# Matrix jobs leave stale ssh-agent processes, which accumulate into hundreds.
# This cleanup has nothing to do with docker containers, but creating
# a separate script/job for a one-liner doesn't seem to worth it.
if [ "$cleanup_ssh_agent_hours" -gt "0" ]; then
res=0; killall --older-than ${cleanup_ssh_agent_hours}h -u $USER ssh-agent &
wait $! || res=$?
# Killall can fail for several reasons:
# return-code 127: command not found
# return-code 1: in general means no ssh-agent process was found
# In the first case, we want the cleanup job to fail, so that we
# know we need to install killall
# The second case is OK, unless killall fails for another
# reason. Assume OK for now.
if [ $res != 0 ]; then
case $res in
127)
echo "WARNING: could not kill stale ssh-agent processes (killall command not found)"
echo "Increasing exit code to indicate killall is missing"
status=$(($status|16))
;;
1)
echo "WARNING: could not kill stale ssh-agent processes or there was no stale ssh-agent older than ${cleanup_ssh_agent_hours}h"
;;
esac
fi
fi
# Check if we have more containers than max_containers
nb_containers=$($DOCKER ps -a | wc -l)
if [ ${max_containers} -gt 0 ] && [ ${nb_containers} -gt ${max_containers} ]; then
echo "ERROR: Too many containers left after cleanup: ${nb_containers} (max: ${max_containers})"
status=$(($status|32))
fi
exit $status
|