summaryrefslogtreecommitdiff
path: root/tcwg-cleanup-stale-containers.sh
blob: d5d999d93950f798b149c4ddac6212286aae86fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/bin/bash

set -e

usage ()
{
    cat <<EOF
Options:
  --cleanup-running-hours HOURS
  --cleanup-stopped-hours HOURS
	Cleanup running/stopped containers that have been created more
	than HOURS ago.  Setting HOURS to negative values will
	run the cleanup in dry-run mode.  Value "0" disables the cleanup.

  --cleanup-ssh-agent-hours HOURS
	Cleanup stale ssh-agent processes that were started more than
	HOURS ago.  Value "0" disables the cleanup.

  --cleanup-images true/false
	Whether to cleanup untagged images

  --cleanup-volumes true/false
	Whether to cleanup dangling volumes

  --max-containers N
	Check that number of containers after cleanups doesn't exceed N.
	Value "0" disables the check.

  --verbose true/false
	Whether to run in verbose mode
EOF
    exit 1
}

cleanup_running_hours="-10"
cleanup_stopped_hours="-240"
cleanup_ssh_agent_hours="-48"
cleanup_images=false
cleanup_volumes=false
verbose=false
max_containers=0

while [ $# -gt 0 ]; do
    case $1 in
	--cleanup-running-hours) cleanup_running_hours="$2"; shift ;;
	--cleanup-stopped-hours) cleanup_stopped_hours="$2"; shift ;;
	--cleanup-ssh-agent-hours) cleanup_ssh_agent_hours="$2"; shift ;;
	--cleanup-images) cleanup_images="$2"; shift ;;
	--cleanup-volumes) cleanup_volumes="$2"; shift ;;
	--max-containers) max_containers="$2"; shift ;;
	--verbose) verbose="$2"; shift ;;
	*) echo "ERROR: Wrong option: $1"; usage ;;
    esac
    shift
done

if $verbose; then
    set -x
fi

DOCKER="docker"

do_cleanup_containers ()
{
    local hours="$1"
    local docker_ps_opts="$2"
    local action="$3"
    local action_msg="$4"
    local msg="$5"

    local cleanup_containers=true
    local dryrun_msg=""
    local only_jenkins_containers=true

    if [ "$hours" -eq "0" ]; then
	exit 0
    elif [ "$hours" -lt "0" ]; then
	hours="$((0-$hours))"
	cleanup_containers=false
	dryrun_msg=" (DRYRUN)"
    fi

    echo "$msg (more than ${hours}h)${dryrun_msg}"
    echo "Container report before:"
    $DOCKER ps $docker_ps_opts

    local -a rm_containers=()
    local curdate=$(date +%s)
    for container in $($DOCKER ps $docker_ps_opts --format "{{.ID}}"); do
	local container_date container_seconds

	container_date=$(date +%s --date="$($DOCKER inspect --format "{{.Created}}" $container)")
	container_seconds=$((curdate-container_date))

	if [ "$(($container_seconds/3600))" -gt "$hours" ]; then

	    # Do we want to remove all containers, or only those
	    # started by Jenkins jobs?
	    if ${only_jenkins_containers}; then
		container_name=$($DOCKER inspect --format "{{.Name}}" $container)
		# Containers started by our Jenkins jobs start with a
		# number
		case ${container_name} in
		    /[0-9]*)
			;;
		    *)
			container=""
			;;
		esac
	    fi
	    rm_containers=("${rm_containers[@]}" $container)
	fi
    done

    local res
    local status="0"
    if [ ${#rm_containers[@]} != 0 ]; then
	echo "Containers to ${action_msg}: ${rm_containers[@]}"
	if $cleanup_containers; then
            for container in "${rm_containers[@]}"; do
		echo "Container to ${action_msg}: $container"
		$DOCKER $action $container &
		res=0; wait $! || res=$?
		if [ $res != 0 ]; then
		    echo "WARNING: $DOCKER $action $container -- exit status: $res"
		    status="1"
		fi
            done
	else
            echo "DRY_RUN: NOT ACTING ON CONTAINERS"
	    echo "Increasing exit code to indicate stale containers"
	    status="1"
	fi

	echo "Containers report after:"
	$DOCKER ps $docker_ps_opts
    else
	echo "Found no container to ${action_msg}"
    fi

    exit $status
}

res="0"
do_cleanup_containers $cleanup_running_hours "" "stop" "stop" "Stopping long-running containers" &
wait $! || res=$?
status=$res

res="0"
do_cleanup_containers $cleanup_stopped_hours "-a" "rm -fv" "remove" "Removing containers stopped long ago" &
wait $! || res=$?
status=$(($status|(2*$res)))

rm_volumes=($($DOCKER volume ls -q -f dangling=true))
# Filter-out named volumes like host-home and home-$USER.  Leave only volumes
# named like a sha1 hash.
rm_volumes=($(echo "${rm_volumes[@]}" | tr " " "\n" | grep "^[a-f0-9]\{64\}\$" | cat))

if [ ${#rm_volumes[@]} != 0 ]; then
    echo "Removing dangling volumes"
    if $cleanup_volumes; then
	for volume in "${rm_volumes[@]}"; do
	    $DOCKER volume rm $volume &
	    res=0; wait $! || res=$?
	    if [ $res != 0 ]; then
		echo "WARNING: $DOCKER volume rm $volume -- exit status: $res"
		status=$(($status|4))
	    fi
	done
    else
	echo "Increasing exit code to indicate dangling volumes"
	status=$(($status|4))
        echo "DRY_RUN: NOT REMOVING DANGLING VOLUMES"
    fi
fi

rm_images=()
for image in $($DOCKER images -q -f dangling=true); do
    if ! $DOCKER ps -a --format "{{.Image}}" | grep -q $image; then
	rm_images=("${rm_images[@]}" $image)
    fi
done

if [ ${#rm_images[@]} != 0 ]; then
    echo "Removing unused images"
    if $cleanup_images; then
	for image in "${rm_images[@]}"; do
	    $DOCKER rmi $image &
	    res=0; wait $! || res=$?
	    if [ $res != 0 ]; then
		echo "WARNING: $DOCKER rmi $image -- exit status: $res"
		status=$(($status|8))
	    fi
	done
    else
	echo "Increasing exit code to indicate unused images"
	status=$(($status|8))
        echo "DRY_RUN: NOT REMOVING UNTAGGED IMAGES"
    fi
fi

# This is a workaround for https://issues.jenkins-ci.org/browse/JENKINS-49097.
# Matrix jobs leave stale ssh-agent processes, which accumulate into hundreds.
# This cleanup has nothing to do with docker containers, but creating
# a separate script/job for a one-liner doesn't seem to worth it.
if [ "$cleanup_ssh_agent_hours" -gt "0" ]; then
    res=0; killall --older-than ${cleanup_ssh_agent_hours}h -u $USER ssh-agent &
    wait $! || res=$?
    # Killall can fail for several reasons:
    # return-code 127: command not found
    # return-code 1: in general means no ssh-agent process was found
    # In the first case, we want the cleanup job to fail, so that we
    # know we need to install killall
    # The second case is OK, unless killall fails for another
    # reason. Assume OK for now.
    if [ $res != 0 ]; then
	case $res in
	    127)
		echo "WARNING: could not kill stale ssh-agent processes (killall command not found)"
		echo "Increasing exit code to indicate killall is missing"
		status=$(($status|16))
		;;
	    1)
		echo "WARNING: could not kill stale ssh-agent processes or there was no stale ssh-agent older than ${cleanup_ssh_agent_hours}h"
		;;
	esac
    fi
fi

# Check if we have more containers than max_containers
nb_containers=$($DOCKER ps -a | wc -l)

if [ ${max_containers} -gt 0 -a ${nb_containers} -gt ${max_containers} ]; then
    echo "ERROR: Too many containers left after cleanup: ${nb_containers} (max: ${max_containers})"
    status=$(($status|32))
fi

exit $status