tcwg-cleanup-stale-containers.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

#!/bin/bash

set -e

usage ()
{
    cat <<EOF
Options:
  --cleanup-running-hours HOURS
  --cleanup-stopped-hours HOURS
	Cleanup running/stopped containers that have been created more
	than HOURS ago.  Setting HOURS to negative values will
	run the cleanup in dry-run mode.  Value "0" disables the cleanup.

  --cleanup-ssh-agent-hours HOURS
	Cleanup stale ssh-agent processes that were started more than
	HOURS ago.  Value "0" disables the cleanup.

  --cleanup-images true/false
	Whether to cleanup untagged images

  --cleanup-volumes true/false
	Whether to cleanup dangling volumes

  --max-containers N
	Check that number of containers after cleanups doesn't exceed N.
	Value "0" disables the check.

  --verbose true/false
	Whether to run in verbose mode
EOF
    exit 1
}

cleanup_running_hours="-10"
cleanup_stopped_hours="-240"
cleanup_ssh_agent_hours="-48"
cleanup_images=false
cleanup_volumes=false
verbose=false
max_containers=0

while [ $# -gt 0 ]; do
    case $1 in
	--cleanup-running-hours) cleanup_running_hours="$2"; shift ;;
	--cleanup-stopped-hours) cleanup_stopped_hours="$2"; shift ;;
	--cleanup-ssh-agent-hours) cleanup_ssh_agent_hours="$2"; shift ;;
	--cleanup-images) cleanup_images="$2"; shift ;;
	--cleanup-volumes) cleanup_volumes="$2"; shift ;;
	--max-containers) max_containers="$2"; shift ;;
	--verbose) verbose="$2"; shift ;;
	*) echo "ERROR: Wrong option: $1"; usage ;;
    esac
    shift
done

if $verbose; then
    set -x
fi

DOCKER="docker"

do_cleanup_containers ()
{
    local hours="$1"
    local docker_ps_opts="$2"
    local action="$3"
    local cleanup_containers=true
    local only_jenkins_containers=true

    if [ "$hours" -eq "0" ]; then
	exit 0
    elif [ "$hours" -lt "0" ]; then
	hours="$((0-$hours))"
	cleanup_containers=false
    fi

    echo "Container report before:"
    $DOCKER ps $docker_ps_opts

    local -a rm_containers=()
    local curdate=$(date +%s)
    for container in $($DOCKER ps $docker_ps_opts --format "{{.ID}}"); do
	local container_date container_seconds

	container_date=$(date +%s --date="$($DOCKER inspect --format "{{.Created}}" $container)")
	container_seconds=$((curdate-container_date))

	if [ "$(($container_seconds/3600))" -gt "$hours" ]; then

	    # Do we want to remove all containers, or only those
	    # started by Jenkins jobs?
	    if ${only_jenkins_containers}; then
		container_name=$($DOCKER inspect --format "{{.Name}}" $container)
		# Containers started by our Jenkins jobs start with a
		# number
		case ${container_name} in
		    /[0-9]*)
			;;
		    *)
			container=""
			;;
		esac
	    fi
	    rm_containers=("${rm_containers[@]}" $container)
	fi
    done

    local res
    local status="0"
    if [ ${#rm_containers[@]} != 0 ]; then
	echo "Removing containers: ${rm_containers[@]}"
	if $cleanup_containers; then
            for container in "${rm_containers[@]}"; do
		echo "Removing container $container"
		$DOCKER $action $container &
		res=0; wait $! || res=$?
		if [ $res != 0 ]; then
		    echo "WARNING: $DOCKER $action $container -- exit status: $res"
		    status="1"
		fi
            done
	else
            echo "DRY_RUN: NOT REMOVING CONTAINERS"
	    echo "Increasing exit code to indicate stale containers"
	    status="1"
	fi

	echo "Containers report after:"
	$DOCKER ps $docker_ps_opts
    else
	echo "Found no container to remove"
    fi

    exit $status
}

res="0"
do_cleanup_containers $cleanup_running_hours "" "stop" &
wait $! || res=$?
status=$res

res="0"
do_cleanup_containers $cleanup_stopped_hours "-a" "rm -fv" &
wait $! || res=$?
status=$(($status|(2*$res)))

rm_volumes=($($DOCKER volume ls -q -f dangling=true))
# Filter-out named volumes like host-home and home-$USER.  Leave only volumes
# named like a sha1 hash.
rm_volumes=($(echo "${rm_volumes[@]}" | grep "^[a-f0-9]\{64\}\$" | cat))

if [ ${#rm_volumes[@]} != 0 ]; then
    echo "Removing dangling volumes"
    if $cleanup_volumes; then
	for volume in "${rm_volumes[@]}"; do
	    $DOCKER volume rm $volume &
	    res=0; wait $! || res=$?
	    if [ $res != 0 ]; then
		echo "WARNING: $DOCKER volume rm $volume -- exit status: $res"
		status=$(($status|4))
	    fi
	done
    else
	echo "Increasing exit code to indicate dangling volumes"
	status=$(($status|4))
        echo "DRY_RUN: NOT REMOVING DANGLING VOLUMES"
    fi
fi

rm_images=()
for image in $($DOCKER images -q -f dangling=true); do
    if ! $DOCKER ps -a --format "{{.Image}}" | grep -q $image; then
	rm_images=("${rm_images[@]}" $image)
    fi
done

if [ ${#rm_images[@]} != 0 ]; then
    echo "Removing unused images"
    if $cleanup_images; then
	for image in "${rm_images[@]}"; do
	    $DOCKER rmi $image &
	    res=0; wait $! || res=$?
	    if [ $res != 0 ]; then
		echo "WARNING: $DOCKER rmi $image -- exit status: $res"
		status=$(($status|8))
	    fi
	done
    else
	echo "Increasing exit code to indicate unused images"
	status=$(($status|8))
        echo "DRY_RUN: NOT REMOVING UNTAGGED IMAGES"
    fi
fi

# This is a workaround for https://issues.jenkins-ci.org/browse/JENKINS-49097.
# Matrix jobs leave stale ssh-agent processes, which accumulate into hundreds.
# This cleanup has nothing to do with docker containers, but creating
# a separate script/job for a one-liner doesn't seem to worth it.
if [ "$cleanup_ssh_agent_hours" -gt "0" ]; then
    res=0; killall --older-than ${cleanup_ssh_agent_hours}h -u $USER ssh-agent &
    wait $! || res=$?
fi

# Check if we have more containers than max_containers
nb_containers=$($DOCKER ps -a | wc -l)

if [ ${max_containers} -gt 0 -a ${nb_containers} -gt ${max_containers} ]; then
    echo "ERROR: Too many containers left after cleanup: ${nb_containers} (max: ${max_containers})"
    status=$(($status|16))
fi

exit $status