diff options
Diffstat (limited to 'tcwg-cleanup-stale-results.sh')
-rwxr-xr-x | tcwg-cleanup-stale-results.sh | 193 |
1 files changed, 123 insertions, 70 deletions
diff --git a/tcwg-cleanup-stale-results.sh b/tcwg-cleanup-stale-results.sh index 413de235..d47a57e5 100755 --- a/tcwg-cleanup-stale-results.sh +++ b/tcwg-cleanup-stale-results.sh @@ -8,85 +8,138 @@ scripts=$(dirname "$0") convert_args_to_variables "$@" -days="${days-30}" -refs_url="${refs_url-https://git.linaro.org/toolchain/ci/base-artifacts}" -refs_pattern="${refs_pattern-refs/heads/linaro-local/ci/tcwg_bmk*}" -results_top="${results_top-/home/tcwg-benchmark/results}" -dryrun="${dryrun-true}" -verbose="${verbose-true}" +results_top="${results_top-$HOME/base-artifacts}" +verbose="${verbose-false}" +cleanup_gc="${cleanup_gc-true}" +cleanup_annex="${cleanup_annex-true}" + +current_host="bkp-01.tcwglab" if $verbose; then set -x fi -# Delete "used_by" markers older than $days days. -(set +f; find $results_top-* -name used_by -mtime "+$days" -delete) - -# Initialize base-artifacts repo (by cloning its "empty" branch). -refs_repo=$(basename "$refs_url" .git) -clone_or_update_repo_no_checkout "$refs_repo" "$refs_url" none empty origin -git -C "$refs_repo" reset --hard - -# Walk through all commits of all tcwg_bmk* branches and mark results -# referenced in those results with "used_by" file. -while IFS= read -r ref; do - git -C "$refs_repo" fetch origin "$ref" >/dev/null 2>&1 - git -C "$refs_repo" reset --hard FETCH_HEAD >/dev/null 2>&1 - depth=0 - # Walk all commits of just-fetched branch (i.e., until HEAD^ can't - # be parsed by git rev-parse). - while true; do - for results_id in "$refs_repo/results_id" \ - "$refs_repo/results_id-1" \ - "$refs_repo/results_id-2"; do - if [ -f "$results_id" ]; then - results_dir="$results_top-$(cat "$results_id")" - used_by="$refs_url/$ref~$depth" - if [ ! -d "$results_dir" ]; then - echo "WARNING: $used_by is missing $results_dir" - else - echo "$used_by" > "$results_dir/used_by" - fi - fi - done - if ! git -C "$refs_repo" rev-parse HEAD^ >/dev/null 2>&1; then - break - fi - git -C "$refs_repo" reset --hard HEAD^ >/dev/null 2>&1 - depth=$(($depth+1)) - done -done < <(git ls-remote "$refs_url" "$refs_pattern" | awk '{ print $2 }') +WORKSPACE="${WORKSPACE-}" +if [ "${WORKSPACE-}" = "" ]; then + WORKSPACE=$(mktemp -d) + rm_workspace="rm -rf $WORKSPACE" +else + rm_workspace="true" +fi -while IFS= read -r -d '' dir; do - # Skip already-deleted dirs (e.g., $dir's parent was deleted). - if [ ! -d "$dir" ]; then - continue - fi +perform_cleanup_gc() +{ + ### CLEANUP THE GIT REPOSITORIES + echo "=== CLEANUP THE GIT REPOSITORIES" + while read -r gitdir; do + if [ "$cleanup_gc" = "dryrun" ]; then + echo "DRYRUN: git -C $gitdir gc" + else + echo "# git -C $gitdir gc" + git -C $gitdir gc + fi + done < <(find $results_top -mindepth 2 -maxdepth 2 -type d -name '*.git') +} - # Don't delete "used_by" dirs and dirs that have recent files - # (i.e., "-mtime -$days"). E.g., in-progress benchmark might have uploaded - # partial results. - if [ x"$(find "$dir" -name used_by -o -mtime "-$days" | head -n1)" != x"" ]; then - continue - fi +perform_cleanup_annex() +{ + ### CLEANUP THE ANNEX FILES + echo "=== CLEANUP THE ANNEX FILES" + existing_annex_file=$WORKSPACE/list_annex.existing.txt + used_annex_file=$WORKSPACE/list_annex.used.txt + recent_annex_file=$WORKSPACE/list_annex.recent.txt + rm -f $used_annex_file $existing_annex_file $recent_annex_file - # Don't delete subdirectories of a "used_by" parent. - parent="$dir" - used=false - while [ x"$parent" != x"/home/tcwg-benchmark" ] && ! $used; do - parent=$(dirname "$parent") - if [ -f "$parent/used_by" ]; then - used=true - fi - done - if $used; then - continue + # List all existing annex + echo "# existing annex results" + assert_with_msg "ERROR: $results_top/annex doesnot exist" [ -d $results_top/annex ] + find $results_top/annex/ -type f > $existing_annex_file + + sort -u $existing_annex_file > $existing_annex_file.tmp + mv $existing_annex_file.tmp $existing_annex_file + + echo " => $(cat $existing_annex_file | wc -l) existing annex" + + # List all used annex + echo "# referenced annex results" + while read gitdir; do + ci_project_config=${gitdir#$results_top/} + ci_project_config=${ci_project_config%.git} + + # annex are tcwg_bmk only + if ! [[ $ci_project_config =~ tcwg_bmk- ]]; then + continue + fi + + rm -rf base-artifacts + git clone -q --reference $gitdir $gitdir \ + --branch linaro-local/ci/$ci_project_config \ + base-artifacts + + for br in $(git -C base-artifacts/ branch -r); do + git -C base-artifacts checkout -q $br + readarray -t all_bmk_datas < <(set +x; get_git_history 0 base-artifacts "annex/bmk-data") + cat "${all_bmk_datas[@]:1}" | sed -e "s|^$current_host:||" >> $used_annex_file + printf " => $(cat $used_annex_file | wc -l) referenced annex -- %-200s\n" "[processed $ci_project_config ($br)]" + rm -rf "${all_bmk_datas[0]}" + done + done < <(find $results_top -mindepth 2 -maxdepth 2 -type d -name '*.git') + + echo "" + echo " => $(cat $used_annex_file | wc -l) referenced annex" + + # recent annex + find $results_top/annex/ -type f -mtime -30 > $recent_annex_file + echo " => $(cat $recent_annex_file | wc -l) recent annex (less than 1-month old)" + + # include recent annex in the referenced ones + sort -u $used_annex_file $recent_annex_file > $used_annex_file.tmp + mv $used_annex_file.tmp $used_annex_file + + ### compare and remove useless annex + missing_annex=list_annex.referenced_but_not_exist.txt + useless_annex=list_annex.exist_but_not_referenced.txt + + set +o pipefail + diff -u $existing_annex_file $used_annex_file | grep '^\+' | sed -e 's|^\+||' | tail -n +2 > $missing_annex + diff -u $existing_annex_file $used_annex_file | grep '^\-' | sed -e 's|^\-||' | tail -n +2 > $useless_annex + + if [ -s $missing_annex ]; then + echo "WARNING: these annex are referenced, but not exists" + cat $missing_annex | sed -e 's|^| |' + else + echo "NOTE: All referenced annex files exist" fi - echo "DELETE: $dir is not used" - if $dryrun; then - echo "DRYRUN: rm -rf $dir" + if [ -s $useless_annex ]; then + echo "REMOVING: About to remove $(cat $useless_annex|wc -l) files." else - rm -rf "$dir" + echo "NOTE: No annex file to remove." fi -done < <(set +f; find $results_top-* -type d -print0) + + for file in $(cat $useless_annex); do + if [ "$cleanup_annex" = "dryrun" ]; then + echo "DRYRUN: rm -rf $file" + else + rm -rf "$file" + fi + done +} + +cd $WORKSPACE + +# free 10Gb disk space. This will be necessary to cleanup the git repositories +rm -f empty-10Gbfile.tmp + +if [ "$cleanup_gc" != "false" ]; then + perform_cleanup_gc +fi + +if [ "$cleanup_annex" != "false" ]; then + perform_cleanup_annex +fi + +# Create a big file to reserve 10Gb for next cleanup. +dd if=/dev/zero of=empty-10Gbfile.tmp bs=1G count=10 + +$rm_workspace |