summaryrefslogtreecommitdiff
path: root/tcwg-cleanup-stale-results.sh
diff options
context:
space:
mode:
Diffstat (limited to 'tcwg-cleanup-stale-results.sh')
-rwxr-xr-xtcwg-cleanup-stale-results.sh191
1 files changed, 122 insertions, 69 deletions
diff --git a/tcwg-cleanup-stale-results.sh b/tcwg-cleanup-stale-results.sh
index fa90a06d..d47a57e5 100755
--- a/tcwg-cleanup-stale-results.sh
+++ b/tcwg-cleanup-stale-results.sh
@@ -8,85 +8,138 @@ scripts=$(dirname "$0")
convert_args_to_variables "$@"
-days="${days-30}"
-refs_url="${refs_url-https://git.linaro.org/toolchain/ci/base-artifacts}"
-refs_pattern="${refs_pattern-refs/heads/linaro-local/ci/tcwg_bmk*}"
-results_top="${results_top-/home/tcwg-benchmark/results}"
-dryrun="${dryrun-true}"
+results_top="${results_top-$HOME/base-artifacts}"
verbose="${verbose-false}"
+cleanup_gc="${cleanup_gc-true}"
+cleanup_annex="${cleanup_annex-true}"
+
+current_host="bkp-01.tcwglab"
if $verbose; then
set -x
fi
-# Delete "used_by" markers older than $days days.
-(set +f; find $results_top-* -name used_by -mtime "+$days" -delete)
-
-# Initialize base-artifacts repo (by cloning its "empty" branch).
-refs_repo=$(basename "$refs_url" .git)
-clone_or_update_repo_no_checkout "$refs_repo" "$refs_url" none empty origin
-git -C "$refs_repo" reset --hard
-
-# Walk through all commits of all tcwg_bmk* branches and mark results
-# referenced in those results with "used_by" file.
-while IFS= read -r ref; do
- git -C "$refs_repo" fetch origin "$ref" >/dev/null 2>&1
- git -C "$refs_repo" reset --hard FETCH_HEAD >/dev/null 2>&1
- depth=0
- # Walk all commits of just-fetched branch (i.e., until HEAD^ can't
- # be parsed by git rev-parse).
- while true; do
- for results_id in "$refs_repo/results_id" \
- "$refs_repo/results_id-1" \
- "$refs_repo/results_id-2"; do
- if [ -f "$results_id" ]; then
- results_dir="$results_top-$(cat "$results_id")"
- used_by="$refs_url/$ref~$depth"
- if [ ! -d "$results_dir" ]; then
- echo "WARNING: $used_by is missing $results_dir"
- else
- echo "$used_by" > "$results_dir/used_by"
- fi
- fi
- done
- if ! git -C "$refs_repo" rev-parse HEAD^ >/dev/null 2>&1; then
- break
- fi
- git -C "$refs_repo" reset --hard HEAD^ >/dev/null 2>&1
- depth=$(($depth+1))
- done
-done < <(git ls-remote "$refs_url" "$refs_pattern" | awk '{ print $2 }')
+WORKSPACE="${WORKSPACE-}"
+if [ "${WORKSPACE-}" = "" ]; then
+ WORKSPACE=$(mktemp -d)
+ rm_workspace="rm -rf $WORKSPACE"
+else
+ rm_workspace="true"
+fi
-while IFS= read -r -d '' dir; do
- # Skip already-deleted dirs (e.g., $dir's parent was deleted).
- if [ ! -d "$dir" ]; then
- continue
- fi
+perform_cleanup_gc()
+{
+ ### CLEANUP THE GIT REPOSITORIES
+ echo "=== CLEANUP THE GIT REPOSITORIES"
+ while read -r gitdir; do
+ if [ "$cleanup_gc" = "dryrun" ]; then
+ echo "DRYRUN: git -C $gitdir gc"
+ else
+ echo "# git -C $gitdir gc"
+ git -C $gitdir gc
+ fi
+ done < <(find $results_top -mindepth 2 -maxdepth 2 -type d -name '*.git')
+}
- # Don't delete "used_by" dirs and dirs that have recent files
- # (i.e., "-mtime -$days"). E.g., in-progress benchmark might have uploaded
- # partial results.
- if [ x"$(find "$dir" -name used_by -o -mtime "-$days" | head -n1)" != x"" ]; then
- continue
- fi
+perform_cleanup_annex()
+{
+ ### CLEANUP THE ANNEX FILES
+ echo "=== CLEANUP THE ANNEX FILES"
+ existing_annex_file=$WORKSPACE/list_annex.existing.txt
+ used_annex_file=$WORKSPACE/list_annex.used.txt
+ recent_annex_file=$WORKSPACE/list_annex.recent.txt
+ rm -f $used_annex_file $existing_annex_file $recent_annex_file
- # Don't delete subdirectories of a "used_by" parent.
- parent="$dir"
- used=false
- while [ x"$parent" != x"/home/tcwg-benchmark" ] && ! $used; do
- parent=$(dirname "$parent")
- if [ -f "$parent/used_by" ]; then
- used=true
- fi
- done
- if $used; then
- continue
+ # List all existing annex
+ echo "# existing annex results"
+ assert_with_msg "ERROR: $results_top/annex doesnot exist" [ -d $results_top/annex ]
+ find $results_top/annex/ -type f > $existing_annex_file
+
+ sort -u $existing_annex_file > $existing_annex_file.tmp
+ mv $existing_annex_file.tmp $existing_annex_file
+
+ echo " => $(cat $existing_annex_file | wc -l) existing annex"
+
+ # List all used annex
+ echo "# referenced annex results"
+ while read gitdir; do
+ ci_project_config=${gitdir#$results_top/}
+ ci_project_config=${ci_project_config%.git}
+
+ # annex are tcwg_bmk only
+ if ! [[ $ci_project_config =~ tcwg_bmk- ]]; then
+ continue
+ fi
+
+ rm -rf base-artifacts
+ git clone -q --reference $gitdir $gitdir \
+ --branch linaro-local/ci/$ci_project_config \
+ base-artifacts
+
+ for br in $(git -C base-artifacts/ branch -r); do
+ git -C base-artifacts checkout -q $br
+ readarray -t all_bmk_datas < <(set +x; get_git_history 0 base-artifacts "annex/bmk-data")
+ cat "${all_bmk_datas[@]:1}" | sed -e "s|^$current_host:||" >> $used_annex_file
+ printf " => $(cat $used_annex_file | wc -l) referenced annex -- %-200s\n" "[processed $ci_project_config ($br)]"
+ rm -rf "${all_bmk_datas[0]}"
+ done
+ done < <(find $results_top -mindepth 2 -maxdepth 2 -type d -name '*.git')
+
+ echo ""
+ echo " => $(cat $used_annex_file | wc -l) referenced annex"
+
+ # recent annex
+ find $results_top/annex/ -type f -mtime -30 > $recent_annex_file
+ echo " => $(cat $recent_annex_file | wc -l) recent annex (less than 1-month old)"
+
+ # include recent annex in the referenced ones
+ sort -u $used_annex_file $recent_annex_file > $used_annex_file.tmp
+ mv $used_annex_file.tmp $used_annex_file
+
+ ### compare and remove useless annex
+ missing_annex=list_annex.referenced_but_not_exist.txt
+ useless_annex=list_annex.exist_but_not_referenced.txt
+
+ set +o pipefail
+ diff -u $existing_annex_file $used_annex_file | grep '^\+' | sed -e 's|^\+||' | tail -n +2 > $missing_annex
+ diff -u $existing_annex_file $used_annex_file | grep '^\-' | sed -e 's|^\-||' | tail -n +2 > $useless_annex
+
+ if [ -s $missing_annex ]; then
+ echo "WARNING: these annex are referenced, but not exists"
+ cat $missing_annex | sed -e 's|^| |'
+ else
+ echo "NOTE: All referenced annex files exist"
fi
- echo "DELETE: $dir is not used"
- if $dryrun; then
- echo "DRYRUN: rm -rf $dir"
+ if [ -s $useless_annex ]; then
+ echo "REMOVING: About to remove $(cat $useless_annex|wc -l) files."
else
- rm -rf "$dir"
+ echo "NOTE: No annex file to remove."
fi
-done < <(set +f; find $results_top-* -type d -print0)
+
+ for file in $(cat $useless_annex); do
+ if [ "$cleanup_annex" = "dryrun" ]; then
+ echo "DRYRUN: rm -rf $file"
+ else
+ rm -rf "$file"
+ fi
+ done
+}
+
+cd $WORKSPACE
+
+# free 10Gb disk space. This will be necessary to cleanup the git repositories
+rm -f empty-10Gbfile.tmp
+
+if [ "$cleanup_gc" != "false" ]; then
+ perform_cleanup_gc
+fi
+
+if [ "$cleanup_annex" != "false" ]; then
+ perform_cleanup_annex
+fi
+
+# Create a big file to reserve 10Gb for next cleanup.
+dd if=/dev/zero of=empty-10Gbfile.tmp bs=1G count=10
+
+$rm_workspace