#!/bin/bash # # growth.sh # # Copyright (C) 2015 Linaro, Ltd # Andy Green # Licensed under GPL2.1 # # Please run the script with no args to get comprehensive help # # Note on sqlite3 usage # # The sqlite3 db generated here is just caching analysis the script # generated itself. You can delete it and the script will recreate an # empty one automatically, but you will have to regenerate the runs # that were stored in it. # # Having the data cached there is helpful both is making complex queries # that are difficult to reproduce using cut, sed. sort etc and in allowing # quick development of new graphs and queries without the cost of generating # the data each time. DB=growth.sq3 SCHEMA_VER=1 BP= DIRCOL=0 rm -f .cols.tmp touch .cols.tmp rm -f .first-phase function sq3() { sqlite3 $DB "$1" if [ $? -ne 0 ] ; then echo "sqlite error" echo $1 exit 1 fi } # number of preset columns before dir ones OFFSET_COLS=8 # return col num of dirname in DIRCOL # $1: dirname # $1: basis branch, $2: starting tree function basis_point() { echo basis_point $1 $2 BP=`diff -u <(git rev-list --first-parent $2) \ <(git rev-list --first-parent $1) | \ sed -ne 's/^ //p' | head -1` echo result $BP } # $1: basis branch, $2: empty or comparison ref, $3: basis point if known # $4: index in sequence, $5: run_key we are attached to function make_stat() { local CB local BASIS local COMP local DS local DS_BASIS local STATS CB="$2" if [ -z "$3" ] ; then basis_point $1 $CB else BP=$3 fi BASIS=`git describe $BP` COMP=`git describe $CB` DS=`git log -n 1 $CB --format=format:%ct` DS_BASIS=`git show $BP --format=format:%ct | head -n1` echo "git diff $BP..$CB --shortstat" F="`git diff $BP..$CB --shortstat`" FILES=`echo $F | cut -d' ' -f1` ADD=`echo $F | cut -d' ' -f4` REM=`echo $F | cut -d' ' -f6` sq3 "insert into snapshots (run_idx, ref_name, ref_date, \ basis_name, basis_date, files_changed, \ loc_added, loc_removed) \ values ($5, \"$COMP\", $DS, \"$BASIS\", $DS_BASIS, \ $FILES, $ADD, $REM);" SKEY=`sq3 "select seq from sqlite_sequence where \ name=\"snapshots\""` git diff $BP..$CB --numstat | while read i ; do F="`echo $i | cut -d' ' -f3`" ADD=`echo $i | cut -d' ' -f1` REM=`echo $i | cut -d' ' -f2` if [ "$ADD" != "-" -a \ -z "`echo $F | grep gitignore`" \ ] ; then DEPTH=`echo "$F" | sed "s|[^/]||g" | wc -c` sq3 "insert into dir_summary (snap_idx, run_key, dir_name, \ dir_depth, loc_added, loc_removed) \ values ($SKEY, $5, \"$F\", $DEPTH, $ADD, $REM);" fi done } # $1: stats file, $2: y axis, $3: output, $4: title function issue_plot_time() { cat >plot.tmp <&2 echo "creating $3" gnuplot plot.tmp } # $1: stats file, $2: y axis, $3: output, $4: title, $5: dimensions function issue_plot_file_dist() { cat >plot.tmp <&2 echo "creating $3" gnuplot plot.tmp } # create schema # one of these for each comparison run sq3 "create table if not exists runs (\ run_key integer primary key autoincrement, \ basis_hash varchar(50), \ comp_hash varchar(50), \ tags integer, \ schema_ver integer \ );" # one of these for each snapshot compared sq3 "create table if not exists snapshots (\ snap_idx integer primary key autoincrement, \ run_idx integer, \ ref_name varchar(50), \ ref_date integer, \ basis_name varchar(50), \ basis_date integer, \ files_changed integer, \ loc_added integer, \ loc_removed integer \ );" # one of these for each dir changed in the snapshot # we have run_key here as well since it simplifies finding all paths sq3 "create table if not exists dir_summary (\ key integer primary key autoincrement, \ snap_idx integer, \ run_key integer, \ dir_name varchar(150), \ dir_depth integer, \ loc_added integer, \ loc_removed integer \ );" if [ -z "$1" ] ; then >&2 echo "Usage: $0 --tags " >&2 echo " $0 --plot [ subdir ]" >&2 echo " $0 --plot < - | subdir > " >&2 echo "" >&2 echo "$0 can be run in two modes, either create a 'run' in" >&2 echo "the sqlite3 db cache, or create graphs about one or" >&2 echo "comparing two runs already in the db cache" >&2 echo "" >&2 echo "Creating a 'run' from one or more tags" >&2 echo "--------------------------------------" >&2 echo "" >&2 echo "A 'run' is created by studying one or more tags against" >&2 echo "a 'basis branch' to isolate the patches on top of the" >&2 echo "tag's basis point. So if you have a kernel branch that" >&2 echo "is tracking mainline, the various tags you have on that" >&2 echo "kernel branch may be based on different mainline versions." >&2 echo "$0 can autodiscover for each tag where the basis point is" >&2 echo "if you just give him the basis branch name, eg, 'mainline'." >&2 echo "" >&2 echo " $$ $0 mainline --tags mybranch-tagname-regexp" >&2 echo "" >&2 echo "Notice that the tag name to analyze on one 'run' is a regexp." >&2 echo "It's fine to have many tags analyzed in one 'run'." >&2 echo "" >&2 echo "When the run starts, the run number is reported and you" >&2 echo "should make a note of it" >&2 echo "" >&2 echo "" >&2 echo "Plotting graphs from one or two runs" >&2 echo "------------------------------------" >&2 echo "" >&2 echo "After the analysis for the tags you are interested in has" >&2 echo "been captured into 'runs' in the sqlite3 db cache, you can" >&2 echo "run the script to produce png and gif graphs showing or" >&2 echo "comparing the data from different runs." >&2 echo "" >&2 echo "There's no requirement at all that the different runs have" >&2 echo "anything in common in their history, basis or content," >&2 echo "giving a lot of flexibility in the comparisons." >&2 echo "" >&2 echo "To produce graphs about one run itself:" >&2 echo "" >&2 echo " $$ $0 --plot [ subdir ]" >&2 echo "" >&2 echo "If subdir is missing, the whole tree is analysed, if given" >&2 echo "the analysis is restricted to the subdirectory given." >&2 echo "" >&2 echo "To produce graphs comparing two runs:" >&2 echo "" >&2 echo " $$ $0 --plot < - | subdir > " >&2 echo "" >&2 echo "If there is no subdir restriction, - must be given."" >&2 echo " must contain only one tag in this case." >&2 echo "" >&2 echo "Graphs will be produced with the union of information" >&2 echo "from run a and run b, showing run a in blue and run b in red." exit 1 fi LEVELS=1,2 FILTER=$3 if [ "$FILTER" = "-" ] ; then FILTER= fi FILTERLEN=${#FILTER} F_DEPTH=`echo "$FILTER" | sed "s|[^/]||g" | wc -c` if [ ! -z "$FILTER" ] ; then if [ $F_DEPTH == 1 ] ; then LEVELS=1,2,3 else if [ $F_DEPTH == 2 ] ; then LEVELS=1,2,3,4 else if [ $F_DEPTH == 3 ] ; then LEVELS=1,2,3,4,5 else LEVELS=1,2,3,4,5,6 fi fi fi fi # # plot mode # if [ "$1" = "--plot" ] ; then PLOT_RUN=$2 COMP_RUN=$4 R=`sq3 "select comp_hash,basis_hash from runs where run_key=$PLOT_RUN"|\ tr '|' '-'` # # get a list of snapshot idxs for both runs combined # if [ ! -z "$COMP_RUN" ] ; then RUN_IDX_COMP="(run_idx=$PLOT_RUN or run_idx=$COMP_RUN)" RUN_KEY_COMP="(run_key=$PLOT_RUN or run_key=$COMP_RUN)" CR=`sq3 "select comp_hash,basis_hash from runs where \ run_key=$COMP_RUN" | tr '|' '-'` R="$R"-VS-$CR else RUN_IDX_COMP="run_idx=$PLOT_RUN" RUN_KEY_COMP="run_key=$PLOT_RUN" fi echo $R # our snapshots SN=`sq3 "select snap_idx from snapshots where run_idx=$PLOT_RUN"` # there's a comparison snapshot? SNC= if [ ! -z "$COMP_RUN" ] ; then SNC=`sq3 "select snap_idx from snapshots where run_idx=$COMP_RUN"` COUNT= for i in $SNC ; do if [ ! -z "$COUNT" ] ; then >&2 echo "Must be single comparison snapshot" exit 1 fi COUNT=x done fi rm -f .plot.tmp rm -f .plot.cols .plot.cols1 rm -f .plot.dist # # using both runs if two given, # create the column header row, and fill .plot.cols with the # list of files / dirs changed in this view of the diff # echo -n "basis_name basis_date ref_name ref_date files add del " \ > .plot.tmp sq3 "select dir_name from dir_summary where \ $RUN_KEY_COMP and \ substr(dir_name, 1, $FILTERLEN)=\"$FILTER\" \ order by loc_added,loc_removed asc" | \ cut -d'/' -f$LEVELS | while read i ; do if [ ! -z "`echo "$i" | grep ^Documentation/`" ] ; then echo "Documentation" >> .plot.cols1 else if [ "$i" != "." ] ; then # don't allow individual files if [ ! -d "$i" ] ; then dirname $i >> .plot.cols1 else echo $i >> .plot.cols1 fi fi fi done # # put the column titles in place and write out the # filtered list of files/dirs we will care about # cat .plot.cols1 | sort | uniq | while read i ; do echo -n "$i " >> .plot.tmp echo $i >> .plot.cols done echo >> .plot.tmp # # find out how many snapshots created by the run he's using # it doesn't include any comparison snapshot # N=0 for i in $SN ; do N=$(( $N + 1 )) done # how many files were changed CHANGEDFILES=`wc -l .plot.cols | cut -d' ' -f1` >&2 echo "Studying $N snapshots" >&2 echo "Total $CHANGEDFILES files changed" # # For each snapshot, go through the list of changed files/dirs and # find out how much changed there in that snapshot # T=1 for i in $SN ; do L=`sq3 "select basis_name, basis_date, ref_name, \ ref_date, files_changed, loc_added, loc_removed\ from snapshots where snap_idx=$i" | tr '|' ' '` echo -n $L >> .plot.tmp >&2 echo -n -e "Snapshot $T/$N: `echo $L | cut -d' ' -f3` \r" T=$(( $T + 1 )) cat .plot.cols | while read j ; do JLEN=${#j} # are we going to deal with his subdirs? if [ -z "`cat .plot.cols | grep "$j/"`" ] ; then # everything inside this dir A=`sq3 "select sum(loc_added) \ from dir_summary where \ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\"\ " | head -n1` else # it's truncated, so only files in this dir # eg arch, but arch/arm is handled elsewhere DEP=$(( `echo "$j" | sed "s|[^/]||g" | wc -c` + 1 )) A=`sq3 "select sum(loc_added) \ from dir_summary where\ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\" and \ dir_depth=$DEP" | head -n1` fi if [ -z "$A" ] ; then echo -n "0 " >> .plot.tmp else echo -n "$A " >> .plot.tmp fi done echo >> .plot.tmp done >&2 echo # # for plots related to changes over time, we can do them now # issue_plot_time "'.plot.tmp' using \ 4:6 notitle with filledcurve y1=0 lc rgb \"#0000ff\"" \ "growth-$R-LOC.png" "$R growth in LOC" issue_plot_time "'.plot.tmp' using \ 4:( (\$4-\$2)/(24 * 3600) ) notitle \ with filledcurve y1=0 lc rgb \"#0000ff\"" \ "growth-$R-basis-age.png" "$R growth basis age (days)" rm -f .plot.tmp1 # # for each file / dir that has changes in any snapshot, for each # snapshot calculate its changes and create a unified plot data file # echo 0 > .biggest T=1 cat .plot.cols | while read j ; do >&2 echo -n -e "File $T/$CHANGEDFILES \r" T=$(( $T + 1 )) echo -n "$j " >> .plot.tmp1 BIGGEST=`cat .biggest` JLEN=${#j} for i in $SN $SNC ; do if [ -z "`cat .plot.cols | grep "$j/"`" ] ; then # everything inside the dir A=`sq3 "select sum(loc_added) \ from dir_summary where\ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\" \ "|head -n1` D=`sq3 "select sum(loc_removed) \ from dir_summary where\ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\" \ "|head -n1` else # it's truncated, so only files in this dir # eg arch, but arch/arm is handled elsewhere DEP=$(( `echo "$j" | sed "s|[^/]||g" | wc -c` + 1 )) A=`sq3 "select sum(loc_added) \ from dir_summary where \ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\"\ and dir_depth=$DEP" | head -n1` D=`sq3 "select sum(loc_removed) \ from dir_summary where \ snap_idx=$i and \ substr(dir_name, 1, $JLEN)=\"$j\" \ and dir_depth=$DEP" | head -n1` fi if [ ! -z "$A" -a ! -z "$D" ] ; then V=$(( $A - $D )) echo -n "$V " >> .plot.tmp1 if [ $V -gt $BIGGEST ] ; then BIGGEST=$V echo $V > .biggest fi else echo -n "0 " >> .plot.tmp1 fi done echo >> .plot.tmp1 done >&2 echo echo -n "idx dir " > .plot.tmp for i in $SN $SNC ; do V="`sq3 "select ref_name \ from snapshots where snap_idx=$i"`" echo -n "$V " >> .plot.tmp done echo >> .plot.tmp C=0 sort -k$(( $N + 1 )) -nr .plot.tmp1 | while read i ; do echo "$C $i" >> .plot.tmp C=$(( $C + 1 )) done WIDTH=$(( 16 * `cat .plot.cols | wc -l` )) if [ $WIDTH -lt 640 ] ; then WIDTH=640 fi # plot each snapshot in turn C=1 while [ $C -le $N ]; do BIGGEST=`cat .biggest` TOT=`cat .plot.tmp |tail -n+2 | \ cut -d' ' -f$(( $C + 2 )) |paste -sd+ | bc` _FILTER=`echo "$FILTER" | sed "s|/|_|g"` if [ ! -z "$_FILTER" ] ; then _FILTER=$_FILTER- fi PL="'.plot.tmp' using 1:$(( $C + 2 )):xtic(2) \ w boxes lc rgb \"#0000ff\" " Q="`cat .plot.tmp | head -n1 | \ cut -d' ' -f$(( $C + 2 ))`" _TOT="Total LOC $Q: $TOT" echo $_TOT if [ ! -z "$SNC" ] ; then PL="'.plot.tmp' using 1:$(( $N + 3 )):xtic(2) \ w boxes lc rgb \"#ff0000\",$PL" TOTC="`cat .plot.tmp |tail -n+2 | \ cut -d' ' -f$(( $N + 3 )) |paste -sd+ | bc`" Q="`cat .plot.tmp | head -n1 | \ cut -d' ' -f$(( $N + 3 ))`" \ _TOT="Total LOC $Q: $TOTC\n$_TOT" fi issue_plot_file_dist \ "$PL" $BIGGEST \ "growth-$R-dist-$_FILTER`printf %04d $C`.png" \ "$R patch distribution (LOC) $3" \ $WIDTH,480 "$_TOT" C=$(( $C + 1 )) done >&2 echo "Converting gif" convert -delay 50 -loop 0 growth-$R-dist-????.png growth-$R-dist.gif exit 0 fi # # Tagged rebase tree mode # if [ "$2" = "--tags" ] ; then if [ -z "$3" ] ; then >&2 echo "Need tag regexp filter with --tags" exit 1 fi sq3 "insert into runs ( \ run_key, basis_hash, comp_hash, schema_ver) \ values (NULL, \"$1\", \"$3\", \"$SCHEMA_VER\"); \ " RUNKEY=`sq3 "select seq from sqlite_sequence where \ name=\"runs\""` >&2 echo "tags mode -- run $RUNKEY" index=0 for i in `git tag | grep "$3"` ; do >&2 echo $i make_stat $1 $i "" $index $RUNKEY index=$(( $index + 1 )) done exit 0 fi exit 0 # ---> untested # # History tree mode # if [ ! -z "$2" ] ; then COMP=$2 else COMP=`git rev-parse --abbrev-ref HEAD` fi basis_point $1 $COMP git log $BP.. --oneline | \ cut -d' ' -f1 | \ tac > .patches.tmp TODO=`wc -l .patches.tmp | cut -d' ' -f 1` C=1 cat .patches.tmp | while read i ; do >&2 echo "Patch $C/$TODO" make_stat $1 $i $BP C=$(( $C + 1 )) done exit 0