#!/bin/bash
#
# growth.sh
#
# Copyright (C) 2015 Linaro, Ltd
# Andy Green <andy.green@linaro.org>
# Licensed under GPL2.1
#
# Please run the script with no args to get comprehensive help
#
# Note on sqlite3 usage
#
# The sqlite3 db generated here is just caching analysis the script
# generated itself.  You can delete it and the script will recreate an
# empty one automatically, but you will have to regenerate the runs
# that were stored in it.
#
# Having the data cached there is helpful both is making complex queries
# that are difficult to reproduce using cut, sed. sort etc and in allowing
# quick development of new graphs and queries without the cost of generating
# the data each time.


DB=growth.sq3
SCHEMA_VER=1

BP=
DIRCOL=0
rm -f .cols.tmp
touch .cols.tmp
rm -f .first-phase

function sq3()
{
	sqlite3 $DB "$1"
	if [ $? -ne 0 ] ; then
		echo "sqlite error"
		echo $1
		exit 1
	fi
}

# number of preset columns before dir ones
OFFSET_COLS=8

# return col num of dirname in DIRCOL
# $1: dirname

# $1: basis branch, $2: starting tree

function basis_point()
{
	echo basis_point $1 $2

	BP=`diff -u <(git rev-list --first-parent $2) \
		    <(git rev-list --first-parent $1) | \
		    sed -ne 's/^ //p' | head -1`

	echo result $BP
}

# $1: basis branch, $2: empty or comparison ref, $3: basis point if known
# $4: index in sequence, $5: run_key we are attached to

function make_stat()
{
	local CB
	local BASIS
	local COMP
	local DS
	local DS_BASIS
	local STATS

	CB="$2"
	if [ -z "$3" ] ; then
		basis_point $1 $CB
	else
		BP=$3
	fi

	BASIS=`git describe $BP`
	COMP=`git describe $CB`
	DS=`git log -n 1 $CB --format=format:%ct`
	DS_BASIS=`git show $BP --format=format:%ct | head -n1`

	echo "git diff $BP..$CB --shortstat"


	F="`git diff $BP..$CB --shortstat`"
	FILES=`echo $F | cut -d' ' -f1`
	ADD=`echo $F | cut -d' ' -f4`
	REM=`echo $F | cut -d' ' -f6`

	sq3 "insert into snapshots (run_idx, ref_name, ref_date, \
			basis_name, basis_date, files_changed, \
			loc_added, loc_removed) \
		values ($5, \"$COMP\", $DS, \"$BASIS\", $DS_BASIS, \
			$FILES, $ADD, $REM);"
	SKEY=`sq3 "select seq from sqlite_sequence where \
			name=\"snapshots\""`

	git diff $BP..$CB --numstat | while read i ; do

		F="`echo $i | cut -d' '  -f3`"
		ADD=`echo $i | cut -d' '  -f1`
		REM=`echo $i | cut -d' ' -f2`

		if [ "$ADD" != "-" -a \
			-z "`echo $F | grep gitignore`" \
		] ; then

			DEPTH=`echo "$F" | sed "s|[^/]||g" | wc -c`

		sq3 "insert into dir_summary (snap_idx, run_key, dir_name, \
				dir_depth, loc_added, loc_removed) \
			values ($SKEY, $5, \"$F\", $DEPTH, $ADD, $REM);"
		fi
	done

}

# $1: stats file, $2: y axis, $3: output, $4: title

function issue_plot_time()
{
	cat >plot.tmp <<EOF
set terminal pngcairo notransparent enhanced font "arial,10" size 660, 320 
set output '$2'
set title "$3" 
set yrange [ 0 : ] noreverse nowriteback
set lmargin  9
set rmargin  2
set autoscale xfixmin
set xdata time
set termoption dash
set timefmt "%s"
set key autotitle columnhead
plot $1
EOF

	>&2 echo "creating $3"
	gnuplot plot.tmp
}

# $1: stats file, $2: y axis, $3: output, $4: title, $5: dimensions

function issue_plot_file_dist()
{
	cat >plot.tmp <<EOF
set terminal pngcairo notransparent enhanced font "arial,10" size $5
set output '$3'
set title "$4" 
set yrange [  : $2 ] noreverse nowriteback
set lmargin  9
set rmargin  2
set autoscale xfixmin
#set xdata time
set termoption dash
set timefmt "%s"
set key autotitle columnhead
set style fill transparent solid 0.5 noborder
set xtics rotate
set style data histogram
set label "$6" at graph 0.99,0.8 right
plot $1
EOF

	>&2 echo "creating $3"
	gnuplot plot.tmp
}


# create schema

# one of these for each comparison run

sq3 "create table if not exists runs (\
	run_key integer primary key autoincrement, \
	basis_hash varchar(50), \
	comp_hash varchar(50), \
	tags integer, \
	schema_ver integer \
);"

# one of these for each snapshot compared

sq3 "create table if not exists snapshots (\
	snap_idx integer primary key autoincrement, \
	run_idx integer, \
	ref_name varchar(50), \
	ref_date integer, \
	basis_name varchar(50), \
	basis_date integer, \
	files_changed integer, \
	loc_added integer, \
	loc_removed integer \
);"

# one of these for each dir changed in the snapshot
# we have run_key here as well since it simplifies finding all paths

sq3 "create table if not exists dir_summary (\
	key integer primary key autoincrement, \
	snap_idx integer, \
	run_key integer, \
	dir_name varchar(150), \
	dir_depth integer, \
	loc_added integer, \
	loc_removed integer \
);"


if [ -z "$1" ] ; then
	>&2 echo "Usage: $0 <basis branch> --tags <tag regexp>"
	>&2 echo "       $0 --plot <run #> [ subdir ]"
	>&2 echo "       $0 --plot <run # a> < - | subdir > <run # b>"
	>&2 echo ""
	>&2 echo "$0 can be run in two modes, either create a 'run' in"
	>&2 echo "the sqlite3 db cache, or create graphs about one or"
	>&2 echo "comparing two runs already in the db cache"
	>&2 echo ""
	>&2 echo "Creating a 'run' from one or more tags"
	>&2 echo "--------------------------------------"
	>&2 echo ""
	>&2 echo "A 'run' is created by studying one or more tags against"
	>&2 echo "a 'basis branch' to isolate the patches on top of the"
	>&2 echo "tag's basis point.  So if you have a kernel branch that"
	>&2 echo "is tracking mainline, the various tags you have on that"
	>&2 echo "kernel branch may be based on different mainline versions."
	>&2 echo "$0 can autodiscover for each tag where the basis point is"
	>&2 echo "if you just give him the basis branch name, eg, 'mainline'."
	>&2 echo ""
	>&2 echo " $$ $0 mainline --tags mybranch-tagname-regexp"
	>&2 echo ""
	>&2 echo "Notice that the tag name to analyze on one 'run' is a regexp."
	>&2 echo "It's fine to have many tags analyzed in one 'run'."
	>&2 echo ""
	>&2 echo "When the run starts, the run number is reported and you"
	>&2 echo "should make a note of it"
	>&2 echo ""
	>&2 echo ""
	>&2 echo "Plotting graphs from one or two runs"
	>&2 echo "------------------------------------"
	>&2 echo ""
	>&2 echo "After the analysis for the tags you are interested in has"
	>&2 echo "been captured into 'runs' in the sqlite3 db cache, you can"
	>&2 echo "run the script to produce png and gif graphs showing or"
	>&2 echo "comparing the data from different runs."
	>&2 echo ""
	>&2 echo "There's no requirement at all that the different runs have"
	>&2 echo "anything in common in their history, basis or content,"
	>&2 echo "giving a lot of flexibility in the comparisons."
	>&2 echo ""
	>&2 echo "To produce graphs about one run itself:"
	>&2 echo ""
	>&2 echo " $$ $0 --plot <run #> [ subdir ]"
	>&2 echo ""
	>&2 echo "If subdir is missing, the whole tree is analysed, if given"
	>&2 echo "the analysis is restricted to the subdirectory given."
	>&2 echo ""
	>&2 echo "To produce graphs comparing two runs:"
	>&2 echo ""
	>&2 echo " $$ $0 --plot <run # a> < - | subdir > <run # b>"
	>&2 echo ""
	>&2 echo "If there is no subdir restriction, - must be given.""
	>&2 echo "<run # b> must contain only one tag in this case."
	>&2 echo ""
	>&2 echo "Graphs will be produced with the union of information"
	>&2 echo "from run a and run b, showing run a in blue and run b in red."

	exit 1
fi

LEVELS=1,2
FILTER=$3
if [ "$FILTER" = "-" ] ; then
	FILTER=
fi

FILTERLEN=${#FILTER}
F_DEPTH=`echo "$FILTER" | sed "s|[^/]||g" | wc -c`

if [ ! -z "$FILTER" ] ; then
	if [ $F_DEPTH == 1 ] ; then
		LEVELS=1,2,3
	else if [ $F_DEPTH == 2 ] ; then
		LEVELS=1,2,3,4
		else if [ $F_DEPTH == 3 ] ; then
			LEVELS=1,2,3,4,5
			else
				LEVELS=1,2,3,4,5,6
			fi
		fi
	fi
fi

#
# plot mode
#

if [ "$1" = "--plot" ] ; then

	PLOT_RUN=$2
	COMP_RUN=$4

	R=`sq3 "select comp_hash,basis_hash from runs where run_key=$PLOT_RUN"|\
		tr '|' '-'`

	#
	# get a list of snapshot idxs for both runs combined
	#

	if [ ! -z "$COMP_RUN" ] ; then
		RUN_IDX_COMP="(run_idx=$PLOT_RUN or run_idx=$COMP_RUN)"
		RUN_KEY_COMP="(run_key=$PLOT_RUN or run_key=$COMP_RUN)"
		CR=`sq3 "select comp_hash,basis_hash from runs where \
			run_key=$COMP_RUN" | tr '|' '-'`
		R="$R"-VS-$CR
	else
		RUN_IDX_COMP="run_idx=$PLOT_RUN"
		RUN_KEY_COMP="run_key=$PLOT_RUN"
	fi
echo $R
	# our snapshots
	SN=`sq3 "select snap_idx from snapshots where run_idx=$PLOT_RUN"`

	# there's a comparison snapshot?
	SNC=
	if [ ! -z "$COMP_RUN" ] ; then
		SNC=`sq3 "select snap_idx from snapshots where run_idx=$COMP_RUN"`
		COUNT=
		for i in $SNC ; do
			if [ ! -z "$COUNT" ] ; then
				>&2 echo "Must be single comparison snapshot"
				exit 1
			fi
			COUNT=x
		done
	fi

	rm -f .plot.tmp
	rm -f .plot.cols .plot.cols1
	rm -f .plot.dist

	#
	# using both runs if two given,
	# create the column header row, and fill .plot.cols with the
	# list of files / dirs changed in this view of the diff
	# 

	echo -n "basis_name basis_date ref_name ref_date files add del " \
								> .plot.tmp
	sq3 "select dir_name from dir_summary where \
		$RUN_KEY_COMP and \
		substr(dir_name, 1, $FILTERLEN)=\"$FILTER\" \
		order by loc_added,loc_removed asc" | \
		cut -d'/' -f$LEVELS | while read i ; do
		if [ ! -z "`echo "$i" | grep ^Documentation/`" ] ; then
			echo "Documentation" >> .plot.cols1
		else
			if [ "$i" != "." ] ; then
				# don't allow individual files
				if [ ! -d "$i" ] ; then
					dirname $i >> .plot.cols1
				else
					echo $i >> .plot.cols1
				fi
			fi
		fi
	done

	#
	# put the column titles in place and write out the
	# filtered list of files/dirs we will care about
	#
	cat .plot.cols1 | sort | uniq | while read i ; do
		echo -n "$i " >> .plot.tmp
		echo $i >> .plot.cols
	done
	echo >> .plot.tmp

	#
	# find out how many snapshots created by the run he's using
	# it doesn't include any comparison snapshot
	#
	N=0
	for i in $SN ; do
		N=$(( $N + 1 ))
	done

	# how many files were changed
	CHANGEDFILES=`wc -l .plot.cols | cut -d' ' -f1`

	>&2 echo "Studying $N snapshots"
	>&2 echo "Total $CHANGEDFILES files changed"

	#
	# For each snapshot, go through the list of changed files/dirs and
	# find out how much changed there in that snapshot
	#
	T=1
	for i in $SN ; do
		L=`sq3 "select basis_name, basis_date, ref_name, \
			ref_date, files_changed, loc_added, loc_removed\
			from snapshots where snap_idx=$i" | tr '|' ' '`
		echo -n $L >> .plot.tmp

		>&2 echo -n -e "Snapshot $T/$N: `echo $L | cut -d' ' -f3`      \r"
		T=$(( $T + 1 ))

		cat .plot.cols | while read j ; do
			JLEN=${#j}

			# are we going to deal with his subdirs?
			if [ -z "`cat .plot.cols | grep "$j/"`" ] ; then

				# everything inside this dir
				A=`sq3 "select sum(loc_added) \
					from dir_summary where \
					 snap_idx=$i and \
					 substr(dir_name, 1, $JLEN)=\"$j\"\
					" | head -n1`

			else
				# it's truncated, so only files in this dir
				# eg arch, but arch/arm is handled elsewhere

				DEP=$(( `echo "$j" | sed "s|[^/]||g" | wc -c` + 1 ))
				A=`sq3 "select sum(loc_added) \
					from dir_summary where\
				 	snap_idx=$i and \
					substr(dir_name, 1, $JLEN)=\"$j\" and \
					dir_depth=$DEP" | head -n1`
			fi

			if [ -z "$A" ] ; then
				echo -n "0 " >> .plot.tmp
			else
				echo -n "$A " >> .plot.tmp
			fi
		done
		echo >> .plot.tmp
	done

	>&2 echo

	#
	# for plots related to changes over time, we can do them now
	#

	issue_plot_time "'.plot.tmp' using \
		4:6 notitle  with filledcurve y1=0 lc rgb \"#0000ff\"" \
		"growth-$R-LOC.png" "$R growth in LOC"

	issue_plot_time "'.plot.tmp' using \
		4:( (\$4-\$2)/(24 * 3600) ) notitle \
		with filledcurve y1=0 lc rgb \"#0000ff\"" \
		   "growth-$R-basis-age.png" "$R growth basis age (days)"

	rm -f .plot.tmp1

	#
	# for each file / dir that has changes in any snapshot, for each
	# snapshot calculate its changes and create a unified plot data file
	#
	echo 0 > .biggest

	T=1
	cat .plot.cols | while read j ; do

		>&2 echo -n -e "File $T/$CHANGEDFILES    \r"
		T=$(( $T + 1 ))

		echo -n "$j " >> .plot.tmp1

		BIGGEST=`cat .biggest`

		JLEN=${#j}

		for i in $SN $SNC ; do
			if [ -z "`cat .plot.cols | grep "$j/"`" ] ; then
				# everything inside the dir
			A=`sq3 "select sum(loc_added) \
				from dir_summary where\
                         	snap_idx=$i and \
				substr(dir_name, 1, $JLEN)=\"$j\" \
				"|head -n1`

			D=`sq3 "select sum(loc_removed) \
				from dir_summary where\
				snap_idx=$i and \
				substr(dir_name, 1, $JLEN)=\"$j\" \
				"|head -n1`
			else
				# it's truncated, so only files in this dir
				# eg arch, but arch/arm is handled elsewhere
				DEP=$(( `echo "$j" | sed "s|[^/]||g" | wc -c` + 1 ))

				A=`sq3 "select sum(loc_added) \
					from dir_summary where \
				 	snap_idx=$i and \
					substr(dir_name, 1, $JLEN)=\"$j\"\
					and dir_depth=$DEP" | head -n1`

				D=`sq3 "select sum(loc_removed) \
					from dir_summary where \
				 	snap_idx=$i and \
					substr(dir_name, 1, $JLEN)=\"$j\" \
					and dir_depth=$DEP" | head -n1`
			fi

			if [ ! -z "$A" -a ! -z "$D" ] ; then
				V=$(( $A - $D ))

				echo -n "$V " >> .plot.tmp1

				if [ $V -gt $BIGGEST ] ; then
					BIGGEST=$V
					echo $V > .biggest
				fi
			else
				echo -n "0 " >> .plot.tmp1
			fi
		done

		echo >> .plot.tmp1
	done

	>&2 echo

	echo -n "idx dir " > .plot.tmp
	for i in $SN $SNC ; do
		V="`sq3 "select ref_name \
			from snapshots where snap_idx=$i"`"

		echo -n "$V " >> .plot.tmp
	done
	echo >> .plot.tmp

	C=0
	sort -k$(( $N + 1 )) -nr .plot.tmp1 | while read i ; do
		echo "$C $i" >> .plot.tmp
		C=$(( $C + 1 ))
	done

	WIDTH=$(( 16 * `cat .plot.cols | wc -l` ))
	if [ $WIDTH -lt 640 ] ; then
		WIDTH=640
	fi

	# plot each snapshot in turn

	C=1
	while [ $C -le $N ]; do

		BIGGEST=`cat .biggest`

		TOT=`cat .plot.tmp |tail -n+2 | \
			cut -d' ' -f$(( $C + 2 )) |paste -sd+ | bc`
		_FILTER=`echo "$FILTER" | sed "s|/|_|g"`
		if [ ! -z "$_FILTER" ] ; then
			_FILTER=$_FILTER-
		fi

		PL="'.plot.tmp' using 1:$(( $C + 2 )):xtic(2) \
                        w boxes lc rgb \"#0000ff\" "

		Q="`cat .plot.tmp | head -n1 | \
				cut -d' ' -f$(( $C + 2 ))`"

		_TOT="Total LOC $Q: $TOT"

		echo $_TOT

		if [ ! -z "$SNC" ] ; then
			PL="'.plot.tmp' using 1:$(( $N + 3 )):xtic(2) \
                        w boxes lc rgb \"#ff0000\",$PL"
			TOTC="`cat .plot.tmp |tail -n+2 | \
				cut -d' ' -f$(( $N + 3 )) |paste -sd+ | bc`"
			Q="`cat .plot.tmp | head -n1 | \
				cut -d' ' -f$(( $N + 3 ))`" \

			_TOT="Total LOC $Q: $TOTC\n$_TOT"
		fi

		issue_plot_file_dist \
			"$PL" $BIGGEST \
			"growth-$R-dist-$_FILTER`printf %04d $C`.png" \
			"$R patch distribution (LOC) $3" \
			$WIDTH,480 "$_TOT"
		C=$(( $C + 1 ))
	done

	>&2 echo "Converting gif"
	convert -delay 50 -loop 0 growth-$R-dist-????.png growth-$R-dist.gif

	exit 0
fi

#
# Tagged rebase tree mode
#

if [ "$2" = "--tags" ] ; then
	if [ -z "$3" ] ; then
		>&2 echo "Need tag regexp filter with --tags"
		exit 1
	fi

	sq3 "insert into runs ( \
			run_key, basis_hash, comp_hash, schema_ver) \
			values (NULL, \"$1\", \"$3\", \"$SCHEMA_VER\"); \
		"
	RUNKEY=`sq3 "select seq from sqlite_sequence where \
			name=\"runs\""`

	>&2 echo "tags mode -- run $RUNKEY"
	index=0
	for i in `git tag | grep "$3"` ; do
		>&2 echo $i
		make_stat $1 $i "" $index $RUNKEY
		index=$(( $index + 1 ))
	done

	exit 0
fi

exit 0

# ---> untested

#
# History tree mode
#

if [ ! -z "$2" ] ; then
	COMP=$2
else
	COMP=`git rev-parse --abbrev-ref HEAD`
fi

basis_point $1 $COMP

git log $BP.. --oneline | \
	cut -d' ' -f1 | \
	tac > .patches.tmp

TODO=`wc -l .patches.tmp | cut -d' ' -f 1`
C=1

cat .patches.tmp | while read i ; do
	>&2 echo "Patch $C/$TODO"
	make_stat $1 $i $BP
	C=$(( $C + 1 ))
done

exit 0