aboutsummaryrefslogtreecommitdiff
path: root/analyse-logs.sh
diff options
context:
space:
mode:
authorPaul Sokolovsky <paul.sokolovsky@linaro.org>2014-02-17 18:58:39 +0000
committerPaul Sokolovsky <paul.sokolovsky@linaro.org>2014-02-17 18:58:39 +0000
commit82d30707ae69285981970af87193ea6cdb954ba1 (patch)
tree8f0307b646e969457ef9867e35cb6a49115a1f3d /analyse-logs.sh
parent017dc19af71052394989085e11273be3bd8608e2 (diff)
Capture latest changes as of 2014-02-17.
Diffstat (limited to 'analyse-logs.sh')
-rwxr-xr-xanalyse-logs.sh112
1 files changed, 73 insertions, 39 deletions
diff --git a/analyse-logs.sh b/analyse-logs.sh
index 6b98489..338f78c 100755
--- a/analyse-logs.sh
+++ b/analyse-logs.sh
@@ -40,16 +40,24 @@ fi
#WEB_NAME="releases.linaro.org"
#WEB_NAME="www.linaro.org"
-# Which tools are we running
-#AWFFULL=$TRUE
-#WEBALIZER=$TRUE
-#WEBDRUID=$TRUE
-#VISITORS=$TRUE
+# Which tools are we running, if not set at all set to false
+if [ -z "$AWFFULL" ] ; then
+ AWFFULL=$FALSE
+fi
+if [ -z "$WEBALIZER" ] ; then
+ WEBALIZER=$FALSE
+fi
+if [ -z "$WEBDRUID" ] ; then
+ WEBDRUID=$FALSE
+fi
+if [ -z "$VISITORS" ] ; then
+ VISITORS=$FALSE
+fi
# this allows an external script to set DEBUG, or if it's not set,
# then set it to false here so the script is run quietly
if [ -z "$DEBUG" ] ; then
- DEBUG=$TRUE
+ DEBUG=$FALSE
fi
# do we want to extract file info and run the log analyzers on only that data
@@ -88,11 +96,12 @@ TMP_LOG_NAME="tmp.$PROCESSED_LOG_NAME"
FILTERED_LOG_NAME="filtered.$PROCESSED_LOG_NAME"
TOOLCHAIN_LOG_NAME="toolchain.$PROCESSED_LOG_NAME"
-if [ $WEB_NAME = "snapshots.linaro.org" ] || [ $WEB_NAME = "releases.linaro.org" ] ; then
- RAW_LOG_NAME="$WEB_NAME-$PROCESSED_LOG_NAME"
-elif [ $WEB_NAME = "www.linaro.org" ] ; then
- RAW_LOG_NAME="$PROCESSED_LOG_NAME"
-fi
+RAW_LOG_NAME="*access.log-2014*"
+#if [ $WEB_NAME = "snapshots.linaro.org" ] || [ $WEB_NAME = "releases.linaro.org" ] ; then
+# RAW_LOG_NAME="$WEB_NAME-$PROCESSED_LOG_NAME"
+#elif [ $WEB_NAME = "www.linaro.org" ] ; then
+# RAW_LOG_NAME="$PROCESSED_LOG_NAME"
+#fi
#########################################################################
# #
@@ -420,34 +429,48 @@ cleanup ()
extract_logs ()
{
+ # Now in 2014 we can just preprocess all 2012 and 2013 files and save procssing time for all 3 web servers
+ # then just grab all of the 2014 files to process
+ # *access.log-2014*
+ # preprocessed-*-2013-access.log.gz
+ x=`ls $CLOG/$RAW_LOG_NAME | wc -l`
+ if [ x > 0 ] ; then
+ zcat $CLOG/$RAW_LOG_NAME > $PPATH/$TMP_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "$WEB_NAME making access.log by zcat $CLOG/$RAW_LOG_NAME"
+ fi
+ fi
+ # 2013 logs preprocessed into a single compressed file to save processing time.
+ zcat $CLOG/preprocessed*access.log.gz > $PPATH/$PROCESSED_LOG_NAME
+
# make sure there is not an existing access.log file
# Build a single log file that is not gzipped.
- cat /dev/null > $PPATH/$TMP_LOG_NAME
- if [ $WEB_NAME = "www.linaro.org" ] ; then
+# cat /dev/null > $PPATH/$TMP_LOG_NAME
+# if [ $WEB_NAME = "www.linaro.org" ] ; then
# This is becasue the logs were stored differently for part of the year
# on www.linaro.org
# zcat the daily files that are stored by date access.log-yyyymmdd on to
# the tail of the new access.log file.
- x=`ls $CLOG/$PROCESSED_LOG_NAME-*.gz | wc -l`
- if [ x > 0 ] ; then
- zcat $CLOG/$PROCESSED_LOG_NAME-*.gz >> $PPATH/$TMP_LOG_NAME
- if [ $DEBUG -eq $TRUE ] ; then
- echo "$WEB_NAME making access.log by zcat $CLOG/$PROCESSED_LOG_NAME-*.gz"
- fi
- fi
+# x=`ls $CLOG/$PROCESSED_LOG_NAME-*.gz | wc -l`
+# if [ x > 0 ] ; then
+# zcat $CLOG/$PROCESSED_LOG_NAME-*.gz >> $PPATH/$TMP_LOG_NAME
+# if [ $DEBUG -eq $TRUE ] ; then
+# echo "$WEB_NAME making access.log by zcat $CLOG/$PROCESSED_LOG_NAME-*.gz"
+# fi
+# fi
# zcat the preprocessed log into the file log, 9 Gig's worth
- zcat $CLOG/preprocessed-$PROCESSED_LOG_NAME-*.gz > $PPATH/$PROCESSED_LOG_NAME
+# zcat $CLOG/preprocessed-$PROCESSED_LOG_NAME-*.gz > $PPATH/$PROCESSED_LOG_NAME
# releases and snapshots are stored with a "slightly" different naming convention
# releases.linaro.org-access.log-yyyymmdd.gz & snapshots.linaro.org-access.log-yyyymmdd.gz
- elif [ $WEB_NAME = "releases.linaro.org" ] || [ $WEB_NAME = "snapshots.linaro.org" ] ; then
- x=`ls $CLOG/$RAW_LOG_NAME-*.gz | wc -l`
- if [ x > 0 ] ; then
- zcat $CLOG/$RAW_LOG_NAME-*.gz > $PPATH/$TMP_LOG_NAME
- if [ $DEBUG -eq $TRUE ] ; then
- echo "$WEB_NAME making access.log by zcat $CLOG/$RAW_LOG_NAME-*.gz"
- fi
- fi
- fi
+# elif [ $WEB_NAME = "releases.linaro.org" ] || [ $WEB_NAME = "snapshots.linaro.org" ] ; then
+# x=`ls $CLOG/$RAW_LOG_NAME-*.gz | wc -l`
+# if [ x > 0 ] ; then
+# zcat $CLOG/$RAW_LOG_NAME-*.gz > $PPATH/$TMP_LOG_NAME
+# if [ $DEBUG -eq $TRUE ] ; then
+# echo "$WEB_NAME making access.log by zcat $CLOG/$RAW_LOG_NAME-*.gz"
+# fi
+# fi
+# fi
if [ $DO_REV_DNS_LOOKUP -eq $TRUE ] || [ $DO_GEOIP_LOOKUP -eq $TRUE ] ; then
# If it's www.linaro.org build the DNS database
@@ -463,7 +486,8 @@ extract_logs ()
# Now translate ip addresses to DNS names for all log files
if [ $DO_GEOIP_LOOKUP -eq $TRUE ] ; then
- # if GEOIP LOOKUP requested do that first and then do reverse DNS
+ # if GEOIP LOOKUP is desired do both GEOIP and reverse DNS lookup at the sametime
+ # the iploc.py program was modified to read both databases and do both in one pass.
if [ $DEBUG -eq $TRUE ] ; then
echo "About to do GEOIP LOOKUP and dnshistory replace"
fi
@@ -482,15 +506,24 @@ extract_logs ()
cat $PPATH/$TMP_LOG_NAME >> $PPATH/$PROCESSED_LOG_NAME
fi
- # now make a new file with only .gz and bz2 files downloaded
+ # now make a new file with only .gz, bz2, xz,exe, and zip files downloaded
# this grep can take some time to run, it's using a regular expression to extract compressed files
if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
- cat $PPATH/$PROCESSED_LOG_NAME | grep -E '\<*\.(bz2|gz|xz|exe|zip)\>' | grep -v -E '\<*\.asc\>' | grep -v HEAD | grep -v "gcc-linaro\ " > $PPATH/$TMP_LOG_NAME
+ cat $PPATH/$PROCESSED_LOG_NAME | grep -E '\<*\.(bz2|gz|xz|exe|zip)\>' | grep -v "gcc-linaro\ " > $PPATH/$TMP_LOG_NAME
if [ $DEBUG -eq $TRUE ] ; then
echo "creating filtered log"
fi
- # strip out our known IP's
- cat $PPATH/$TMP_LOG_NAME | grep -v validation.linaro.org > $PPATH/$FILTERED_LOG_NAME
+ # strip out our known IP's and some standard extra junk we don't need or care about
+ cat $PPATH/$TMP_LOG_NAME \
+ | grep -v .asc \
+ | grep -v HEAD \
+ | grep -v OPTIONS \
+ | grep -v .png \
+ | grep -v .ico \
+ | grep -v .css \
+ | grep -v .js \
+ | grep -v validation.linaro.org \
+ > $PPATH/$FILTERED_LOG_NAME
if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
if [ $DEBUG -eq $TRUE ] ; then
echo "creating toochain log"
@@ -626,9 +659,10 @@ extract_logs
process_logs
# cleanup the extra files and stuff
-cleanup
-
-# change back to where we were called from
-cd $STARTING_LOCATION
+if [ $DEBUG -eq $FALSE ] ; then
+ cleanup
+ # change back to where we were called from
+ cd $STARTING_LOCATION
+fi
# done, out of here