aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Sokolovsky <paul.sokolovsky@linaro.org>2016-01-15 14:04:57 +0200
committerPaul Sokolovsky <paul.sokolovsky@linaro.org>2016-01-15 14:04:57 +0200
commit67eba98d03d6341b3961ee64005eaf0343cee99e (patch)
treeabdfbe69a6e79fc6fd7427c78dd7e7a0f625b031
parent4bcaad6be4c8fd4789c1b7a3a179892c1e473ba6 (diff)
analyse-logs-old.sh: Copy of analyse-logs.sh, before applying more changes.
analyse-logs.sh is going to be switched to use incrementally resolved logs as produced by resolve-logs-incremental.sh script. Change-Id: I18d80f6994b74f13763f1a1722e3febd0235369c
-rwxr-xr-xanalyse-logs-old.sh632
1 files changed, 632 insertions, 0 deletions
diff --git a/analyse-logs-old.sh b/analyse-logs-old.sh
new file mode 100755
index 0000000..c3e1fd5
--- /dev/null
+++ b/analyse-logs-old.sh
@@ -0,0 +1,632 @@
+#!/bin/bash
+#########################################################################
+# #
+# This script processed linaro web logs, producing different reports #
+# Reports that can be run: #
+# 1) Awffull (webalizer fork) #
+# 2) Webalizer #
+# 3) Webdruid (webalizer fork) #
+# 4) Visitors #
+# #
+# You can run those tools on the full log or a processed log #
+# The full log shows path traveled and other info, the processed log #
+# really only shows what files were downloaded, for releases and #
+# snapshots.linaro.org this is important information. #
+# #
+# 1) full log analyses of un unaltered web log #
+# 2) filtered analyses of the web log of only the .gz and .bz2 files #
+# and everything else stripped out #
+# #
+# To speed things up we use dnshistory to do reverse DNS #
+# as a time cheat we only run the lookup on www.linaro.org log so #
+# you should always process that log first so your DNS data is fresh #
+# #
+#########################################################################
+
+# first save where we are
+STARTING_LOCATION=`pwd`
+
+# house keeping
+if [ -z "$WEB_NAME" ] || [ -z "$TRUE" ] ; then
+ echo "WEB SITE NAME or other variables NOT SET"
+ echo "This script is not designed to be called directly"
+ exit 1
+fi
+
+#TRUE=1
+#FALSE=0
+
+#WEB_NAME="snapshots.linaro.org"
+#WEB_NAME="releases.linaro.org"
+#WEB_NAME="www.linaro.org"
+
+# Which tools are we running, if not set at all set to false
+if [ -z "$AWFFULL" ] ; then
+ AWFFULL=$FALSE
+fi
+if [ -z "$WEBALIZER" ] ; then
+ WEBALIZER=$FALSE
+fi
+if [ -z "$WEBDRUID" ] ; then
+ WEBDRUID=$FALSE
+fi
+if [ -z "$VISITORS" ] ; then
+ VISITORS=$FALSE
+fi
+
+# this allows an external script to set DEBUG, or if it's not set,
+# then set it to false here so the script is run quietly
+if [ -z "$DEBUG" ] ; then
+ DEBUG=$FALSE
+fi
+
+# do we want to extract file info and run the log analyzers on only that data
+# quite handy for snapshots and releases .linaro.org so we get a better
+# picture of what is downloaded.
+# 1 = true, 0 = false
+if [ $WEB_NAME = "snapshots.linaro.org" ] || [ $WEB_NAME = "releases.linaro.org" ] || [ $WEB_NAME = "builds.96boards.org" ]; then
+ EXTRACT_GZ_BZ2_FILES=$TRUE
+ EXTRACT_TOOLCHAIN_LOG=$TRUE
+else
+ EXTRACT_GZ_BZ2_FILES=$FALSE
+ EXTRACT_TOOLCHAIN_LOG=$FALSE
+fi
+
+# this allows an external script to set DO_GEOIP_LOOKUP or DO_REV_DNS_LOOKUP
+# as desired but by default they are set to true so the log files have
+# as much data in them as possible.
+if [ -z "$DO_GEOIP_LOOKUP" ] ; then
+ DO_GEOIP_LOOKUP=$TRUE
+fi
+if [ -z "$DO_REV_DNS_LOOKUP" ] ; then
+ DO_REV_DNS_LOOKUP=$TRUE
+fi
+
+# Load config
+CONFIG=${1:-config}
+source $CONFIG
+
+#if [ $WEB_NAME = "snapshots.linaro.org" ] || [ $WEB_NAME = "releases.linaro.org" ] ; then
+# RAW_LOG_NAME="$WEB_NAME-$PROCESSED_LOG_NAME"
+#elif [ $WEB_NAME = "www.linaro.org" ] ; then
+# RAW_LOG_NAME="$PROCESSED_LOG_NAME"
+#fi
+
+#########################################################################
+# #
+# only have old logs on www.linaro.org as it's hard to get stuff done #
+# on that machine #
+# #
+#########################################################################
+if [ $WEB_NAME = "www.linaro.org" ] ; then
+ OLD_LOG_PATH="$INPUT_PATH/2013"
+ OLD_LOG_NAME="$PROCESSED_LOG_NAME"
+else
+ OLD_LOG_PATH="$INPUT_PATH"
+ OLD_LOG_NAME=""
+fi
+
+AWFFULL_FULL_PATH_NAME="awffull.full"
+AWFFULL_CONF_FULL_NAME="awffull.conf.full"
+
+AWFFULL_FILTERED_PATH_NAME="awffull.filtered"
+AWFFULL_CONF_FILTERED_NAME="awffull.conf.filtered"
+
+AWFFULL_TOOLCHAIN_PATH_NAME="awffull.toolchain"
+AWFFULL_CONF_TOOLCHAIN_NAME="awffull.conf.toolchain"
+
+AWFFULL_EXTRA_CMD_LINE_OPTIONS=""
+
+WEBALIZER_FULL_PATH_NAME="webalizer.full"
+WEBALIZER_CONF_FULL_NAME="webalizer.conf.full"
+
+WEBALIZER_FILTERED_PATH_NAME="webalizer.filtered"
+WEBALIZER_CONF_FILTERED_NAME="webalizer.conf.filtered"
+
+WEBALIZER_TOOLCHAIN_PATH_NAME="webalizer.toolchain"
+WEBALIZER_CONF_TOOLCHAIN_NAME="webalizer.conf.toolchain"
+
+WEBALIZER_EXTRA_CMD_LINE_OPTIONS=""
+
+WEBDRUID_FULL_PATH_NAME="webdruid.full"
+WEBDRUID_CONF_FULL_NAME="webdruid.conf.full"
+
+WEBDRUID_FILTERED_PATH_NAME="webdruid.filtered"
+WEBDRUID_CONF_FILTERED_NAME="webdruid.conf.filtered"
+
+WEBDRUID_TOOLCHAIN_PATH_NAME="webdruid.toolchain"
+WEBDRUID_CONF_TOOLCHAIN_NAME="webdruid.conf.toolchain"
+
+WEBDRUID_EXTRA_CMD_LINE_OPTIONS=""
+
+write.conf.file ()
+{
+# $1 = File to write conf into into
+# $2 = Output directory for analsys to be written into
+# $3 = Info to be written into the web analsys to id what you are looking at
+
+ echo "# DO NOT EDIT THIS FILE, modify analyse-snapshot.sh as it rewrites this file everytime it's run" > $1
+ echo "OutputDir $2" >> $1
+ echo "HTMLPost $3" >> $1
+ echo "ReportTitle \"Usage Statistics for the $3 of \"" >> $1
+ echo "HostName $WEB_NAME" >> $1
+ echo "ReallyQuiet yes" >> $1
+ echo "TopSites 100" >> $1
+ echo "AllSites yes" >> $1
+ echo "TopURLs 100" >> $1
+ echo "AllURLs yes" >> $1
+ echo "GeoIP no" >> $1
+# echo "GeoIPDatabase $GEO_IP_DB" >> $1
+ echo "IgnoreURL /get-remote-static" >> $1
+ echo "IgnoreURL /linaro-openid/login" >> $1
+ echo "IgnoreURL /get-textile-files" >> $1
+ echo "IgnoreURL /css/*" >> $1
+ echo "IgnoreURL /static/*" >> $1
+ echo "IgnoreURL /js/*" >> $1
+ echo "IgnoreURL /license" >> $1
+
+ if [ "$WEB_NAME" == "cards.linaro.org" -a $1 == "$WORK_PATH/$WEBDRUID_CONF_FULL_NAME" ]; then
+ # Graphs take too long too generate, being killed if tried
+ echo "PathGraph no" >> $1
+ echo "UsersFlow no" >> $1
+ fi
+}
+
+write_config_files ()
+{
+ if [ $AWFFULL -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$AWFFULL_CONF_FULL_NAME $OUTPUT_PATH/$AWFFULL_FULL_PATH_NAME $PROCESSED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.awffull.conf.full"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$AWFFULL_CONF_FILTERED_NAME $OUTPUT_PATH/$AWFFULL_FILTERED_PATH_NAME $FILTERED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.awffull.conf.filtered"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$AWFFULL_CONF_TOOLCHAIN_NAME $OUTPUT_PATH/$AWFFULL_TOOLCHAIN_PATH_NAME $TOOLCHAIN_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.awffull.conf.toolchain"
+ fi
+ fi
+ fi
+ fi
+
+ if [ $WEBALIZER -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBALIZER_CONF_FULL_NAME $OUTPUT_PATH/$WEBALIZER_FULL_PATH_NAME $PROCESSED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webalizer.conf.full"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBALIZER_CONF_FILTERED_NAME $OUTPUT_PATH/$WEBALIZER_FILTERED_PATH_NAME $FILTERED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webalizer.conf.filtered"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBALIZER_CONF_TOOLCHAIN_NAME $OUTPUT_PATH/$WEBALIZER_TOOLCHAIN_PATH_NAME $TOOLCHAIN_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webalizer.conf.toolchain"
+ fi
+ fi
+ fi
+ fi
+
+ if [ $WEBDRUID -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBDRUID_CONF_FULL_NAME $OUTPUT_PATH/$WEBDRUID_FULL_PATH_NAME $PROCESSED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webdruid.conf.full"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBDRUID_CONF_FILTERED_NAME $OUTPUT_PATH/$WEBDRUID_FILTERED_PATH_NAME $FILTERED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webdruid.conf.filtered"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ write.conf.file $WORK_PATH/$WEBDRUID_CONF_TOOLCHAIN_NAME $OUTPUT_PATH/$WEBDRUID_TOOLCHAIN_PATH_NAME $TOOLCHAIN_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "write.webdruid.conf.toolchain"
+ fi
+ fi
+ fi
+ fi
+}
+
+prep_directories ()
+{
+ if [ ! -d "$WORK_PATH" ]; then
+ mkdir -p "$WORK_PATH"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $WORK_PATH"
+ fi
+ fi
+ if [ $AWFFULL -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$AWFFULL_FULL_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$AWFFULL_FULL_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$AWFFULL_FULL_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$AWFFULL_FILTERED_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$AWFFULL_FILTERED_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$AWFFULL_FILTERED_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$AWFFULL_TOOLCHAIN_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$AWFFULL_TOOLCHAIN_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$AWFFULL_TOOLCHAIN_PATH_NAME"
+ fi
+ fi
+ fi
+ fi
+ fi
+ if [ $WEBALIZER -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBALIZER_FULL_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBALIZER_FULL_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBALIZER_FULL_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBALIZER_FILTERED_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBALIZER_FILTERED_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBALIZER_FILTER_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBALIZER_TOOLCHAIN_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBALIZER_TOOLCHAIN_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBALIZER_TOOLCHAIN_PATH_NAME"
+ fi
+ fi
+ fi
+ fi
+ fi
+ if [ $WEBDRUID -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBDRUID_FULL_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBDRUID_FULL_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBDRUID_FULL_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBDRUID_FILTERED_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBDRUID_FILTERED_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBDRUID_FILTERED_PATH_NAME"
+ fi
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ if [ ! -d "$OUTPUT_PATH/$WEBDRUID_TOOLCHAIN_PATH_NAME" ]; then
+ mkdir -p "$OUTPUT_PATH/$WEBDRUID_TOOLCHAIN_PATH_NAME"
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "mkdir -p $OUTPUT_PATH/$WEBDRUID_TOOLCHAIN_PATH_NAME"
+ fi
+ fi
+ fi
+ fi
+ fi
+
+ cd "$WORK_PATH"
+}
+
+cleanup ()
+{
+ # now delete the temp log file.
+ rm -f $WORK_PATH/$TMP_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$TMP_LOG_NAME"
+ fi
+
+ rm -f $WORK_PATH/$PROCESSED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$PROCESSED_LOG_NAME"
+ fi
+
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$FILTERED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$FILTERED_LOG_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$TOOLCHAIN_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$TOOLCHAIN_LOG_NAME"
+ fi
+ fi
+ fi
+
+ if [ $AWFFULL -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$AWFFULL_CONF_FULL_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$AWFFULL_CONF_FULL_NAME"
+ fi
+
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$AWFFULL_CONF_FILTERED_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$AWFFULL_CONF_FILTERED_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$AWFFULL_CONF_TOOLCHAIN_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$AWFFULL_CONF_TOOLCHAIN_NAME"
+ fi
+ fi
+ fi
+ fi
+
+ if [ $WEBALIZER -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$WEBALIZER_CONF_FULL_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBALIZER_CONF_FULL_NAME"
+ fi
+
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ if [ -f "$WORK_PATH/$WEBALIZER_CONF_FILTERED_NAME" ] ; then
+ rm -f $WORK_PATH/$WEBALIZER_CONF_FILTERED_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBALIZER_CONF_FILTERED_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$WEBALIZER_CONF_TOOLCHAIN_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBALIZER_CONF_TOOLCHAIN_NAME"
+ fi
+ fi
+ fi
+ fi
+ fi
+
+ if [ $WEBDRUID -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$WEBDRUID_CONF_FULL_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBDRUID_CONF_FULL_NAME"
+ fi
+
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$WEBDRUID_CONF_FILTERED_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBDRUID_CONF_FILTERED_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ rm -f $WORK_PATH/$WEBDRUID_CONF_TOOLCHAIN_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "rm -f $WORK_PATH/$WEBDRUID_CONF_TOOLCHAIN_NAME"
+ fi
+ fi
+ fi
+ fi
+}
+
+extract_logs ()
+{
+ # Build a single log file that is not gzipped.
+
+ # Now in 2014 we can just preprocess all 2012 and 2013 files and save processing time for all 3 web servers
+ # then just grab all of the 2014 files to process
+ # *access.log-2014*
+ # preprocessed-*-2013-access.log.gz
+ x=`ls $INPUT_PATH/$RAW_LOG_NAME | wc -l`
+ if [ x > 0 ] ; then
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "$WEB_NAME making access.log by zcat $INPUT_PATH/$RAW_LOG_NAME"
+ fi
+ zcat $INPUT_PATH/$RAW_LOG_NAME | grep -v "::1" > $WORK_PATH/$TMP_LOG_NAME
+ fi
+ # Previous years logs preprocessed into a single compressed file to save processing time.
+ if [ $DEBUG -eq $TRUE ] ; then
+ zcat $INPUT_PATH/preprocessed*access.log.gz > $WORK_PATH/$PROCESSED_LOG_NAME || true
+ else
+ zcat $INPUT_PATH/preprocessed*access.log.gz > $WORK_PATH/$PROCESSED_LOG_NAME 2>/dev/null || true
+ fi
+
+ if [ $DO_REV_DNS_LOOKUP -eq $TRUE ] || [ $DO_GEOIP_LOOKUP -eq $TRUE ] ; then
+ # If it's www.linaro.org build the DNS database
+ # This is a tad risky as we could have differnt folks coming directly
+ # into releases or snapshots then the main site, that said the risk is
+ # low and the speedup huge so it's worth it.
+ if [ $WEB_NAME = "www.linaro.org" ] ; then
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "About to do dnshistory lookup"
+ fi
+ if [ $DEBUG -eq $TRUE ] ; then
+ /usr/bin/dnshistory -L $DNSHISTORY_OPTS -d $DNSHISTORY_DB -f $WORK_PATH/$TMP_LOG_NAME
+ else
+ /usr/bin/dnshistory -L $DNSHISTORY_OPTS -d $DNSHISTORY_DB -f $WORK_PATH/$TMP_LOG_NAME > /dev/null
+ fi
+ fi
+
+ # Now translate ip addresses to DNS names for all log files
+ if [ $DO_GEOIP_LOOKUP -eq $TRUE ] ; then
+ # if GEOIP LOOKUP is desired do both GEOIP and reverse DNS lookup at the sametime
+ # the iploc.py program was modified to read both databases and do both in one pass.
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "About to do GEOIP LOOKUP and dnshistory replace"
+ fi
+ python $STARTING_LOCATION/iploc.py --config=$STARTING_LOCATION/$CONFIG \
+ $WORK_PATH/$TMP_LOG_NAME >> $WORK_PATH/$PROCESSED_LOG_NAME
+ else
+ # GEOIP info not requested so do the reverse DNS only
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "About to do dnshistory replace only"
+ fi
+ /usr/bin/dnshistory -T --logtype=www -d $DNSHISTORY_DB -f $WORK_PATH/$TMP_LOG_NAME >> $WORK_PATH/$PROCESSED_LOG_NAME
+ fi
+ else
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "No GEOIP LOOKUP or Reverse DNS"
+ fi
+ cat $WORK_PATH/$TMP_LOG_NAME >> $WORK_PATH/$PROCESSED_LOG_NAME
+ fi
+
+ # now make a new file with only .gz, bz2, xz,exe, and zip files downloaded
+ # this grep can take some time to run, it's using a regular expression to extract compressed files
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ cat $WORK_PATH/$PROCESSED_LOG_NAME | grep -E '\<*\.(bz2|gz|xz|exe|zip)\>' | grep -v "gcc-linaro\ " > $WORK_PATH/$TMP_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "creating filtered log"
+ fi
+ # strip out our known IP's and some standard extra junk we don't need or care about
+ cat $WORK_PATH/$TMP_LOG_NAME \
+ | grep -v .asc \
+ | grep -v HEAD \
+ | grep -v OPTIONS \
+ | grep -v .png \
+ | grep -v .ico \
+ | grep -v .css \
+ | grep -v .js \
+ | grep -v validation.linaro.org \
+ > $WORK_PATH/$FILTERED_LOG_NAME
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "creating toochain log"
+ fi
+ cat $WORK_PATH/$FILTERED_LOG_NAME | grep -E '\<*gcc-linaro' > $WORK_PATH/$TOOLCHAIN_LOG_NAME
+ fi
+ fi
+}
+
+process_logs ()
+{
+ ## Awffull
+ # use all the data in the file
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "`date`"
+ fi
+ if [ $AWFFULL -eq $TRUE ] ; then
+ awffull $AWFFULL_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$AWFFULL_CONF_FULL_NAME $WORK_PATH/$PROCESSED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "awffull processing $WORK_PATH/$PROCESSED_LOG_NAME"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ # Now use the filtered log that has only the .bz2 or .gz files and look at those patterns
+ awffull $AWFFULL_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$AWFFULL_CONF_FILTERED_NAME $WORK_PATH/$FILTERED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "awffull processing $WORK_PATH/$FILTERED_LOG_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ awffull $AWFFULL_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$AWFFULL_CONF_TOOLCHAIN_NAME $WORK_PATH/$TOOLCHAIN_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "awffull processing $WORK_PATH/$TOOLCHAIN_LOG_NAME"
+ fi
+ fi
+ fi
+ fi
+
+ ## Webalizer
+ # use all the data in the file
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "`date`"
+ fi
+ if [ $WEBALIZER -eq $TRUE ] ; then
+ webalizer $WEBALIZER_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBALIZER_CONF_FULL_NAME $WORK_PATH/$PROCESSED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webalizer processing $WORK_PATH/$PROCESSED_LOG_NAME"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ # Now use the filtered log that has only the .bz2 or .gz files and look at those patterns
+ webalizer $WEBALIZER_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBALIZER_CONF_FILTERED_NAME $WORK_PATH/$FILTERED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webalizer processing $WORK_PATH/$FILTERED_LOG_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ webalizer $WEBALIZER_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBALIZER_CONF_TOOLCHAIN_NAME $WORK_PATH/$TOOLCHAIN_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webalizer processing $WORK_PATH/$TOOLCHAIN_LOG_NAME"
+ fi
+ fi
+ fi
+ fi
+
+ ## Visitors
+ # use all the data in the file
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "`date`"
+ fi
+ if [ $VISITORS -eq $TRUE ] ; then
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "visitors processing $WORK_PATH/$PROCESSED_LOG_NAME"
+ echo visitors --ignore-404 -A --prefix $WEB_NAME -o html --trails --output-file $OUTPUT_PATH/visitors-all.html $WORK_PATH/$PROCESSED_LOG_NAME
+ fi
+ visitors --ignore-404 -A --prefix $WEB_NAME -o html --trails --output-file $OUTPUT_PATH/visitors-all.html $WORK_PATH/$PROCESSED_LOG_NAME &> /dev/null
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ # Now use the filtered log that has only the .bz2 or .gz files and look at those patterns
+ visitors --ignore-404 -A --prefix $WEB_NAME -o html --trails --output-file $OUTPUT_PATH/visitors-filtered.html $WORK_PATH/$FILTERED_LOG_NAME &> /dev/null
+ cp $WORK_PATH/$FILTERED_LOG_NAME $OUTPUT_PATH/.
+ gzip -f -9 $OUTPUT_PATH/$FILTERED_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "visitors processing $WORK_PATH/$FILTERED_LOG_NAME and then creating $OUTPUT_PATH/$FILTERED_LOG_NAME.gz"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ visitors --ignore-404 -A --prefix $WEB_NAME -o html --trails --output-file $OUTPUT_PATH/visitors-toolchain.html $WORK_PATH/$TOOLCHAIN_LOG_NAME &> /dev/null
+ cp $WORK_PATH/$TOOLCHAIN_LOG_NAME $OUTPUT_PATH/.
+ gzip -f -9 $OUTPUT_PATH/$TOOLCHAIN_LOG_NAME
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "visitors processing $WORK_PATH/$TOOLCHAIN_LOG_NAME and then creating $OUTPUT_PATH/$TOOLCHAIN_LOG_NAME.gz"
+ fi
+ fi
+ fi
+ fi
+
+ ## Webdruid
+ # use all the data in the file
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "`date`"
+ fi
+ if [ $WEBDRUID -eq $TRUE ] ; then
+ webdruid $WEBDRUID_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBDRUID_CONF_FULL_NAME $WORK_PATH/$PROCESSED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webdruid processing $WORK_PATH/$PROCESSED_LOG_NAME"
+ fi
+ if [ $EXTRACT_GZ_BZ2_FILES -eq $TRUE ] ; then
+ # Now use the filtered log that has only the .bz2 or .gz files and look at those patterns
+ webdruid $WEBDRUID_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBDRUID_CONF_FILTERED_NAME $WORK_PATH/$FILTERED_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webdruid processing $WORK_PATH/$FILTERED_LOG_NAME"
+ fi
+ if [ $EXTRACT_TOOLCHAIN_LOG -eq $TRUE ] ; then
+ webdruid $WEBDRUID_EXTRA_CMD_LINE_OPTIONS -c $WORK_PATH/$WEBDRUID_CONF_TOOLCHAIN_NAME $WORK_PATH/$TOOLCHAIN_LOG_NAME > /dev/null
+ if [ $DEBUG -eq $TRUE ] ; then
+ echo "webdruid processing $WORK_PATH/$TOOLCHAIN_LOG_NAME"
+ fi
+ fi
+ fi
+ fi
+
+}
+
+# OK, the actual section to do work, prep, write out config files as needed,
+# extract log files, process the log files, cleanup and exit.
+
+# verify and make if necessary all directories needed by this script
+prep_directories
+cleanup
+
+# write out the config files, rememnber they are changed here in this file
+write_config_files
+
+#/usr/bin/touch ~/bin/starttime
+# combine all logs into a single file, then filter the log into a second file
+extract_logs
+
+# analyse the logs...
+process_logs
+
+# cleanup the extra files and stuff
+if [ $DEBUG -ne $TRUE ] ; then
+ cleanup
+else
+ echo "WARNING: Not cleaning up temporary files, beware of running out of disk space."
+fi
+
+# change back to where we were called from
+cd $STARTING_LOCATION
+
+# done, out of here