#!/bin/sh

#
# utility to count certain kinds of error conditions and report log messages
# which aren't filtered below.  this could probably be a bit more
# comprehensive, but it provides an idea about the overall health of the
# system
#
# in our deployment it gets run shortly after midnight on yesterdays logs and
# the output is emailed via cron.  here's an example crontab entry:
#
# MAILTO=me@my.net
# 15 0 * * * /path/to/script/esmond_logwatch
#


analyze() {
    case "$1" in
        *bz2)
            cmd="bzcat -cd $1"
            ;;
        *gz)
            cmd="gzcat -cd $1"
            ;;
        *)
            cmd="cat $1"
            ;;
    esac

$cmd | awk '

function update_count(dict, key)
{
    dict[key]++;
}

function report_count(title, dict)
{
    total = 0;

    if(length(dict) > 0) {
        print "\n" title
        for (k in dict) {
            print dict[k], k;
            total += dict[k];
        }
        print total, "TOTAL"
    }
}

# these lines are basically noise, ignore them
/logfile turned over/ { next }
/have a TSDBRow for DisplayString|IpAddress|INTEGER|PhysAddress|IANAifType|Integer32/ { next }
/shutting down|exiting|starting|run|stop$/ { next }
/espersistd.ifref|aluifref|alusapref|infifref|lspopstatus.*processed [0-9]+ vars/ { next }
/reloading devices and oidsets/ { next }
/started thread persist_thread/ { next }
/add mib/ { next }
/draining/ { next }
/worker assigned/ { next }
/espolld.*added/ { next }
/stopping/ { next }
/espolld.manager.*killing/ { next }
/espersistd.manager.*killing/ { next }
/last message repeated/ { next }
/grabbing data/ { next }
/stored [0-9]+ vars/ { next }
/grabbed [0-9]+ vars/ { next }
/records written/ { next }
/creating TSDBVar/ { next }
/summarize.*loop/ { next }

# Jun 13 00:05:39 localhost espersistd.tsdb_5 [91882] bad metadata for kans-asw1/FastPollHC/ifHCOutOctets/ge-0_0_2
/bad metadata for/ {
    update_count(bad_metadata, $NF);
    next;
}
# Jun 13 09:57:45 localhost espersistd.manager [1032] child died: pid 48131, tsdb_6
/espersistd.manager.*child died/ {
    update_count(deaths, $NF);
    next;
}
# Jun 13 00:37:06 localhost espersistd.tsdb_5 [97930] var needs repair, skipping: ameslab-rt2/JnxFirewall/counter/ge-1/0/0.0-i/bogons-source-ge-1/0/0.0-i
/var needs repair/ {
    update_count(needs_repair, $NF);
    next;
}
# Jun 13 20:20:41 localhost espersistd.tsdb_4 [75229] Error aggregating: forr-rt1 FastPollHC: 1308016537 is greater than the maximum slot 1307889029
/Error aggregating.*greater than/ {
    k = $9 " " $10;
    update_count(agg_greater, k);
    next;
}
# Jun 13 00:15:35 localhost espersistd.manager [5363] pid 93875: Exception TypeError: "'NoneType' object is not callable" in <function <lambda> at 0x803f95488> ignored 
/Exception.*NoneType.*lambda/ { next };
/exception/ {
    proc = $5;
    split($7, parts, /=/);
    e = parts[2];
    k = proc " " e;
    update_count(exceptions, k);
    next;
}
/timeout/ { 
    split($5, parts, /\./);
    update_count(timeouts, parts[2]);
    next;
}
/bad data for/ { 
    update_count(bad_data, $10);
    next;
}
/failed to deserialize/ {
    update_count(deserialize, $5);
    next;
}
/unable to resolve OID/ {
    update_count(unresolveable, $NF);
    next;
}
/migrate-tsdb-chunks: rsync took/ {
    if($8 > 15000) { print };
    next
}
/migrate-tsdb-chunks/ { next }
/unknown ifIndex/ {
    split($5, parts, /\./);
    host = parts[2];
    oid = $(NF-1);
    k = host " " oid;
    update_count(unknowns, k);
    next;
}
/summarize.*no var/ {
    update_count(summarize_no_var, $NF);
    next;
}
/summarize.*var has invalid metadata/ {
    update_count(summarize_invalid, $NF);
    next;
}
/summarize.*range error/ {
    update_count(summarize_range_error, $NF);
    next;
}
/summarize.*not valid/ {
    update_count(summarize_not_valid, $NF);
    next;
}


{ print }
END {
    report_count("TIMEOUTS", timeouts);
    report_count("EXCEPTIONS", exceptions);
    report_count("DESERIALIZE", deserialize);
    report_count("UNRESOLVABLE" ,unresolveable);
    report_count("UNKNOWNS" ,unknowns);
    report_count("DEATHS",deaths);
    report_count("AGGREGATION ERRORS" ,agg_greater);
    report_count("NEEDS REPAIR" ,needs_repair);
    report_count("SUMMARIZE NO VAR", summarize_no_var);
    report_count("SUMMARIZE INVAILD METDATA", summarize_invalid_metadata);
    report_count("SUMMARIZE RANGE ERROR", summarize_range_error);
    report_count("SUMMARIZE NOT VALID", summarize_not_valid);
    report_count("BAD METADATA" ,bad_metadata);
    report_count("BAD DATA" ,bad_data);
}
'
}

if [ $# -eq 0 ]; then
    analyze /var/log/esmond.log.0.bz2 
else
    for f in $*; do 
        analyze $f
    done
fi
