#!/usr/bin/env python

"""
Generates secondary aggregation statistics from esmond data base rates.

When run with no arguments, it will update min/max/etc 'statistics' from 
the base rates with a slight delay on the end so the base rates can catch 
up.  

If run with the optional --start and --end arguments, it will only update 
over the custom time range.
"""
import calendar
import datetime
import fcntl
import json
import os
import sys
import time

from optparse import OptionParser

from django.conf import settings

from esmond.config import get_config, get_config_path
from esmond.cassandra import CASSANDRA_DB, RawData

from pycassa.columnfamily import NotFoundException

# This sets the seconds previous to time.time() to stop
# reading base rates since the most recent ones are probably
# incomplete.
RATE_TAIL_DELAY = 180

def program_lock(lockfile):
    fd = os.open(lockfile, os.O_CREAT | os.O_TRUNC | os.O_WRONLY)
    try:
        fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
    except IOError:
        return False

    return True

def _get_rollup_freqs(oid):
    # XXX(mmg): going to need to be able to get my hands on the aggregation
    # values for the base rate and rollups.
    return [300, 1800, 7200, 86400]
    
def _agg_timestamp(data, freq):
    return datetime.datetime.utcfromtimestamp((data.ts_to_unixtime() / freq) * freq)

# Check to see if another insgtance is running.
if not program_lock(os.path.join(settings.ESMOND_ROOT, 'statgen.lock')):
    sys.exit()
    
usage = '%prog | -v | --start | --end'
desc = ' '.join(__doc__.split())
parser = OptionParser(usage=usage, description=desc)
parser.add_option('-s', '--start', dest='start', type='int', metavar='START_TS',
                  default=-1,
                  help="Starting unix timestamp if generating over a custom time range.")
parser.add_option('-e', '--end', dest='end', type='int', metavar='END_TS',
                  default=-1,
                  help="Ending unix timestamp if generating over a custom time range.")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
                  help="Print output when running.")

options, args = parser.parse_args()

# Sanity check the optional timestamp arguments.
using_custom_range = False
ts_arg_count = 0
if options.start > -1 : ts_arg_count += 1
if options.end > -1 : ts_arg_count += 1

if ts_arg_count:
    if ts_arg_count == 1:
        print 'Please specify both --start and --end flags.'
        sys.exit(-1)
    elif ts_arg_count == 2:
        if options.start > options.end:
            print 'The --start flag must have a smaller timestamp than the --end flag ',
            print '(--start=%s and --end=%s given).' % (options.start, options.end)
            sys.exit(-1)
        else:
            print 'Using custom time range.'
            using_custom_range = True

config = get_config(get_config_path())

db = CASSANDRA_DB(config, clear_on_test=False)

keys = []
agg_freqs = {}

# Get a list of all the keys currently in the base rate column family.
for k in db.rates._column_family.get_range(column_count=0,filter_empty=False):
    keys.append(k[0])

keys.sort()

for key in keys:
    if options.verbose:
        print 'Processing:', key
    device,path,oid,base_freq,year = key.split(RawData._key_delimiter)
    # XXX: need the frequencies for the rollup the "right way" here.
    for freq in _get_rollup_freqs(oid):
    # This first loop will generate an entire row of statistics rollups
    # if a given row of base rates do not have a corresponding row of 
    # stat rollups.  This should happen on initial execution or the 
    # addition of new devices/frequencies.
    #
    # Otherwise, it will just generate a dict of row keys and frequencies
    # for the more usual "incremental updates."
        agg_key = '%s%s%s%s%s%s%s%s%s' % \
            (device, RawData._key_delimiter,
            path, RawData._key_delimiter,
            oid, RawData._key_delimiter,
            freq, RawData._key_delimiter,
            year)
        try:
            # See if there is a corresponding aggregation row for the 
            # base rate rows.  The exception should trigger when
            # the script is first run/initialized, if a new device/path
            # is added, or if a new rollup frequency has been added to 
            # an oid.
            ret = db.stat_agg._column_family.get(agg_key)
        except NotFoundException:
            # This will go through an entire row/year's worth of data and
            # generate the rollups at the approprite frequency. Will be
            # skipped if the user has defined a custom range on the command line.
            #
            # This operation is idempotent because there is only one min/max.
            if not using_custom_range:
                if options.verbose:
                    print 'No key', agg_key, 'found in stat aggregations.'
                for c in db.rates._column_family.xget(key, 
                            column_finish=int(time.time()) - RATE_TAIL_DELAY):
                    ts = c[0]
                    is_valid = c[1]['is_valid']
                    val = c[1]['val']
                    if is_valid == 0:
                        continue
                    data = RawData(device, None, oid, path, ts, val, base_freq)
                    db.update_stat_aggregation(data, _agg_timestamp(data, freq), freq)
        # Compare the "main" base rate key agianst its associated rollups
        # to determine the lowest rollup frequency.  This will be used to 
        # determine a "starting point" to generate ongoing stat rollups.
        if not agg_freqs.has_key(key):
            agg_freqs[key] = int(freq)
        else:
            if agg_freqs[key] > int(freq):
                agg_freqs[key] = int(freq)
        pass
    if options.verbose:
        print 'updating aggs for', key
    # Get the timestamp of the last finest grained aggregate written - it will
    # be before the last base rate actually processed due to the rounding.  There
    # will be a small overlap with the previous base rates processed, but doesn't
    # matter since updating the min/max is idempotent.
    #
    # This behavior can be overridden on the commandline with a custom range.
    agg_key = '%s%s%s%s%s%s%s%s%s' % \
        (device, RawData._key_delimiter,
        path, RawData._key_delimiter,
        oid, RawData._key_delimiter,
        agg_freqs[key], RawData._key_delimiter,
        year)
    if not using_custom_range:
        ret = db.stat_agg._column_family.get(agg_key, column_count=1, 
                    column_reversed=True)
        starting_ts = ret.keys()[0]
        ending_ts = int(time.time()) - RATE_TAIL_DELAY
    else:
        starting_ts = options.start
        ending_ts = options.end
    # Read all the values from the base rate row starting with that timestamp,
    # and generate a list of data objects from the results.
    ret = db.rates._column_family.xget(key, column_start=starting_ts,
                column_finish=ending_ts)
    value_objects = []
    for r in ret:
        ts = r[0]
        is_valid = r[1]['is_valid']
        val = r[1]['val']
        if is_valid == 0:
            continue
        data = RawData(device, None, oid, path, ts, val, base_freq)
        value_objects.append(data)
    
    # Iterate over the rollup freqencies and then iterate through list of data
    # objects to do the appropriate updates.
    for freq in _get_rollup_freqs(oid):
        for vo in value_objects:
            db.update_stat_aggregation(vo, _agg_timestamp(vo, freq), freq)
    
db.close()
db.stats.report('all')
        
