#!/usr/bin/python
#
# bigbrother
# http://snarfed.org/space/bigbrother
# Copyright 2003, 2004 Ryan Barrett <bigbrother@ryanb.org>
#
# File: bigbrother.py
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA
#

USAGE = """Usage: bigbrother.py [OPTIONS] LOGFILE

Parses a log of AIM away messages and timestamps, such as is generated by the
bigbrother plugin for Gaim, and generates a report. If LOGFILE is -, the log
file will be read from stdin. The log line format is:

  [timestamp] [away message]

The timestamp should be ISO 8601 w/o time zone, e.g. Tue 2003-11-04 16:08:11.
Also, the bigbrother.conf file contains user-configurable options.

Options:
  -d <dir>    Output the html and images to <dir>. Defaults to ./
  -o <file>   Write the index html file to <file>. Defaults to index.html
  -p          Just prints the statistics to stdout
  -s <date>   Start date, YYYY-MM-DD
  -e <date>   End date, YYYY-MM-DD
  -V          Print version information and exit
  -h          Print this message"""

VERSION      = '0.5.1'
URL          = 'http://snarfed.org/space/bigbrother'

import sys
import os.path

# this is the dir where bigbrother.py is. bigbrother.conf, index.html.template,
# etc. should be in this dir.
BASE_DIR     = os.path.dirname(os.path.abspath(sys.argv[0]))

import string
import re
import types
import time
import getopt
import ConfigParser
import graph
import html


# constants
CONFIG_FILE  = os.path.join(BASE_DIR, 'bigbrother.conf')
SECS_IN_DAY  = 24 * 60 * 60
WEEKDAYS     = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']
WEEKEND      = ['Sat', 'Sun']
DAYS_OF_WEEK = WEEKDAYS + WEEKEND
DATE_FORMAT  = '%Y-%m-%d'



#
# GLOBAL VARS
#
config = ConfigParser.ConfigParser()
html.config = config  # HACK: this sucks. how else could i allow these modules
graph.config = config # to get at config options?

# list of (unix time, msg), ordered by unix time
msgs_by_time = []

# dictionary with msg as key, (unix time, length) as value
times_by_msg = {}


def main():
  # get configuration settings
  # (can't initialize at top because of command-line options)
  set_defaults()
  config.read(CONFIG_FILE)
  filename = parse_args()

  # read and parse log
  if (filename == '-'):
    lines = sys.stdin.readlines()
  else:
    lines = open(filename, 'r').readlines()

  read_awaylog(lines,
               config_getdate('bigbrother', 'start_date'),
               config_getdate('bigbrother', 'end_date'))

  if len(msgs_by_time) == 0:
    sys.exit('No log lines found to process!') 

  if config.getboolean('bigbrother', 'dump_to_stdout'):
    dump_to_stdout()
    sys.exit(0)


  # calculate stats (we want them in *descending* order)
  freqs_map = calc_freqs()
  freqs = reverse_sort_by_value(freqs_map)
  lengths = reverse_sort_by_value(calc_lengths())
  avgs = reverse_sort_by_value(calc_avg_lengths())

  small_size = config.getint('bigbrother', 'small_tier_size')
  small_tier = get_tier(lengths, freqs_map, small_size)
  large_size = config.getint('bigbrother', 'large_tier_size')
  large_tier = get_tier(lengths, freqs_map, large_size)

  freqs_tier = [x for x in freqs if x[0] in large_tier]
  avgs_tier = [x for x in avgs if x[0] in large_tier]


  # generate graphs for frequency, length, and avg length
  graph.pygd_piechart('lengths.png', lengths)
  graph.pygd_barchart('frequencies.png', freqs_tier)
  graph.pygd_barchart('averages.png', avgs_tier, unixtimes=True)

  # generate raw data pages for frequency, length, and avg length
  for (values, name, description) in [
    (freqs, 'frequencies', 'Frequencies of each message'),
    (lengths, 'lengths', 'Total length of time spent on each message'),
    (avgs, 'averages', 'Average length each away message is on')]:

    # generate html page 
    htmlvalues = values
    if name != 'frequencies':
      htmlvalues = [(msg, secs_to_time(secs)) for (msg, secs) in values] 
    page = html.table_page(htmlvalues, name, description)
    html.write_html_page(create_path(name + '.html'), page)

  # generate 24-hr distribution graphs
  day_dists = dict([(msg, calc_day_distribution(msg)) for msg in times_by_msg])
  for msg in large_tier:
    graph.pygd_timeplot(create_path(msg + '.png'), day_dists[msg].items())

  # generate a feq 7-day distribution graphs
  week_dists = dict([(msg, calc_week_distribution(msg))
                     for msg in times_by_msg])
  for msg in small_tier:
    labelled = zip(DAYS_OF_WEEK, week_dists[msg])
    filename = create_path('week_' + msg + '.png')
    graph.pygd_barchart(filename, labelled, big_size=False, unixtimes=True)


  # generate layered graphs
  num_layers = config.getint('bigbrother', 'num_layers')
  assert num_layers >= 1
  layers = {}   # map of graph name => [layer messages]

  for days, graphname in ((WEEKDAYS, 'weekday'), (WEEKEND, 'weekend')):
    dists = dict([(msg, calc_day_distribution(msg, days))
                  for msg in times_by_msg])
    lengths = [(msg, sum(dists[msg].values())) for msg in times_by_msg]
    layers[graphname] = get_tier(lengths, freqs_map, num_layers)
    data = [dists[msg].items() for msg in layers[graphname]]
    graph.pygd_stack_timeplot(create_path(graphname + '_layers.png'), *data)

  # generate the index file
  index_file = create_path(config.get('bigbrother', 'output_filename'))
  html.write_index_page(index_file, large_tier, small_tier,
                        layers['weekday'], layers['weekend'])


def set_defaults():
  """ Sets reasonable default values for all configuration settings. Be
  careful! If settings already exist, this will wipe them out!
  """
  config.remove_section('bigbrother')
  config.add_section('bigbrother')
  config.set('bigbrother', 'output_directory', '')
  config.set('bigbrother', 'output_filename', 'index.html')
  config.set('bigbrother', 'discard_blips', 'true')
  config.set('bigbrother', 'enforce_order', 'false')
  config.set('bigbrother', 'sample_interval', '60')
  config.set('bigbrother', 'small_tier_size', '6')
  config.set('bigbrother', 'large_tier_size', '20')
  config.set('bigbrother', 'minimum_tier_freq', '4')
  config.set('bigbrother', 'dump_to_stdout', 'false')
  config.set('bigbrother', 'start_date', '')
  config.set('bigbrother', 'end_date', '')
  config.set('bigbrother', 'smoothing_factor', '5')
  config.set('bigbrother', 'num_layers', '5')
  


def parse_args():
  """ Uses getopt to parse the command-line options and take the appropriate
  action(s). Returns the log file name.
  """
  try:
    opts, args = getopt.getopt(sys.argv[1:], 'hVpo:d:s:e:')
  except getopt.GetoptError, msg:
    print >> sys.stderr, msg
    sys.exit(USAGE)

  for opt, val in opts:
    if opt in ('-h', '--help'):
      print >>sys.stdout, USAGE
      sys.exit()
    elif opt == '-V':
      print >>sys.stdout, 'bigbrother ' + VERSION
      sys.exit()
    elif opt == '-p':
      config.set('bigbrother', 'dump_to_stdout', 'true')
    elif opt == '-s':
      config.set('bigbrother', 'start_date', val)
    elif opt == '-e':
      config.set('bigbrother', 'end_date', val)
    elif opt == '-d':
      config.set('bigbrother', 'output_directory', val)
    elif opt == '-o':
      config.set('bigbrother', 'output_filename', val)

  # sanity checks
  start = config.get('bigbrother', 'start_date')
  end = config.get('bigbrother', 'end_date')
  # format is YYYY-MM-DD, so lexicographic comparison works
  if start != '' and end != '' and start >= end:
    sys.exit('Start date %s is not before end date %s.' % (start, end))

  if len(args) != 1:
    sys.exit(USAGE)
    
  return args[0]


def dump_to_stdout():
  """ Dumps the calculated statistics, pretty-printed, to stdout.
  """
  lengths = calc_lengths().items()
  avgs = calc_avg_lengths().items()
  freqs = calc_freqs().items()
  lengths.sort(), avgs.sort(), freqs.sort() # sort alphabetically by away msg

  assert [msg for msg, len in lengths] == [msg for msg, avg in avgs]
  assert [msg for msg, len in lengths] == [msg for msg, freq in freqs]

  msgs = [msg for msg, avg in lengths]
  lengths = [len for msg, len in lengths]
  avgs = [avg for msg, avg in avgs]
  freqs = [freq for msg, freq in freqs]

  data = ([('Away message', 'Frequency', 'Length', 'Avg. duration')] +
          zip(msgs, freqs, lengths, avgs))
  print >> sys.stdout, prettyprint(data)


class Bunch:
  """ Utility class, used to collect a bunch of named items. Stolen from
  O'Reilly's Python Cookbook, by Alex Martelli and David Ascher, p. 13.
  """
  def __init__(self, **kwds):
    self.__dict__.update(kwds)

    
def read_awaylog(lines, start=None, end=None):
  """ Parses each of the lines as an away message. Populates the msgs_by_time
  and times_by_msg data structures. Log is a list, and start and end are unix
  times. (Start and end are each optional.)
  """
  global msgs_by_time, times_by_msg
  msgs_by_time, times_by_msg = [], {}

  if not start:
    start = 0
  if not end:
    end = sys.maxint
  assert start < end

  # stored to calc length; of the form (unix time, msg)
  last = None

  # parse lines (ignoring blank lines)
  stripped = [line.strip() for line in lines if line.strip()]

  # check for day of week tags
  if stripped[0][:3] in DAYS_OF_WEEK:
    stripped = [string.split(line, maxsplit=1)[1] for line in stripped]
    print >> sys.stderr, \
"""WARNING: this log file contains day of week tags, which are deprecated.
Please convert your log file with strip_days.sh, then switch to bigbrother.pl
0.5 or higher."""

  split = [string.split(line, maxsplit=2) for line in stripped]

  # convert to list of (struct_time, msg) tuples.
  tms = [(time.strptime(date + ' ' + timestr, '%Y-%m-%d %H:%M:%S'), msg)
         for date, timestr, msg in split]

  # set tm_isdst to -1 to handle daylight savings time correctly!!!
  daylighted = [(struct[0:-1] + (-1,), msg) for struct, msg in tms]

  # end result is list of (unix time, msg)
  parsed = [(int(time.mktime(struct)), msg)
            for struct, msg in daylighted]

  # do we need to sort it?
  enforce_order = config.getboolean('bigbrother', 'enforce_order')
  if not enforce_order:
    parsed.sort()

  # populate data structures
  last = None
  for (unixtime, msg), i in zip(parsed, xrange(sys.maxint)):
    if (last and unixtime >= start and last.unixtime <= end):
      # check that this message occurred after the last message?
      if enforce_order and unixtime < last.unixtime:
        sys.exit('Log file is not strictly increasing by time, line %d:\n%s'
                 % (i + 1, lines[i]))

      length = unixtime - last.unixtime

      # only record it if it's not a blip (unless they want blips)
      if length > 0 or not config.getboolean('bigbrother', 'discard_blips'):
        msgs_by_time += [(last.unixtime, last.msg)]
        stored = times_by_msg.setdefault(last.msg, [])
        stored += [(last.unixtime, length)]
    # end if
        
    last = Bunch(unixtime=unixtime, msg=msg)

  # end for

    
def calc_freqs():
  """ Returns a dictionary with away msg as key, number of instances of msg as
  value.
  """
  freqs = {}

  for msg, times in times_by_msg.items():
    freqs[msg] = len(times)

  return freqs


def calc_lengths():
  """ Returns a dictionary with away msg as key, total amount of time spent on
  that away message (in seconds) as value.
  """
  lengths = {}

  for msg, times in times_by_msg.items():
    lengths[msg] = sum([len for time, len in times])

  return lengths


def calc_avg_lengths():
  """ Returns a dictionary with away msg as key, average amount of time spent
  on that away message (in seconds) as value.
  """
  freqs = calc_freqs()
  lengths = calc_lengths()
  avgs = {}

  for msg in freqs.keys():
    avgs[msg] = float(lengths[msg]) / freqs[msg]

  return avgs


def log_length():
  """ Returns the total amount of time spanned by the log, in seconds. Runs in
  constant time.
  """
  # this assumes that the times for each message in times_by_msg are sorted!
  start = msgs_by_time[0][0]
  end_msg = msgs_by_time[-1][1]
  end = msgs_by_time[-1][0] + times_by_msg[end_msg][-1][1]

  return end - start


def get_tier(lengths, freqs, tier_size):
  """ Calculates and returns a tier of messages. A tier is a list of the most
  used messages, by total time spent, descending. The lengths parameter is a
  list of (message, time spent on that message) tuples, freqs is a dictionary
  mapping messages to their frequencies, and tier_size is the desired size of
  the tier.
  """
  min_freq = config.getint('bigbrother', 'minimum_tier_freq')
  assert min_freq >= 1

  lengths = reverse_sort_by_value(lengths)
  
  msgs = [msg for (msg, len) in lengths if freqs[msg] >= min_freq]
  return msgs[:tier_size]


def calc_day_distribution(msg, which_days=DAYS_OF_WEEK):
  """ Breaks the day (from 12am to 11:59:59pm) into intervals and maps the
  given away message to the number of times it was on during each interval.
  Only the days in which_days (which should be a subset of DAYS_OF_WEEK) are
  counted. The interval is given by the sample_interval configuration setting.
  (An assertion is raised if this does not divide SECS_IN_DAY.) Returns a
  dictionary with interval start time (in seconds since midnight) as key,
  number of times the away msg was on as value. If an away message was never on
  during a particular interval, that interval will not be present in the
  dictionary.
  """

  # sanity checks
  for day in which_days:
    assert day in DAYS_OF_WEEK

  # interval must divide secs_in_day (for now)
  interval = config.getint('bigbrother', 'sample_interval')
  assert float(SECS_IN_DAY) / interval == SECS_IN_DAY / interval

  distribution = {}
  prev_unixtime = -1

  for unixtime, length in times_by_msg[msg]:
    # find the end interval
    tm = time.localtime(unixtime)
    end = secs_since_midnight(tm.tm_hour, tm.tm_min, tm.tm_sec) + length

    # find the start interval. if the message occurs multiple times within an
    # interval, make sure we only mark each interval at most once per day
    start_unixtime = max(unixtime,
                         (prev_unixtime / interval + 2) * interval)
    tm = time.localtime(start_unixtime)
    start = secs_since_midnight(tm.tm_hour, tm.tm_min, tm.tm_sec)

    # round the start and end to the outer containing intervals
    end = (end / interval + 1) * interval + 1
    start = start / interval * interval

    # mark the rest of the intervals for this msg
    cur_day = get_day_of_week(unixtime)
    for secs in range(start, end, interval):
      point = secs % SECS_IN_DAY
      if (point == 0):
        cur_day = tomorrow(cur_day)
      if cur_day in which_days:
        distribution[point] = distribution.get(point, 0) + 1
      # i wanted to do: distribution.setdefault(interval, 0) += 1
      # ...but this attempts to modify the int value, and ints are immutable :/

    prev_unixtime = unixtime

  return distribution


def calc_week_distribution(msg):
  """ Calculates the amount of time a given away message was on during each day
  of the week. Returns a seven-item list with the number of times the away msg
  was on, one for each day of the week. The list starts with Monday. NOTE: this
  doesn't handle away messages that are on across three days!
  """
  distribution = [0] * 7

  for start, length in times_by_msg[msg]:
    # walk the length of time this msg was on, and parcel out the of time it
    # was on during each day. (the first and last day are the tricky ones.)
    end = start + length
    while start != end:
      next = strip_time(start) + SECS_IN_DAY   # midnight the next day
      duration = min(next - start, end - start)
      day = time.localtime(start).tm_wday
      distribution[day] += duration
      start += duration
      assert start <= end

  return distribution


#
# utilities
#
def config_getdate(section, option):
  """ Reads the given option from the config as a date, in YYYY-MM-DD format,
  and returns it as a unix time.
  """
  date = config.get(section, option)
  if date == '':
    return None
  else:
    return time.mktime(time.strptime(date, DATE_FORMAT))


def prettyprint(array):
  """ Takes an array, i.e. a list of lists of arbitrary values, pretty-prints
  them, and returns the pretty-printed string. Strings are enclosed in double
  quotes, and fields are padded with spaces so that they are aligned. An
  assertion is raised if the rows don't all have the same number of fields.
  """
  if not array:
    return ''

  numcols = len(array[0])
  for row in array:
    assert len(row) == numcols

  # convert the values to strings and pad them
  def stringify(x):
    if type(x) in types.StringTypes:
      return '"%s"' % x
    elif type(x) is types.FloatType:
      return '%.2f' % x
    else:
      return repr(x)

  stringified = [[stringify(x) for x in row] for row in array]

  # pad each value to its column's full width
  rotated = [[row[col] for row in stringified]
             for col in range(0, numcols)]
  indiv_widths = [[len(x) for x in row] for row in rotated]
  widths = [max(row) for row in indiv_widths]

  padded = [[x.ljust(width) for (x, width) in zip(row, widths)]
            for row in stringified]

  # join them, and we're done!
  return string.join([string.join(row) for row in padded], '\n')


def tomorrow(day):
  """ Takes an abbreviated day of the week (in DAYS_OF_WEEK) and returns the
  next day of the week.
  """
  assert day in DAYS_OF_WEEK
  return DAYS_OF_WEEK[(DAYS_OF_WEEK.index(day) + 1) % 7]


def get_day_of_week(unixtime):
  """ Converts from unix time to day of week. Returns one of 'Mon', 'Tue',
  'Wed', 'Thu', 'Fri', 'Sat', 'Sun'.
  """
  return DAYS_OF_WEEK[time.localtime(unixtime).tm_wday]

  
# these characters aren't allowed in *nix filenames; replace them
SPECIAL_CHARS = {
    ' ': '_',
    '/': '-',
    '\\': '-',
    '"': "'",
    '?': '',
    '<': '(',
    '>': ')',
    ':': ' -' }

def create_path(filename):
  """ Prepares a filename. Prepends the user-specified output directory, and
  sanitizes the filename by replacing prohibited characters with legal
  characters.
  """
  dir = config.get('bigbrother', 'output_directory')

  for fromchar, tostr in SPECIAL_CHARS.items():
    filename = filename.replace(fromchar, tostr)
    if fromchar != '/':
      dir = dir.replace(fromchar, tostr)

  if dir != '' and not os.path.exists(dir):
    os.makedirs(dir)

  return os.path.join(dir, filename)


def secs_since_midnight(hour, min, sec):
  """ Takes a time and returns the number of seconds since midnight.
  """
  return (hour * 60 + min) * 60 + sec


def strip_time(unixtime):
  """ Takes a unix time and returns the unix time for midnight of the same day.
  Uses time.localtime to handle timezone, daylight savings time, etc.
  """
  tm = time.localtime(unixtime)
  return unixtime - secs_since_midnight(tm.tm_hour, tm.tm_min, tm.tm_sec)
  

def secs_to_time(seconds):
  """ Takes a number of seconds since midnight and returns the corresponding
  time, in 24-hour HH::MM format.
  """
  return '%02d:%02d' % (seconds / 3600, (seconds / 60) % 60)


def sum(values):
  """ Returns the sum of the given list.
  """
  return reduce((lambda a, b: a + b), values, 0) 


def ceiling_normalize(values, ceiling = None):
  """ A modified normalization function. Takes a list of values and an optional
  ceiling. First, if the ceiling is specified, any values greater than the
  ceiling are replaced with the ceiling. Then, each number is replaced with the
  fraction of the maximum it comprises. (The maximum is calculated post-ceiling
  replacement.)
  """
  if not values:
    return []
  
  if ceiling:
    values = [min(val, ceiling) for val in values]

  denom = max(values)
  if denom == 0:
    denom = 1
  return [float(val) / denom for val in values]

def reverse_sort_by_value(to_sort):
  """ Reverse sorts a dictionary (or list of (key, value) tuples) by value, and
  returns the sorted list of (key, value) pairs.
  """
  if type(to_sort) != types.DictType:
    to_sort = dict(to_sort)

  flipped = zip(to_sort.values(), to_sort.keys())
  flipped.sort()
  flipped.reverse()
  return [(x[1], x[0]) for x in flipped]


if __name__ == '__main__':
  main()
