#!/usr/bin/perl

use warnings;
use strict;
use File::Basename;

my %opt = ();
$opt{percents} = 1;

# ---------------------------------------------------------------------------

my $date_lut = create_date_lookup_table();

my $pairs = { };
foreach my $f (@ARGV) {
  my ($class, $who, $daterev);
  if ($f =~ m,LOGS\.\S+?-(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) {
    # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz
    ($class, $who, $daterev) = ($1, $2, $3);
  }
  elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) {
    # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz
    ($class, $who, $daterev) = ($1, $2, $3);
  }
  elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.log,) { # ham-jm.log
    ($class, $who) = ($1, $2);
  }
  elsif ($f =~ m,(ham|nonspam|spam),) {
    ($class) = ($1);
    $who = 'unknown';
  }
  else {
    die "cannot parse filename: $f\n";
  }

  $class = 'ham' if $class eq 'nonspam';
  push @{$pairs->{$who}}, [ $f, $class ];
}

my $byuser = {};
my $total_counts = {};
foreach my $who (keys %{$pairs}) {
  my $buckets = {};
  foreach my $file (@{$pairs->{$who}}) {
    my ($f, $class) = @{$file};
    load_log($buckets, $total_counts, $f, $class, $who);
  }
  $byuser->{$who}->{buckets} = $buckets;
}

my $all_tspam = 0;
my $all_tham = 0;

foreach my $who (sort keys %{$byuser}) {
  report($byuser->{$who}->{buckets}, $total_counts, $who);
}

final_report($total_counts);
exit;

# ---------------------------------------------------------------------------

sub load_log {
  my ($buckets, $total_counts, $f, $class, $who) = @_;

  my ($caught, $score, $restofline);

  if ($f =~ /\.gz$/) {
    open (IN, "gunzip -cd $f|") or die "cannot read $f";
  } else {
    open (IN, "<$f") or die "cannot read $f";
  }
  while (<IN>) {
    ($caught, $score, $restofline) = split(' ', $_, 3);
    next unless ($caught =~ /^[Y\.]$/ && $restofline);
    next unless ($restofline =~ /(?: |,)time=(\d+)(?:\D|$)/);
    my $t = $1;

    my $tbucket = time_to_bucket($t);
    if (!exists $buckets->{$tbucket}) {
      $buckets->{$tbucket} ||= { };
    }

    if (!exists $buckets->{$tbucket}->{$class}) {
      $buckets->{$tbucket}->{$class} = {
        count => 0,
        range_lo => undef,
        range_hi => undef,
      };
    }

    $total_counts->{$class}++;
    my $b = $buckets->{$tbucket}->{$class};
    $b->{count}++;
    update_range_lo(\$b->{range_lo}, $score);
    update_range_hi(\$b->{range_hi}, $score);
  }
  close IN;
}

# ---------------------------------------------------------------------------

# bb-jhardin       Spam messages    Score range    Ham messages     Score range
#   in 2009-06          39   (0%)   [0,29]               0
#   in 2009-07           8   (0%)   [1,24]               2   (0%)   [1,4]
#   TOTAL:              73   (0%)   [0,29]               2   (0%)   [1,4]

sub report {
  my ($buckets, $total_counts, $who) = @_;


  printf "%-16s %-15s  %-14s %-15s  %-14s\n",
            $who, "Spam messages", "Score range", "Ham messages", "Score range";
                
  my $tspam = 0;
  my $tham = 0;
  my ($trslo, $trshi, $trhlo, $trhhi);

  foreach my $tbucket (sort keys %{$buckets}) {
    my $buck = $buckets->{$tbucket};
    my $nspam = $buck->{spam}->{count} || 0;
    my $nham = $buck->{ham}->{count} || 0;
    printf "%-16s %7s %6s   %-14s %7s %6s   %-14s\n",
            "  in $tbucket",
            $nspam, as_percent($nspam, $total_counts->{spam}),
            format_score_range($buck->{spam}->{range_lo}, $buck->{spam}->{range_hi}),
            $nham, as_percent($nham, $total_counts->{ham}),
            format_score_range($buck->{ham}->{range_lo}, $buck->{ham}->{range_hi});

    $tspam += $nspam;
    $tham += $nham;
    update_range_lo(\$trslo, $buck->{spam}->{range_lo});
    update_range_hi(\$trshi, $buck->{spam}->{range_hi});
    update_range_lo(\$trhlo, $buck->{ham}->{range_lo});
    update_range_hi(\$trhhi, $buck->{ham}->{range_hi});
  }

  printf "%-16s %7s %6s   %-14s %7s %6s   %-14s\n",
            "  TOTAL:",
            $tspam, as_percent($tspam, $total_counts->{spam}),
            format_score_range($trslo, $trshi),
            $tham, as_percent($tham, $total_counts->{ham}),
            format_score_range($trhlo, $trhhi);

  $all_tspam += $tspam;
  $all_tham += $tham;
  print "\n";
}

# ---------------------------------------------------------------------------

sub final_report {
  my ($total_counts) = @_;

  printf "%-16s %7s %6s   %-14s %7s %6s   %-14s\n",
            "OVERALL TOTAL:",
            $all_tspam, '', '',
            $all_tham, '', '';
}

# ---------------------------------------------------------------------------

use Time::Local;
sub create_date_lookup_table {

  my ($sec,$min,$hour,$mday,$cmon,$cyear,$x) = gmtime time;

  my @month_starts = ();
  my ($year, $mon);
  for ($year = $cyear; $year >= 70; $year--) {
    for ($mon = 11; $mon >= 0; $mon--) {
      next if ($year == $cyear && $mon > $cmon);        # in the future

      if ($year < $cyear-1 || ($year==$cyear-1 && $mon < $cmon)) {
        # just record January 1 for times over a year ago
        next unless ($mon == 0);
        push @month_starts, [
          timegm(0,0,0,1,$mon,$year), $year+1900
        ];
      }
      else {
        push @month_starts, [
          timegm(0,0,0,1,$mon,$year), sprintf("%04d-%02d", $year+1900, $mon+1) 
        ];
      }
    }
  }
  return \@month_starts;
}

sub time_to_bucket {
  my ($t) = @_;
  # could binary-search here, but the win is probably not worth it
  foreach my $pair (@{$date_lut}) {
    if ($pair->[0] < $t) {
      return $pair->[1];
    }
  }
  return "1970";
}

# ---------------------------------------------------------------------------

sub as_percent {
  my ($num, $total) = @_;
  if (!$opt{percents} || !$num) { return ''; }
  if (!$total) { return '(100%)'; }
  return sprintf("(%d%%)", (($num||0) *100.0) / $total);
}

# ---------------------------------------------------------------------------

sub format_score_range {
  my ($rlo, $rhi) = @_;
  if (!defined $rlo && !defined $rhi) { return ''; }
  if (!defined $rlo) { $rlo = ''; }
  if (!defined $rhi) { $rhi = ''; }
  return "[$rlo,$rhi]";
}

# ---------------------------------------------------------------------------

sub update_range_lo {
  my ($rloref, $score) = @_;
  return unless defined $score;
  if (!defined $$rloref || $score < $$rloref) { $$rloref = $score; }
}

# ---------------------------------------------------------------------------

sub update_range_hi {
  my ($rhiref, $score) = @_;
  return unless defined $score;
  if (!defined $$rhiref || $score > $$rhiref) { $$rhiref = $score; }
}

