#!/usr/bin/perl -w

use strict;

my $ID = '$Id: colastats.pl,v 1.1 2007/01/13 10:17:52 mrloy Exp mrloy $';

my $NewsDir = "/var/spool/news/comp/os/linux/advocacy";

######my $DataDir = "[directory where lastread & colastats.data file are located";

my $Headers = "From|Message\-ID|Message\-Id|NNTP\-Posting\-Hosts*";
$Headers   .= "|Newsgroups|Path|References|Subject|User\-Agent";
$Headers   .= "|X\-Newsreader|X\-No\-Archive|X\-no\-archive";

my $FromExceptions = "(bailo|Kadaitcha Man|Eelf)";
my $ReaderExceptions = "^X\$";

my ($ArticleCnt, $SubjectCnt, $CurrFrom, $CurrSubject, %XPosters);
my (@Article, @Files, $File, %FromName, %FromAddr, %Reader, %Subject);
my (%FromOrig, %FromQuote, %FromTotal, %SubjectBytes, %FromXNoA);
my (%MsgIDFrom, %MsgIDNg, %LastRef, $CurrRef, $CurrNg, $CurrMsgID);
my (%TrollMsgID, $TrollPost, %TrollCnt, %PopUserCnt, %DirReply, %XPost);
my (%XPostQ, %QuoteFrom, %TrollCntFrom, %FromTroll, %Trolls, $Trolls);
my $NewLastread = 0;
my $MaxLines = 30;

opendir(DIR, $NewsDir) or die "$NewsDir: $!\n";
@Files = grep (/\d+/, readdir(DIR));
closedir(DIR);
$ArticleCnt = 0;

my $Lastread = 0;
my $LastreadFile = "$DataDir/lastread";

if (open(IN, $LastreadFile)) {
    while (<IN>) {
        chomp;
        next unless /(\d+)/;
        $Lastread = $1;
    }
    close(IN);
}
chdir($NewsDir) or die "$NewsDir: $_\n";

foreach $File (@Files) {
    next unless $File > $Lastread;
    $NewLastread = $File if $File > $NewLastread;
    $ArticleCnt++;
    open(IN, $File) or die "$File: $!\n";
    @Article = <IN>;
    close(IN);
    GetHdrs();
    BodyCnts();
}

if ($NewLastread) {
    open (OUT, "> $LastreadFile") or die "$LastreadFile: $!\n";
    print OUT "$NewLastread\n";
    close(OUT);
}

PrintHeader();
Separator();
UserStats();
Separator();
SubjectStats();
Separator();
PopUsers();
Separator();
TrollFeeders();
Separator();
QualityPosters();
Separator();
XNoAStats();
Separator();
ReaderStats();
Separator();
PrintTrailer();

sub BodyCnts {
    my $Line = shift @Article;
    my $OrigBytes = 0;
    my $QuoteBytes = 0;
    my $QuoteRegex = ">|:|<";
    while ($Line) {
        last if $Line =~ /^-- /; # ignore .sigs
        last if $Line =~ /^-----BEGIN PGP SIGNATURE/; # ignore pgp sigs
        if ($Line =~ /^($QuoteRegex|\s+\S+>)/) {
            $QuoteBytes += length $Line;
        }
        else {
            $OrigBytes += length $Line;
        }
        $Line = shift @Article;
    }
    $FromOrig{$CurrFrom}        += $OrigBytes;
    $FromQuote{$CurrFrom}       += $QuoteBytes;
    $FromTotal{$CurrFrom}       += $OrigBytes + $QuoteBytes;
    $SubjectBytes{$CurrSubject} += $OrigBytes + $QuoteBytes;
}

sub PrintHeader {
    my $Ver;
    if ($ID =~ /^\$Id: (.+),v (\d+\.\d+)/) {
        $Ver = $1 . " " . "version " . $2;
    }
    else {
        $Ver = $ID;
    }
    print "$Ver\n\n";
    print "This report covers $ArticleCnt articles received by this ";
    print "system to\nnewsgroup comp.os.linux.advocacy [1]\n";
}

sub PrintTrailer {
    print "[1]: Posts via mail to news gateways are dropped by this server\n";
    print "     and hence not included in the stats.\n";
    print "     Other attention seeking posters may not appear as well.\n";
    print "[2]: Troll feeder %'age is (troll feeding posts / total posts)";
    print " * 100\n";
    print "[3]: The poster 'quality' stats is based on:\n";
    print "       a) quoting: 100 - %'age quoted\n";
    print "       b) cross-posting: 100 - %'age cross-posted\n";
    print "       c) number of direct followups posters articles get\n";
    print "       d) troll feeding: 100 - 2.0 * %'age troll followups\n";
    print "       e) 75 deducted for known trolls\n";
    print "       f) for the humour impaired the quotes around 'quality'\n";
    print "          are intended to convey they are not to be taken too\n";
    print "          seriously :-)\n";
    print "       g) for Erik: it is highly unlikely, although perhaps\n";
    print "          possible, for a troll to appear in the 'quality' stats\n";
    print "[4]: Erik Funkenbusch is a MS apologist, frequent liar and FUD\n";
    print "     spreader and seriously lacking in ethical values. He often\n";
    print "     tries to change the subject when his argument has been torn\n";
    print "     to shreds or, more likely, just runs away.\n";
    print "\n";
    print "     weasel n. [Cambridge] A naive user, one who deliberately\n";
    print "     or accidentally does things that are stupid or ill-advised.\n";
    print "     Roughly synonymous with loser\n";
}

sub Separator {
    my $Separator = '-' x 73;
    print "$Separator\n";
}

sub SubjectStats {
    my $Sub;
    my $Cnt = 1;
    my $LastSubjectCnt = 0;
    print "Toplist of Subjects\n\n Pos  Subject";
    print "                                                Msgs   Bytes\n";
    my $CurrLine = 1;
    foreach my $Subject (sort DescendSubjectSort (keys(%Subject))) {
        $Sub = substr($Subject, 0, 50);
        if ($Subject{$Subject} != $LastSubjectCnt) {
            printf "%4d. %-52s   %4d %7d\n", $Cnt, $Sub, $Subject{$Subject},
                                       $SubjectBytes{$Subject};
        }
        else {
            printf "      %-52s   %4d %7d\n", $Sub, $Subject{$Subject},
                                       $SubjectBytes{$Subject};
        }
        $LastSubjectCnt = $Subject{$Subject};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub ReaderStats {
    my $Cnt = 1;
    my $LastReaderCnt = 0;
    print "Toplist of Newsreaders\n\n Pos  Newsreader";
    print "                                                     Msgs\n";
    my $CurrLine = 1;
    foreach my $Reader (sort DescendReaderSort (keys(%Reader))) {
        next if $Reader =~ /$ReaderExceptions/; # skip arseholes readers
        if ($Reader{$Reader} != $LastReaderCnt) {
            printf "%4d. %-62s %4d\n", $Cnt, $Reader, $Reader{$Reader};
        }
        else {
            printf "      %-62s %4d\n", $Reader, $Reader{$Reader};
        }
        $LastReaderCnt = $Reader{$Reader};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub DescendSubjectSort {
    $Subject{$b} <=> $Subject{$a}
}

sub DescendFromSort {
    $FromName{$b} <=> $FromName{$a}
}

sub DescendXNoASort {
    $FromXNoA{$b} <=> $FromXNoA{$a}
}

sub DescendXPostersSort {
    $XPosters{$b} <=> $XPosters{$a}
}

sub DescendReaderSort {
    $Reader{$b} <=> $Reader{$a}
}

sub DescendTrollSort {
    $TrollCnt{$b} <=> $TrollCnt{$a}
}

sub DescendDirReplySort {
    $DirReply{$b} <=> $DirReply{$a}
}

sub DescendXPostQSort {
    $XPostQ{$b} <=> $XPostQ{$a}
}

sub UserStats {
    my $Cnt = 1;
    my $Quote;
    my $LastUserCnt = 0;
    print "Toplist of Posters\n\n Pos  Poster";
    print "                                          Msgs   Bytes Quoted\n";
    my $CurrLine = 1;
    foreach my $Key (sort DescendFromSort (keys(%FromName))) {
        my $FullFrom;
        next if $Key =~ /$FromExceptions/; # drop arseholes
        if (exists $FromAddr{$Key}) {
            if ($FromAddr{$Key} =~ /^</) {
                $FullFrom = $Key . " " . $FromAddr{$Key};
            }
            else {
                $FullFrom = $Key . " " . "<" . $FromAddr{$Key} . ">"
            }
        }
        else {
            $FullFrom = $Key;
        }
        $Quote = int(($FromQuote{$Key}/$FromTotal{$Key})*100);
        $QuoteFrom{$Key} = $Quote;
        $FullFrom = substr($FullFrom, 0, 47);
        if ($FromName{$Key} != $LastUserCnt) {
            printf "%4d. %-47s %4d %7d    %2d%%\n", $Cnt, $FullFrom,
                    $FromName{$Key}, $FromTotal{$Key}, $Quote;
        }
        else {
            printf "      %-47s %4d %7d    %2d%%\n", $FullFrom,
                   $FromName{$Key}, $FromTotal{$Key}, $Quote;
        }
        $LastUserCnt = $FromName{$Key};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub XNoAStats {
    my $Cnt = 1;
    my $LastXNoACnt = 0;
    print "Toplist of X-No-Archive Posters\n\n Pos  Poster";
    print "                                                         Msgs\n";
    my $CurrLine = 1;
    foreach my $Key (sort DescendXNoASort (keys(%FromXNoA))) {
        my $FullFrom;
        next if $Key =~ /$FromExceptions/; # drop arseholes
        if (exists $FromAddr{$Key}) {
            if ($FromAddr{$Key} =~ /^</) {
                $FullFrom = $Key . " " . $FromAddr{$Key};
            }
            else {
                $FullFrom = $Key . " " . "<" . $FromAddr{$Key} . ">"
            }
        }
        else {
            $FullFrom = $Key;
        }
        $FullFrom = substr($FullFrom, 0, 60);
        if ($FromXNoA{$Key} != $LastXNoACnt) {
            printf "%4d. %-62s %4d\n", $Cnt, $FullFrom,
                    $FromXNoA{$Key};
        }
        else {
            printf "      %-62s %4d\n", $FullFrom,
                   $FromXNoA{$Key};
        }
        $LastXNoACnt = $FromXNoA{$Key};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub GetHdrs {
    my $Line;
    $CurrRef   = "";
    $CurrNg    = "";
    $CurrMsgID = "";

    while (1) {
        $Line = shift @Article;
        my $Tmp;
        chomp $Line;
        last if $Line =~ /^\s*$/;
        if ($Line =~ /^From: (.+)$/) {
            SplitName($1);
        }
        elsif ($Line =~ /^Subject:\s*([Re][Ee]:)*\s*(.*)$/) {
            $Tmp = $2;
            if (length($Tmp) > 50) {
                $Tmp = substr($Tmp, 0, 50);
            }
            $Subject{$Tmp}++;
            $CurrSubject = $Tmp;
        }
        elsif ($Line =~ /^(User\-Agent|X\-Newsreader): (.+)/) {
            my $Tmp = $2;
            my $Readers = "40tude_Dialog|Forte Free Agent|Forte Agent";
            $Readers   .= "|slrn|tin|trn|Mozilla|[Gg]nus";
            $Readers   .= "|KNode|Pan|Xnews|Microsoft Outlook Express";
            $Readers   .= "|Gigi's Own NewsReader|News Xpress|knews";
            $Readers   .= "|Sylpheed|Turnpike Integrated|ProNews/2";
            $Readers   .= "|MR/2 Internet Cruiser|SupraNews newsreader";
            $Readers   .= "|News Xpress|Microhard Lookout|MacSOUP";
            $Readers   .= "|Thunderbird|Turnpike/6.04-U";
            $Readers   .= "|Mariozilla/3.1g";
            if ($Tmp =~ /^\s*($Readers)/) {
                $Reader{$1}++;
            }
            else {
                $Reader{$Tmp}++;
            }
        }
        elsif ($Line =~ /x-no-archive:/i) {
            $FromXNoA{$CurrFrom}++;
        }
        elsif ($Line =~ /^References:\s*(.+)/) {
            $CurrRef = $1;
            while (1) { # Check for multi-line References
                if ($Article[0] =~ /^\s+(<.*)$/) {
                    $CurrRef .= " " . $1;
                    shift @Article;
                }
                else {
                    last;
                }
            }
        }
        elsif ($Line =~ /^Newsgroups:\s*(.+)/) {
            $CurrNg = $1;
        }
        elsif ($Line =~ /^Message-I[dD]:\s*(.+)/) {
            $CurrMsgID = $1;
        }
    }
    $MsgIDFrom{$CurrMsgID}  = $CurrFrom;
    $MsgIDNg{$CurrMsgID}    = $CurrNg;
    $XPost{$CurrFrom}++ if $CurrNg =~ /,/;
    $XPost{$CurrFrom} = 0 unless exists $XPost{$CurrFrom};
    $TrollMsgID{$CurrMsgID} = $CurrFrom if $TrollPost;
    $FromTroll{$CurrFrom} = 1 if $TrollPost;
    if ($CurrRef) {
        my @Tmp = split(/ +/, $CurrRef);
        $LastRef{$CurrMsgID} = $Tmp[$#Tmp];
        my $LRef = $LastRef{$CurrMsgID};
        $PopUserCnt{$LRef}++;
    }
}

sub SplitName {
    my $From = shift @_;
    my ($Tmp1, $Tmp2);
    my $TrollsFile = "$DataDir/colastats.data";
    if (open(IN, $TrollsFile)) {
        while (<IN>) {
            chomp;
            ($Tmp1, $Tmp2) = split(':::');
            $Trolls{$Tmp1} = $Tmp2;
        }
        close(IN);
    }
    else {
        %Trolls = ();
    }
    $TrollPost = 0;
    foreach my $Troll (keys %Trolls) {
        if ($From =~ /$Troll/i) {
            $From = $Trolls{$Troll};
            $TrollPost = 1;
        }
    }
    ($From = $From) =~ s/"//g;
    ($From = $From) =~ s/  / /g;
    # Lin?nut K?hlmann - I'm useless at unicode stuff :-(
#    $From = "Lin?nut <lin?nut\@bone.com>" if $From =~ /Lin.+nut/;
    $From = "Peter K?hlmann <Peter.Koehlmann\@t-online.de>"
        if $From =~ /Peter .+mann/;

    if ($From =~ /^(.+) \((.+)\)/) {
        $FromName{$2}++;
        $FromAddr{$2} = $1;
        $CurrFrom = $2;
    }
    elsif ($From =~ /^(.*) (<.*>)/) {
        $FromName{$1}++;
        $FromAddr{$1} = $2;
        $CurrFrom = $1;
    }
    else {
        $FromName{$From}++;
        $CurrFrom = $From;
    }
}

sub TrollFeeders  {
    my $Cnt = 1;
    my $Total = 0;
    my $LastTrollFeederCnt = 0;

    foreach my $MsgID (keys (%MsgIDFrom)) {
        if (exists $LastRef{$MsgID}) {
            if (exists $TrollMsgID{$LastRef{$MsgID}}) {
                $TrollCnt{$MsgIDFrom{$MsgID}}++;
                $Total++;
            }
        }
    }

    print "Toplist of Troll Feeders [2]\n\n Pos  Poster";
    print "                                                     Msgs   %\n";
    my $CurrLine = 1;
    my $FullFrom;
    my $Percent;
    foreach my $From (sort DescendTrollSort (keys %TrollCnt)) {
        next if $From =~ /$FromExceptions/; # drop arseholes
        if (exists $FromAddr{$From}) {
            if ($FromAddr{$From} =~ /^</) {
                $FullFrom = $From . " " . $FromAddr{$From};
            }
            else {
                $FullFrom = $From . " " . "<" . $FromAddr{$From} . ">"
            }
        }
        else {
            $FullFrom = $From;
        }
        # +0.5 contributed by Mark Kent
        $Percent = int((($TrollCnt{$From}/$FromName{$From})*100)+0.5);
        $TrollCntFrom{$From} = $Percent;
        if ($TrollCnt{$From} != $LastTrollFeederCnt) {
            printf "%4d. %-58s %4d %3d\n", $Cnt, $FullFrom, $TrollCnt{$From},
                    $Percent;
        }
        else {
            printf "      %-58s %4d %3d\n", $FullFrom, $TrollCnt{$From},
                    $Percent;
        }
        $LastTrollFeederCnt = $TrollCnt{$From};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub PopUsers  {
    my $Cnt = 1;
    my $Total = 0;
    my $LastDirReply = 0;
    my ($FullFrom, $From, %SumRef);

    foreach my $Ref (keys %PopUserCnt) {
        if (exists $MsgIDFrom{$Ref}) {
            $From = $MsgIDFrom{$Ref};
            $SumRef{$From} = $Ref;
            if ($DirReply{$From}) {
                $DirReply{$From} += $PopUserCnt{$Ref};
            }
            else {
                $DirReply{$From} = $PopUserCnt{$Ref};
            }
            $Total += $PopUserCnt{$Ref};
        }
    }

    print "Toplist of Most Replied To Posters\n\n Pos  Poster";
    print "                                                         Msgs\n";
    my $CurrLine = 1;

    foreach my $Key (sort DescendDirReplySort (keys %DirReply)) {
        if (exists $MsgIDFrom{$SumRef{$Key}}) {
            $From = $MsgIDFrom{$SumRef{$Key}};
            next if $From =~ /$FromExceptions/; # drop arseholes
            if (exists $FromAddr{$From}) {
                if ($FromAddr{$From} =~ /^</) {
                    $FullFrom = $From . " " . $FromAddr{$From};
                }
                else {
                    $FullFrom = $From . " " . "<" . $FromAddr{$From} . ">"
                    }
            }
            else {
                $FullFrom = $From;
            }
        }

        if ($DirReply{$Key} != $LastDirReply) {
            printf "%4d. %-62s %4d\n", $Cnt, $FullFrom, $DirReply{$Key};
        }
        else {
            printf "      %-62s %4d\n", $FullFrom, $DirReply{$Key},;
        }
        $LastDirReply = $DirReply{$Key};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

sub QualityPosters {
    my($XP, $Q, $F, $T, $FullFrom, $LastXPostQ);
    my $Cnt = 1;

    foreach my $Poster (keys %FromName) {
        next if $Poster =~ /(relf|bailo)/i;
        next if $FromName{$Poster} < 6;
        $XP = int((($FromName{$Poster} - $XPost{$Poster}) /
                      $FromName{$Poster}) * 100);
        if ($QuoteFrom{$Poster}) {
            $Q = 100 - $QuoteFrom{$Poster};
        }
        else {
            $Q = 0;
        }
        if ($DirReply{$Poster} > $FromName{$Poster}) {
            $F = $DirReply{$Poster} - $FromName{$Poster} + 50;
        }
        else {
            $F = $FromName{$Poster} - $DirReply{$Poster} ;
        }
        if (exists $TrollCntFrom{$Poster}) {
            $T = (100 - (2.0 * $TrollCntFrom{$Poster}));
        }
        else {
            $T = 100;
        }
            
        $XPostQ{$Poster} = int(($XP + $Q + $F + $T) / 4);
        $XPostQ{$Poster} -= 75 if exists $FromTroll{$Poster};
####        warn "$Poster $FromName{$Poster} $XP $Q $F ($DirReply{$Poster}) $T\n";
    }

    print "Toplist of 'Quality' Posters [3]\n\n Pos  Poster";
    print "                                                        Index\n";
    my $CurrLine = 1;


    foreach my $Poster (sort DescendXPostQSort (keys %XPostQ)) {
        next if $Poster =~ /$FromExceptions/; # drop arseholes
        if (exists $FromAddr{$Poster}) {
            if ($FromAddr{$Poster} =~ /^</) {
                $FullFrom = $Poster . " " . $FromAddr{$Poster};
            }
            else {
                $FullFrom = $Poster . " " . "<" . $FromAddr{$Poster} . ">"
                }
        }
        else {
            $FullFrom = $Poster;
        }

        if ($XPostQ{$Poster} != $LastXPostQ) {
            printf "%4d. %-62s %4d\n", $Cnt, $FullFrom, $XPostQ{$Poster};
        }
        else {
            printf "      %-62s %4d\n", $FullFrom, $XPostQ{$Poster},;
        }
        $LastXPostQ = $XPostQ{$Poster};
        $Cnt++;
        last if ++$CurrLine > $MaxLines;
    }
}

