#!/usr/bin/env perl use strict; use warnings; #use diagnostics; use utf8; use Encode; use constant DEBUG => 0; ## Print out all text to STDOUT UTF-8 encoded binmode STDOUT, ':encoding(UTF-8)'; binmode STDERR, ':encoding(UTF-8)'; # FIXME: Automatically include resolved '.' in @INC # print join "\n", @INC; use locale ':not_characters'; use Locale::TextDomain ('de.pointedears.newsstat'); use POSIX ('locale_h'); use Locale::Messages qw (bind_textdomain_filter bind_textdomain_codeset turn_utf_8_on); #setlocale( LC_MESSAGES, '' ); bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on; bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8'; require Mail::Message; require DateTime; require DateTime::Format::Mail; ###################### USER CONFIGURATIONS ############################ ## The name of the group to do stats for my $newsgroup_name = $ARGV[0]; $newsgroup_name or usage(); ## Check for removal flags my $ix; my $j; my %skipSec; my @skiplist; my $args = @ARGV; for ( $ix = 1 ; $ix < $args ; $ix++ ) { $j = $ix + 1; if ( $ARGV[$ix] eq "-x" ) { @skiplist = split( ",", $ARGV[$j] ); } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { @skiplist = split( ",", $1 ); } } foreach (@skiplist) { $skipSec{$_} = 1; } ## Leafnode users will want /var/spool/news for this variable. my $news = "/var/spool/news/"; ## Number of top or bottom posters to show my $topposters = 20; ## Number of threads we want to know about my $topthreads = 20; ## Number of cross-posted threads to show my $topcrossposts = 10; ## Number of agents we list my $topagents = 10; ## Number of time zones to show my $toptz = 10; ###################### DATA STRUCTURES ###################### my $group = $newsgroup_name; $group =~ s!\.!/!g; my %data; # name, count, agent, total, orig, quoted my $totsize = 0; # holds total sizes of all files my %crossposts; # group, count my %threads; # subject, count my $replies = 0; # total no. of replies my $origposts = 0; # total no. of original posts my %tz; # timezones by count my $earliest; # earliest article we have found my $latest; # latest article we have found my $totheader = 0; # total size of header material my $totbody = 0; # total size of body material my $totsig = 0; # total size of sig material my $totquoted = 0; # total size of quoted material my $totorig = 0; # total size of original material my $totalposts; # total no. of posts considered my %distinct_agent; ## Used to hold counts of User Agents used my %agents = ( "Compuserver" => 0, "Foorum" => 0, "Forte Agent" => 0, "Forte Free Agent" => 0, "Gnus" => 0, "KNode" => 0, "MacSOUP" => 0, "MT-NewsWatcher" => 0, "MicroPlanet Gravity" => 0, "Microsoft Outlook Express" => 0, "Microsoft Windows Mail" => 0, "Mozilla" => 0, "News Rover" => 0, "NN" => 0, "Pan" => 0, "rn" => 0, "slrn" => 0, "Sylpheed" => 0, "tin" => 0, "VSoup" => 0, "WebTV" => 0, "Xnews" => 0, ); my $datetime_parser = DateTime::Format::Mail->new(); $datetime_parser->loose(); my $today = DateTime->today( time_zone => 'UTC' ); my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1); my $start = int $prev_month->strftime('%s'); my $numdays = int DateTime->last_day_of_month( year => $prev_month->year(), month => $prev_month->month(), time_zone => $prev_month->time_zone(), )->day(); my $end = int $today->clone()->set_day(1)->strftime('%s'); dmsg( $start, " to ", $end ) if DEBUG; chdir("$news$group") or die __x( "Can't cd to {newsgroup}: {error}\n", newsgroup => "$news$group", error => $! ); opendir( DIR, "." ) or die __x( "Can't open {newsgroup}: {error}\n", newsgroup => "$news$group", error => $! ); while ( defined( my $filename = readdir(DIR) ) ) { next unless -f $filename; # only want real files next if ( $filename eq ".overview" ); # real articles only get_article($filename); # read in the article } closedir(DIR); # finished with the directory dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG; ## Post-processing count_agents(); # count agents, collapsing versions fix_percent(); write_data(); display_results(); ######################################## ## Get current article's header and body ######################################## sub get_article { my $filename = shift; open( my $FILE, '<', $filename ) or die __x( "Can't open {file}: {error}\n", file => $filename, error => $! ); my $msg = Mail::Message->read($FILE); my $timestamp = $msg->timestamp(); my $date = $msg->study('Date'); ## Disregard article if timestamp is not in range dmsg($timestamp) if DEBUG; if ( $timestamp < $start or $timestamp >= $end ) { dmsg("Posting on $date ignored.") if DEBUG; return; } $totalposts++; # bump count of articles considered ## DEBUG dmsg($date) if DEBUG; ## get stats about the file itself my $filesize = -s $filename; # get total size of file $totsize += $filesize; # bump total sizes of all files if ( ( not defined $earliest ) or $timestamp < $earliest ) { $earliest = $timestamp; } elsif ( ( not defined $latest ) or $timestamp > $latest ) { $latest = $timestamp; } #print "timestamp: $timestamp\n"; ## count header size $totheader += $msg->head()->size(); ## get the poster's name (MIME-decoded, in UTF-8) my $poster = $msg->study('From'); if ( defined $poster ) { ## Convert old to new format $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; ## Collapse whitespace $poster =~ s/\s+/ /g; ## Remove outer quotes; TODO: observe RFC 5322 strictly $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x; ## DEBUG dmsg($poster) if DEBUG; ## seen this one before? if ( !defined( $data{$poster} ) ) { $data{$poster}{'agent'} = __ 'unknown'; # comes after For: field $data{$poster}{'orig'} = 0; $data{$poster}{'quoted'} = 0; } $data{$poster}{'count'}++; # bump count for this poster $data{$poster}{'size'} += $filesize; # total size of file ## The User-Agent and/or X-Newsreader fields ## for User-Agent by poster my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader'); if ( defined $ua ) { $data{$poster}{'agent'} = $ua; ## DEBUG dmsg($ua) if DEBUG; } ## The User Agent for User-Agent by number of posts get_agent($msg); ## Get all cross-posted newsgroups for ( split( /,/, $msg->study('Newsgroups') ) ) { $crossposts{$_}++; # bump count for each } ## Get threads my $thread = $msg->study('Subject'); $thread =~ s/^re:\s+//i; # Remove Re: or re: at start $thread =~ s/\s+/ /g; # collapse whitespace $threads{$thread}{'count'}++; # bump count of this subject $threads{$thread}{'size'} += $filesize; # bump bytes for this thread ## Is this an original post or a reply? if ( defined $msg->study('References') ) { $replies++; } else { $origposts++; } ## Get the time zone my $datetime = $datetime_parser->parse_datetime($date); my $tz = $datetime->strftime('%z'); $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o; $tz{$tz}++; ## DEBUG dmsg($tz) if DEBUG; #### Now analyse the body text #### my $body = $msg->body(); my $insig = 0; my @body = $body->lines; for (@body) { $totbody += length($_); # bump total body size next if (m{^$>}o); # don't count blank lines in body if ( $insig == 1 ) { # bump total sig size $totsig += length($_); } ## are we in a quote line? ## Bill Unruh uses ] quotes, and another poster uses :: elsif ( m{^\s*[>\]]}o or m{^\s*::}o ) { ## bump count of quoted chrs $data{$poster}{'quoted'} += length($_); $totquoted += length($_); } elsif (/^-- $/) { $insig = 1; } else { ## We must be processing an original line $data{$poster}{'orig'} += length($_); # bump count of original chrs $totorig += length($_); } } # end for (@body) } close($FILE); } sub get_agent { my $msg = shift; my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader') or $msg->study('X-Mailer'); if ( not defined $ua ) { my $org = $msg->study('Organization'); if ( defined $org and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ ) { $ua = $org; } elsif ( $msg->study('Message-ID') =~ /pine/i ) { $ua = "Pine"; } } ## Hopefully found UA, else set to unknown if ( not defined $ua ) { $ua = __ "unknown"; } $ua = clean($ua); my $raw = $ua; my $agent = $raw; ## strip http if ( $raw =~ /.*http.*/ ) { $raw =~ s!posted via!!i; $raw =~ s!http://!!g; $raw =~ s!/!!g; $raw =~ s! !!g; } ## Fix Outlook from Mac if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } ## Pick out the popular agents if ( $raw =~ /(outlook express)/i || $raw =~ /(windows mail)/i || $raw =~ /(microplanet gravity)/i || $raw =~ /(news rover)/i || $raw =~ /(forte agent)/i || $raw =~ /(forte free agent)/i ) { $agent = $1; } elsif ( $raw =~ /^( pan |sylpheed |slrn |mozilla |knode |tin |hamster |xrn |xnews |aol |gnus |krn |macsoup |messenger |openxp |pine |thoth |turnpike |winvn |vsoup |google |supernews |nn |rn |007 |webtv |compuserve )/ix ) { $agent = $1; } else { ## Clean up unknown agents if ( $raw =~ m!^(.*?)/! ) { $agent = $1; } elsif ( $raw =~ /^(\w*)\d.*/ ) { $agent = $1; } } $distinct_agent{$agent}++; return $agent; } ## get_agent ######################################### ## Count the User-Agents used, collapsing ## different versions into one per agent. ######################################### sub count_agents { POSTER: foreach my $poster ( keys %data ) { foreach my $agent_name ( keys %distinct_agent ) { # check against known ones if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ ) { $agents{$agent_name}++; next POSTER; } } $agents{ $data{$poster}{'agent'} }++; } } # count_agents ############################################# ## Set orig/total percentages for all posters ############################################# sub fix_percent { foreach my $poster ( keys %data ) { my $percent = 100; if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) ) { $percent = $data{$poster}{'orig'} * 100 / ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} ); #/ } elsif ( $data{$poster}{'orig'} == 0 ) { $percent = 0; } $data{$poster}{'percent'} = $percent; } } ## fix_percent ################################## ## Write data structures to a file ################################## sub write_data { open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" ) or die __x( "Can't create XDATA: {error}\n", error => $! ); print $OUTF "Data collected from $newsgroup_name\n\n"; print $OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; foreach my $name ( keys %data ) { print $OUTF "$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n"; } print $OUTF "============================================================================\n"; print $OUTF "Thread subjects\n"; print $OUTF "----------------------------------------------------------------------------\n"; foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) { print $OUTF "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n"; } print $OUTF "============================================================================\n"; print $OUTF "Cross-posts\n"; print $OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %crossposts ) { print $OUTF "$name : $crossposts{$name}\n"; } print $OUTF "============================================================================\n"; print $OUTF "User agents\n"; print $OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %agents ) { print $OUTF "$name : $agents{$name}\n"; } print $OUTF "============================================================================\n"; print $OUTF "Time zones\n"; print $OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %tz ) { print $OUTF "$name : $tz{$name}\n"; } close $OUTF; } # write_data sub display_results { #################### DISPLAY RESULTS ##################### print "=" x 76, "\n"; printf "%s\n", centred( __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ), 76 ); print "=" x 76, "\n"; printf "%s\n", centred( __ "(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)", 76 ); print "\n\n"; printf __"Total posts considered: %s over %d days" . "\n", commify($totalposts), $numdays; my $time_locale = setlocale(LC_TIME); my $earliest_datetime = DateTime->from_epoch( epoch => $earliest, locale => $time_locale, time_zone => 'UTC', ); my $latest_datetime = DateTime->from_epoch( epoch => $latest, locale => $time_locale, time_zone => 'UTC', ); my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z'; printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format); printf __"Latest article" . ": %s\n", $latest_datetime->strftime($datetime_format); printf __"Original articles: %s; replies" . ": %s\n", commify($origposts), commify($replies); printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n", commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576; # printf __ "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, commify( int( $totsize / $totalposts ) ); my $count = keys %data; printf __"Total headers: %s KiB; bodies: %s KiB\n", commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); printf __ "Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n", commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), ( $totorig * 100 ) / ( $totorig + $totquoted ), commify( int( $totsig / 1024 ) ); printf __"Total number of posters: %s, average %s bytes per poster\n", commify($count), commify( int( $totsize / $count ) ); #/ $count = keys %threads; printf __"Total number of threads: %s, average %s bytes per thread\n", commify($count), commify( int( $totsize / $count ) ); #/ printf __"Total number of user agents: %d\n", scalar keys %agents; print "\n", "=" x 76, "\n"; ######################################## ## Show posters by article count Sec 1; ######################################## unless ( $skipSec{1} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", centred( __x( "Top {count} posters by number of articles", count => $topposters ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) { my $name = substr( $poster, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{count}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ###################################### ## Show posters by size in KiB Sec 2; ###################################### unless ( $skipSec{2} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", centred( __x( "Top {count} posters by article size in KiB", count => $topposters ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) { my $name = substr( $poster, 0, 62 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{size} / 1024; #/ last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ##################################### ## Show top posters for original text ##################################### unless ( $skipSec{3} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", centred( __x( "Top {count} responders by original text (> 5 posts)", count => $topposters ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data ) { next if $data{$poster}{quoted} == 0; next if $data{$poster}{count} < 5; my $name = substr( $poster, 0, 63 ); printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{percent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ######################################## ## Show bottom posters for original text ######################################## unless ( $skipSec{4} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", centred( __x( "Bottom {count} responders by original text (> 5 posts)", count => $topposters ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data ) { next if $data{$poster}{quoted} == 0; next if $data{$poster}{count} < 5; my $name = substr( $poster, 0, 63 ); printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{percent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ##################################### ## Show threads by number of articles ##################################### unless ( $skipSec{5} ) { if ( keys %threads < $topthreads ) { $count = keys %threads; } else { $count = $topthreads; } printf "%s\n", centred( __x( "Top {count} threads by no. of articles", count => $topthreads ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $thread ( sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} } keys %threads ) { my $name = substr( $thread, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $threads{$thread}{'count'}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ############################## ## Show threads by size in KiB ############################## unless ( $skipSec{6} ) { if ( keys %threads < $topthreads ) { $count = keys %threads; } else { $count = $topthreads; } printf "%s\n", centred( __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $thread ( sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} } keys %threads ) { my $name = substr( $thread, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $threads{$thread}{'size'} / 1024; #/ last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ################################## ## Show top 10 cross-posted groups ################################## unless ( $skipSec{7} ) { delete $crossposts{"$newsgroup_name"}; # don't include ours if ( keys %crossposts < $topcrossposts ) { $count = keys %crossposts; } else { $count = $topcrossposts; } printf "%s\n", centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $crossposts{$name}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ######################### ## Show agents and counts ######################### unless ( $skipSec{8} ) { if ( keys %agents < $topagents ) { $count = keys %agents; } else { $count = $topagents; } printf "%s\n", centred( __x( "Top {count} user agents by poster", count => $topagents ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), $agents{$agent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ####################### ## Show distinct agents ####################### unless ( $skipSec{9} ) { if ( keys %distinct_agent < $topagents ) { $count = keys %distinct_agent; } else { $count = $topagents; } printf "%s\n", centred( __x( "Top {count} user agents by number of posts", count => $topagents ), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} } keys %distinct_agent ) { printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), $distinct_agent{$agent}, ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ############################ ## Show timezones and counts ############################ unless ( $skipSec{10} ) { if ( keys %tz < $toptz ) { $count = keys %tz; } else { $count = $toptz; } printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 ); print "=" x 76, "\n"; my $i = 0; foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } } ## helper subs ############################### ## Right pad a string with '.'s ############################### sub rpad { ## Get text to pad, length to pad, pad chr my ( $text, $pad_len, $pad_chr ) = @_; ## DEBUG printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1; if ( length($text) > $pad_len ) { $text = substr( $text, 0, $pad_len ); } my $padded = $text . $pad_chr x ( $pad_len - length($text) ); return $padded; } ################## ## Centre a string ################## sub centred { my ( $text, $width ) = @_; # text to centre, size of field to centre in my $pad_len = ( $width - length($text) ) / 2; #/ my $centred = " " x $pad_len . $text; return $centred; } ########################### ## Put commas into a number ########################### # Get some of locale's numeric formatting parameters my ($thousands_sep, $grouping) = @{localeconv()}{'thousands_sep', 'grouping'}; # Apply defaults if values are missing $thousands_sep = ',' unless $thousands_sep; # grouping and mon_grouping are packed lists # of small integers (characters) telling the # grouping (thousand_seps and mon_thousand_seps # being the group dividers) of numbers and # monetary quantities. The integers' meanings: # 255 means no more grouping, 0 means repeat # the previous grouping, 1-254 means use that # as the current grouping. Grouping goes from # right to left (low to high digits). In the # below we cheat slightly by never using anything # else than the first grouping (whatever that is). my @grouping; if ($grouping) { @grouping = unpack("C*", $grouping); } else { @grouping = (3); } sub commify { local $_ = shift; #1 while s/^([-+]?\d+)(\d{3})/$1,$2/; $_ = int; # Chop non-integer part 1 while s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/; return $_; } ################################################################ ## Returns a string with leading and trailing whitespace removed ################################################################ sub clean { my $dirty = shift; my $clean = $dirty; $clean =~ s/^\s+|\s+$//g; return $clean; } sub usage { print __"usage: newsstat.pl NEWS.GROUP\n"; exit 1; } sub dmsg { print STDERR @_, "\n"; } sub dmsg2 { my ( $level, @msg ) = @_; print STDERR @msg, "\n" if $level >= DEBUG; }