| 1,4 → 1,4 |
| #!/usr/bin/perl -w |
| #!/usr/bin/perl |
| use strict; |
| use warnings; |
| use utf8; |
| 9,12 → 9,12 |
| |
| ############################################################################ |
| # Collect statistics about a newsgroup (specified by first argument) in |
| # the local news spool. Check all articles in the last 30-day period. |
| # the local news spool. Check all articles in the last 30-day period. |
| # Rank posters by number of posts and by volume of posts, report on top and |
| # bottom 20 posters. Show their name, number of posts, size of posts, |
| # percentage of quoted lines. Rank user-agents used, by poster rather than |
| # by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
| # |
| # |
| # (Numbers and paths can be configured below. -- PE) |
| ############################################################################ |
| |
| 21,7 → 21,7 |
| ############################################################################ |
| # RECENT CHANGES # |
| # 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
| # - Use warnings, utf8 (just in case) |
| # - Use warnings, utf8 (just in case) |
| # - Documentation update |
| # N/A NN - Take newsgroup name as argument |
| # 2004-06-19 NN - newsgroup name is $ARGV[0] |
| 39,7 → 39,7 |
| # - introduced the $newsgroup_name variable which |
| # later becomes $news$group |
| # - changed $name to $agent_name in countagents() |
| # |
| # |
| # Contributors |
| # ------------- |
| # NN Nomen nominandum (name to be determined later) |
| 75,15 → 75,20 |
| my %skipSec; |
| my @skiplist; |
| my $args = @ARGV; |
| for ( $ix = 1 ; $ix < $args ; $ix++ ) { |
| $j = $ix + 1; |
| if ( $ARGV[$ix] eq "-x" ) { |
| @skiplist = split(",",$ARGV[$j]); |
| } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { |
| @skiplist = split(",",$1); |
| } |
| for ( $ix = 1 ; $ix < $args ; $ix++ ) |
| { |
| $j = $ix + 1; |
| if ( $ARGV[$ix] eq "-x" ) |
| { |
| @skiplist = split( ",", $ARGV[$j] ); |
| } |
| elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
| { |
| @skiplist = split( ",", $1 ); |
| } |
| } |
| foreach(@skiplist) { |
| foreach (@skiplist) |
| { |
| $skipSec{$_} = 1; |
| } |
| |
| 108,108 → 113,113 |
| # no. of time zones to show |
| my $toptz = 10; |
| |
| |
| |
| ###################### DATA STRUCTURES ###################### |
| my $group = $newsgroup_name; |
| $group =~ s!\.!/!g; |
| my %data; # name, count, agent, total, orig, quoted |
| my %threads; # subject, count |
| my %crossposts; # group, count |
| my %tz; # timezones by count |
| my %headers; # holds header of current article |
| my %lcheader; # holds lowercase headers |
| my @body; # holds body of current article |
| my @sig; # holds sig text; |
| my $totalposts; # total no. of posts considered |
| my $filename; # name of current article file |
| my $filesize; # size of current article file |
| my $earliest; # earliest article we have found |
| my $latest; # latest article we have found |
| my $poster; # poster we are dealing with |
| my $totsize = 0; # holds total sizes of all files |
| my $totheader = 0; # total size of header material |
| my $totbody = 0; # total size of body material |
| my $totsig = 0; # total size of sig material |
| my $totorig = 0; # total size of original material |
| my $totquoted = 0; # total size of quoted material |
| my $origposts = 0; # total no. of original posts |
| my $replies = 0; # total no. of replies |
| my $i; # general purpose |
| my %data; # name, count, agent, total, orig, quoted |
| my %threads; # subject, count |
| my %crossposts; # group, count |
| my %tz; # timezones by count |
| my %headers; # holds header of current article |
| my %lcheader; # holds lowercase headers |
| my @body; # holds body of current article |
| my @sig; # holds sig text; |
| my $totalposts; # total no. of posts considered |
| my $filename; # name of current article file |
| my $filesize; # size of current article file |
| my $earliest; # earliest article we have found |
| my $latest; # latest article we have found |
| my $poster; # poster we are dealing with |
| my $totsize = 0; # holds total sizes of all files |
| my $totheader = 0; # total size of header material |
| my $totbody = 0; # total size of body material |
| my $totsig = 0; # total size of sig material |
| my $totorig = 0; # total size of original material |
| my $totquoted = 0; # total size of quoted material |
| my $origposts = 0; # total no. of original posts |
| my $replies = 0; # total no. of replies |
| my $i; # general purpose |
| my %distinct_agent; |
| my %agents = # used to hold counts of User Agents used |
| ( "KNode" => 0, |
| "Pan" => 0, |
| "Mozilla" => 0, |
| "Sylpheed" => 0, |
| "Gnus" => 0, |
| "Forte Agent" => 0, |
| "Forte Free Agent" => 0, |
| "MicroPlanet Gravity" => 0, |
| "Microsoft Outlook Express" => 0, |
| "Xnews" => 0, |
| "slrn" => 0, |
| "tin" => 0, |
| "rn" => 0, |
| "NN" => 0, |
| "MacSOUP" => 0, |
| "Foorum" => 0, |
| "MT-NewsWatcher" => 0, |
| "News Rover" => 0, |
| "WebTV" => 0, |
| "Compuserver" => 0, |
| "VSoup" => 0); |
| my %agents = # used to hold counts of User Agents used |
| ( |
| "KNode" => 0, |
| "Pan" => 0, |
| "Mozilla" => 0, |
| "Sylpheed" => 0, |
| "Gnus" => 0, |
| "Forte Agent" => 0, |
| "Forte Free Agent" => 0, |
| "MicroPlanet Gravity" => 0, |
| "Microsoft Outlook Express" => 0, |
| "Xnews" => 0, |
| "slrn" => 0, |
| "tin" => 0, |
| "rn" => 0, |
| "NN" => 0, |
| "MacSOUP" => 0, |
| "Foorum" => 0, |
| "MT-NewsWatcher" => 0, |
| "News Rover" => 0, |
| "WebTV" => 0, |
| "Compuserver" => 0, |
| "VSoup" => 0 |
| ); |
| |
| ######################## MAIN CODE ######################## |
| $! = 1; |
| |
| chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
| opendir(DIR, ".") or die "Can't open $news$group directory: $!\n"; |
| while (defined($filename = readdir(DIR))) { |
| %lcheader = (); |
| next unless -f $filename; # only want real files |
| next if ($filename eq ".overview"); # real articles only |
| next if (-M $filename > $numdays); # only want articles <= a certain age |
| $earliest = (stat $filename)[9] unless defined ($earliest); |
| $latest = (stat $filename)[9] unless defined ($latest); |
| &getarticle($filename); # read in the article |
| &getdata; # grab the data from the article |
| $totalposts++; # bump count of articles considered |
| opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
| while ( defined( $filename = readdir(DIR) ) ) |
| { |
| %lcheader = (); |
| next unless -f $filename; # only want real files |
| next if ( $filename eq ".overview" ); # real articles only |
| next if ( -M $filename > $numdays ); # only want articles <= a certain age |
| $earliest = ( stat $filename )[9] unless defined($earliest); |
| $latest = ( stat $filename )[9] unless defined($latest); |
| &getarticle($filename); # read in the article |
| &getdata; # grab the data from the article |
| $totalposts++; # bump count of articles considered |
| } |
| closedir(DIR); # finished with the directory |
| closedir(DIR); # finished with the directory |
| |
| # post-processing |
| &countagents; # count agents, collapsing versions |
| &fixpercent; # check percentages orig/total for posters |
| &countagents; # count agents, collapsing versions |
| &fixpercent; # check percentages orig/total for posters |
| |
| &writedata; |
| |
| #################### DISPLAY RESULTS ##################### |
| print "=" x 76, "\n"; |
| printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
| printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
| print "=" x 76, "\n"; |
| printf "%s\n", ¢red("(stats compiled with a script by Garry Knight et al.)", 76); |
| printf "%s\n", |
| ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
| print "\n\n"; |
| printf "Total posts considered: %s over %d days\n", |
| commify($totalposts), $numdays; |
| printf "Earliest article: %s\n", scalar localtime($earliest); |
| printf "Latest article: %s\n", scalar localtime($latest); |
| printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); |
| printf "Total posts considered: %s over %d days\n", commify($totalposts), |
| $numdays; |
| printf "Earliest article: %s\n", scalar localtime($earliest); |
| printf "Latest article: %s\n", scalar localtime($latest); |
| printf "Original articles: %s, replies: %s\n", commify($origposts), |
| commify($replies); |
| printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
| commify(int($totsize / 1024)), $totsize / 1048576; # |
| commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
| printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
| commify(int($totalposts / $numdays)), |
| $totsize / $numdays / 1048576, commify(int($totsize / $totalposts)); |
| commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
| commify( int( $totsize / $totalposts ) ); |
| my $count = keys %data; |
| printf "Total headers: %s KB bodies: %s KB\n", |
| commify(int($totheader / 1024)), commify(int($totbody / 1024)); |
| commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
| printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
| commify(int($totquoted / 1024)), commify(int($totorig / 1024)), |
| ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024)); |
| printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), |
| commify(int($totsize / $count)); #/ |
| commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
| ( $totorig * 100 ) / ( $totorig + $totquoted ), |
| commify( int( $totsig / 1024 ) ); |
| printf "Total number of posters: %s, average %s bytes per poster\n", |
| commify($count), commify( int( $totsize / $count ) ); #/ |
| $count = keys %threads; |
| printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), |
| commify(int($totsize / $count)); #/ |
| printf "Total number of threads: %s, average %s bytes per thread\n", |
| commify($count), commify( int( $totsize / $count ) ); #/ |
| printf "Total number of User-Agents: %d\n", scalar keys %agents; |
| print "\n", "=" x 76, "\n"; |
| |
| 216,39 → 226,51 |
| ############################### |
| # show posters by article count Sec 1; |
| ############################### |
| unless ( $skipSec{1} ) { |
| if (keys %data < $topposters) { |
| $count = keys %data; |
| } else { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red("Top $count posters by number of articles", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) { |
| my $name = substr($poster, 0, 65); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count}; |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| unless ( $skipSec{1} ) |
| { |
| if ( keys %data < $topposters ) |
| { |
| $count = keys %data; |
| } |
| else |
| { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
| { |
| my $name = substr( $poster, 0, 65 ); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
| $data{$poster}{count}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ################################ |
| # show posters by size in Kbytes Sec 2; |
| ################################ |
| unless ( $skipSec{2} ) { |
| if (keys %data < $topposters) { |
| unless ( $skipSec{2} ) |
| { |
| if ( keys %data < $topposters ) |
| { |
| $count = keys %data; |
| } else { |
| } |
| else |
| { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red("Top $count posters by article size in Kbytes", 76); |
| printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) { |
| my $name = substr($poster, 0, 62); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/ |
| last if (++$i == $count); |
| foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
| { |
| my $name = substr( $poster, 0, 62 ); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
| $data{$poster}{size} / 1024; #/ |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| 256,43 → 278,59 |
| #################################### |
| # show top posters for original text |
| #################################### |
| unless ( $skipSec{3} ) { |
| if (keys %data < $topposters) { |
| $count = keys %data; |
| } else { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red("Top $count responders by original text (> 5 posts)", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) { |
| next if $data{$poster}{quoted} == 0; |
| next if $data{$poster}{count} < 5; |
| my $name = substr($poster, 0, 63); |
| printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| unless ( $skipSec{3} ) |
| { |
| if ( keys %data < $topposters ) |
| { |
| $count = keys %data; |
| } |
| else |
| { |
| $count = $topposters; |
| } |
| printf "%s\n", |
| ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} } |
| keys %data ) |
| { |
| next if $data{$poster}{quoted} == 0; |
| next if $data{$poster}{count} < 5; |
| my $name = substr( $poster, 0, 63 ); |
| printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
| $data{$poster}{percent}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ####################################### |
| # show bottom posters for original text |
| ####################################### |
| unless ( $skipSec{4} ) { |
| if (keys %data < $topposters) { |
| unless ( $skipSec{4} ) |
| { |
| if ( keys %data < $topposters ) |
| { |
| $count = keys %data; |
| } else { |
| } |
| else |
| { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red("Bottom $count responders by original text (> 5 posts)", 76); |
| printf "%s\n", |
| ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) { |
| foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} } |
| keys %data ) |
| { |
| next if $data{$poster}{quoted} == 0; |
| next if $data{$poster}{count} < 5; |
| my $name = substr($poster, 0, 63); |
| printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
| last if (++$i == $count); |
| my $name = substr( $poster, 0, 63 ); |
| printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
| $data{$poster}{percent}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| 300,19 → 338,26 |
| #################################### |
| # show threads by number of articles |
| #################################### |
| unless ( $skipSec{5} ) { |
| if (keys %threads < $topthreads) { |
| unless ( $skipSec{5} ) |
| { |
| if ( keys %threads < $topthreads ) |
| { |
| $count = keys %threads; |
| } else { |
| } |
| else |
| { |
| $count = $topthreads; |
| } |
| printf "%s\n", ¢red("Top $count threads by no. of articles", 76); |
| printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) { |
| my $name = substr($thread, 0, 65); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count}; |
| last if (++$i == $count); |
| foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} } |
| keys %threads ) |
| { |
| my $name = substr( $thread, 0, 65 ); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
| $threads{$thread}{count}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| 319,19 → 364,26 |
| ################################ |
| # show threads by size in Kbytes |
| ################################ |
| unless ( $skipSec{6} ) { |
| if (keys %threads < $topthreads) { |
| unless ( $skipSec{6} ) |
| { |
| if ( keys %threads < $topthreads ) |
| { |
| $count = keys %threads; |
| } else { |
| } |
| else |
| { |
| $count = $topthreads; |
| } |
| printf "%s\n", ¢red("Top $count threads by size in KB", 76); |
| printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) { |
| my $name = substr($thread, 0, 65); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/ |
| last if (++$i == $count); |
| foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} } |
| keys %threads ) |
| { |
| my $name = substr( $thread, 0, 65 ); |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
| $threads{$thread}{size} / 1024; #/ |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| 339,198 → 391,236 |
| ################################# |
| # show top 10 cross-posted groups |
| ################################# |
| unless ( $skipSec{7} ) { |
| delete $crossposts{"$newsgroup_name"}; # don't include ours |
| if (keys %crossposts < $topcrossposts) { |
| $count = keys %crossposts; |
| } else { |
| $count = $topcrossposts; |
| unless ( $skipSec{7} ) |
| { |
| delete $crossposts{"$newsgroup_name"}; # don't include ours |
| if ( keys %crossposts < $topcrossposts ) |
| { |
| $count = keys %crossposts; |
| } |
| else |
| { |
| $count = $topcrossposts; |
| } |
| printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach |
| my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
| { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
| $crossposts{$name}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| printf "%s\n", ¢red("Top $count cross-posted groups", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name}; |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| ####################### |
| #show agents and counts |
| ####################### |
| unless ( $skipSec{8} ) { |
| if (keys %agents < $topagents) { |
| $count = keys %agents; |
| } else { |
| $count = $topagents; |
| unless ( $skipSec{8} ) |
| { |
| if ( keys %agents < $topagents ) |
| { |
| $count = keys %agents; |
| } |
| else |
| { |
| $count = $topagents; |
| } |
| printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
| { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
| $agents{$agent}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| printf "%s\n", ¢red("Top $count User Agents by poster", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent}; |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ####################### |
| #show distinct agents |
| ####################### |
| unless ( $skipSec{9} ) { |
| if (keys %distinct_agent < $topagents) { |
| $count = keys %distinct_agent; |
| } else { |
| $count = $topagents; |
| unless ( $skipSec{9} ) |
| { |
| if ( keys %distinct_agent < $topagents ) |
| { |
| $count = keys %distinct_agent; |
| } |
| else |
| { |
| $count = $topagents; |
| } |
| printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
| keys %distinct_agent ) |
| { |
| printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
| $distinct_agent{$agent}, |
| ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| printf "%s\n", ¢red("Top $count User Agents by number of posts", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) { |
| printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100); |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ########################## |
| #show timezones and counts |
| ########################## |
| unless ( $skipSec{10} ) { |
| if (keys %tz < $toptz) { |
| $count = keys %tz; |
| } else { |
| $count = $toptz; |
| unless ( $skipSec{10} ) |
| { |
| if ( keys %tz < $toptz ) |
| { |
| $count = keys %tz; |
| } |
| else |
| { |
| $count = $toptz; |
| } |
| printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
| { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
| last if ( ++$i == $count ); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| printf "%s\n", ¢red("Top 10 time zones", 76); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) { |
| printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone}; |
| last if (++$i == $count); |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| |
| ################################ SUBROUTINES ################################ |
| |
| |
| ####################################### |
| # get current article's header and body |
| ####################################### |
| sub getarticle { |
| %headers = (); # dump old headers |
| my $filename = shift; # get the name of the file |
| # get stats about the file itself |
| $filesize = -s $filename; # get total size of file |
| $totsize += $filesize; # bump total sizes of all files |
| sub getarticle |
| { |
| %headers = (); # dump old headers |
| my $filename = shift; # get the name of the file |
| |
| my $mtime = (stat $filename)[9]; |
| if ( $mtime < $earliest ) { |
| # get stats about the file itself |
| $filesize = -s $filename; # get total size of file |
| $totsize += $filesize; # bump total sizes of all files |
| |
| my $mtime = ( stat $filename )[9]; |
| if ( $mtime < $earliest ) |
| { |
| $earliest = $mtime; |
| } elsif ( $mtime > $latest ) { |
| } |
| elsif ( $mtime > $latest ) |
| { |
| $latest = $mtime; |
| } |
| } |
| |
| # now read the file |
| open(FILE, $filename) or die "Can't open $filename: $!\n"; |
| while (<FILE>) { |
| $totheader += length($_); # bump total header size |
| last if (/^\s*$/); # end of header? |
| if (/^([^:\s]*):\s+(.*)/) { |
| my($key,$val) = ($1,$2); |
| $headers{$key} = decode('MIME-Header', $val); |
| $lcheader{clean(lc($key))} = clean($val); |
| # now read the file |
| open( FILE, $filename ) or die "Can't open $filename: $!\n"; |
| while (<FILE>) |
| { |
| $totheader += length($_); # bump total header size |
| last if (/^\s*$/); # end of header? |
| if (/^([^:\s]*):\s+(.*)/) |
| { |
| my ( $key, $val ) = ( $1, $2 ); |
| $headers{$key} = decode( 'MIME-Header', $val ); |
| $lcheader{ clean( lc($key) ) } = clean($val); |
| } |
| } |
| } |
| @body = <FILE>; # slurp up body |
| close(FILE); |
| } # getarticle |
| @body = <FILE>; # slurp up body |
| close(FILE); |
| } # getarticle |
| |
| ################################### |
| # get data from the current article |
| ################################### |
| sub getdata { |
| sub getdata |
| { |
| #### First, analyse header fields #### |
| |
| # Set up this poster if not defined, get counts, sizes |
| $poster = encode('UTF-8', $headers{From}); # get the poster's name |
| if (!defined($data{$poster})) { # seen this one before? |
| $data{$poster}{agent} = 'Unknown'; # comes after For: field |
| $data{$poster}{orig} = 0; |
| $data{$poster}{quoted} = 0; |
| } |
| $data{$poster}{count}++; # bump count for this poster |
| $data{$poster}{size} += $filesize; # total size of file |
| # Set up this poster if not defined, get counts, sizes |
| $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name |
| if ( !defined( $data{$poster} ) ) |
| { # seen this one before? |
| $data{$poster}{agent} = 'Unknown'; # comes after For: field |
| $data{$poster}{orig} = 0; |
| $data{$poster}{quoted} = 0; |
| } |
| $data{$poster}{count}++; # bump count for this poster |
| $data{$poster}{size} += $filesize; # total size of file |
| |
| # The User-Agent and/or X-Newsreader fields |
| # for User-Agent by poster |
| if (defined $lcheader{"user-agent"}) { |
| $data{$poster}{agent} = $lcheader{"user-agent"}; |
| } |
| if (defined $lcheader{"x-newsreader"}) { |
| $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
| } |
| # The User-Agent and/or X-Newsreader fields |
| # for User-Agent by poster |
| if ( defined $lcheader{"user-agent"} ) |
| { |
| $data{$poster}{agent} = $lcheader{"user-agent"}; |
| } |
| if ( defined $lcheader{"x-newsreader"} ) |
| { |
| $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
| } |
| |
| # The User Agent for User-Agent by number of posts |
| my $UA = "unknown"; |
| foreach my $keys ( keys %lcheader ) |
| { |
| if (defined $lcheader{'user-agent'}) |
| { |
| $UA = $lcheader{'user-agent'}; |
| } |
| elsif (defined $lcheader{"x-newsreader"}) |
| { |
| $UA = $lcheader{"x-newsreader"}; |
| } |
| elsif (defined $lcheader{'x-mailer'}) |
| { |
| $UA = $lcheader{'x-mailer'}; |
| } |
| elsif ((defined $lcheader{'organization'}) && |
| ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/)) |
| { |
| $UA = $lcheader{'organization'}; |
| } |
| elsif ( $lcheader{'message-id'} =~ /pine/i ) |
| { |
| $UA = "Pine"; |
| } ## Hopefully found UA, else set to unknown |
| } |
| # The User Agent for User-Agent by number of posts |
| my $UA = "unknown"; |
| foreach my $keys ( keys %lcheader ) |
| { |
| if ( defined $lcheader{'user-agent'} ) |
| { |
| $UA = $lcheader{'user-agent'}; |
| } |
| elsif ( defined $lcheader{"x-newsreader"} ) |
| { |
| $UA = $lcheader{"x-newsreader"}; |
| } |
| elsif ( defined $lcheader{'x-mailer'} ) |
| { |
| $UA = $lcheader{'x-mailer'}; |
| } |
| elsif ( |
| ( defined $lcheader{'organization'} ) |
| && ( $lcheader{'organization'} =~ |
| /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
| ) |
| { |
| $UA = $lcheader{'organization'}; |
| } |
| elsif ( $lcheader{'message-id'} =~ /pine/i ) |
| { |
| $UA = "Pine"; |
| } ## Hopefully found UA, else set to unknown |
| } |
| |
| $UA = clean($UA); |
| $UA = get_agent($UA); |
| |
| $UA = clean($UA); |
| $UA = get_agent($UA); |
| sub get_agent |
| { |
| my $raw = shift; |
| my $agent = $raw; |
| |
| ## strip http |
| if ( $raw =~ /.*http.*/ ) |
| { |
| $raw =~ s!posted via!!i; |
| $raw =~ s!http://!!g; |
| $raw =~ s!/!!g; |
| $raw =~ s! !!g; |
| } |
| |
| sub get_agent { |
| my $raw = shift; |
| my $agent = $raw; |
| ## Fix Outlook from Mac |
| if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
| |
| ## strip http |
| if ( $raw =~ /.*http.*/ ) { |
| $raw =~ s!posted via!!i; |
| $raw =~ s!http://!!g; |
| $raw =~ s!/!!g; |
| $raw =~ s! !!g; |
| } |
| |
| ## Fix Outlook from Mac |
| if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;} |
| |
| ## Pick out the popular agents |
| if ( $raw =~ /(outlook express)/i || |
| $raw =~ /(microplanet gravity)/i || |
| $raw =~ /(news rover)/i || |
| $raw =~ /(forte agent)/i || |
| $raw =~ /(forte free agent)/i |
| ) |
| { |
| $agent = $1; |
| } |
| elsif ( $raw =~ /^( |
| ## Pick out the popular agents |
| if ( $raw =~ /(outlook express)/i |
| || $raw =~ /(microplanet gravity)/i |
| || $raw =~ /(news rover)/i |
| || $raw =~ /(forte agent)/i |
| || $raw =~ /(forte free agent)/i ) |
| { |
| $agent = $1; |
| } |
| elsif ( |
| $raw =~ /^( |
| pan |
| |sylpheed |
| |slrn |
| 558,193 → 648,240 |
| |007 |
| |webtv |
| |compuserve |
| )/ix ) |
| { |
| $agent = $1; |
| } |
| else |
| { |
| ## Clean up unknown agents |
| if ( $raw =~ m!^(.*?)/! ) { |
| $agent = $1; |
| } |
| elsif ( $raw =~ /^(\w*)\d.*/ ) |
| { |
| $agent = $1; |
| } |
| )/ix |
| ) |
| { |
| $agent = $1; |
| } |
| else |
| { |
| ## Clean up unknown agents |
| if ( $raw =~ m!^(.*?)/! ) |
| { |
| $agent = $1; |
| } |
| elsif ( $raw =~ /^(\w*)\d.*/ ) |
| { |
| $agent = $1; |
| } |
| } |
| |
| $distinct_agent{$agent}++; |
| return $agent; |
| } |
| |
| $distinct_agent{$agent}++; |
| return $agent; |
| } |
| # Get all cross-posted newsgroups |
| for ( split /,/, $headers{"Newsgroups"} ) |
| { |
| $crossposts{$_}++; # bump count for each |
| } |
| |
| # Get threads |
| my $thread = encode( 'UTF-8', $headers{"Subject"} ); |
| $thread =~ s/^re: //i; # Remove Re: or re: at start |
| $thread =~ s/\s+/ /g; # collapse whitespace |
| $threads{$thread}{count} += 1; # bump count of this subject |
| $threads{$thread}{size} += $filesize; # bump bytes for this thread |
| |
| # Get all cross-posted newsgroups |
| for (split /,/, $headers{"Newsgroups"}) { |
| $crossposts{$_}++; # bump count for each |
| } |
| # Is this an original post or a reply? |
| if ( defined $headers{"References"} ) |
| { |
| $replies++; |
| } |
| else |
| { |
| $origposts++; |
| } |
| |
| # Get threads |
| my $thread = encode('UTF-8', $headers{"Subject"}); |
| $thread =~ s/^re: //i; # Remove Re: or re: at start |
| $thread =~ s/\s+/ /g; # collapse whitespace |
| $threads{$thread}{count} += 1; # bump count of this subject |
| $threads{$thread}{size} += $filesize; # bump bytes for this thread |
| # Get the time zone |
| $_ = $headers{"Date"}; |
| my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
| if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
| { |
| $tz = "UTC"; |
| } |
| $tz{$tz}++; |
| |
| # Is this an original post or a reply? |
| if (defined $headers{"References"}) { |
| $replies++; |
| } else { |
| $origposts++; |
| } |
| #### Now analyse the body text #### |
| my $insig = 0; |
| for (@body) |
| { |
| $totbody += length($_); # bump total body size |
| next if (/^$>/); # don't count blank lines in body |
| if ( $insig == 1 ) |
| { |
| $totsig += length($_); # bump total sig size |
| |
| # Get the time zone |
| $_ = $headers{"Date"}; |
| my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
| if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) { |
| $tz = "UTC"; |
| } |
| $tz{$tz}++; |
| # Bill Unruh uses ] quotes, and another poster uses :: |
| } |
| elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
| { # are we in a quote line? |
| $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
| $totquoted += length($_); |
| } |
| elsif (/-- /) |
| { |
| $insig = 1; |
| } |
| else |
| { |
| |
| #### Now analyse the body text #### |
| my $insig = 0; |
| for (@body) { |
| $totbody += length($_); # bump total body size |
| next if (/^$>/); # don't count blank lines in body |
| if ($insig == 1) { |
| $totsig += length($_); # bump total sig size |
| # Bill Unruh uses ] quotes, and another poster uses :: |
| } elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line? |
| $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
| $totquoted += length($_); |
| } elsif (/-- /) { |
| $insig = 1; |
| } else { |
| # we must be processing an original line |
| $data{$poster}{orig} += length($_); # bump count of original chrs |
| $totorig += length($_); |
| } |
| } # end for (@body) |
| # we must be processing an original line |
| $data{$poster}{orig} += length($_); # bump count of original chrs |
| $totorig += length($_); |
| } |
| } # end for (@body) |
| |
| } # getdata |
| } # getdata |
| |
| ######################################## |
| # Count the User-Agents used, collapsing |
| # different versions into one per agent. |
| ######################################## |
| sub countagents { |
| sub countagents |
| { |
| POSTER: |
| foreach $poster (keys %data) { |
| foreach my $agent_name (keys %distinct_agent) { # check against known ones |
| if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { |
| $agents{$agent_name}++; |
| next POSTER; |
| } |
| } |
| $agents{$data{$poster}{agent}}++; |
| } |
| } # countagents |
| foreach $poster ( keys %data ) |
| { |
| foreach my $agent_name ( keys %distinct_agent ) |
| { # check against known ones |
| if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
| { |
| $agents{$agent_name}++; |
| next POSTER; |
| } |
| } |
| $agents{ $data{$poster}{agent} }++; |
| } |
| } # countagents |
| |
| ############################################ |
| # set orig/total percentages for all posters |
| ############################################ |
| sub fixpercent { |
| foreach $poster (keys %data) { |
| my $percent = 100; |
| if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) { |
| $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/ |
| } elsif ($data{$poster}{orig} == 0) { |
| $percent = 0; |
| } |
| $data{$poster}{percent} = $percent; |
| } |
| sub fixpercent |
| { |
| foreach $poster ( keys %data ) |
| { |
| my $percent = 100; |
| if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
| { |
| $percent = $data{$poster}{orig} * 100 / |
| ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
| } |
| elsif ( $data{$poster}{orig} == 0 ) |
| { |
| $percent = 0; |
| } |
| $data{$poster}{percent} = $percent; |
| } |
| } |
| |
| ############################## |
| # right pad a string with '.'s |
| ############################## |
| sub rpad { |
| # get text to pad, length to pad, pad chr |
| my ($text, $pad_len, $pad_chr) = @_; |
| if (length($text) > $pad_len) { |
| $text = substr($text, 0, $pad_len); |
| } |
| my $padded = $text . $pad_chr x ( $pad_len - length( $text ) ); |
| return $padded; |
| sub rpad |
| { |
| |
| # get text to pad, length to pad, pad chr |
| my ( $text, $pad_len, $pad_chr ) = @_; |
| if ( length($text) > $pad_len ) |
| { |
| $text = substr( $text, 0, $pad_len ); |
| } |
| my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
| return $padded; |
| } |
| |
| ################# |
| # centre a string |
| ################# |
| sub centred { |
| my ($text, $width) = @_; # text to centre, size of field to centre in |
| my $pad_len = ($width - length($text)) / 2; #/ |
| my $centred = " " x $pad_len . $text; |
| return $centred; |
| sub centred |
| { |
| my ( $text, $width ) = @_; # text to centre, size of field to centre in |
| my $pad_len = ( $width - length($text) ) / 2; #/ |
| my $centred = " " x $pad_len . $text; |
| return $centred; |
| } |
| |
| ########################## |
| # put commas into a number |
| ########################## |
| sub commify { |
| $_ = shift; |
| 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
| return $_; |
| sub commify |
| { |
| $_ = shift; |
| 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
| return $_; |
| } |
| |
| ######################### |
| # clean |
| ######################### |
| sub clean { |
| my $dirty = shift; |
| my $clean = $dirty; |
| $clean =~ s/^\s*//; |
| $clean =~ s/\s*$//; |
| sub clean |
| { |
| my $dirty = shift; |
| my $clean = $dirty; |
| $clean =~ s/^\s*//; |
| $clean =~ s/\s*$//; |
| |
| return $clean; |
| return $clean; |
| } |
| |
| sub usage |
| { |
| |
| sub usage { |
| |
| print "usage: newstat.pl newsgroupname\n"; |
| exit 1; |
| print "usage: newstat.pl newsgroupname\n"; |
| exit 1; |
| } |
| |
| ################################### |
| # Write data structures to a file # |
| ################################### |
| sub writedata { |
| open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
| print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
| print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
| foreach my $name (keys %data) { |
| print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
| } |
| print OUTF "============================================================================\n"; |
| print OUTF "Thread subjects\n"; |
| print OUTF "----------------------------------------------------------------------------\n"; |
| foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) { |
| print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
| } |
| print OUTF "============================================================================\n"; |
| print OUTF "Cross-posts\n"; |
| print OUTF "----------------------------------------------------------------------------\n"; |
| foreach my $name (sort keys %crossposts) { |
| print OUTF "$name : $crossposts{$name}\n"; |
| } |
| print OUTF |
| print OUTF "============================================================================\n"; |
| print OUTF "User agents\n"; |
| print OUTF "----------------------------------------------------------------------------\n"; |
| foreach my $name (sort keys %agents) { |
| print OUTF "$name : $agents{$name}\n"; |
| } |
| print OUTF "============================================================================\n"; |
| print OUTF "Time zones\n"; |
| print OUTF "----------------------------------------------------------------------------\n"; |
| foreach my $name (sort keys %tz) { |
| print OUTF "$name : $tz{$name}\n"; |
| } |
| close OUTF; |
| } # writedata |
| sub writedata |
| { |
| open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
| print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
| print OUTF |
| "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
| foreach my $name ( keys %data ) |
| { |
| print OUTF |
| "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
| } |
| print OUTF |
| "============================================================================\n"; |
| print OUTF "Thread subjects\n"; |
| print OUTF |
| "----------------------------------------------------------------------------\n"; |
| foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
| { |
| print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
| } |
| print OUTF |
| "============================================================================\n"; |
| print OUTF "Cross-posts\n"; |
| print OUTF |
| "----------------------------------------------------------------------------\n"; |
| foreach my $name ( sort keys %crossposts ) |
| { |
| print OUTF "$name : $crossposts{$name}\n"; |
| } |
| print OUTF print OUTF |
| "============================================================================\n"; |
| print OUTF "User agents\n"; |
| print OUTF |
| "----------------------------------------------------------------------------\n"; |
| foreach my $name ( sort keys %agents ) |
| { |
| print OUTF "$name : $agents{$name}\n"; |
| } |
| print OUTF |
| "============================================================================\n"; |
| print OUTF "Time zones\n"; |
| print OUTF |
| "----------------------------------------------------------------------------\n"; |
| foreach my $name ( sort keys %tz ) |
| { |
| print OUTF "$name : $tz{$name}\n"; |
| } |
| close OUTF; |
| } # writedata |