| 1,80 → 1,35 |
| #!/usr/bin/env perl |
| use strict; |
| use warnings; |
| use diagnostics; |
| use utf8; |
| use encoding 'utf-8'; |
| use Encode; |
| |
| ########################### |
| # newsstat.pl version 0.4.2 |
| ## Print out all text to STDOUT UTF-8 encoded |
| binmode STDOUT, ':encoding(UTF-8)'; |
| |
| ############################################################################ |
| # Collect statistics about a newsgroup (specified by first argument) in |
| # the local news spool. Check all articles in the last 30-day period. |
| # Rank posters by number of posts and by volume of posts, report on top and |
| # bottom 20 posters. Show their name, number of posts, size of posts, |
| # percentage of quoted lines. Rank user-agents used, by poster rather than |
| # by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
| # |
| # (Numbers and paths can be configured below. -- PE) |
| ############################################################################ |
| ############################ |
| ## newsstat.pl version 0.4.3 |
| |
| ############################################################################ |
| # RECENT CHANGES # |
| # 2011-10-03 PE - Use more compatible shebang |
| # - Fixed some Perl::Critic-ized code |
| # - Fixed wrong indent for non-ASCII names |
| # - Formatted source code |
| # 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
| # - Use warnings, utf8 (just in case) |
| # - Documentation update |
| # N/A NN - Take newsgroup name as argument |
| # 2004-06-19 NN - newsgroup name is $ARGV[0] |
| # - Allow command line flags for subtracting |
| # output if not pertinent for a group |
| # 2002-11-09 NN - Put Garry's writedata() function back in. |
| # - added "rn" to my list of UA's |
| # - Started using %distinct_agent for both User agent |
| # sections |
| # - named it newsstat.pl version 0.3 |
| # 2002-11-06 NN - Fixed the earliest/latest file problem by using |
| # mtime rather than ctime, and simplifying the logic |
| # 2002-11-05 NN - moved user configurations to the top |
| # - fixed the cross-posting section |
| # - introduced the $newsgroup_name variable which |
| # later becomes $news$group |
| # - changed $name to $agent_name in countagents() |
| # |
| # Contributors |
| # ------------- |
| # NN Nomen nominandum (name to be determined later) |
| # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de> |
| ########################################################################### |
| ## Collect statistics about a newsgroup (specified by first argument) |
| ## in the local news spool. Check all articles in the last 30-day period. |
| ## Rank posters by number of posts and by volume of posts, report on top |
| ## and bottom 20 posters. Show their name, number of posts, size of posts, |
| ## percentage of quoted lines. Rank user-agents used, by poster rather |
| ## than by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
| ## |
| ## Numbers and paths can be configured below. See ChangeLog and TODO |
| ## for more. -- PE |
| ########################################################################### |
| |
| ########### TODO ############# |
| # Commas in bottom section of report |
| # Show date the figures were compiled |
| # No. of HTML articles (Content-Type: text/html) |
| # No. of quoted sigs (/>\s*-- /) |
| # Per cent of top-posted articles |
| # Top 10 cross-posters |
| # Top 20 news posting hosts (from Path) |
| # Count of certain subject words: newbie, kde, burner, sendmail, etc. |
| # Count *all* User Agents that each poster uses |
| # What do we do about Bill Unruh's ] quote style? |
| # Change the way dates/times are checked |
| # include % share in posters by no. of arts |
| # include % share in posters by size |
| # Total, orig & quoted lines by user agent with per cent |
| # Take more arguments |
| ####################################################### |
| |
| ###################### USER CONFIGURATIONS ############################ |
| |
| # The name of the group to do stats for |
| ## The name of the group to do stats for |
| my $newsgroup_name = $ARGV[0]; |
| $newsgroup_name or &usage; |
| |
| # Check for removal flags |
| ## Check for removal flags |
| my $ix; |
| my $j; |
| my %skipSec; |
| 97,25 → 52,25 |
| $skipSec{$_} = 1; |
| } |
| |
| # Leafnode users will want /var/spool/news for this variable. |
| ## Leafnode users will want /var/spool/news for this variable. |
| my $news = "/var/spool/news/"; |
| |
| # How many days are we doing statistics for? |
| ## How many days are we doing statistics for? |
| my $numdays = 30; |
| |
| # no. of agents we list |
| ## Number of agents we list |
| my $topagents = 10; |
| |
| # no. of threads we want to know about |
| ## Number of threads we want to know about |
| my $topthreads = 20; |
| |
| # no. of top or bottom posters to show |
| ## Number of top or bottom posters to show |
| my $topposters = 20; |
| |
| # no. of cross-posted threads to show |
| ## Number of cross-posted threads to show |
| my $topcrossposts = 10; |
| |
| # no. of time zones to show |
| ## Number of time zones to show |
| my $toptz = 10; |
| |
| ###################### DATA STRUCTURES ###################### |
| 145,30 → 100,32 |
| my $replies = 0; # total no. of replies |
| my $i; # general purpose |
| my %distinct_agent; |
| my %agents = # used to hold counts of User Agents used |
| ( |
| "KNode" => 0, |
| "Pan" => 0, |
| "Mozilla" => 0, |
| "Sylpheed" => 0, |
| "Gnus" => 0, |
| |
| ## Used to hold counts of User Agents used |
| my %agents = ( |
| "Compuserver" => 0, |
| "Foorum" => 0, |
| "Forte Agent" => 0, |
| "Forte Free Agent" => 0, |
| "Gnus" => 0, |
| "KNode" => 0, |
| "MacSOUP" => 0, |
| "MT-NewsWatcher" => 0, |
| "MicroPlanet Gravity" => 0, |
| "Microsoft Outlook Express" => 0, |
| "Xnews" => 0, |
| "Microsoft Windows Mail" => 0, |
| "Mozilla" => 0, |
| "News Rover" => 0, |
| "NN" => 0, |
| "Pan" => 0, |
| "rn" => 0, |
| "slrn" => 0, |
| "Sylpheed" => 0, |
| "tin" => 0, |
| "rn" => 0, |
| "NN" => 0, |
| "MacSOUP" => 0, |
| "Foorum" => 0, |
| "MT-NewsWatcher" => 0, |
| "News Rover" => 0, |
| "VSoup" => 0, |
| "WebTV" => 0, |
| "Compuserver" => 0, |
| "VSoup" => 0 |
| ); |
| "Xnews" => 0 |
| ); |
| |
| ######################## MAIN CODE ######################## |
| $! = 1; |
| 178,22 → 135,22 |
| while ( defined( $filename = readdir(DIR) ) ) |
| { |
| %lcheader = (); |
| next unless -f $filename; # only want real files |
| next if ( $filename eq ".overview" ); # real articles only |
| next if ( -M $filename > $numdays ); # only want articles <= a certain age |
| next unless -f $filename; # only want real files |
| next if ( $filename eq ".overview" ); # real articles only |
| next if ( -M $filename > $numdays ); # only want articles <= a certain age |
| $earliest = ( stat $filename )[9] unless defined($earliest); |
| $latest = ( stat $filename )[9] unless defined($latest); |
| &getarticle($filename); # read in the article |
| &getdata; # grab the data from the article |
| $totalposts++; # bump count of articles considered |
| &get_article($filename); # read in the article |
| &get_data; # grab the data from the article |
| $totalposts++; # bump count of articles considered |
| } |
| closedir(DIR); # finished with the directory |
| closedir(DIR); # finished with the directory |
| |
| # post-processing |
| &countagents; # count agents, collapsing versions |
| &fixpercent; # check percentages orig/total for posters |
| ## Post-processing |
| &count_agents; # count agents, collapsing versions |
| &fix_percent; # check percentages orig/total for posters |
| |
| &writedata; |
| &write_data; |
| |
| #################### DISPLAY RESULTS ##################### |
| print "=" x 76, "\n"; |
| 208,15 → 165,15 |
| printf "Latest article: %s\n", scalar localtime($latest); |
| printf "Original articles: %s, replies: %s\n", commify($origposts), |
| commify($replies); |
| printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
| printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
| commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
| printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
| printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
| commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
| commify( int( $totsize / $totalposts ) ); |
| my $count = keys %data; |
| printf "Total headers: %s KB bodies: %s KB\n", |
| printf "Total headers: %s KiB bodies: %s KiB\n", |
| commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
| printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
| printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
| commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
| ( $totorig * 100 ) / ( $totorig + $totquoted ), |
| commify( int( $totsig / 1024 ) ); |
| 225,12 → 182,12 |
| $count = keys %threads; |
| printf "Total number of threads: %s, average %s bytes per thread\n", |
| commify($count), commify( int( $totsize / $count ) ); #/ |
| printf "Total number of User-Agents: %d\n", scalar keys %agents; |
| printf "Total number of user agents: %d\n", scalar keys %agents; |
| print "\n", "=" x 76, "\n"; |
| |
| ############################### |
| # show posters by article count Sec 1; |
| ############################### |
| ######################################## |
| ## Show posters by article count Sec 1; |
| ######################################## |
| unless ( $skipSec{1} ) |
| { |
| if ( keys %data < $topposters ) |
| 255,9 → 212,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ################################ |
| # show posters by size in Kbytes Sec 2; |
| ################################ |
| ###################################### |
| ## Show posters by size in KiB Sec 2; |
| ###################################### |
| unless ( $skipSec{2} ) |
| { |
| if ( keys %data < $topposters ) |
| 268,7 → 225,7 |
| { |
| $count = $topposters; |
| } |
| printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
| printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
| 281,9 → 238,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| #################################### |
| # show top posters for original text |
| #################################### |
| ##################################### |
| ## Show top posters for original text |
| ##################################### |
| unless ( $skipSec{3} ) |
| { |
| if ( keys %data < $topposters ) |
| 313,9 → 270,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ####################################### |
| # show bottom posters for original text |
| ####################################### |
| ######################################## |
| ## Show bottom posters for original text |
| ######################################## |
| unless ( $skipSec{4} ) |
| { |
| if ( keys %data < $topposters ) |
| 345,9 → 302,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| #################################### |
| # show threads by number of articles |
| #################################### |
| ##################################### |
| ## Show threads by number of articles |
| ##################################### |
| unless ( $skipSec{5} ) |
| { |
| if ( keys %threads < $topthreads ) |
| 373,9 → 330,10 |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| ################################ |
| # show threads by size in Kbytes |
| ################################ |
| |
| ############################## |
| ## Show threads by size in KiB |
| ############################## |
| unless ( $skipSec{6} ) |
| { |
| if ( keys %threads < $topthreads ) |
| 386,7 → 344,7 |
| { |
| $count = $topthreads; |
| } |
| printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
| printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
| print "=" x 76, "\n"; |
| $i = 0; |
| foreach my $thread ( |
| 402,9 → 360,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ################################# |
| # show top 10 cross-posted groups |
| ################################# |
| ################################## |
| ## Show top 10 cross-posted groups |
| ################################## |
| unless ( $skipSec{7} ) |
| { |
| delete $crossposts{"$newsgroup_name"}; # don't include ours |
| 428,9 → 386,10 |
| } |
| print "\n", "=" x 76, "\n"; |
| } |
| ####################### |
| #show agents and counts |
| ####################### |
| |
| ######################### |
| ## Show agents and counts |
| ######################### |
| unless ( $skipSec{8} ) |
| { |
| if ( keys %agents < $topagents ) |
| 454,7 → 413,7 |
| } |
| |
| ####################### |
| #show distinct agents |
| ## Show distinct agents |
| ####################### |
| unless ( $skipSec{9} ) |
| { |
| 482,9 → 441,9 |
| print "\n", "=" x 76, "\n"; |
| } |
| |
| ########################## |
| #show timezones and counts |
| ########################## |
| ############################ |
| ## Show timezones and counts |
| ############################ |
| unless ( $skipSec{10} ) |
| { |
| if ( keys %tz < $toptz ) |
| 508,15 → 467,15 |
| |
| ################################ SUBROUTINES ################################ |
| |
| ####################################### |
| # get current article's header and body |
| ####################################### |
| sub getarticle |
| ######################################## |
| ## Get current article's header and body |
| ######################################## |
| sub get_article |
| { |
| %headers = (); # dump old headers |
| my $filename = shift; # get the name of the file |
| |
| # get stats about the file itself |
| ## get stats about the file itself |
| $filesize = -s $filename; # get total size of file |
| $totsize += $filesize; # bump total sizes of all files |
| |
| 530,13 → 489,13 |
| $latest = $mtime; |
| } |
| |
| # now read the file |
| open( my $FILE, $filename ) or die "Can't open $filename: $!\n"; |
| ## now read the file |
| open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
| while (<$FILE>) |
| { |
| $totheader += length($_); # bump total header size |
| last if (/^\s*$/); # end of header? |
| if (/^([^:\s]*):\s+(.*)/) |
| if (/^([^:\s]*):\s*(.*)/) |
| { |
| my ( $key, $val ) = ( $1, $2 ); |
| $headers{$key} = decode( 'MIME-Header', $val ); |
| 545,19 → 504,29 |
| } |
| @body = <$FILE>; # slurp up body |
| close($FILE); |
| } # getarticle |
| } # get_article |
| |
| ################################### |
| # get data from the current article |
| ################################### |
| sub getdata |
| #################################### |
| ## Get data from the current article |
| #################################### |
| sub get_data |
| { |
| #### First, analyse header fields #### |
| |
| # Set up this poster if not defined, get counts, sizes |
| ## Set up this poster if not defined, get counts, sizes |
| my $poster = $headers{From}; # get the poster's name |
| |
| # Convert old to new format |
| $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
| |
| # Collapse whitespace |
| $poster =~ s/\s+/ /; |
| |
| # Remove outer quotes |
| $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
| |
| if ( !defined( $data{$poster} ) ) |
| { # seen this one before? |
| { # seen this one before? |
| $data{$poster}{agent} = 'Unknown'; # comes after For: field |
| $data{$poster}{orig} = 0; |
| $data{$poster}{quoted} = 0; |
| 565,8 → 534,8 |
| $data{$poster}{count}++; # bump count for this poster |
| $data{$poster}{size} += $filesize; # total size of file |
| |
| # The User-Agent and/or X-Newsreader fields |
| # for User-Agent by poster |
| ## The User-Agent and/or X-Newsreader fields |
| ## for User-Agent by poster |
| if ( defined $lcheader{"user-agent"} ) |
| { |
| $data{$poster}{agent} = $lcheader{"user-agent"}; |
| 576,7 → 545,7 |
| $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
| } |
| |
| # The User Agent for User-Agent by number of posts |
| ## The User Agent for User-Agent by number of posts |
| my $UA = "unknown"; |
| foreach my $keys ( keys %lcheader ) |
| { |
| 627,11 → 596,14 |
| if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
| |
| ## Pick out the popular agents |
| if ( $raw =~ /(outlook express)/i |
| || $raw =~ /(microplanet gravity)/i |
| || $raw =~ /(news rover)/i |
| || $raw =~ /(forte agent)/i |
| || $raw =~ /(forte free agent)/i ) |
| if ( |
| $raw =~ /(outlook express)/i |
| || $raw =~ /(windows mail)/i |
| || $raw =~ /(microplanet gravity)/i |
| || $raw =~ /(news rover)/i |
| || $raw =~ /(forte agent)/i |
| || $raw =~ /(forte free agent)/i |
| ) |
| { |
| $agent = $1; |
| } |
| 686,13 → 658,13 |
| return $agent; |
| } |
| |
| # Get all cross-posted newsgroups |
| ## Get all cross-posted newsgroups |
| for ( split /,/, $headers{"Newsgroups"} ) |
| { |
| $crossposts{$_}++; # bump count for each |
| } |
| |
| # Get threads |
| ## Get threads |
| my $thread = $headers{"Subject"}; |
| $thread =~ s/^re: //i; # Remove Re: or re: at start |
| $thread =~ s/\s+/ /g; # collapse whitespace |
| 699,7 → 671,7 |
| $threads{$thread}{count} += 1; # bump count of this subject |
| $threads{$thread}{size} += $filesize; # bump bytes for this thread |
| |
| # Is this an original post or a reply? |
| ## Is this an original post or a reply? |
| if ( defined $headers{"References"} ) |
| { |
| $replies++; |
| 709,9 → 681,9 |
| $origposts++; |
| } |
| |
| # Get the time zone |
| ## Get the time zone |
| $_ = $headers{"Date"}; |
| my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
| my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
| if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
| { |
| $tz = "UTC"; |
| 728,7 → 700,7 |
| { |
| $totsig += length($_); # bump total sig size |
| |
| # Bill Unruh uses ] quotes, and another poster uses :: |
| ## Bill Unruh uses ] quotes, and another poster uses :: |
| } |
| elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
| { # are we in a quote line? |
| 742,19 → 714,19 |
| else |
| { |
| |
| # we must be processing an original line |
| ## We must be processing an original line |
| $data{$poster}{orig} += length($_); # bump count of original chrs |
| $totorig += length($_); |
| } |
| } # end for (@body) |
| |
| } # getdata |
| } # get_data |
| |
| ######################################## |
| # Count the User-Agents used, collapsing |
| # different versions into one per agent. |
| ######################################## |
| sub countagents |
| ######################################### |
| ## Count the User-Agents used, collapsing |
| ## different versions into one per agent. |
| ######################################### |
| sub count_agents |
| { |
| POSTER: |
| foreach my $poster ( keys %data ) |
| 769,12 → 741,12 |
| } |
| $agents{ $data{$poster}{agent} }++; |
| } |
| } # countagents |
| } # count_agents |
| |
| ############################################ |
| # set orig/total percentages for all posters |
| ############################################ |
| sub fixpercent |
| ############################################# |
| ## Set orig/total percentages for all posters |
| ############################################# |
| sub fix_percent |
| { |
| foreach my $poster ( keys %data ) |
| { |
| 793,16 → 765,16 |
| } |
| } |
| |
| ############################## |
| # right pad a string with '.'s |
| ############################## |
| ############################### |
| ## Right pad a string with '.'s |
| ############################### |
| sub rpad |
| { |
| # get text to pad, length to pad, pad chr |
| ## Get text to pad, length to pad, pad chr |
| my ( $text, $pad_len, $pad_chr ) = @_; |
| |
| ### DEBUG |
| # printf "|%s| = %d\n", $text, length($text); |
| ## DEBUG |
| #printf "|%s| = %d\n", $text, length($text); |
| |
| if ( length($text) > $pad_len ) |
| { |
| 812,9 → 784,9 |
| return $padded; |
| } |
| |
| ################# |
| # centre a string |
| ################# |
| ################## |
| ## Centre a string |
| ################## |
| sub centred |
| { |
| my ( $text, $width ) = @_; # text to centre, size of field to centre in |
| 823,25 → 795,24 |
| return $centred; |
| } |
| |
| ########################## |
| # put commas into a number |
| ########################## |
| ########################### |
| ## Put commas into a number |
| ########################### |
| sub commify |
| { |
| $_ = shift; |
| 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
| local $_ = shift; |
| 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
| return $_; |
| } |
| |
| ######################### |
| # clean |
| ######################### |
| ################################################################ |
| ## Returns a string with leading and trailing whitespace removed |
| ################################################################ |
| sub clean |
| { |
| my $dirty = shift; |
| my $clean = $dirty; |
| $clean =~ s/^\s*//; |
| $clean =~ s/\s*$//; |
| $clean =~ s/^\s*|\s*$//g; |
| |
| return $clean; |
| } |
| 848,18 → 819,18 |
| |
| sub usage |
| { |
| |
| print "usage: newstat.pl newsgroupname\n"; |
| exit 1; |
| } |
| |
| ################################### |
| # Write data structures to a file # |
| ################################### |
| sub writedata |
| ################################## |
| ## Write data structures to a file |
| ################################## |
| sub write_data |
| { |
| open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
| print $OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
| open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
| or die "Can't create XDATA: $!\n"; |
| print $OUTF "Data collected from $newsgroup_name\n\n"; |
| print $OUTF |
| "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
| foreach my $name ( keys %data ) |
| 886,7 → 857,7 |
| { |
| print $OUTF "$name : $crossposts{$name}\n"; |
| } |
| print $OUTF print $OUTF |
| print $OUTF |
| "============================================================================\n"; |
| print $OUTF "User agents\n"; |
| print $OUTF |
| 905,4 → 876,4 |
| print $OUTF "$name : $tz{$name}\n"; |
| } |
| close $OUTF; |
| } # writedata |
| } # write_data |