#!/usr/bin/perl use strict; use warnings; use utf8; use Encode; ######################### # newsstat.pl version 0.4 ############################################################################ # Collect statistics about a newsgroup (specified by first argument) in # the local news spool. Check all articles in the last 30-day period. # Rank posters by number of posts and by volume of posts, report on top and # bottom 20 posters. Show their name, number of posts, size of posts, # percentage of quoted lines. Rank user-agents used, by poster rather than # by post. Rank top 20 threads. Rank top 10 cross-posted groups. # # (Numbers and paths can be configured below. -- PE) ############################################################################ ############################################################################ # RECENT CHANGES # # 2011-07-03 PE - Use Encode to decode/encode MIME encodings # - Use warnings, utf8 (just in case) # - Documentation update # N/A NN - Take newsgroup name as argument # 2004-06-19 NN - newsgroup name is $ARGV[0] # - Allow command line flags for subtracting # output if not pertinent for a group # 2002-11-09 NN - Put Garry's writedata() function back in. # - added "rn" to my list of UA's # - Started using %distinct_agent for both User agent # sections # - named it newsstat.pl version 0.3 # 2002-11-06 NN - Fixed the earliest/latest file problem by using # mtime rather than ctime, and simplifying the logic # 2002-11-05 NN - moved user configurations to the top # - fixed the cross-posting section # - introduced the $newsgroup_name variable which # later becomes $news$group # - changed $name to $agent_name in countagents() # # Contributors # ------------- # NN Nomen nominandum (name to be determined later) # PE Thomas 'PointedEars' Lahn ########### TODO ############# # Commas in bottom section of report # Show date the figures were compiled # No. of HTML articles (Content-Type: text/html) # No. of quoted sigs (/>\s*-- /) # Per cent of top-posted articles # Top 10 cross-posters # Top 20 news posting hosts (from Path) # Count of certain subject words: newbie, kde, burner, sendmail, etc. # Count *all* User Agents that each poster uses # What do we do about Bill Unruh's ] quote style? # Change the way dates/times are checked # include % share in posters by no. of arts # include % share in posters by size # Total, orig & quoted lines by user agent with per cent # Take more arguments ####################################################### ###################### USER CONFIGURATIONS ############################ # The name of the group to do stats for my $newsgroup_name = $ARGV[0]; $newsgroup_name or &usage; # Check for removal flags my $ix; my $j; my %skipSec; my @skiplist; my $args = @ARGV; for ( $ix = 1 ; $ix < $args ; $ix++ ) { $j = $ix + 1; if ( $ARGV[$ix] eq "-x" ) { @skiplist = split( ",", $ARGV[$j] ); } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { @skiplist = split( ",", $1 ); } } foreach (@skiplist) { $skipSec{$_} = 1; } # Leafnode users will want /var/spool/news for this variable. my $news = "/var/spool/news/"; # How many days are we doing statistics for? my $numdays = 30; # no. of agents we list my $topagents = 10; # no. of threads we want to know about my $topthreads = 20; # no. of top or bottom posters to show my $topposters = 20; # no. of cross-posted threads to show my $topcrossposts = 10; # no. of time zones to show my $toptz = 10; ###################### DATA STRUCTURES ###################### my $group = $newsgroup_name; $group =~ s!\.!/!g; my %data; # name, count, agent, total, orig, quoted my %threads; # subject, count my %crossposts; # group, count my %tz; # timezones by count my %headers; # holds header of current article my %lcheader; # holds lowercase headers my @body; # holds body of current article my @sig; # holds sig text; my $totalposts; # total no. of posts considered my $filename; # name of current article file my $filesize; # size of current article file my $earliest; # earliest article we have found my $latest; # latest article we have found my $poster; # poster we are dealing with my $totsize = 0; # holds total sizes of all files my $totheader = 0; # total size of header material my $totbody = 0; # total size of body material my $totsig = 0; # total size of sig material my $totorig = 0; # total size of original material my $totquoted = 0; # total size of quoted material my $origposts = 0; # total no. of original posts my $replies = 0; # total no. of replies my $i; # general purpose my %distinct_agent; my %agents = # used to hold counts of User Agents used ( "KNode" => 0, "Pan" => 0, "Mozilla" => 0, "Sylpheed" => 0, "Gnus" => 0, "Forte Agent" => 0, "Forte Free Agent" => 0, "MicroPlanet Gravity" => 0, "Microsoft Outlook Express" => 0, "Xnews" => 0, "slrn" => 0, "tin" => 0, "rn" => 0, "NN" => 0, "MacSOUP" => 0, "Foorum" => 0, "MT-NewsWatcher" => 0, "News Rover" => 0, "WebTV" => 0, "Compuserver" => 0, "VSoup" => 0 ); ######################## MAIN CODE ######################## $! = 1; chdir("$news$group") or die "Can't cd to $news$group: $!\n"; opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; while ( defined( $filename = readdir(DIR) ) ) { %lcheader = (); next unless -f $filename; # only want real files next if ( $filename eq ".overview" ); # real articles only next if ( -M $filename > $numdays ); # only want articles <= a certain age $earliest = ( stat $filename )[9] unless defined($earliest); $latest = ( stat $filename )[9] unless defined($latest); &getarticle($filename); # read in the article &getdata; # grab the data from the article $totalposts++; # bump count of articles considered } closedir(DIR); # finished with the directory # post-processing &countagents; # count agents, collapsing versions &fixpercent; # check percentages orig/total for posters &writedata; #################### DISPLAY RESULTS ##################### print "=" x 76, "\n"; printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); print "=" x 76, "\n"; printf "%s\n", ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); print "\n\n"; printf "Total posts considered: %s over %d days\n", commify($totalposts), $numdays; printf "Earliest article: %s\n", scalar localtime($earliest); printf "Latest article: %s\n", scalar localtime($latest); printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576; # printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, commify( int( $totsize / $totalposts ) ); my $count = keys %data; printf "Total headers: %s KB bodies: %s KB\n", commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), ( $totorig * 100 ) / ( $totorig + $totquoted ), commify( int( $totsig / 1024 ) ); printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), commify( int( $totsize / $count ) ); #/ $count = keys %threads; printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), commify( int( $totsize / $count ) ); #/ printf "Total number of User-Agents: %d\n", scalar keys %agents; print "\n", "=" x 76, "\n"; ############################### # show posters by article count Sec 1; ############################### unless ( $skipSec{1} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); print "=" x 76, "\n"; $i = 0; foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) { my $name = substr( $poster, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{count}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ################################ # show posters by size in Kbytes Sec 2; ################################ unless ( $skipSec{2} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); print "=" x 76, "\n"; $i = 0; foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) { my $name = substr( $poster, 0, 62 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{size} / 1024; #/ last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } #################################### # show top posters for original text #################################### unless ( $skipSec{3} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", ¢red( "Top $count responders by original text (> 5 posts)", 76 ); print "=" x 76, "\n"; $i = 0; foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data ) { next if $data{$poster}{quoted} == 0; next if $data{$poster}{count} < 5; my $name = substr( $poster, 0, 63 ); printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{percent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ####################################### # show bottom posters for original text ####################################### unless ( $skipSec{4} ) { if ( keys %data < $topposters ) { $count = keys %data; } else { $count = $topposters; } printf "%s\n", ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); print "=" x 76, "\n"; $i = 0; foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data ) { next if $data{$poster}{quoted} == 0; next if $data{$poster}{count} < 5; my $name = substr( $poster, 0, 63 ); printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), $data{$poster}{percent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } #################################### # show threads by number of articles #################################### unless ( $skipSec{5} ) { if ( keys %threads < $topthreads ) { $count = keys %threads; } else { $count = $topthreads; } printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} } keys %threads ) { my $name = substr( $thread, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $threads{$thread}{count}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ################################ # show threads by size in Kbytes ################################ unless ( $skipSec{6} ) { if ( keys %threads < $topthreads ) { $count = keys %threads; } else { $count = $topthreads; } printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} } keys %threads ) { my $name = substr( $thread, 0, 65 ); printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $threads{$thread}{size} / 1024; #/ last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ################################# # show top 10 cross-posted groups ################################# unless ( $skipSec{7} ) { delete $crossposts{"$newsgroup_name"}; # don't include ours if ( keys %crossposts < $topcrossposts ) { $count = keys %crossposts; } else { $count = $topcrossposts; } printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), $crossposts{$name}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ####################### #show agents and counts ####################### unless ( $skipSec{8} ) { if ( keys %agents < $topagents ) { $count = keys %agents; } else { $count = $topagents; } printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), $agents{$agent}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ####################### #show distinct agents ####################### unless ( $skipSec{9} ) { if ( keys %distinct_agent < $topagents ) { $count = keys %distinct_agent; } else { $count = $topagents; } printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} } keys %distinct_agent ) { printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), $distinct_agent{$agent}, ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ########################## #show timezones and counts ########################## unless ( $skipSec{10} ) { if ( keys %tz < $toptz ) { $count = keys %tz; } else { $count = $toptz; } printf "%s\n", ¢red( "Top 10 time zones", 76 ); print "=" x 76, "\n"; $i = 0; foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) { printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; last if ( ++$i == $count ); } print "\n", "=" x 76, "\n"; } ################################ SUBROUTINES ################################ ####################################### # get current article's header and body ####################################### sub getarticle { %headers = (); # dump old headers my $filename = shift; # get the name of the file # get stats about the file itself $filesize = -s $filename; # get total size of file $totsize += $filesize; # bump total sizes of all files my $mtime = ( stat $filename )[9]; if ( $mtime < $earliest ) { $earliest = $mtime; } elsif ( $mtime > $latest ) { $latest = $mtime; } # now read the file open( FILE, $filename ) or die "Can't open $filename: $!\n"; while () { $totheader += length($_); # bump total header size last if (/^\s*$/); # end of header? if (/^([^:\s]*):\s+(.*)/) { my ( $key, $val ) = ( $1, $2 ); $headers{$key} = decode( 'MIME-Header', $val ); $lcheader{ clean( lc($key) ) } = clean($val); } } @body = ; # slurp up body close(FILE); } # getarticle ################################### # get data from the current article ################################### sub getdata { #### First, analyse header fields #### # Set up this poster if not defined, get counts, sizes $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name if ( !defined( $data{$poster} ) ) { # seen this one before? $data{$poster}{agent} = 'Unknown'; # comes after For: field $data{$poster}{orig} = 0; $data{$poster}{quoted} = 0; } $data{$poster}{count}++; # bump count for this poster $data{$poster}{size} += $filesize; # total size of file # The User-Agent and/or X-Newsreader fields # for User-Agent by poster if ( defined $lcheader{"user-agent"} ) { $data{$poster}{agent} = $lcheader{"user-agent"}; } if ( defined $lcheader{"x-newsreader"} ) { $data{$poster}{agent} = $lcheader{"x-newsreader"}; } # The User Agent for User-Agent by number of posts my $UA = "unknown"; foreach my $keys ( keys %lcheader ) { if ( defined $lcheader{'user-agent'} ) { $UA = $lcheader{'user-agent'}; } elsif ( defined $lcheader{"x-newsreader"} ) { $UA = $lcheader{"x-newsreader"}; } elsif ( defined $lcheader{'x-mailer'} ) { $UA = $lcheader{'x-mailer'}; } elsif ( ( defined $lcheader{'organization'} ) && ( $lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ ) ) { $UA = $lcheader{'organization'}; } elsif ( $lcheader{'message-id'} =~ /pine/i ) { $UA = "Pine"; } ## Hopefully found UA, else set to unknown } $UA = clean($UA); $UA = get_agent($UA); sub get_agent { my $raw = shift; my $agent = $raw; ## strip http if ( $raw =~ /.*http.*/ ) { $raw =~ s!posted via!!i; $raw =~ s!http://!!g; $raw =~ s!/!!g; $raw =~ s! !!g; } ## Fix Outlook from Mac if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } ## Pick out the popular agents if ( $raw =~ /(outlook express)/i || $raw =~ /(microplanet gravity)/i || $raw =~ /(news rover)/i || $raw =~ /(forte agent)/i || $raw =~ /(forte free agent)/i ) { $agent = $1; } elsif ( $raw =~ /^( pan |sylpheed |slrn |mozilla |knode |tin |hamster |xrn |xnews |aol |gnus |krn |macsoup |messenger |openxp |pine |thoth |turnpike |winvn |vsoup |google |supernews |nn |rn |007 |webtv |compuserve )/ix ) { $agent = $1; } else { ## Clean up unknown agents if ( $raw =~ m!^(.*?)/! ) { $agent = $1; } elsif ( $raw =~ /^(\w*)\d.*/ ) { $agent = $1; } } $distinct_agent{$agent}++; return $agent; } # Get all cross-posted newsgroups for ( split /,/, $headers{"Newsgroups"} ) { $crossposts{$_}++; # bump count for each } # Get threads my $thread = encode( 'UTF-8', $headers{"Subject"} ); $thread =~ s/^re: //i; # Remove Re: or re: at start $thread =~ s/\s+/ /g; # collapse whitespace $threads{$thread}{count} += 1; # bump count of this subject $threads{$thread}{size} += $filesize; # bump bytes for this thread # Is this an original post or a reply? if ( defined $headers{"References"} ) { $replies++; } else { $origposts++; } # Get the time zone $_ = $headers{"Date"}; my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) { $tz = "UTC"; } $tz{$tz}++; #### Now analyse the body text #### my $insig = 0; for (@body) { $totbody += length($_); # bump total body size next if (/^$>/); # don't count blank lines in body if ( $insig == 1 ) { $totsig += length($_); # bump total sig size # Bill Unruh uses ] quotes, and another poster uses :: } elsif ( /^\s*[>\]]/ or /^\s*::/ ) { # are we in a quote line? $data{$poster}{quoted} += length($_); # bump count of quoted chrs $totquoted += length($_); } elsif (/-- /) { $insig = 1; } else { # we must be processing an original line $data{$poster}{orig} += length($_); # bump count of original chrs $totorig += length($_); } } # end for (@body) } # getdata ######################################## # Count the User-Agents used, collapsing # different versions into one per agent. ######################################## sub countagents { POSTER: foreach $poster ( keys %data ) { foreach my $agent_name ( keys %distinct_agent ) { # check against known ones if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { $agents{$agent_name}++; next POSTER; } } $agents{ $data{$poster}{agent} }++; } } # countagents ############################################ # set orig/total percentages for all posters ############################################ sub fixpercent { foreach $poster ( keys %data ) { my $percent = 100; if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) { $percent = $data{$poster}{orig} * 100 / ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ } elsif ( $data{$poster}{orig} == 0 ) { $percent = 0; } $data{$poster}{percent} = $percent; } } ############################## # right pad a string with '.'s ############################## sub rpad { # get text to pad, length to pad, pad chr my ( $text, $pad_len, $pad_chr ) = @_; if ( length($text) > $pad_len ) { $text = substr( $text, 0, $pad_len ); } my $padded = $text . $pad_chr x ( $pad_len - length($text) ); return $padded; } ################# # centre a string ################# sub centred { my ( $text, $width ) = @_; # text to centre, size of field to centre in my $pad_len = ( $width - length($text) ) / 2; #/ my $centred = " " x $pad_len . $text; return $centred; } ########################## # put commas into a number ########################## sub commify { $_ = shift; 1 while s/^(-?\d+)(\d{3})/$1,$2/; return $_; } ######################### # clean ######################### sub clean { my $dirty = shift; my $clean = $dirty; $clean =~ s/^\s*//; $clean =~ s/\s*$//; return $clean; } sub usage { print "usage: newstat.pl newsgroupname\n"; exit 1; } ################################### # Write data structures to a file # ################################### sub writedata { open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; print OUTF "Data collected from alt.os.linux.mandrake\n\n"; print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; foreach my $name ( keys %data ) { print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; } print OUTF "============================================================================\n"; print OUTF "Thread subjects\n"; print OUTF "----------------------------------------------------------------------------\n"; foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) { print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; } print OUTF "============================================================================\n"; print OUTF "Cross-posts\n"; print OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %crossposts ) { print OUTF "$name : $crossposts{$name}\n"; } print OUTF print OUTF "============================================================================\n"; print OUTF "User agents\n"; print OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %agents ) { print OUTF "$name : $agents{$name}\n"; } print OUTF "============================================================================\n"; print OUTF "Time zones\n"; print OUTF "----------------------------------------------------------------------------\n"; foreach my $name ( sort keys %tz ) { print OUTF "$name : $tz{$name}\n"; } close OUTF; } # writedata