WebSVN - LCARS - Path Comparison - /trunk/tools/network/news Rev 22 and /trunk/tools/network/news Rev 14

Regard whitespace Rev 22 → Rev 14

/trunk/tools/network/news/newsstat/po/POTFILES.in
File deleted
\ No newline at end of file

/trunk/tools/network/news/newsstat/po/PACKAGE
File deleted

/trunk/tools/network/news/newsstat/po/de.pointedears.newsstat.pot
File deleted

/trunk/tools/network/news/newsstat/po/Makefile
File deleted

/trunk/tools/network/news/newsstat/po/de.po
File deleted

 /trunk/tools/network/news/newsstat/newsstat.pl
 ,39 → 1,33
 #!/usr/bin/env perl
 use strict;
 use warnings;
-#use diagnostics;
+use diagnostics;
 use utf8;
 use Encode;
-use constant DEBUG => 0;
 ## Print out all text to STDOUT UTF-8 encoded
 binmode STDOUT, ':encoding(UTF-8)';
-binmode STDERR, ':encoding(UTF-8)';
-# FIXME: Automatically include resolved '.' in @INC
-# print join "\n", @INC;
+##############################
+## newsstat.pl version 0.4.3.1
-use locale ':not_characters';
-use Locale::TextDomain ('de.pointedears.newsstat');
-use POSIX ('locale_h');
-use Locale::Messages qw (bind_textdomain_filter
-                         bind_textdomain_codeset
-                         turn_utf_8_on);
-#setlocale( LC_MESSAGES, '' );
-bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;
-bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
+###########################################################################
+## Collect statistics about a newsgroup (specified by first argument)
+## in the local news spool. Check all articles in the last 30-day period.
+## Rank posters by number of posts and by volume of posts, report on top
+## and bottom 20 posters. Show their name, number of posts, size of posts,
+## percentage of quoted lines. Rank user-agents used, by poster rather
+## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
+##
+## Numbers and paths can be configured below.  See ChangeLog and TODO
+## for more.  -- PE
+###########################################################################
-require Mail::Message;
-require DateTime;
-require DateTime::Format::Mail;
 ###################### USER CONFIGURATIONS ############################
 ## The name of the group to do stats for
 my $newsgroup_name = $ARGV[0];
-$newsgroup_name or usage();
+$newsgroup_name or &usage;
 ## Check for removal flags
 my $ix;
 ,18 → 55,21
 ## Leafnode users will want /var/spool/news for this variable.
 my $news = "/var/spool/news/";
-## Number of top or bottom posters to show
-my $topposters = 20;
+## How many days are we doing statistics for?
+my $numdays = 30;
+## Number of agents we list
+my $topagents = 10;
 ## Number of threads we want to know about
 my $topthreads = 20;
+## Number of top or bottom posters to show
+my $topposters = 20;
 ## Number of cross-posted threads to show
 my $topcrossposts = 10;
-## Number of agents we list
-my $topagents = 10;
 ## Number of time zones to show
 my $toptz = 10;
 ,20 → 77,28
 my $group = $newsgroup_name;
 $group =~ s!\.!/!g;
 my %data;    # name, count, agent, total, orig, quoted
-my $totsize = 0;    # holds total sizes of all files
+my %threads;       # subject, count
 my %crossposts;     # group, count
-my %threads;        # subject, count
-my $replies   = 0;  # total no. of replies
-my $origposts = 0;  # total no. of original posts
 my %tz;             # timezones by count
+my %headers;       # holds header of current article
+my %lcheader;      # holds lowercase headers
+my @body;          # holds body of current article
+my @sig;           # holds sig text;
+my $totalposts;    # total no. of posts considered
+my $filename;      # name of current article file
+my $filesize;      # size of current article file
 my $earliest;       # earliest article we have found
 my $latest;         # latest article we have found
+my $poster;        # poster we are dealing with
+my $totsize   = 0; # holds total sizes of all files
 my $totheader = 0;  # total size of header material
 my $totbody   = 0;  # total size of body material
 my $totsig    = 0;  # total size of sig material
+my $totorig   = 0; # total size of original material
 my $totquoted = 0;  # total size of quoted material
-my $totorig   = 0;  # total size of original material
-my $totalposts;     # total no. of posts considered
+my $origposts = 0; # total no. of original posts
+my $replies   = 0; # total no. of replies
+my $i;             # general purpose
 my %distinct_agent;
 ## Used to hold counts of User Agents used
 ,478 → 124,65
   "tin"                       => 0,
   "VSoup"                     => 0,
   "WebTV"                     => 0,
-  "Xnews"                     => 0,
+  "Xnews"                     => 0
 );
-my $datetime_parser = DateTime::Format::Mail->new();
-$datetime_parser->loose();
+######################## MAIN CODE ########################
+$! = 1;
-my $today = DateTime->today( time_zone => 'UTC' );
-my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
-my $start      = int $prev_month->strftime('%s');
-my $numdays    = int DateTime->last_day_of_month(
-  year      => $prev_month->year(),
-  month     => $prev_month->month(),
-  time_zone => $prev_month->time_zone(),
-)->day();
-my $end = int $today->clone()->set_day(1)->strftime('%s');
-dmsg( $start, " to ", $end ) if DEBUG;
-chdir("$news$group")
-  or die __x(
-  "Can't cd to {newsgroup}: {error}\n",
-  newsgroup => "$news$group",
-  error     => $!
-  );
-opendir( DIR, "." )
-  or die __x(
-  "Can't open {newsgroup}: {error}\n",
-  newsgroup => "$news$group",
-  error     => $!
-  );
-while ( defined( my $filename = readdir(DIR) ) )
+chdir("$news$group") or die "Can't cd to $news$group: $!\n";
+opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
+while ( defined( $filename = readdir(DIR) ) )
 {
+  %lcheader = ();
   next unless -f $filename;    # only want real files
   next if ( $filename eq ".overview" );    # real articles only
-  get_article($filename);                  # read in the article
+  next if ( -M $filename > $numdays );    # only want articles <= a certain age
+  $earliest = ( stat $filename )[9] unless defined($earliest);
+  $latest   = ( stat $filename )[9] unless defined($latest);
+  &get_article($filename);                 # read in the article
+  &get_data;                               # grab the data from the article
+  $totalposts++;                          # bump count of articles considered
 }
 closedir(DIR);                             # finished with the directory
-dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
 ## Post-processing
-count_agents();                            # count agents, collapsing versions
-fix_percent();
+&count_agents;    # count agents, collapsing versions
+&fix_percent;     # check percentages orig/total for posters
-write_data();
-display_results();
+&write_data;
-########################################
-## Get current article's header and body
-########################################
-sub get_article
-{
-  my $filename = shift;
-  open( my $FILE, '<', $filename )
-    or
-    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
-  my $msg       = Mail::Message->read($FILE);
-  my $timestamp = $msg->timestamp();
-  my $date      = $msg->study('Date');
-  ## Disregard article if timestamp is not in range
-  dmsg($timestamp) if DEBUG;
-  if ( $timestamp < $start or $timestamp >= $end )
-  {
-    dmsg("Posting on $date ignored.") if DEBUG;
-    return;
-  }
-  $totalposts++;    # bump count of articles considered
-  ## DEBUG
-  dmsg($date) if DEBUG;
-  ## get stats about the file itself
-  my $filesize = -s $filename;    # get total size of file
-  $totsize += $filesize;          # bump total sizes of all files
-  if ( ( not defined $earliest ) or $timestamp < $earliest )
-  {
-    $earliest = $timestamp;
-  }
-  elsif ( ( not defined $latest ) or $timestamp > $latest )
-  {
-    $latest = $timestamp;
-  }
-  #print "timestamp: $timestamp\n";
-  ## count header size
-  $totheader += $msg->head()->size();
-  ## get the poster's name (MIME-decoded, in UTF-8)
-  my $poster = $msg->study('From');
-  if ( defined $poster )
-  {
-    ## Convert old to new format
-    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
-    ## Collapse whitespace
-    $poster =~ s/\s+/ /g;
-    ## Remove outer quotes; TODO: observe RFC 5322 strictly
-    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
-    ## DEBUG
-    dmsg($poster) if DEBUG;
-    ## seen this one before?
-    if ( !defined( $data{$poster} ) )
-    {
-      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
-      $data{$poster}{'orig'}   = 0;
-      $data{$poster}{'quoted'} = 0;
-    }
-    $data{$poster}{'count'}++;                    # bump count for this poster
-    $data{$poster}{'size'} += $filesize;          # total size of file
-    ## The User-Agent and/or X-Newsreader fields
-    ## for User-Agent by poster
-    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
-    if ( defined $ua )
-    {
-      $data{$poster}{'agent'} = $ua;
-      ## DEBUG
-      dmsg($ua) if DEBUG;
-    }
-    ## The User Agent for User-Agent by number of posts
-    get_agent($msg);
-    ## Get all cross-posted newsgroups
-    for ( split( /,/, $msg->study('Newsgroups') ) )
-    {
-      $crossposts{$_}++;    # bump count for each
-    }
-    ## Get threads
-    my $thread = $msg->study('Subject');
-    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
-    $thread =~ s/\s+/ /g;       # collapse whitespace
-    $threads{$thread}{'count'}++;    # bump count of this subject
-    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
-    ## Is this an original post or a reply?
-    if ( defined $msg->study('References') )
-    {
-      $replies++;
-    }
-    else
-    {
-      $origposts++;
-    }
-    ## Get the time zone
-    my $datetime = $datetime_parser->parse_datetime($date);
-    my $tz       = $datetime->strftime('%z');
-    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
-    $tz{$tz}++;
-    ## DEBUG
-    dmsg($tz) if DEBUG;
-#### Now analyse the body text ####
-    my $body = $msg->body();
-    my $insig = 0;
-    my @body  = $body->lines;
-    for (@body)
-    {
-      $totbody += length($_);    # bump total body size
-      next if (m{^$>}o);         # don't count blank lines in body
-      if ( $insig == 1 )
-      {
-        # bump total sig size
-        $totsig += length($_);
-      }
-      ## are we in a quote line?
-      ## Bill Unruh uses ] quotes, and another poster uses ::
-      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
-      {
-        ## bump count of quoted chrs
-        $data{$poster}{'quoted'} += length($_);
-        $totquoted += length($_);
-      }
-      elsif (/^-- $/)
-      {
-        $insig = 1;
-      }
-      else
-      {
-        ## We must be processing an original line
-        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
-        $totorig += length($_);
-      }
-    }
-    # end for (@body)
-  }
-  close($FILE);
-}
-sub get_agent
-{
-  my $msg = shift;
-  my $ua =
-       $msg->study('User-Agent')
-    or $msg->study('X-Newsreader')
-    or $msg->study('X-Mailer');
-  if ( not defined $ua )
-  {
-    my $org = $msg->study('Organization');
-    if ( defined $org
-      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
-    {
-      $ua = $org;
-    }
-    elsif ( $msg->study('Message-ID') =~ /pine/i )
-    {
-      $ua = "Pine";
-    }
-  }
-  ## Hopefully found UA, else set to unknown
-  if ( not defined $ua )
-  {
-    $ua = __ "unknown";
-  }
-  $ua = clean($ua);
-  my $raw   = $ua;
-  my $agent = $raw;
-  ## strip http
-  if ( $raw =~ /.*http.*/ )
-  {
-    $raw =~ s!posted via!!i;
-    $raw =~ s!http://!!g;
-    $raw =~ s!/!!g;
-    $raw =~ s! !!g;
-  }
-  ## Fix Outlook from Mac
-  if ( $raw =~ /^microsoft/i )
-  {
-    $raw =~ s/-/ /g;
-  }
-  ## Pick out the popular agents
-  if ( $raw =~ /(outlook express)/i
-    || $raw =~ /(windows mail)/i
-    || $raw =~ /(microplanet gravity)/i
-    || $raw =~ /(news rover)/i
-    || $raw =~ /(forte agent)/i
-    || $raw =~ /(forte free agent)/i )
-  {
-    $agent = $1;
-  }
-  elsif (
-    $raw =~ /^(
-        pan
-       |sylpheed
-       |slrn
-       |mozilla
-       |knode
-       |tin
-       |hamster
-       |xrn
-       |xnews
-       |aol
-       |gnus
-       |krn
-       |macsoup
-       |messenger
-       |openxp
-       |pine
-       |thoth
-       |turnpike
-       |winvn
-       |vsoup
-       |google
-       |supernews
-       |nn
-       |rn
-       |007
-       |webtv
-       |compuserve
-       )/ix
-    )
-  {
-    $agent = $1;
-  }
-  else
-  {
-    ## Clean up unknown agents
-    if ( $raw =~ m!^(.*?)/! )
-    {
-      $agent = $1;
-    }
-    elsif ( $raw =~ /^(\w*)\d.*/ )
-    {
-      $agent = $1;
-    }
-  }
-  $distinct_agent{$agent}++;
-  return $agent;
-}
-## get_agent
-#########################################
-## Count the User-Agents used, collapsing
-## different versions into one per agent.
-#########################################
-sub count_agents
-{
-POSTER:
-  foreach my $poster ( keys %data )
-  {
-    foreach my $agent_name ( keys %distinct_agent )
-    {    # check against known ones
-      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
-      {
-        $agents{$agent_name}++;
-        next POSTER;
-      }
-    }
-    $agents{ $data{$poster}{'agent'} }++;
-  }
-}    # count_agents
-#############################################
-## Set orig/total percentages for all posters
-#############################################
-sub fix_percent
-{
-  foreach my $poster ( keys %data )
-  {
-    my $percent = 100;
-    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
-    {
-      $percent =
-        $data{$poster}{'orig'} * 100 /
-        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
-    }
-    elsif ( $data{$poster}{'orig'} == 0 )
-    {
-      $percent = 0;
-    }
-    $data{$poster}{'percent'} = $percent;
-  }
-}
-## fix_percent
-##################################
-## Write data structures to a file
-##################################
-sub write_data
-{
-  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
-    or die __x( "Can't create XDATA: {error}\n", error => $! );
-  print $OUTF "Data collected from $newsgroup_name\n\n";
-  print $OUTF
-    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
-  foreach my $name ( keys %data )
-  {
-    print $OUTF
-"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
-  }
-  print $OUTF
-"============================================================================\n";
-  print $OUTF "Thread subjects\n";
-  print $OUTF
-"----------------------------------------------------------------------------\n";
-  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
-  {
-    print $OUTF
-      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
-  }
-  print $OUTF
-"============================================================================\n";
-  print $OUTF "Cross-posts\n";
-  print $OUTF
-"----------------------------------------------------------------------------\n";
-  foreach my $name ( sort keys %crossposts )
-  {
-    print $OUTF "$name : $crossposts{$name}\n";
-  }
-  print $OUTF
-"============================================================================\n";
-  print $OUTF "User agents\n";
-  print $OUTF
-"----------------------------------------------------------------------------\n";
-  foreach my $name ( sort keys %agents )
-  {
-    print $OUTF "$name : $agents{$name}\n";
-  }
-  print $OUTF
-"============================================================================\n";
-  print $OUTF "Time zones\n";
-  print $OUTF
-"----------------------------------------------------------------------------\n";
-  foreach my $name ( sort keys %tz )
-  {
-    print $OUTF "$name : $tz{$name}\n";
-  }
-  close $OUTF;
-}    # write_data
-sub display_results
-{
   #################### DISPLAY RESULTS #####################
   print "=" x 76, "\n";
-  printf "%s\n",
-    centred(
-    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
-);
+printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
   print "=" x 76, "\n";
   printf "%s\n",
-    centred(
-    __
-"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)",
-    );
+  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
   print "\n\n";
-  printf __"Total posts considered: %s over %d days" . "\n",
-    commify($totalposts),
+printf "Total posts considered: %s over %d days\n", commify($totalposts),
     $numdays;
-  my $time_locale = setlocale(LC_TIME);
-  my $earliest_datetime = DateTime->from_epoch(
-    epoch => $earliest,
-    locale => $time_locale,
-    time_zone => 'UTC',
-  );
-  my $latest_datetime = DateTime->from_epoch(
-    epoch => $latest,
-    locale => $time_locale,
-    time_zone => 'UTC',
-  );
-  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
-  printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format);
-  printf __"Latest article" . ":   %s\n", $latest_datetime->strftime($datetime_format);
-  printf __"Original articles: %s; replies" . ": %s\n",
-    commify($origposts),
+printf "Earliest article: %s\n",               scalar localtime($earliest);
+printf "Latest article:   %s\n",               scalar localtime($latest);
+printf "Original articles: %s, replies: %s\n", commify($origposts),
     commify($replies);
-  printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n",
-    commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576;  #
-  printf __
-    "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
+printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
+  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
+printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
     commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
     commify( int( $totsize / $totalposts ) );
   my $count = keys %data;
-  printf __"Total headers: %s KiB; bodies: %s KiB\n",
+printf "Total headers: %s KiB  bodies: %s KiB\n",
     commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
-  printf __
-    "Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n",
+printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
     commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
     ( $totorig * 100 ) / ( $totorig + $totquoted ),
     commify( int( $totsig / 1024 ) );
-  printf __"Total number of posters: %s, average %s bytes per poster\n",
+printf "Total number of posters: %s, average %s bytes per poster\n",
     commify($count), commify( int( $totsize / $count ) );    #/
   $count = keys %threads;
-  printf __"Total number of threads: %s, average %s bytes per thread\n",
+printf "Total number of threads: %s, average %s bytes per thread\n",
     commify($count), commify( int( $totsize / $count ) );    #/
-  printf __"Total number of user agents: %d\n", scalar keys %agents;
+printf "Total number of user agents: %d\n", scalar keys %agents;
   print "\n", "=" x 76, "\n";
   ########################################
 ,11 → 198,9
     {
       $count = $topposters;
     }
-    printf "%s\n",
-      centred(
-      __x( "Top {count} posters by number of articles", count => $topposters ), 76 );
+  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach
       my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
     {
 ,14 → 225,10
     {
       $count = $topposters;
     }
-    printf "%s\n",
-      centred(
-      __x( "Top {count} posters by article size in KiB", count => $topposters ),
-);
+  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
-    foreach
-      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
+  $i = 0;
+  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
     {
       my $name = substr( $poster, 0, 62 );
       printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
 ,15 → 252,9
       $count = $topposters;
     }
     printf "%s\n",
-      centred(
-      __x(
-        "Top {count} responders by original text (> 5 posts)",
-        count => $topposters
-      ),
-      );
+    &centred( "Top $count responders by original text (> 5 posts)", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $poster (
       sort { $data{$b}{percent} <=> $data{$a}{percent} }
       keys %data
 ,15 → 284,9
       $count = $topposters;
     }
     printf "%s\n",
-      centred(
-      __x(
-        "Bottom {count} responders by original text  (> 5 posts)",
-        count => $topposters
-      ),
-      );
+    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $poster (
       sort { $data{$a}{percent} <=> $data{$b}{percent} }
       keys %data
 ,19 → 315,17
     {
       $count = $topthreads;
     }
-    printf "%s\n",
-      centred( __x( "Top {count} threads by no. of articles", count => $topthreads ),
-);
+  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $thread (
-      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
+    sort { $threads{$b}{count} <=> $threads{$a}{count} }
       keys %threads
       )
     {
       my $name = substr( $thread, 0, 65 );
       printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-        $threads{$thread}{'count'};
+      $threads{$thread}{count};
       last if ( ++$i == $count );
     }
     print "\n", "=" x 76, "\n";
 ,19 → 344,17
     {
       $count = $topthreads;
     }
-    printf "%s\n",
-      centred( __x( "Top {count} threads by size in KiB", count => $topthreads ),
-);
+  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $thread (
-      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
+    sort { $threads{$b}{size} <=> $threads{$a}{size} }
       keys %threads
       )
     {
       my $name = substr( $thread, 0, 65 );
       printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-        $threads{$thread}{'size'} / 1024;    #/
+      $threads{$thread}{size} / 1024;    #/
       last if ( ++$i == $count );
     }
     print "\n", "=" x 76, "\n";
 ,10 → 374,9
     {
       $count = $topcrossposts;
     }
-    printf "%s\n",
-      centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
+  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach
       my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
     {
 ,11 → 400,9
     {
       $count = $topagents;
     }
-    printf "%s\n",
-      centred( __x( "Top {count} user agents by poster", count => $topagents ),
-);
+  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
     {
       printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
 ,12 → 425,9
     {
       $count = $topagents;
     }
-    printf "%s\n",
-      centred(
-      __x( "Top {count} user agents by number of posts", count => $topagents ),
-);
+  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $agent (
       sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
       keys %distinct_agent
 ,9 → 454,9
     {
       $count = $toptz;
     }
-    printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 );
+  printf "%s\n", &centred( "Top 10 time zones", 76 );
     print "=" x 76, "\n";
-    my $i = 0;
+  $i = 0;
     foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
     {
       printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
 ,10 → 464,307
     }
     print "\n", "=" x 76, "\n";
   }
+################################ SUBROUTINES ################################
+########################################
+## Get current article's header and body
+########################################
+sub get_article
+{
+  %headers = ();    # dump old headers
+  my $filename = shift;    # get the name of the file
+  ## get stats about the file itself
+  $filesize = -s $filename;    # get total size of file
+  $totsize += $filesize;       # bump total sizes of all files
+  my $mtime = ( stat $filename )[9];
+  if ( $mtime < $earliest )
+  {
+    $earliest = $mtime;
 }
+  elsif ( $mtime > $latest )
+  {
+    $latest = $mtime;
+  }
-## helper subs
+  ## now read the file
+  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
+  while (<$FILE>)
+  {
+    $totheader += length($_);    # bump total header size
+    last if (/^\s*$/);           # end of header?
+    if (/^([^:\s]*):\s*(.*)/)
+    {
+      my ( $key, $val ) = ( $1, $2 );
+      $headers{$key} = decode( 'MIME-Header', $val );
+      $lcheader{ clean( lc($key) ) } = clean($val);
+    }
+  }
+  @body = <$FILE>;               # slurp up body
+  close($FILE);
+}    # get_article
+####################################
+## Get data from the current article
+####################################
+sub get_data
+{
+#### First, analyse header fields ####
+  ## Set up this poster if not defined, get counts, sizes
+  my $poster = $headers{From};    # get the poster's name
+  # Convert old to new format
+  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
+  # Collapse whitespace
+  $poster =~ s/\s+/ /;
+  # Remove outer quotes
+  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
+  if ( !defined( $data{$poster} ) )
+  {                               # seen this one before?
+    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
+    $data{$poster}{orig}   = 0;
+    $data{$poster}{quoted} = 0;
+  }
+  $data{$poster}{count}++;                 # bump count for this poster
+  $data{$poster}{size} += $filesize;       # total size of file
+  ## The User-Agent and/or X-Newsreader fields
+  ## for User-Agent by poster
+  if ( defined $lcheader{"user-agent"} )
+  {
+    $data{$poster}{agent} = $lcheader{"user-agent"};
+  }
+  if ( defined $lcheader{"x-newsreader"} )
+  {
+    $data{$poster}{agent} = $lcheader{"x-newsreader"};
+  }
+  ## The User Agent for User-Agent by number of posts
+  my $UA = "unknown";
+  foreach my $keys ( keys %lcheader )
+  {
+    if ( defined $lcheader{'user-agent'} )
+    {
+      $UA = $lcheader{'user-agent'};
+    }
+    elsif ( defined $lcheader{"x-newsreader"} )
+    {
+      $UA = $lcheader{"x-newsreader"};
+    }
+    elsif ( defined $lcheader{'x-mailer'} )
+    {
+      $UA = $lcheader{'x-mailer'};
+    }
+    elsif (
+      ( defined $lcheader{'organization'} )
+      && ( $lcheader{'organization'} =~
+        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
+      )
+    {
+      $UA = $lcheader{'organization'};
+    }
+    elsif ( $lcheader{'message-id'} =~ /pine/i )
+    {
+      $UA = "Pine";
+    }    ## Hopefully found UA, else set to unknown
+  }
+  $UA = clean($UA);
+  $UA = get_agent($UA);
+  sub get_agent
+  {
+    my $raw   = shift;
+    my $agent = $raw;
+    ## strip http
+    if ( $raw =~ /.*http.*/ )
+    {
+      $raw =~ s!posted via!!i;
+      $raw =~ s!http://!!g;
+      $raw =~ s!/!!g;
+      $raw =~ s! !!g;
+    }
+    ## Fix Outlook from Mac
+    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
+    ## Pick out the popular agents
+    if (
+           $raw =~ /(outlook express)/i
+        || $raw =~ /(windows mail)/i
+        || $raw =~ /(microplanet gravity)/i
+        || $raw =~ /(news rover)/i
+        || $raw =~ /(forte agent)/i
+        || $raw =~ /(forte free agent)/i
+      )
+    {
+      $agent = $1;
+    }
+    elsif (
+      $raw =~ /^(
+        pan
+       |sylpheed
+       |slrn
+       |mozilla
+       |knode
+       |tin
+       |hamster
+       |xrn
+       |xnews
+       |aol
+       |gnus
+       |krn
+       |macsoup
+       |messenger
+       |openxp
+       |pine
+       |thoth
+       |turnpike
+       |winvn
+       |vsoup
+       |google
+       |supernews
+       |nn
+       |rn
+       |007
+       |webtv
+       |compuserve
+       )/ix
+      )
+    {
+      $agent = $1;
+    }
+    else
+    {
+      ## Clean up unknown agents
+      if ( $raw =~ m!^(.*?)/! )
+      {
+        $agent = $1;
+      }
+      elsif ( $raw =~ /^(\w*)\d.*/ )
+      {
+        $agent = $1;
+      }
+    }
+    $distinct_agent{$agent}++;
+    return $agent;
+  }
+  ## Get all cross-posted newsgroups
+  for ( split /,/, $headers{"Newsgroups"} )
+  {
+    $crossposts{$_}++;    # bump count for each
+  }
+  ## Get threads
+  my $thread = $headers{"Subject"};
+  $thread =~ s/^re: //i;    # Remove Re: or re: at start
+  $thread =~ s/\s+/ /g;     # collapse whitespace
+  $threads{$thread}{count} += 1;            # bump count of this subject
+  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
+  ## Is this an original post or a reply?
+  if ( defined $headers{"References"} )
+  {
+    $replies++;
+  }
+  else
+  {
+    $origposts++;
+  }
+  ## Get the time zone
+  $_ = $headers{"Date"};
+  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
+  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
+  {
+    $tz = "UTC";
+  }
+  $tz{$tz}++;
+#### Now analyse the body text ####
+  my $insig = 0;
+  for (@body)
+  {
+    $totbody += length($_);    # bump total body size
+    next if (/^$>/);           # don't count blank lines in body
+    if ( $insig == 1 )
+    {
+      $totsig += length($_);    # bump total sig size
+      ## Bill Unruh uses ] quotes, and another poster uses ::
+    }
+    elsif ( /^\s*[>\]]/ or /^\s*::/ )
+    {                           # are we in a quote line?
+      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
+      $totquoted += length($_);
+    }
+    elsif (/-- /)
+    {
+      $insig = 1;
+    }
+    else
+    {
+      ## We must be processing an original line
+      $data{$poster}{orig} += length($_);      # bump count of original chrs
+      $totorig += length($_);
+    }
+  }    # end for (@body)
+}    # get_data
+#########################################
+## Count the User-Agents used, collapsing
+## different versions into one per agent.
+#########################################
+sub count_agents
+{
+POSTER:
+  foreach my $poster ( keys %data )
+  {
+    foreach my $agent_name ( keys %distinct_agent )
+    {    # check against known ones
+      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
+      {
+        $agents{$agent_name}++;
+        next POSTER;
+      }
+    }
+    $agents{ $data{$poster}{agent} }++;
+  }
+}    # count_agents
+#############################################
+## Set orig/total percentages for all posters
+#############################################
+sub fix_percent
+{
+  foreach my $poster ( keys %data )
+  {
+    my $percent = 100;
+    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
+    {
+      $percent =
+        $data{$poster}{orig} * 100 /
+        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
+    }
+    elsif ( $data{$poster}{orig} == 0 )
+    {
+      $percent = 0;
+    }
+    $data{$poster}{percent} = $percent;
+  }
+}
 ###############################
 ## Right pad a string with '.'s
 ###############################
 ,7 → 774,7
   my ( $text, $pad_len, $pad_chr ) = @_;
   ## DEBUG
-  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
+#printf "|%s| = %d\n", $text, length($text);
   if ( length($text) > $pad_len )
   {
 ,36 → 798,10
 ###########################
 ## Put commas into a number
 ###########################
-# Get some of locale's numeric formatting parameters
-my ($thousands_sep, $grouping) =
-        @{localeconv()}{'thousands_sep', 'grouping'};
-# Apply defaults if values are missing
-$thousands_sep = ',' unless $thousands_sep;
-# grouping and mon_grouping are packed lists
-# of small integers (characters) telling the
-# grouping (thousand_seps and mon_thousand_seps
-# being the group dividers) of numbers and
-# monetary quantities.  The integers' meanings:
-# 255 means no more grouping, 0 means repeat
-# the previous grouping, 1-254 means use that
-# as the current grouping.  Grouping goes from
-# right to left (low to high digits).  In the
-# below we cheat slightly by never using anything
-# else than the first grouping (whatever that is).
-my @grouping;
-if ($grouping) {
-    @grouping = unpack("C*", $grouping);
-} else {
-    @grouping = (3);
-}
 sub commify
 {
   local $_ = shift;
-  #1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
-  $_ = int;    # Chop non-integer part
-while
-  s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/;
+while s/^([-+]?\d+)(\d{3})/$1,$2/;
   return $_;
 }
 ,17 → 819,61
 sub usage
 {
-  print __"usage: newsstat.pl NEWS.GROUP\n";
+  print "usage: newstat.pl newsgroupname\n";
   exit 1;
 }
-sub dmsg
+##################################
+## Write data structures to a file
+##################################
+sub write_data
 {
-  print STDERR @_, "\n";
+  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
+    or die "Can't create XDATA: $!\n";
+  print $OUTF "Data collected from $newsgroup_name\n\n";
+  print $OUTF
+    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
+  foreach my $name ( keys %data )
+  {
+    print $OUTF
+"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
 }
-sub dmsg2
+  print $OUTF
+"============================================================================\n";
+  print $OUTF "Thread subjects\n";
+  print $OUTF
+"----------------------------------------------------------------------------\n";
+  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
 {
-  my ( $level, @msg ) = @_;
-  print STDERR @msg, "\n" if $level >= DEBUG;
+    print $OUTF
+      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
 }
+  print $OUTF
+"============================================================================\n";
+  print $OUTF "Cross-posts\n";
+  print $OUTF
+"----------------------------------------------------------------------------\n";
+  foreach my $name ( sort keys %crossposts )
+  {
+    print $OUTF "$name : $crossposts{$name}\n";
+  }
+  print $OUTF
+"============================================================================\n";
+  print $OUTF "User agents\n";
+  print $OUTF
+"----------------------------------------------------------------------------\n";
+  foreach my $name ( sort keys %agents )
+  {
+    print $OUTF "$name : $agents{$name}\n";
+  }
+  print $OUTF
+"============================================================================\n";
+  print $OUTF "Time zones\n";
+  print $OUTF
+"----------------------------------------------------------------------------\n";
+  foreach my $name ( sort keys %tz )
+  {
+    print $OUTF "$name : $tz{$name}\n";
+  }
+  close $OUTF;
+}    # write_data

Subversion Repositories LCARS

Compare Revisions

Last modification

Regard whitespace Rev 22 → Rev 14