WebSVN - LCARS - Diff - Rev 13 and 14 - /trunk/tools/network/news/newsstat/newsstat.pl


#!/usr/bin/env perl
use strict;
use warnings;
use diagnostics;
use utf8;
use Encode;

## Print out all text to STDOUT UTF-8 encoded
binmode STDOUT, ':encoding(UTF-8)';

##############################
## newsstat.pl version 0.4.3.1

###########################################################################
## Collect statistics about a newsgroup (specified by first argument)
## in the local news spool. Check all articles in the last 30-day period.
## Rank posters by number of posts and by volume of posts, report on top
## and bottom 20 posters. Show their name, number of posts, size of posts,
## percentage of quoted lines. Rank user-agents used, by poster rather
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
##
## Numbers and paths can be configured below.  See ChangeLog and TODO
## for more.  -- PE
###########################################################################

###################### USER CONFIGURATIONS ############################

## The name of the group to do stats for
my $newsgroup_name = $ARGV[0];
$newsgroup_name or &usage;

## Check for removal flags
my $ix;
my $j;
my %skipSec;
my @skiplist;
my $args = @ARGV;
for ( $ix = 1 ; $ix < $args ; $ix++ )
{
  $j = $ix + 1;
  if ( $ARGV[$ix] eq "-x" )
  {
    @skiplist = split( ",", $ARGV[$j] );
  }
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
  {
    @skiplist = split( ",", $1 );
  }
}
foreach (@skiplist)
{
  $skipSec{$_} = 1;
}

## Leafnode users will want /var/spool/news for this variable.
my $news = "/var/spool/news/";

## How many days are we doing statistics for?
my $numdays = 30;

## Number of agents we list
my $topagents = 10;

## Number of threads we want to know about
my $topthreads = 20;

## Number of top or bottom posters to show
my $topposters = 20;

## Number of cross-posted threads to show
my $topcrossposts = 10;

## Number of time zones to show
my $toptz = 10;

###################### DATA STRUCTURES ######################
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data;          # name, count, agent, total, orig, quoted
my %threads;       # subject, count
my %crossposts;    # group, count
my %tz;            # timezones by count
my %headers;       # holds header of current article
my %lcheader;      # holds lowercase headers
my @body;          # holds body of current article
my @sig;           # holds sig text;
my $totalposts;    # total no. of posts considered
my $filename;      # name of current article file
my $filesize;      # size of current article file
my $earliest;      # earliest article we have found
my $latest;        # latest article we have found
my $poster;        # poster we are dealing with
my $totsize   = 0; # holds total sizes of all files
my $totheader = 0; # total size of header material
my $totbody   = 0; # total size of body material
my $totsig    = 0; # total size of sig material
my $totorig   = 0; # total size of original material
my $totquoted = 0; # total size of quoted material
my $origposts = 0; # total no. of original posts
my $replies   = 0; # total no. of replies
my $i;             # general purpose
my %distinct_agent;

## Used to hold counts of User Agents used
my %agents = (
  "Compuserver"               => 0,
  "Foorum"                    => 0,
  "Forte Agent"               => 0,
  "Forte Free Agent"          => 0,
  "Gnus"                      => 0,
  "KNode"                     => 0,
  "MacSOUP"                   => 0,
  "MT-NewsWatcher"            => 0,
  "MicroPlanet Gravity"       => 0,
  "Microsoft Outlook Express" => 0,
  "Microsoft Windows Mail"    => 0,
  "Mozilla"                   => 0,
  "News Rover"                => 0,
  "NN"                        => 0,
  "Pan"                       => 0,
  "rn"                        => 0,
  "slrn"                      => 0,
  "Sylpheed"                  => 0,
  "tin"                       => 0,
  "VSoup"                     => 0,
  "WebTV"                     => 0,
  "Xnews"                     => 0
);

######################## MAIN CODE ########################
$! = 1;

chdir("$news$group") or die "Can't cd to $news$group: $!\n";
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
while ( defined( $filename = readdir(DIR) ) )
{
  %lcheader = ();
  next unless -f $filename;               # only want real files
  next if ( $filename eq ".overview" );   # real articles only
  next if ( -M $filename > $numdays );    # only want articles <= a certain age
  $earliest = ( stat $filename )[9] unless defined($earliest);
  $latest   = ( stat $filename )[9] unless defined($latest);
  &get_article($filename);                 # read in the article
  &get_data;                               # grab the data from the article
  $totalposts++;                          # bump count of articles considered
}
closedir(DIR);                            # finished with the directory

## Post-processing
&count_agents;    # count agents, collapsing versions
&fix_percent;     # check percentages orig/total for posters

&write_data;

#################### DISPLAY RESULTS #####################
print "=" x 76, "\n";
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
print "=" x 76, "\n";
printf "%s\n",
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
print "\n\n";
printf "Total posts considered: %s over %d days\n", commify($totalposts),
  $numdays;
printf "Earliest article: %s\n",               scalar localtime($earliest);
printf "Latest article:   %s\n",               scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts),
  commify($replies);
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
  commify( int( $totsize / $totalposts ) );
my $count = keys %data;
printf "Total headers: %s KiB  bodies: %s KiB\n",
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
  commify( int( $totsig / 1024 ) );
printf "Total number of posters: %s, average %s bytes per poster\n",
  commify($count), commify( int( $totsize / $count ) );     #/
$count = keys %threads;
printf "Total number of threads: %s, average %s bytes per thread\n",
  commify($count), commify( int( $totsize / $count ) );     #/
printf "Total number of user agents: %d\n", scalar keys %agents;
print "\n", "=" x 76, "\n";

########################################
## Show posters by article count  Sec 1;
########################################
unless ( $skipSec{1} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
  {
    my $name = substr( $poster, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{count};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

######################################
## Show posters by size in KiB  Sec 2;
######################################
unless ( $skipSec{2} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
  {
    my $name = substr( $poster, 0, 62 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{size} / 1024;    #/
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#####################################
## Show top posters for original text
#####################################
unless ( $skipSec{3} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n",
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster (
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
    keys %data
    )
  {
    next if $data{$poster}{quoted} == 0;
    next if $data{$poster}{count} < 5;
    my $name = substr( $poster, 0, 63 );
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{percent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

########################################
## Show bottom posters for original text
########################################
unless ( $skipSec{4} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n",
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster (
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
    keys %data
    )
  {
    next if $data{$poster}{quoted} == 0;
    next if $data{$poster}{count} < 5;
    my $name = substr( $poster, 0, 63 );
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{percent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#####################################
## Show threads by number of articles
#####################################
unless ( $skipSec{5} )
{
  if ( keys %threads < $topthreads )
  {
    $count = keys %threads;
  }
  else
  {
    $count = $topthreads;
  }
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread (
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
    keys %threads
    )
  {
    my $name = substr( $thread, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $threads{$thread}{count};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

##############################
## Show threads by size in KiB
##############################
unless ( $skipSec{6} )
{
  if ( keys %threads < $topthreads )
  {
    $count = keys %threads;
  }
  else
  {
    $count = $topthreads;
  }
  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread (
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
    keys %threads
    )
  {
    my $name = substr( $thread, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $threads{$thread}{size} / 1024;    #/
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

##################################
## Show top 10 cross-posted groups
##################################
unless ( $skipSec{7} )
{
  delete $crossposts{"$newsgroup_name"};    # don't include ours
  if ( keys %crossposts < $topcrossposts )
  {
    $count = keys %crossposts;
  }
  else
  {
    $count = $topcrossposts;
  }
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $crossposts{$name};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#########################
## Show agents and counts
#########################
unless ( $skipSec{8} )
{
  if ( keys %agents < $topagents )
  {
    $count = keys %agents;
  }
  else
  {
    $count = $topagents;
  }
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
      $agents{$agent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#######################
## Show distinct agents
#######################
unless ( $skipSec{9} )
{
  if ( keys %distinct_agent < $topagents )
  {
    $count = keys %distinct_agent;
  }
  else
  {
    $count = $topagents;
  }
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $agent (
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
    keys %distinct_agent
    )
  {
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
      $distinct_agent{$agent},
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

############################
## Show timezones and counts
############################
unless ( $skipSec{10} )
{
  if ( keys %tz < $toptz )
  {
    $count = keys %tz;
  }
  else
  {
    $count = $toptz;
  }
  printf "%s\n", &centred( "Top 10 time zones", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

################################ SUBROUTINES ################################

########################################
## Get current article's header and body
########################################
sub get_article
{
  %headers = ();    # dump old headers
  my $filename = shift;    # get the name of the file

  ## get stats about the file itself
  $filesize = -s $filename;    # get total size of file
  $totsize += $filesize;       # bump total sizes of all files

  my $mtime = ( stat $filename )[9];
  if ( $mtime < $earliest )
  {
    $earliest = $mtime;
  }
  elsif ( $mtime > $latest )
  {
    $latest = $mtime;
  }

  ## now read the file
  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
  while (<$FILE>)
  {
    $totheader += length($_);    # bump total header size
    last if (/^\s*$/);           # end of header?
    if (/^([^:\s]*):\s*(.*)/)
    {
      my ( $key, $val ) = ( $1, $2 );
      $headers{$key} = decode( 'MIME-Header', $val );
      $lcheader{ clean( lc($key) ) } = clean($val);
    }
  }
  @body = <$FILE>;               # slurp up body
  close($FILE);
}    # get_article

####################################
## Get data from the current article
####################################
sub get_data
{
#### First, analyse header fields ####

  ## Set up this poster if not defined, get counts, sizes
  my $poster = $headers{From};    # get the poster's name
 
  # Convert old to new format
  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
 
  # Collapse whitespace
  $poster =~ s/\s+/ /;
 
  # Remove outer quotes
  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
 
  if ( !defined( $data{$poster} ) )
  {                               # seen this one before?
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
    $data{$poster}{orig}   = 0;
    $data{$poster}{quoted} = 0;
  }
  $data{$poster}{count}++;                 # bump count for this poster
  $data{$poster}{size} += $filesize;       # total size of file

  ## The User-Agent and/or X-Newsreader fields
  ## for User-Agent by poster
  if ( defined $lcheader{"user-agent"} )
  {
    $data{$poster}{agent} = $lcheader{"user-agent"};
  }
  if ( defined $lcheader{"x-newsreader"} )
  {
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
  }

  ## The User Agent for User-Agent by number of posts
  my $UA = "unknown";
  foreach my $keys ( keys %lcheader )
  {
    if ( defined $lcheader{'user-agent'} )
    {
      $UA = $lcheader{'user-agent'};
    }
    elsif ( defined $lcheader{"x-newsreader"} )
    {
      $UA = $lcheader{"x-newsreader"};
    }
    elsif ( defined $lcheader{'x-mailer'} )
    {
      $UA = $lcheader{'x-mailer'};
    }
    elsif (
      ( defined $lcheader{'organization'} )
      && ( $lcheader{'organization'} =~
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
      )
    {
      $UA = $lcheader{'organization'};
    }
    elsif ( $lcheader{'message-id'} =~ /pine/i )
    {
      $UA = "Pine";
    }    ## Hopefully found UA, else set to unknown
  }

  $UA = clean($UA);
  $UA = get_agent($UA);

  sub get_agent
  {
    my $raw   = shift;
    my $agent = $raw;

    ## strip http
    if ( $raw =~ /.*http.*/ )
    {
      $raw =~ s!posted via!!i;
      $raw =~ s!http://!!g;
      $raw =~ s!/!!g;
      $raw =~ s! !!g;
    }

    ## Fix Outlook from Mac
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }

    ## Pick out the popular agents
    if (
           $raw =~ /(outlook express)/i
        || $raw =~ /(windows mail)/i
        || $raw =~ /(microplanet gravity)/i
        || $raw =~ /(news rover)/i
        || $raw =~ /(forte agent)/i
        || $raw =~ /(forte free agent)/i
      )
    {
      $agent = $1;
    }
    elsif (
      $raw =~ /^(
        pan
       |sylpheed
       |slrn
       |mozilla
       |knode
       |tin
       |hamster
       |xrn
       |xnews
       |aol
       |gnus
       |krn
       |macsoup
       |messenger
       |openxp
       |pine
       |thoth
       |turnpike
       |winvn
       |vsoup
       |google
       |supernews
       |nn
       |rn
       |007
       |webtv
       |compuserve
       )/ix
      )
    {
      $agent = $1;
    }
    else
    {
      ## Clean up unknown agents
      if ( $raw =~ m!^(.*?)/! )
      {
        $agent = $1;
      }
      elsif ( $raw =~ /^(\w*)\d.*/ )
      {
        $agent = $1;
      }
    }

    $distinct_agent{$agent}++;
    return $agent;
  }

  ## Get all cross-posted newsgroups
  for ( split /,/, $headers{"Newsgroups"} )
  {
    $crossposts{$_}++;    # bump count for each
  }

  ## Get threads
  my $thread = $headers{"Subject"};
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
  $thread =~ s/\s+/ /g;     # collapse whitespace
  $threads{$thread}{count} += 1;            # bump count of this subject
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread

  ## Is this an original post or a reply?
  if ( defined $headers{"References"} )
  {
    $replies++;
  }
  else
  {
    $origposts++;
  }

  ## Get the time zone
  $_ = $headers{"Date"};
  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
  {
    $tz = "UTC";
  }
  $tz{$tz}++;

#### Now analyse the body text ####
  my $insig = 0;
  for (@body)
  {
    $totbody += length($_);    # bump total body size
    next if (/^$>/);           # don't count blank lines in body
    if ( $insig == 1 )
    {
      $totsig += length($_);    # bump total sig size

      ## Bill Unruh uses ] quotes, and another poster uses ::
    }
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
    {                           # are we in a quote line?
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
      $totquoted += length($_);
    }
    elsif (/-- /)
    {
      $insig = 1;
    }
    else
    {

      ## We must be processing an original line
      $data{$poster}{orig} += length($_);      # bump count of original chrs
      $totorig += length($_);
    }
  }    # end for (@body)

}    # get_data

#########################################
## Count the User-Agents used, collapsing
## different versions into one per agent.
#########################################
sub count_agents
{
POSTER:
  foreach my $poster ( keys %data )
  {
    foreach my $agent_name ( keys %distinct_agent )
    {    # check against known ones
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
      {
        $agents{$agent_name}++;
        next POSTER;
      }
    }
    $agents{ $data{$poster}{agent} }++;
  }
}    # count_agents

#############################################
## Set orig/total percentages for all posters
#############################################
sub fix_percent
{
  foreach my $poster ( keys %data )
  {
    my $percent = 100;
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
    {
      $percent =
        $data{$poster}{orig} * 100 /
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
    }
    elsif ( $data{$poster}{orig} == 0 )
    {
      $percent = 0;
    }
    $data{$poster}{percent} = $percent;
  }
}

###############################
## Right pad a string with '.'s
###############################
sub rpad
{
  ## Get text to pad, length to pad, pad chr
  my ( $text, $pad_len, $pad_chr ) = @_;

  ## DEBUG
#printf "|%s| = %d\n", $text, length($text);

  if ( length($text) > $pad_len )
  {
    $text = substr( $text, 0, $pad_len );
  }
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
  return $padded;
}

##################
## Centre a string
##################
sub centred
{
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
  my $pad_len = ( $width - length($text) ) / 2;    #/
  my $centred = " " x $pad_len . $text;
  return $centred;
}

###########################
## Put commas into a number
###########################
sub commify
{
  local $_ = shift;
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
  return $_;
}

################################################################
## Returns a string with leading and trailing whitespace removed
################################################################
sub clean
{
  my $dirty = shift;
  my $clean = $dirty;
  $clean =~ s/^\s+|\s+$//g;

  return $clean;
}

sub usage
{
  print "usage: newstat.pl newsgroupname\n";
  exit 1;
}

##################################
## Write data structures to a file
##################################
sub write_data
{
  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
    or die "Can't create XDATA: $!\n";
  print $OUTF "Data collected from $newsgroup_name\n\n";
  print $OUTF
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
  foreach my $name ( keys %data )
  {
    print $OUTF
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Thread subjects\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
  {
    print $OUTF
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Cross-posts\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %crossposts )
  {
    print $OUTF "$name : $crossposts{$name}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "User agents\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %agents )
  {
    print $OUTF "$name : $agents{$name}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Time zones\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %tz )
  {
    print $OUTF "$name : $tz{$name}\n";
  }
  close $OUTF;
}    # write_data
 

Rev 13	Rev 14
1	#!/usr/bin/env perl	1	#!/usr/bin/env perl
2	use strict;	2	use strict;
3	use warnings;	3	use warnings;
4	use diagnostics;	4	use diagnostics;
5	use utf8;	5	use utf8;
6	use Encode;	6	use Encode;
7		7
8	## Print out all text to STDOUT UTF-8 encoded	8	## Print out all text to STDOUT UTF-8 encoded
9	binmode STDOUT, ':encoding(UTF-8)';	9	binmode STDOUT, ':encoding(UTF-8)';
10		10
11	############################	11	##############################
12	## newsstat.pl version 0.4.3	12	## newsstat.pl version 0.4.3.1
13		13
14	###########################################################################	14	###########################################################################
15	## Collect statistics about a newsgroup (specified by first argument)	15	## Collect statistics about a newsgroup (specified by first argument)
16	## in the local news spool. Check all articles in the last 30-day period.	16	## in the local news spool. Check all articles in the last 30-day period.
17	## Rank posters by number of posts and by volume of posts, report on top	17	## Rank posters by number of posts and by volume of posts, report on top
18	## and bottom 20 posters. Show their name, number of posts, size of posts,	18	## and bottom 20 posters. Show their name, number of posts, size of posts,
19	## percentage of quoted lines. Rank user-agents used, by poster rather	19	## percentage of quoted lines. Rank user-agents used, by poster rather
20	## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.	20	## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
21	##	21	##
22	## Numbers and paths can be configured below. See ChangeLog and TODO	22	## Numbers and paths can be configured below. See ChangeLog and TODO
23	## for more. -- PE	23	## for more. -- PE
24	###########################################################################	24	###########################################################################
25		25
26	###################### USER CONFIGURATIONS ############################	26	###################### USER CONFIGURATIONS ############################
27		27
28	## The name of the group to do stats for	28	## The name of the group to do stats for
29	my $newsgroup_name = $ARGV[0];	29	my $newsgroup_name = $ARGV[0];
30	$newsgroup_name or &usage;	30	$newsgroup_name or &usage;
31		31
32	## Check for removal flags	32	## Check for removal flags
33	my $ix;	33	my $ix;
34	my $j;	34	my $j;
35	my %skipSec;	35	my %skipSec;
36	my @skiplist;	36	my @skiplist;
37	my $args = @ARGV;	37	my $args = @ARGV;
38	for ( $ix = 1 ; $ix < $args ; $ix++ )	38	for ( $ix = 1 ; $ix < $args ; $ix++ )
39	{	39	{
40	$j = $ix + 1;	40	$j = $ix + 1;
41	if ( $ARGV[$ix] eq "-x" )	41	if ( $ARGV[$ix] eq "-x" )
42	{	42	{
43	@skiplist = split( ",", $ARGV[$j] );	43	@skiplist = split( ",", $ARGV[$j] );
44	}	44	}
45	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )	45	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
46	{	46	{
47	@skiplist = split( ",", $1 );	47	@skiplist = split( ",", $1 );
48	}	48	}
49	}	49	}
50	foreach (@skiplist)	50	foreach (@skiplist)
51	{	51	{
52	$skipSec{$_} = 1;	52	$skipSec{$_} = 1;
53	}	53	}
54		54
55	## Leafnode users will want /var/spool/news for this variable.	55	## Leafnode users will want /var/spool/news for this variable.
56	my $news = "/var/spool/news/";	56	my $news = "/var/spool/news/";
57		57
58	## How many days are we doing statistics for?	58	## How many days are we doing statistics for?
59	my $numdays = 30;	59	my $numdays = 30;
60		60
61	## Number of agents we list	61	## Number of agents we list
62	my $topagents = 10;	62	my $topagents = 10;
63		63
64	## Number of threads we want to know about	64	## Number of threads we want to know about
65	my $topthreads = 20;	65	my $topthreads = 20;
66		66
67	## Number of top or bottom posters to show	67	## Number of top or bottom posters to show
68	my $topposters = 20;	68	my $topposters = 20;
69		69
70	## Number of cross-posted threads to show	70	## Number of cross-posted threads to show
71	my $topcrossposts = 10;	71	my $topcrossposts = 10;
72		72
73	## Number of time zones to show	73	## Number of time zones to show
74	my $toptz = 10;	74	my $toptz = 10;
75		75
76	###################### DATA STRUCTURES ######################	76	###################### DATA STRUCTURES ######################
77	my $group = $newsgroup_name;	77	my $group = $newsgroup_name;
78	$group =~ s!\.!/!g;	78	$group =~ s!\.!/!g;
79	my %data; # name, count, agent, total, orig, quoted	79	my %data; # name, count, agent, total, orig, quoted
80	my %threads; # subject, count	80	my %threads; # subject, count
81	my %crossposts; # group, count	81	my %crossposts; # group, count
82	my %tz; # timezones by count	82	my %tz; # timezones by count
83	my %headers; # holds header of current article	83	my %headers; # holds header of current article
84	my %lcheader; # holds lowercase headers	84	my %lcheader; # holds lowercase headers
85	my @body; # holds body of current article	85	my @body; # holds body of current article
86	my @sig; # holds sig text;	86	my @sig; # holds sig text;
87	my $totalposts; # total no. of posts considered	87	my $totalposts; # total no. of posts considered
88	my $filename; # name of current article file	88	my $filename; # name of current article file
89	my $filesize; # size of current article file	89	my $filesize; # size of current article file
90	my $earliest; # earliest article we have found	90	my $earliest; # earliest article we have found
91	my $latest; # latest article we have found	91	my $latest; # latest article we have found
92	my $poster; # poster we are dealing with	92	my $poster; # poster we are dealing with
93	my $totsize = 0; # holds total sizes of all files	93	my $totsize = 0; # holds total sizes of all files
94	my $totheader = 0; # total size of header material	94	my $totheader = 0; # total size of header material
95	my $totbody = 0; # total size of body material	95	my $totbody = 0; # total size of body material
96	my $totsig = 0; # total size of sig material	96	my $totsig = 0; # total size of sig material
97	my $totorig = 0; # total size of original material	97	my $totorig = 0; # total size of original material
98	my $totquoted = 0; # total size of quoted material	98	my $totquoted = 0; # total size of quoted material
99	my $origposts = 0; # total no. of original posts	99	my $origposts = 0; # total no. of original posts
100	my $replies = 0; # total no. of replies	100	my $replies = 0; # total no. of replies
101	my $i; # general purpose	101	my $i; # general purpose
102	my %distinct_agent;	102	my %distinct_agent;
103		103
104	## Used to hold counts of User Agents used	104	## Used to hold counts of User Agents used
105	my %agents = (	105	my %agents = (
106	"Compuserver" => 0,	106	"Compuserver" => 0,
107	"Foorum" => 0,	107	"Foorum" => 0,
108	"Forte Agent" => 0,	108	"Forte Agent" => 0,
109	"Forte Free Agent" => 0,	109	"Forte Free Agent" => 0,
110	"Gnus" => 0,	110	"Gnus" => 0,
111	"KNode" => 0,	111	"KNode" => 0,
112	"MacSOUP" => 0,	112	"MacSOUP" => 0,
113	"MT-NewsWatcher" => 0,	113	"MT-NewsWatcher" => 0,
114	"MicroPlanet Gravity" => 0,	114	"MicroPlanet Gravity" => 0,
115	"Microsoft Outlook Express" => 0,	115	"Microsoft Outlook Express" => 0,
116	"Microsoft Windows Mail" => 0,	116	"Microsoft Windows Mail" => 0,
117	"Mozilla" => 0,	117	"Mozilla" => 0,
118	"News Rover" => 0,	118	"News Rover" => 0,
119	"NN" => 0,	119	"NN" => 0,
120	"Pan" => 0,	120	"Pan" => 0,
121	"rn" => 0,	121	"rn" => 0,
122	"slrn" => 0,	122	"slrn" => 0,
123	"Sylpheed" => 0,	123	"Sylpheed" => 0,
124	"tin" => 0,	124	"tin" => 0,
125	"VSoup" => 0,	125	"VSoup" => 0,
126	"WebTV" => 0,	126	"WebTV" => 0,
127	"Xnews" => 0	127	"Xnews" => 0
128	);	128	);
129		129
130	######################## MAIN CODE ########################	130	######################## MAIN CODE ########################
131	$! = 1;	131	$! = 1;
132		132
133	chdir("$news$group") or die "Can't cd to $news$group: $!\n";	133	chdir("$news$group") or die "Can't cd to $news$group: $!\n";
134	opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";	134	opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
135	while ( defined( $filename = readdir(DIR) ) )	135	while ( defined( $filename = readdir(DIR) ) )
136	{	136	{
137	%lcheader = ();	137	%lcheader = ();
138	next unless -f $filename; # only want real files	138	next unless -f $filename; # only want real files
139	next if ( $filename eq ".overview" ); # real articles only	139	next if ( $filename eq ".overview" ); # real articles only
140	next if ( -M $filename > $numdays ); # only want articles <= a certain age	140	next if ( -M $filename > $numdays ); # only want articles <= a certain age
141	$earliest = ( stat $filename )[9] unless defined($earliest);	141	$earliest = ( stat $filename )[9] unless defined($earliest);
142	$latest = ( stat $filename )[9] unless defined($latest);	142	$latest = ( stat $filename )[9] unless defined($latest);
143	&get_article($filename); # read in the article	143	&get_article($filename); # read in the article
144	&get_data; # grab the data from the article	144	&get_data; # grab the data from the article
145	$totalposts++; # bump count of articles considered	145	$totalposts++; # bump count of articles considered
146	}	146	}
147	closedir(DIR); # finished with the directory	147	closedir(DIR); # finished with the directory
148		148
149	## Post-processing	149	## Post-processing
150	&count_agents; # count agents, collapsing versions	150	&count_agents; # count agents, collapsing versions
151	&fix_percent; # check percentages orig/total for posters	151	&fix_percent; # check percentages orig/total for posters
152		152
153	&write_data;	153	&write_data;
154		154
155	#################### DISPLAY RESULTS #####################	155	#################### DISPLAY RESULTS #####################
156	print "=" x 76, "\n";	156	print "=" x 76, "\n";
157	printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );	157	printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
158	print "=" x 76, "\n";	158	print "=" x 76, "\n";
159	printf "%s\n",	159	printf "%s\n",
160	&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );	160	&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
161	print "\n\n";	161	print "\n\n";
162	printf "Total posts considered: %s over %d days\n", commify($totalposts),	162	printf "Total posts considered: %s over %d days\n", commify($totalposts),
163	$numdays;	163	$numdays;
164	printf "Earliest article: %s\n", scalar localtime($earliest);	164	printf "Earliest article: %s\n", scalar localtime($earliest);
165	printf "Latest article: %s\n", scalar localtime($latest);	165	printf "Latest article: %s\n", scalar localtime($latest);
166	printf "Original articles: %s, replies: %s\n", commify($origposts),	166	printf "Original articles: %s, replies: %s\n", commify($origposts),
167	commify($replies);	167	commify($replies);
168	printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),	168	printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
169	commify( int( $totsize / 1024 ) ), $totsize / 1048576; #	169	commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
170	printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",	170	printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
171	commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,	171	commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
172	commify( int( $totsize / $totalposts ) );	172	commify( int( $totsize / $totalposts ) );
173	my $count = keys %data;	173	my $count = keys %data;
174	printf "Total headers: %s KiB bodies: %s KiB\n",	174	printf "Total headers: %s KiB bodies: %s KiB\n",
175	commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );	175	commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
176	printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n",	176	printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n",
177	commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),	177	commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
178	( $totorig * 100 ) / ( $totorig + $totquoted ),	178	( $totorig * 100 ) / ( $totorig + $totquoted ),
179	commify( int( $totsig / 1024 ) );	179	commify( int( $totsig / 1024 ) );
180	printf "Total number of posters: %s, average %s bytes per poster\n",	180	printf "Total number of posters: %s, average %s bytes per poster\n",
181	commify($count), commify( int( $totsize / $count ) ); #/	181	commify($count), commify( int( $totsize / $count ) ); #/
182	$count = keys %threads;	182	$count = keys %threads;
183	printf "Total number of threads: %s, average %s bytes per thread\n",	183	printf "Total number of threads: %s, average %s bytes per thread\n",
184	commify($count), commify( int( $totsize / $count ) ); #/	184	commify($count), commify( int( $totsize / $count ) ); #/
185	printf "Total number of user agents: %d\n", scalar keys %agents;	185	printf "Total number of user agents: %d\n", scalar keys %agents;
186	print "\n", "=" x 76, "\n";	186	print "\n", "=" x 76, "\n";
187		187
188	########################################	188	########################################
189	## Show posters by article count Sec 1;	189	## Show posters by article count Sec 1;
190	########################################	190	########################################
191	unless ( $skipSec{1} )	191	unless ( $skipSec{1} )
192	{	192	{
193	if ( keys %data < $topposters )	193	if ( keys %data < $topposters )
194	{	194	{
195	$count = keys %data;	195	$count = keys %data;
196	}	196	}
197	else	197	else
198	{	198	{
199	$count = $topposters;	199	$count = $topposters;
200	}	200	}
201	printf "%s\n", &centred( "Top $count posters by number of articles", 76 );	201	printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
202	print "=" x 76, "\n";	202	print "=" x 76, "\n";
203	$i = 0;	203	$i = 0;
204	foreach	204	foreach
205	my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )	205	my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
206	{	206	{
207	my $name = substr( $poster, 0, 65 );	207	my $name = substr( $poster, 0, 65 );
208	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),	208	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
209	$data{$poster}{count};	209	$data{$poster}{count};
210	last if ( ++$i == $count );	210	last if ( ++$i == $count );
211	}	211	}
212	print "\n", "=" x 76, "\n";	212	print "\n", "=" x 76, "\n";
213	}	213	}
214		214
215	######################################	215	######################################
216	## Show posters by size in KiB Sec 2;	216	## Show posters by size in KiB Sec 2;
217	######################################	217	######################################
218	unless ( $skipSec{2} )	218	unless ( $skipSec{2} )
219	{	219	{
220	if ( keys %data < $topposters )	220	if ( keys %data < $topposters )
221	{	221	{
222	$count = keys %data;	222	$count = keys %data;
223	}	223	}
224	else	224	else
225	{	225	{
226	$count = $topposters;	226	$count = $topposters;
227	}	227	}
228	printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );	228	printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
229	print "=" x 76, "\n";	229	print "=" x 76, "\n";
230	$i = 0;	230	$i = 0;
231	foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )	231	foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
232	{	232	{
233	my $name = substr( $poster, 0, 62 );	233	my $name = substr( $poster, 0, 62 );
234	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),	234	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
235	$data{$poster}{size} / 1024; #/	235	$data{$poster}{size} / 1024; #/
236	last if ( ++$i == $count );	236	last if ( ++$i == $count );
237	}	237	}
238	print "\n", "=" x 76, "\n";	238	print "\n", "=" x 76, "\n";
239	}	239	}
240		240
241	#####################################	241	#####################################
242	## Show top posters for original text	242	## Show top posters for original text
243	#####################################	243	#####################################
244	unless ( $skipSec{3} )	244	unless ( $skipSec{3} )
245	{	245	{
246	if ( keys %data < $topposters )	246	if ( keys %data < $topposters )
247	{	247	{
248	$count = keys %data;	248	$count = keys %data;
249	}	249	}
250	else	250	else
251	{	251	{
252	$count = $topposters;	252	$count = $topposters;
253	}	253	}
254	printf "%s\n",	254	printf "%s\n",
255	&centred( "Top $count responders by original text (> 5 posts)", 76 );	255	&centred( "Top $count responders by original text (> 5 posts)", 76 );
256	print "=" x 76, "\n";	256	print "=" x 76, "\n";
257	$i = 0;	257	$i = 0;
258	foreach my $poster (	258	foreach my $poster (
259	sort { $data{$b}{percent} <=> $data{$a}{percent} }	259	sort { $data{$b}{percent} <=> $data{$a}{percent} }
260	keys %data	260	keys %data
261	)	261	)
262	{	262	{
263	next if $data{$poster}{quoted} == 0;	263	next if $data{$poster}{quoted} == 0;
264	next if $data{$poster}{count} < 5;	264	next if $data{$poster}{count} < 5;
265	my $name = substr( $poster, 0, 63 );	265	my $name = substr( $poster, 0, 63 );
266	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),	266	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
267	$data{$poster}{percent};	267	$data{$poster}{percent};
268	last if ( ++$i == $count );	268	last if ( ++$i == $count );
269	}	269	}
270	print "\n", "=" x 76, "\n";	270	print "\n", "=" x 76, "\n";
271	}	271	}
272		272
273	########################################	273	########################################
274	## Show bottom posters for original text	274	## Show bottom posters for original text
275	########################################	275	########################################
276	unless ( $skipSec{4} )	276	unless ( $skipSec{4} )
277	{	277	{
278	if ( keys %data < $topposters )	278	if ( keys %data < $topposters )
279	{	279	{
280	$count = keys %data;	280	$count = keys %data;
281	}	281	}
282	else	282	else
283	{	283	{
284	$count = $topposters;	284	$count = $topposters;
285	}	285	}
286	printf "%s\n",	286	printf "%s\n",
287	&centred( "Bottom $count responders by original text (> 5 posts)", 76 );	287	&centred( "Bottom $count responders by original text (> 5 posts)", 76 );
288	print "=" x 76, "\n";	288	print "=" x 76, "\n";
289	$i = 0;	289	$i = 0;
290	foreach my $poster (	290	foreach my $poster (
291	sort { $data{$a}{percent} <=> $data{$b}{percent} }	291	sort { $data{$a}{percent} <=> $data{$b}{percent} }
292	keys %data	292	keys %data
293	)	293	)
294	{	294	{
295	next if $data{$poster}{quoted} == 0;	295	next if $data{$poster}{quoted} == 0;
296	next if $data{$poster}{count} < 5;	296	next if $data{$poster}{count} < 5;
297	my $name = substr( $poster, 0, 63 );	297	my $name = substr( $poster, 0, 63 );
298	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),	298	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
299	$data{$poster}{percent};	299	$data{$poster}{percent};
300	last if ( ++$i == $count );	300	last if ( ++$i == $count );
301	}	301	}
302	print "\n", "=" x 76, "\n";	302	print "\n", "=" x 76, "\n";
303	}	303	}
304		304
305	#####################################	305	#####################################
306	## Show threads by number of articles	306	## Show threads by number of articles
307	#####################################	307	#####################################
308	unless ( $skipSec{5} )	308	unless ( $skipSec{5} )
309	{	309	{
310	if ( keys %threads < $topthreads )	310	if ( keys %threads < $topthreads )
311	{	311	{
312	$count = keys %threads;	312	$count = keys %threads;
313	}	313	}
314	else	314	else
315	{	315	{
316	$count = $topthreads;	316	$count = $topthreads;
317	}	317	}
318	printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );	318	printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
319	print "=" x 76, "\n";	319	print "=" x 76, "\n";
320	$i = 0;	320	$i = 0;
321	foreach my $thread (	321	foreach my $thread (
322	sort { $threads{$b}{count} <=> $threads{$a}{count} }	322	sort { $threads{$b}{count} <=> $threads{$a}{count} }
323	keys %threads	323	keys %threads
324	)	324	)
325	{	325	{
326	my $name = substr( $thread, 0, 65 );	326	my $name = substr( $thread, 0, 65 );
327	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	327	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
328	$threads{$thread}{count};	328	$threads{$thread}{count};
329	last if ( ++$i == $count );	329	last if ( ++$i == $count );
330	}	330	}
331	print "\n", "=" x 76, "\n";	331	print "\n", "=" x 76, "\n";
332	}	332	}
333		333
334	##############################	334	##############################
335	## Show threads by size in KiB	335	## Show threads by size in KiB
336	##############################	336	##############################
337	unless ( $skipSec{6} )	337	unless ( $skipSec{6} )
338	{	338	{
339	if ( keys %threads < $topthreads )	339	if ( keys %threads < $topthreads )
340	{	340	{
341	$count = keys %threads;	341	$count = keys %threads;
342	}	342	}
343	else	343	else
344	{	344	{
345	$count = $topthreads;	345	$count = $topthreads;
346	}	346	}
347	printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );	347	printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
348	print "=" x 76, "\n";	348	print "=" x 76, "\n";
349	$i = 0;	349	$i = 0;
350	foreach my $thread (	350	foreach my $thread (
351	sort { $threads{$b}{size} <=> $threads{$a}{size} }	351	sort { $threads{$b}{size} <=> $threads{$a}{size} }
352	keys %threads	352	keys %threads
353	)	353	)
354	{	354	{
355	my $name = substr( $thread, 0, 65 );	355	my $name = substr( $thread, 0, 65 );
356	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	356	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
357	$threads{$thread}{size} / 1024; #/	357	$threads{$thread}{size} / 1024; #/
358	last if ( ++$i == $count );	358	last if ( ++$i == $count );
359	}	359	}
360	print "\n", "=" x 76, "\n";	360	print "\n", "=" x 76, "\n";
361	}	361	}
362		362
363	##################################	363	##################################
364	## Show top 10 cross-posted groups	364	## Show top 10 cross-posted groups
365	##################################	365	##################################
366	unless ( $skipSec{7} )	366	unless ( $skipSec{7} )
367	{	367	{
368	delete $crossposts{"$newsgroup_name"}; # don't include ours	368	delete $crossposts{"$newsgroup_name"}; # don't include ours
369	if ( keys %crossposts < $topcrossposts )	369	if ( keys %crossposts < $topcrossposts )
370	{	370	{
371	$count = keys %crossposts;	371	$count = keys %crossposts;
372	}	372	}
373	else	373	else
374	{	374	{
375	$count = $topcrossposts;	375	$count = $topcrossposts;
376	}	376	}
377	printf "%s\n", &centred( "Top $count cross-posted groups", 76 );	377	printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
378	print "=" x 76, "\n";	378	print "=" x 76, "\n";
379	$i = 0;	379	$i = 0;
380	foreach	380	foreach
381	my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )	381	my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
382	{	382	{
383	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	383	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
384	$crossposts{$name};	384	$crossposts{$name};
385	last if ( ++$i == $count );	385	last if ( ++$i == $count );
386	}	386	}
387	print "\n", "=" x 76, "\n";	387	print "\n", "=" x 76, "\n";
388	}	388	}
389		389
390	#########################	390	#########################
391	## Show agents and counts	391	## Show agents and counts
392	#########################	392	#########################
393	unless ( $skipSec{8} )	393	unless ( $skipSec{8} )
394	{	394	{
395	if ( keys %agents < $topagents )	395	if ( keys %agents < $topagents )
396	{	396	{
397	$count = keys %agents;	397	$count = keys %agents;
398	}	398	}
399	else	399	else
400	{	400	{
401	$count = $topagents;	401	$count = $topagents;
402	}	402	}
403	printf "%s\n", &centred( "Top $count User Agents by poster", 76 );	403	printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
404	print "=" x 76, "\n";	404	print "=" x 76, "\n";
405	$i = 0;	405	$i = 0;
406	foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )	406	foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
407	{	407	{
408	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),	408	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
409	$agents{$agent};	409	$agents{$agent};
410	last if ( ++$i == $count );	410	last if ( ++$i == $count );
411	}	411	}
412	print "\n", "=" x 76, "\n";	412	print "\n", "=" x 76, "\n";
413	}	413	}
414		414
415	#######################	415	#######################
416	## Show distinct agents	416	## Show distinct agents
417	#######################	417	#######################
418	unless ( $skipSec{9} )	418	unless ( $skipSec{9} )
419	{	419	{
420	if ( keys %distinct_agent < $topagents )	420	if ( keys %distinct_agent < $topagents )
421	{	421	{
422	$count = keys %distinct_agent;	422	$count = keys %distinct_agent;
423	}	423	}
424	else	424	else
425	{	425	{
426	$count = $topagents;	426	$count = $topagents;
427	}	427	}
428	printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );	428	printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
429	print "=" x 76, "\n";	429	print "=" x 76, "\n";
430	$i = 0;	430	$i = 0;
431	foreach my $agent (	431	foreach my $agent (
432	sort { $distinct_agent{$b} <=> $distinct_agent{$a} }	432	sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
433	keys %distinct_agent	433	keys %distinct_agent
434	)	434	)
435	{	435	{
436	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),	436	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
437	$distinct_agent{$agent},	437	$distinct_agent{$agent},
438	( ( $distinct_agent{$agent} / $totalposts ) * 100 );	438	( ( $distinct_agent{$agent} / $totalposts ) * 100 );
439	last if ( ++$i == $count );	439	last if ( ++$i == $count );
440	}	440	}
441	print "\n", "=" x 76, "\n";	441	print "\n", "=" x 76, "\n";
442	}	442	}
443		443
444	############################	444	############################
445	## Show timezones and counts	445	## Show timezones and counts
446	############################	446	############################
447	unless ( $skipSec{10} )	447	unless ( $skipSec{10} )
448	{	448	{
449	if ( keys %tz < $toptz )	449	if ( keys %tz < $toptz )
450	{	450	{
451	$count = keys %tz;	451	$count = keys %tz;
452	}	452	}
453	else	453	else
454	{	454	{
455	$count = $toptz;	455	$count = $toptz;
456	}	456	}
457	printf "%s\n", &centred( "Top 10 time zones", 76 );	457	printf "%s\n", &centred( "Top 10 time zones", 76 );
458	print "=" x 76, "\n";	458	print "=" x 76, "\n";
459	$i = 0;	459	$i = 0;
460	foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )	460	foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
461	{	461	{
462	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};	462	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
463	last if ( ++$i == $count );	463	last if ( ++$i == $count );
464	}	464	}
465	print "\n", "=" x 76, "\n";	465	print "\n", "=" x 76, "\n";
466	}	466	}
467		467
468	################################ SUBROUTINES ################################	468	################################ SUBROUTINES ################################
469		469
470	########################################	470	########################################
471	## Get current article's header and body	471	## Get current article's header and body
472	########################################	472	########################################
473	sub get_article	473	sub get_article
474	{	474	{
475	%headers = (); # dump old headers	475	%headers = (); # dump old headers
476	my $filename = shift; # get the name of the file	476	my $filename = shift; # get the name of the file
477		477
478	## get stats about the file itself	478	## get stats about the file itself
479	$filesize = -s $filename; # get total size of file	479	$filesize = -s $filename; # get total size of file
480	$totsize += $filesize; # bump total sizes of all files	480	$totsize += $filesize; # bump total sizes of all files
481		481
482	my $mtime = ( stat $filename )[9];	482	my $mtime = ( stat $filename )[9];
483	if ( $mtime < $earliest )	483	if ( $mtime < $earliest )
484	{	484	{
485	$earliest = $mtime;	485	$earliest = $mtime;
486	}	486	}
487	elsif ( $mtime > $latest )	487	elsif ( $mtime > $latest )
488	{	488	{
489	$latest = $mtime;	489	$latest = $mtime;
490	}	490	}
491		491
492	## now read the file	492	## now read the file
493	open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";	493	open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
494	while (<$FILE>)	494	while (<$FILE>)
495	{	495	{
496	$totheader += length($_); # bump total header size	496	$totheader += length($_); # bump total header size
497	last if (/^\s*$/); # end of header?	497	last if (/^\s*$/); # end of header?
498	if (/^([^:\s]):\s(.*)/)	498	if (/^([^:\s]):\s(.*)/)
499	{	499	{
500	my ( $key, $val ) = ( $1, $2 );	500	my ( $key, $val ) = ( $1, $2 );

Subversion Repositories LCARS

(root)/trunk/tools/network/news/newsstat/newsstat.pl - Rev 13 → 14