WebSVN - LCARS - Diff - Rev 7 and 8 - /trunk/tools/network/news/newsstat/newsstat.pl


#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use Encode;

###########################
# newsstat.pl version 0.4.1

############################################################################
# Collect statistics about a newsgroup (specified by first argument) in
# the local news spool. Check all articles in the last 30-day period.
# Rank posters by number of posts and by volume of posts, report on top and
# bottom 20 posters. Show their name, number of posts, size of posts,
# percentage of quoted lines. Rank user-agents used, by poster rather than
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
#
# (Numbers and paths can be configured below.  -- PE)
############################################################################

############################################################################
#                       RECENT CHANGES                                     #
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
#                 - Use warnings, utf8 (just in case)
#                 - Documentation update
# N/A         NN  - Take newsgroup name as argument
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
#                 - Allow command line flags for subtracting
#                   output if not pertinent for a group
# 2002-11-09  NN  - Put Garry's writedata() function back in.
#                 - added "rn" to my list of UA's
#                 - Started using %distinct_agent for both User agent
#                   sections
#                 - named it newsstat.pl version 0.3
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
#                   mtime rather than ctime, and simplifying the logic
# 2002-11-05  NN  - moved user configurations to the top
#                 - fixed the cross-posting section
#                 - introduced the $newsgroup_name variable which
#                   later becomes $news$group
#                 - changed $name to $agent_name in countagents()
#
# Contributors
# -------------
# NN  Nomen nominandum (name to be determined later)
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>

########### TODO #############
# Commas in bottom section of report
# Show date the figures were compiled
# No. of HTML articles (Content-Type: text/html)
# No. of quoted sigs (/>\s*-- /)
# Per cent of top-posted articles
# Top 10 cross-posters
# Top 20 news posting hosts (from Path)
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
# Count *all* User Agents that each poster uses
# What do we do about Bill Unruh's ] quote style?
# Change the way dates/times are checked
# include % share in posters by no. of arts
# include % share in posters by size
# Total, orig & quoted lines by user agent with per cent
# Take more arguments
#######################################################

###################### USER CONFIGURATIONS ############################

# The name of the group to do stats for
my $newsgroup_name = $ARGV[0];
$newsgroup_name or &usage;

# Check for removal flags
my $ix;
my $j;
my %skipSec;
my @skiplist;
my $args = @ARGV;
for ( $ix = 1 ; $ix < $args ; $ix++ )
{
  $j = $ix + 1;
  if ( $ARGV[$ix] eq "-x" )
  {
    @skiplist = split( ",", $ARGV[$j] );
  }
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
  {
    @skiplist = split( ",", $1 );
  }
}
foreach (@skiplist)
{
  $skipSec{$_} = 1;
}

# Leafnode users will want /var/spool/news for this variable.
my $news = "/var/spool/news/";

# How many days are we doing statistics for?
my $numdays = 30;

# no. of agents we list
my $topagents = 10;

# no. of threads we want to know about
my $topthreads = 20;

# no. of top or bottom posters to show
my $topposters = 20;

# no. of cross-posted threads to show
my $topcrossposts = 10;

# no. of time zones to show
my $toptz = 10;

###################### DATA STRUCTURES ######################
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data;          # name, count, agent, total, orig, quoted
my %threads;       # subject, count
my %crossposts;    # group, count
my %tz;            # timezones by count
my %headers;       # holds header of current article
my %lcheader;      # holds lowercase headers
my @body;          # holds body of current article
my @sig;           # holds sig text;
my $totalposts;    # total no. of posts considered
my $filename;      # name of current article file
my $filesize;      # size of current article file
my $earliest;      # earliest article we have found
my $latest;        # latest article we have found
my $poster;        # poster we are dealing with
my $totsize   = 0; # holds total sizes of all files
my $totheader = 0; # total size of header material
my $totbody   = 0; # total size of body material
my $totsig    = 0; # total size of sig material
my $totorig   = 0; # total size of original material
my $totquoted = 0; # total size of quoted material
my $origposts = 0; # total no. of original posts
my $replies   = 0; # total no. of replies
my $i;             # general purpose
my %distinct_agent;
my %agents =       # used to hold counts of User Agents used
  (
  "KNode"                     => 0,
  "Pan"                       => 0,
  "Mozilla"                   => 0,
  "Sylpheed"                  => 0,
  "Gnus"                      => 0,
  "Forte Agent"               => 0,
  "Forte Free Agent"          => 0,
  "MicroPlanet Gravity"       => 0,
  "Microsoft Outlook Express" => 0,
  "Xnews"                     => 0,
  "slrn"                      => 0,
  "tin"                       => 0,
  "rn"                        => 0,
  "NN"                        => 0,
  "MacSOUP"                   => 0,
  "Foorum"                    => 0,
  "MT-NewsWatcher"            => 0,
  "News Rover"                => 0,
  "WebTV"                     => 0,
  "Compuserver"               => 0,
  "VSoup"                     => 0
  );

######################## MAIN CODE ########################
$! = 1;

chdir("$news$group") or die "Can't cd to $news$group: $!\n";
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
while ( defined( $filename = readdir(DIR) ) )
{
  %lcheader = ();
  next unless -f $filename;    # only want real files
  next if ( $filename eq ".overview" );    # real articles only
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
  $earliest = ( stat $filename )[9] unless defined($earliest);
  $latest   = ( stat $filename )[9] unless defined($latest);
  &getarticle($filename);                  # read in the article
  &getdata;                                # grab the data from the article
  $totalposts++;                           # bump count of articles considered
}
closedir(DIR);                             # finished with the directory

# post-processing
&countagents;    # count agents, collapsing versions
&fixpercent;     # check percentages orig/total for posters

&writedata;

#################### DISPLAY RESULTS #####################
print "=" x 76, "\n";
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
print "=" x 76, "\n";
printf "%s\n",
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
print "\n\n";
printf "Total posts considered: %s over %d days\n", commify($totalposts),
  $numdays;
printf "Earliest article: %s\n",               scalar localtime($earliest);
printf "Latest article:   %s\n",               scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts),
  commify($replies);
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
  commify( int( $totsize / $totalposts ) );
my $count = keys %data;
printf "Total headers: %s KB  bodies: %s KB\n",
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
  commify( int( $totsig / 1024 ) );
printf "Total number of posters: %s, average %s bytes per poster\n",
  commify($count), commify( int( $totsize / $count ) );     #/
$count = keys %threads;
printf "Total number of threads: %s, average %s bytes per thread\n",
  commify($count), commify( int( $totsize / $count ) );     #/
printf "Total number of User-Agents: %d\n", scalar keys %agents;
print "\n", "=" x 76, "\n";

###############################
# show posters by article count  Sec 1;
###############################
unless ( $skipSec{1} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
  {
    my $name = substr( $poster, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{count};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

################################
# show posters by size in Kbytes Sec 2;
################################
unless ( $skipSec{2} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
  {
    my $name = substr( $poster, 0, 62 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{size} / 1024;    #/
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

####################################
# show top posters for original text
####################################
unless ( $skipSec{3} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n",
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
    keys %data )
  {
    next if $data{$poster}{quoted} == 0;
    next if $data{$poster}{count} < 5;
    my $name = substr( $poster, 0, 63 );
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{percent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#######################################
# show bottom posters for original text
#######################################
unless ( $skipSec{4} )
{
  if ( keys %data < $topposters )
  {
    $count = keys %data;
  }
  else
  {
    $count = $topposters;
  }
  printf "%s\n",
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
    keys %data )
  {
    next if $data{$poster}{quoted} == 0;
    next if $data{$poster}{count} < 5;
    my $name = substr( $poster, 0, 63 );
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
      $data{$poster}{percent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

####################################
# show threads by number of articles
####################################
unless ( $skipSec{5} )
{
  if ( keys %threads < $topthreads )
  {
    $count = keys %threads;
  }
  else
  {
    $count = $topthreads;
  }
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
    keys %threads )
  {
    my $name = substr( $thread, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $threads{$thread}{count};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}
################################
# show threads by size in Kbytes
################################
unless ( $skipSec{6} )
{
  if ( keys %threads < $topthreads )
  {
    $count = keys %threads;
  }
  else
  {
    $count = $topthreads;
  }
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
    keys %threads )
  {
    my $name = substr( $thread, 0, 65 );
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $threads{$thread}{size} / 1024;    #/
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#################################
# show top 10 cross-posted groups
#################################
unless ( $skipSec{7} )
{
  delete $crossposts{"$newsgroup_name"};    # don't include ours
  if ( keys %crossposts < $topcrossposts )
  {
    $count = keys %crossposts;
  }
  else
  {
    $count = $topcrossposts;
  }
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
      $crossposts{$name};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}
#######################
#show agents and counts
#######################
unless ( $skipSec{8} )
{
  if ( keys %agents < $topagents )
  {
    $count = keys %agents;
  }
  else
  {
    $count = $topagents;
  }
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
      $agents{$agent};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

#######################
#show distinct agents
#######################
unless ( $skipSec{9} )
{
  if ( keys %distinct_agent < $topagents )
  {
    $count = keys %distinct_agent;
  }
  else
  {
    $count = $topagents;
  }
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
    keys %distinct_agent )
  {
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
      $distinct_agent{$agent},
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

##########################
#show timezones and counts
##########################
unless ( $skipSec{10} )
{
  if ( keys %tz < $toptz )
  {
    $count = keys %tz;
  }
  else
  {
    $count = $toptz;
  }
  printf "%s\n", &centred( "Top 10 time zones", 76 );
  print "=" x 76, "\n";
  $i = 0;
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
  {
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
    last if ( ++$i == $count );
  }
  print "\n", "=" x 76, "\n";
}

################################ SUBROUTINES ################################

#######################################
# get current article's header and body
#######################################
sub getarticle
{
  %headers = ();    # dump old headers
  my $filename = shift;    # get the name of the file

  # get stats about the file itself
  $filesize = -s $filename;    # get total size of file
  $totsize += $filesize;       # bump total sizes of all files

  my $mtime = ( stat $filename )[9];
  if ( $mtime < $earliest )
  {
    $earliest = $mtime;
  }
  elsif ( $mtime > $latest )
  {
    $latest = $mtime;
  }

  # now read the file
  open(my $FILE, $filename) or die "Can't open $filename: $!\n";
  while (<$FILE>)
  {
    $totheader += length($_);    # bump total header size
    last if (/^\s*$/);           # end of header?
    if (/^([^:\s]*):\s+(.*)/)
    {
      my ( $key, $val ) = ( $1, $2 );
      $headers{$key} = decode( 'MIME-Header', $val );
      $lcheader{ clean( lc($key) ) } = clean($val);
    }
  }
  @body = <$FILE>;                # slurp up body
  close($FILE);
}    # getarticle

###################################
# get data from the current article
###################################
sub getdata
{
#### First, analyse header fields ####

  # Set up this poster if not defined, get counts, sizes
  $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
  if ( !defined( $data{$poster} ) )
  {                                               # seen this one before?
    $data{$poster}{agent}  = 'Unknown';           # comes after For: field
    $data{$poster}{orig}   = 0;
    $data{$poster}{quoted} = 0;
  }
  $data{$poster}{count}++;                        # bump count for this poster
  $data{$poster}{size} += $filesize;              # total size of file

  # The User-Agent and/or X-Newsreader fields
  # for User-Agent by poster
  if ( defined $lcheader{"user-agent"} )
  {
    $data{$poster}{agent} = $lcheader{"user-agent"};
  }
  if ( defined $lcheader{"x-newsreader"} )
  {
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
  }

  # The User Agent for User-Agent by number of posts
  my $UA = "unknown";
  foreach my $keys ( keys %lcheader )
  {
    if ( defined $lcheader{'user-agent'} )
    {
      $UA = $lcheader{'user-agent'};
    }
    elsif ( defined $lcheader{"x-newsreader"} )
    {
      $UA = $lcheader{"x-newsreader"};
    }
    elsif ( defined $lcheader{'x-mailer'} )
    {
      $UA = $lcheader{'x-mailer'};
    }
    elsif (
      ( defined $lcheader{'organization'} )
      && ( $lcheader{'organization'} =~
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
      )
    {
      $UA = $lcheader{'organization'};
    }
    elsif ( $lcheader{'message-id'} =~ /pine/i )
    {
      $UA = "Pine";
    }    ## Hopefully found UA, else set to unknown
  }

  $UA = clean($UA);
  $UA = get_agent($UA);

  sub get_agent
  {
    my $raw   = shift;
    my $agent = $raw;

    ## strip http
    if ( $raw =~ /.*http.*/ )
    {
      $raw =~ s!posted via!!i;
      $raw =~ s!http://!!g;
      $raw =~ s!/!!g;
      $raw =~ s! !!g;
    }

    ## Fix Outlook from Mac
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }

    ## Pick out the popular agents
    if ( $raw =~ /(outlook express)/i
      || $raw =~ /(microplanet gravity)/i
      || $raw =~ /(news rover)/i
      || $raw =~ /(forte agent)/i
      || $raw =~ /(forte free agent)/i )
    {
      $agent = $1;
    }
    elsif (
      $raw =~ /^(
        pan
       |sylpheed
       |slrn
       |mozilla
       |knode
       |tin
       |hamster
       |xrn
       |xnews
       |aol
       |gnus
       |krn
       |macsoup
       |messenger
       |openxp
       |pine
       |thoth
       |turnpike
       |winvn
       |vsoup
       |google
       |supernews
       |nn
       |rn
       |007
       |webtv
       |compuserve
       )/ix
      )
    {
      $agent = $1;
    }
    else
    {
      ## Clean up unknown agents
      if ( $raw =~ m!^(.*?)/! )
      {
        $agent = $1;
      }
      elsif ( $raw =~ /^(\w*)\d.*/ )
      {
        $agent = $1;
      }
    }

    $distinct_agent{$agent}++;
    return $agent;
  }

  # Get all cross-posted newsgroups
  for ( split /,/, $headers{"Newsgroups"} )
  {
    $crossposts{$_}++;    # bump count for each
  }

  # Get threads
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
  $thread =~ s/\s+/ /g;     # collapse whitespace
  $threads{$thread}{count} += 1;            # bump count of this subject
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread

  # Is this an original post or a reply?
  if ( defined $headers{"References"} )
  {
    $replies++;
  }
  else
  {
    $origposts++;
  }

  # Get the time zone
  $_ = $headers{"Date"};
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
  if ( ($tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
  {
    $tz = "UTC";
  }
  $tz{$tz}++;

#### Now analyse the body text ####
  my $insig = 0;
  for (@body)
  {
    $totbody += length($_);    # bump total body size
    next if (/^$>/);           # don't count blank lines in body
    if ( $insig == 1 )
    {
      $totsig += length($_);    # bump total sig size

      # Bill Unruh uses ] quotes, and another poster uses ::
    }
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
    {                           # are we in a quote line?
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
      $totquoted             += length($_);
    }
    elsif (/-- /)
    {
      $insig = 1;
    }
    else
    {

      # we must be processing an original line
      $data{$poster}{orig} += length($_);      # bump count of original chrs
      $totorig             += length($_);
    }
  }    # end for (@body)

}    # getdata

########################################
# Count the User-Agents used, collapsing
# different versions into one per agent.
########################################
sub countagents
{
POSTER:
  foreach my $poster ( keys %data )
  {
    foreach my $agent_name ( keys %distinct_agent )
    {    # check against known ones
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
      {
        $agents{$agent_name}++;
        next POSTER;
      }
    }
    $agents{ $data{$poster}{agent} }++;
  }
}    # countagents

############################################
# set orig/total percentages for all posters
############################################
sub fixpercent
{
  foreach my $poster ( keys %data )
  {
    my $percent = 100;
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
    {
      $percent = $data{$poster}{orig} * 100 /
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
    }
    elsif ( $data{$poster}{orig} == 0 )
    {
      $percent = 0;
    }
    $data{$poster}{percent} = $percent;
  }
}

##############################
# right pad a string with '.'s
##############################
sub rpad
{

  # get text to pad, length to pad, pad chr
  my ( $text, $pad_len, $pad_chr ) = @_;
  if ( length($text) > $pad_len )
  {
    $text = substr( $text, 0, $pad_len );
  }
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
  return $padded;
}

#################
# centre a string
#################
sub centred
{
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
  my $pad_len = ( $width - length($text) ) / 2;    #/
  my $centred = " " x $pad_len . $text;
  return $centred;
}

##########################
# put commas into a number
##########################
sub commify
{
  $_ = shift;
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
  return $_;
}

#########################
# clean
#########################
sub clean
{
  my $dirty = shift;
  my $clean = $dirty;
  $clean =~ s/^\s*//;
  $clean =~ s/\s*$//;

  return $clean;
}

sub usage
{

  print "usage: newstat.pl newsgroupname\n";
  exit 1;
}

###################################
# Write data structures to a file #
###################################
sub writedata
{
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
  print $OUTF
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
  foreach my $name ( keys %data )
  {
    print $OUTF
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Thread subjects\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
  {
    print $OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Cross-posts\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %crossposts )
  {
    print $OUTF "$name : $crossposts{$name}\n";
  }
  print $OUTF print $OUTF
"============================================================================\n";
  print $OUTF "User agents\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %agents )
  {
    print $OUTF "$name : $agents{$name}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Time zones\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %tz )
  {
    print $OUTF "$name : $tz{$name}\n";
  }
  close $OUTF;
}    # writedata
 

Rev 7	Rev 8
1	#!/usr/bin/perl	1	#!/usr/bin/env perl
2	use strict;	2	use strict;
3	use warnings;	3	use warnings;
4	use utf8;	4	use utf8;
5	use Encode;	5	use Encode;
6		6
7	#########################	7	###########################
8	# newsstat.pl version 0.4	8	# newsstat.pl version 0.4.1
9		9
10	############################################################################	10	############################################################################
11	# Collect statistics about a newsgroup (specified by first argument) in	11	# Collect statistics about a newsgroup (specified by first argument) in
12	# the local news spool. Check all articles in the last 30-day period.	12	# the local news spool. Check all articles in the last 30-day period.
13	# Rank posters by number of posts and by volume of posts, report on top and	13	# Rank posters by number of posts and by volume of posts, report on top and
14	# bottom 20 posters. Show their name, number of posts, size of posts,	14	# bottom 20 posters. Show their name, number of posts, size of posts,
15	# percentage of quoted lines. Rank user-agents used, by poster rather than	15	# percentage of quoted lines. Rank user-agents used, by poster rather than
16	# by post. Rank top 20 threads. Rank top 10 cross-posted groups.	16	# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
17	#	17	#
18	# (Numbers and paths can be configured below. -- PE)	18	# (Numbers and paths can be configured below. -- PE)
19	############################################################################	19	############################################################################
20		20
21	############################################################################	21	############################################################################
22	# RECENT CHANGES #	22	# RECENT CHANGES #
23	# 2011-07-03 PE - Use Encode to decode/encode MIME encodings	23	# 2011-07-03 PE - Use Encode to decode/encode MIME encodings
24	# - Use warnings, utf8 (just in case)	24	# - Use warnings, utf8 (just in case)
25	# - Documentation update	25	# - Documentation update
26	# N/A NN - Take newsgroup name as argument	26	# N/A NN - Take newsgroup name as argument
27	# 2004-06-19 NN - newsgroup name is $ARGV[0]	27	# 2004-06-19 NN - newsgroup name is $ARGV[0]
28	# - Allow command line flags for subtracting	28	# - Allow command line flags for subtracting
29	# output if not pertinent for a group	29	# output if not pertinent for a group
30	# 2002-11-09 NN - Put Garry's writedata() function back in.	30	# 2002-11-09 NN - Put Garry's writedata() function back in.
31	# - added "rn" to my list of UA's	31	# - added "rn" to my list of UA's
32	# - Started using %distinct_agent for both User agent	32	# - Started using %distinct_agent for both User agent
33	# sections	33	# sections
34	# - named it newsstat.pl version 0.3	34	# - named it newsstat.pl version 0.3
35	# 2002-11-06 NN - Fixed the earliest/latest file problem by using	35	# 2002-11-06 NN - Fixed the earliest/latest file problem by using
36	# mtime rather than ctime, and simplifying the logic	36	# mtime rather than ctime, and simplifying the logic
37	# 2002-11-05 NN - moved user configurations to the top	37	# 2002-11-05 NN - moved user configurations to the top
38	# - fixed the cross-posting section	38	# - fixed the cross-posting section
39	# - introduced the $newsgroup_name variable which	39	# - introduced the $newsgroup_name variable which
40	# later becomes $news$group	40	# later becomes $news$group
41	# - changed $name to $agent_name in countagents()	41	# - changed $name to $agent_name in countagents()
42	#	42	#
43	# Contributors	43	# Contributors
44	# -------------	44	# -------------
45	# NN Nomen nominandum (name to be determined later)	45	# NN Nomen nominandum (name to be determined later)
46	# PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>	46	# PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
47		47
48	########### TODO #############	48	########### TODO #############
49	# Commas in bottom section of report	49	# Commas in bottom section of report
50	# Show date the figures were compiled	50	# Show date the figures were compiled
51	# No. of HTML articles (Content-Type: text/html)	51	# No. of HTML articles (Content-Type: text/html)
52	# No. of quoted sigs (/>\s*-- /)	52	# No. of quoted sigs (/>\s*-- /)
53	# Per cent of top-posted articles	53	# Per cent of top-posted articles
54	# Top 10 cross-posters	54	# Top 10 cross-posters
55	# Top 20 news posting hosts (from Path)	55	# Top 20 news posting hosts (from Path)
56	# Count of certain subject words: newbie, kde, burner, sendmail, etc.	56	# Count of certain subject words: newbie, kde, burner, sendmail, etc.
57	# Count all User Agents that each poster uses	57	# Count all User Agents that each poster uses
58	# What do we do about Bill Unruh's ] quote style?	58	# What do we do about Bill Unruh's ] quote style?
59	# Change the way dates/times are checked	59	# Change the way dates/times are checked
60	# include % share in posters by no. of arts	60	# include % share in posters by no. of arts
61	# include % share in posters by size	61	# include % share in posters by size
62	# Total, orig & quoted lines by user agent with per cent	62	# Total, orig & quoted lines by user agent with per cent
63	# Take more arguments	63	# Take more arguments
64	#######################################################	64	#######################################################
65		65
66	###################### USER CONFIGURATIONS ############################	66	###################### USER CONFIGURATIONS ############################
67		67
68	# The name of the group to do stats for	68	# The name of the group to do stats for
69	my $newsgroup_name = $ARGV[0];	69	my $newsgroup_name = $ARGV[0];
70	$newsgroup_name or &usage;	70	$newsgroup_name or &usage;
71		71
72	# Check for removal flags	72	# Check for removal flags
73	my $ix;	73	my $ix;
74	my $j;	74	my $j;
75	my %skipSec;	75	my %skipSec;
76	my @skiplist;	76	my @skiplist;
77	my $args = @ARGV;	77	my $args = @ARGV;
78	for ( $ix = 1 ; $ix < $args ; $ix++ )	78	for ( $ix = 1 ; $ix < $args ; $ix++ )
79	{	79	{
80	$j = $ix + 1;	80	$j = $ix + 1;
81	if ( $ARGV[$ix] eq "-x" )	81	if ( $ARGV[$ix] eq "-x" )
82	{	82	{
83	@skiplist = split( ",", $ARGV[$j] );	83	@skiplist = split( ",", $ARGV[$j] );
84	}	84	}
85	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )	85	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
86	{	86	{
87	@skiplist = split( ",", $1 );	87	@skiplist = split( ",", $1 );
88	}	88	}
89	}	89	}
90	foreach (@skiplist)	90	foreach (@skiplist)
91	{	91	{
92	$skipSec{$_} = 1;	92	$skipSec{$_} = 1;
93	}	93	}
94		94
95	# Leafnode users will want /var/spool/news for this variable.	95	# Leafnode users will want /var/spool/news for this variable.
96	my $news = "/var/spool/news/";	96	my $news = "/var/spool/news/";
97		97
98	# How many days are we doing statistics for?	98	# How many days are we doing statistics for?
99	my $numdays = 30;	99	my $numdays = 30;
100		100
101	# no. of agents we list	101	# no. of agents we list
102	my $topagents = 10;	102	my $topagents = 10;
103		103
104	# no. of threads we want to know about	104	# no. of threads we want to know about
105	my $topthreads = 20;	105	my $topthreads = 20;
106		106
107	# no. of top or bottom posters to show	107	# no. of top or bottom posters to show
108	my $topposters = 20;	108	my $topposters = 20;
109		109
110	# no. of cross-posted threads to show	110	# no. of cross-posted threads to show
111	my $topcrossposts = 10;	111	my $topcrossposts = 10;
112		112
113	# no. of time zones to show	113	# no. of time zones to show
114	my $toptz = 10;	114	my $toptz = 10;
115		115
116	###################### DATA STRUCTURES ######################	116	###################### DATA STRUCTURES ######################
117	my $group = $newsgroup_name;	117	my $group = $newsgroup_name;
118	$group =~ s!\.!/!g;	118	$group =~ s!\.!/!g;
119	my %data; # name, count, agent, total, orig, quoted	119	my %data; # name, count, agent, total, orig, quoted
120	my %threads; # subject, count	120	my %threads; # subject, count
121	my %crossposts; # group, count	121	my %crossposts; # group, count
122	my %tz; # timezones by count	122	my %tz; # timezones by count
123	my %headers; # holds header of current article	123	my %headers; # holds header of current article
124	my %lcheader; # holds lowercase headers	124	my %lcheader; # holds lowercase headers
125	my @body; # holds body of current article	125	my @body; # holds body of current article
126	my @sig; # holds sig text;	126	my @sig; # holds sig text;
127	my $totalposts; # total no. of posts considered	127	my $totalposts; # total no. of posts considered
128	my $filename; # name of current article file	128	my $filename; # name of current article file
129	my $filesize; # size of current article file	129	my $filesize; # size of current article file
130	my $earliest; # earliest article we have found	130	my $earliest; # earliest article we have found
131	my $latest; # latest article we have found	131	my $latest; # latest article we have found
132	my $poster; # poster we are dealing with	132	my $poster; # poster we are dealing with
133	my $totsize = 0; # holds total sizes of all files	133	my $totsize = 0; # holds total sizes of all files
134	my $totheader = 0; # total size of header material	134	my $totheader = 0; # total size of header material
135	my $totbody = 0; # total size of body material	135	my $totbody = 0; # total size of body material
136	my $totsig = 0; # total size of sig material	136	my $totsig = 0; # total size of sig material
137	my $totorig = 0; # total size of original material	137	my $totorig = 0; # total size of original material
138	my $totquoted = 0; # total size of quoted material	138	my $totquoted = 0; # total size of quoted material
139	my $origposts = 0; # total no. of original posts	139	my $origposts = 0; # total no. of original posts
140	my $replies = 0; # total no. of replies	140	my $replies = 0; # total no. of replies
141	my $i; # general purpose	141	my $i; # general purpose
142	my %distinct_agent;	142	my %distinct_agent;
143	my %agents = # used to hold counts of User Agents used	143	my %agents = # used to hold counts of User Agents used
144	(	144	(
145	"KNode" => 0,	145	"KNode" => 0,
146	"Pan" => 0,	146	"Pan" => 0,
147	"Mozilla" => 0,	147	"Mozilla" => 0,
148	"Sylpheed" => 0,	148	"Sylpheed" => 0,
149	"Gnus" => 0,	149	"Gnus" => 0,
150	"Forte Agent" => 0,	150	"Forte Agent" => 0,
151	"Forte Free Agent" => 0,	151	"Forte Free Agent" => 0,
152	"MicroPlanet Gravity" => 0,	152	"MicroPlanet Gravity" => 0,
153	"Microsoft Outlook Express" => 0,	153	"Microsoft Outlook Express" => 0,
154	"Xnews" => 0,	154	"Xnews" => 0,
155	"slrn" => 0,	155	"slrn" => 0,
156	"tin" => 0,	156	"tin" => 0,
157	"rn" => 0,	157	"rn" => 0,
158	"NN" => 0,	158	"NN" => 0,
159	"MacSOUP" => 0,	159	"MacSOUP" => 0,
160	"Foorum" => 0,	160	"Foorum" => 0,
161	"MT-NewsWatcher" => 0,	161	"MT-NewsWatcher" => 0,
162	"News Rover" => 0,	162	"News Rover" => 0,
163	"WebTV" => 0,	163	"WebTV" => 0,
164	"Compuserver" => 0,	164	"Compuserver" => 0,
165	"VSoup" => 0	165	"VSoup" => 0
166	);	166	);
167		167
168	######################## MAIN CODE ########################	168	######################## MAIN CODE ########################
169	$! = 1;	169	$! = 1;
170		170
171	chdir("$news$group") or die "Can't cd to $news$group: $!\n";	171	chdir("$news$group") or die "Can't cd to $news$group: $!\n";
172	opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";	172	opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
173	while ( defined( $filename = readdir(DIR) ) )	173	while ( defined( $filename = readdir(DIR) ) )
174	{	174	{
175	%lcheader = ();	175	%lcheader = ();
176	next unless -f $filename; # only want real files	176	next unless -f $filename; # only want real files
177	next if ( $filename eq ".overview" ); # real articles only	177	next if ( $filename eq ".overview" ); # real articles only
178	next if ( -M $filename > $numdays ); # only want articles <= a certain age	178	next if ( -M $filename > $numdays ); # only want articles <= a certain age
179	$earliest = ( stat $filename )[9] unless defined($earliest);	179	$earliest = ( stat $filename )[9] unless defined($earliest);
180	$latest = ( stat $filename )[9] unless defined($latest);	180	$latest = ( stat $filename )[9] unless defined($latest);
181	&getarticle($filename); # read in the article	181	&getarticle($filename); # read in the article
182	&getdata; # grab the data from the article	182	&getdata; # grab the data from the article
183	$totalposts++; # bump count of articles considered	183	$totalposts++; # bump count of articles considered
184	}	184	}
185	closedir(DIR); # finished with the directory	185	closedir(DIR); # finished with the directory
186		186
187	# post-processing	187	# post-processing
188	&countagents; # count agents, collapsing versions	188	&countagents; # count agents, collapsing versions
189	&fixpercent; # check percentages orig/total for posters	189	&fixpercent; # check percentages orig/total for posters
190		190
191	&writedata;	191	&writedata;
192		192
193	#################### DISPLAY RESULTS #####################	193	#################### DISPLAY RESULTS #####################
194	print "=" x 76, "\n";	194	print "=" x 76, "\n";
195	printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );	195	printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
196	print "=" x 76, "\n";	196	print "=" x 76, "\n";
197	printf "%s\n",	197	printf "%s\n",
198	&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );	198	&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
199	print "\n\n";	199	print "\n\n";
200	printf "Total posts considered: %s over %d days\n", commify($totalposts),	200	printf "Total posts considered: %s over %d days\n", commify($totalposts),
201	$numdays;	201	$numdays;
202	printf "Earliest article: %s\n", scalar localtime($earliest);	202	printf "Earliest article: %s\n", scalar localtime($earliest);
203	printf "Latest article: %s\n", scalar localtime($latest);	203	printf "Latest article: %s\n", scalar localtime($latest);
204	printf "Original articles: %s, replies: %s\n", commify($origposts),	204	printf "Original articles: %s, replies: %s\n", commify($origposts),
205	commify($replies);	205	commify($replies);
206	printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),	206	printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
207	commify( int( $totsize / 1024 ) ), $totsize / 1048576; #	207	commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
208	printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",	208	printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
209	commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,	209	commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
210	commify( int( $totsize / $totalposts ) );	210	commify( int( $totsize / $totalposts ) );
211	my $count = keys %data;	211	my $count = keys %data;
212	printf "Total headers: %s KB bodies: %s KB\n",	212	printf "Total headers: %s KB bodies: %s KB\n",
213	commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );	213	commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
214	printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",	214	printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",
215	commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),	215	commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
216	( $totorig * 100 ) / ( $totorig + $totquoted ),	216	( $totorig * 100 ) / ( $totorig + $totquoted ),
217	commify( int( $totsig / 1024 ) );	217	commify( int( $totsig / 1024 ) );
218	printf "Total number of posters: %s, average %s bytes per poster\n",	218	printf "Total number of posters: %s, average %s bytes per poster\n",
219	commify($count), commify( int( $totsize / $count ) ); #/	219	commify($count), commify( int( $totsize / $count ) ); #/
220	$count = keys %threads;	220	$count = keys %threads;
221	printf "Total number of threads: %s, average %s bytes per thread\n",	221	printf "Total number of threads: %s, average %s bytes per thread\n",
222	commify($count), commify( int( $totsize / $count ) ); #/	222	commify($count), commify( int( $totsize / $count ) ); #/
223	printf "Total number of User-Agents: %d\n", scalar keys %agents;	223	printf "Total number of User-Agents: %d\n", scalar keys %agents;
224	print "\n", "=" x 76, "\n";	224	print "\n", "=" x 76, "\n";
225		225
226	###############################	226	###############################
227	# show posters by article count Sec 1;	227	# show posters by article count Sec 1;
228	###############################	228	###############################
229	unless ( $skipSec{1} )	229	unless ( $skipSec{1} )
230	{	230	{
231	if ( keys %data < $topposters )	231	if ( keys %data < $topposters )
232	{	232	{
233	$count = keys %data;	233	$count = keys %data;
234	}	234	}
235	else	235	else
236	{	236	{
237	$count = $topposters;	237	$count = $topposters;
238	}	238	}
239	printf "%s\n", &centred( "Top $count posters by number of articles", 76 );	239	printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
240	print "=" x 76, "\n";	240	print "=" x 76, "\n";
241	$i = 0;	241	$i = 0;
242	foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )	242	foreach my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
243	{	243	{
244	my $name = substr( $poster, 0, 65 );	244	my $name = substr( $poster, 0, 65 );
245	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),	245	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
246	$data{$poster}{count};	246	$data{$poster}{count};
247	last if ( ++$i == $count );	247	last if ( ++$i == $count );
248	}	248	}
249	print "\n", "=" x 76, "\n";	249	print "\n", "=" x 76, "\n";
250	}	250	}
251		251
252	################################	252	################################
253	# show posters by size in Kbytes Sec 2;	253	# show posters by size in Kbytes Sec 2;
254	################################	254	################################
255	unless ( $skipSec{2} )	255	unless ( $skipSec{2} )
256	{	256	{
257	if ( keys %data < $topposters )	257	if ( keys %data < $topposters )
258	{	258	{
259	$count = keys %data;	259	$count = keys %data;
260	}	260	}
261	else	261	else
262	{	262	{
263	$count = $topposters;	263	$count = $topposters;
264	}	264	}
265	printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );	265	printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
266	print "=" x 76, "\n";	266	print "=" x 76, "\n";
267	$i = 0;	267	$i = 0;
268	foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )	268	foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
269	{	269	{
270	my $name = substr( $poster, 0, 62 );	270	my $name = substr( $poster, 0, 62 );
271	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),	271	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
272	$data{$poster}{size} / 1024; #/	272	$data{$poster}{size} / 1024; #/
273	last if ( ++$i == $count );	273	last if ( ++$i == $count );
274	}	274	}
275	print "\n", "=" x 76, "\n";	275	print "\n", "=" x 76, "\n";
276	}	276	}
277		277
278	####################################	278	####################################
279	# show top posters for original text	279	# show top posters for original text
280	####################################	280	####################################
281	unless ( $skipSec{3} )	281	unless ( $skipSec{3} )
282	{	282	{
283	if ( keys %data < $topposters )	283	if ( keys %data < $topposters )
284	{	284	{
285	$count = keys %data;	285	$count = keys %data;
286	}	286	}
287	else	287	else
288	{	288	{
289	$count = $topposters;	289	$count = $topposters;
290	}	290	}
291	printf "%s\n",	291	printf "%s\n",
292	&centred( "Top $count responders by original text (> 5 posts)", 76 );	292	&centred( "Top $count responders by original text (> 5 posts)", 76 );
293	print "=" x 76, "\n";	293	print "=" x 76, "\n";
294	$i = 0;	294	$i = 0;
295	foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }	295	foreach my $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
296	keys %data )	296	keys %data )
297	{	297	{
298	next if $data{$poster}{quoted} == 0;	298	next if $data{$poster}{quoted} == 0;
299	next if $data{$poster}{count} < 5;	299	next if $data{$poster}{count} < 5;
300	my $name = substr( $poster, 0, 63 );	300	my $name = substr( $poster, 0, 63 );
301	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),	301	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
302	$data{$poster}{percent};	302	$data{$poster}{percent};
303	last if ( ++$i == $count );	303	last if ( ++$i == $count );
304	}	304	}
305	print "\n", "=" x 76, "\n";	305	print "\n", "=" x 76, "\n";
306	}	306	}
307		307
308	#######################################	308	#######################################
309	# show bottom posters for original text	309	# show bottom posters for original text
310	#######################################	310	#######################################
311	unless ( $skipSec{4} )	311	unless ( $skipSec{4} )
312	{	312	{
313	if ( keys %data < $topposters )	313	if ( keys %data < $topposters )
314	{	314	{
315	$count = keys %data;	315	$count = keys %data;
316	}	316	}
317	else	317	else
318	{	318	{
319	$count = $topposters;	319	$count = $topposters;
320	}	320	}
321	printf "%s\n",	321	printf "%s\n",
322	&centred( "Bottom $count responders by original text (> 5 posts)", 76 );	322	&centred( "Bottom $count responders by original text (> 5 posts)", 76 );
323	print "=" x 76, "\n";	323	print "=" x 76, "\n";
324	$i = 0;	324	$i = 0;
325	foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }	325	foreach my $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
326	keys %data )	326	keys %data )
327	{	327	{
328	next if $data{$poster}{quoted} == 0;	328	next if $data{$poster}{quoted} == 0;
329	next if $data{$poster}{count} < 5;	329	next if $data{$poster}{count} < 5;
330	my $name = substr( $poster, 0, 63 );	330	my $name = substr( $poster, 0, 63 );
331	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),	331	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
332	$data{$poster}{percent};	332	$data{$poster}{percent};
333	last if ( ++$i == $count );	333	last if ( ++$i == $count );
334	}	334	}
335	print "\n", "=" x 76, "\n";	335	print "\n", "=" x 76, "\n";
336	}	336	}
337		337
338	####################################	338	####################################
339	# show threads by number of articles	339	# show threads by number of articles
340	####################################	340	####################################
341	unless ( $skipSec{5} )	341	unless ( $skipSec{5} )
342	{	342	{
343	if ( keys %threads < $topthreads )	343	if ( keys %threads < $topthreads )
344	{	344	{
345	$count = keys %threads;	345	$count = keys %threads;
346	}	346	}
347	else	347	else
348	{	348	{
349	$count = $topthreads;	349	$count = $topthreads;
350	}	350	}
351	printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );	351	printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
352	print "=" x 76, "\n";	352	print "=" x 76, "\n";
353	$i = 0;	353	$i = 0;
354	foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }	354	foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
355	keys %threads )	355	keys %threads )
356	{	356	{
357	my $name = substr( $thread, 0, 65 );	357	my $name = substr( $thread, 0, 65 );
358	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	358	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
359	$threads{$thread}{count};	359	$threads{$thread}{count};
360	last if ( ++$i == $count );	360	last if ( ++$i == $count );
361	}	361	}
362	print "\n", "=" x 76, "\n";	362	print "\n", "=" x 76, "\n";
363	}	363	}
364	################################	364	################################
365	# show threads by size in Kbytes	365	# show threads by size in Kbytes
366	################################	366	################################
367	unless ( $skipSec{6} )	367	unless ( $skipSec{6} )
368	{	368	{
369	if ( keys %threads < $topthreads )	369	if ( keys %threads < $topthreads )
370	{	370	{
371	$count = keys %threads;	371	$count = keys %threads;
372	}	372	}
373	else	373	else
374	{	374	{
375	$count = $topthreads;	375	$count = $topthreads;
376	}	376	}
377	printf "%s\n", &centred( "Top $count threads by size in KB", 76 );	377	printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
378	print "=" x 76, "\n";	378	print "=" x 76, "\n";
379	$i = 0;	379	$i = 0;
380	foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }	380	foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
381	keys %threads )	381	keys %threads )
382	{	382	{
383	my $name = substr( $thread, 0, 65 );	383	my $name = substr( $thread, 0, 65 );
384	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	384	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
385	$threads{$thread}{size} / 1024; #/	385	$threads{$thread}{size} / 1024; #/
386	last if ( ++$i == $count );	386	last if ( ++$i == $count );
387	}	387	}
388	print "\n", "=" x 76, "\n";	388	print "\n", "=" x 76, "\n";
389	}	389	}
390		390
391	#################################	391	#################################
392	# show top 10 cross-posted groups	392	# show top 10 cross-posted groups
393	#################################	393	#################################
394	unless ( $skipSec{7} )	394	unless ( $skipSec{7} )
395	{	395	{
396	delete $crossposts{"$newsgroup_name"}; # don't include ours	396	delete $crossposts{"$newsgroup_name"}; # don't include ours
397	if ( keys %crossposts < $topcrossposts )	397	if ( keys %crossposts < $topcrossposts )
398	{	398	{
399	$count = keys %crossposts;	399	$count = keys %crossposts;
400	}	400	}
401	else	401	else
402	{	402	{
403	$count = $topcrossposts;	403	$count = $topcrossposts;
404	}	404	}
405	printf "%s\n", &centred( "Top $count cross-posted groups", 76 );	405	printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
406	print "=" x 76, "\n";	406	print "=" x 76, "\n";
407	$i = 0;	407	$i = 0;
408	foreach	408	foreach
409	my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )	409	my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
410	{	410	{
411	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),	411	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
412	$crossposts{$name};	412	$crossposts{$name};
413	last if ( ++$i == $count );	413	last if ( ++$i == $count );
414	}	414	}
415	print "\n", "=" x 76, "\n";	415	print "\n", "=" x 76, "\n";
416	}	416	}
417	#######################	417	#######################
418	#show agents and counts	418	#show agents and counts
419	#######################	419	#######################
420	unless ( $skipSec{8} )	420	unless ( $skipSec{8} )
421	{	421	{
422	if ( keys %agents < $topagents )	422	if ( keys %agents < $topagents )
423	{	423	{
424	$count = keys %agents;	424	$count = keys %agents;
425	}	425	}
426	else	426	else
427	{	427	{
428	$count = $topagents;	428	$count = $topagents;
429	}	429	}
430	printf "%s\n", &centred( "Top $count User Agents by poster", 76 );	430	printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
431	print "=" x 76, "\n";	431	print "=" x 76, "\n";
432	$i = 0;	432	$i = 0;
433	foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )	433	foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
434	{	434	{
435	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),	435	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
436	$agents{$agent};	436	$agents{$agent};
437	last if ( ++$i == $count );	437	last if ( ++$i == $count );
438	}	438	}
439	print "\n", "=" x 76, "\n";	439	print "\n", "=" x 76, "\n";
440	}	440	}
441		441
442	#######################	442	#######################
443	#show distinct agents	443	#show distinct agents
444	#######################	444	#######################
445	unless ( $skipSec{9} )	445	unless ( $skipSec{9} )
446	{	446	{
447	if ( keys %distinct_agent < $topagents )	447	if ( keys %distinct_agent < $topagents )
448	{	448	{
449	$count = keys %distinct_agent;	449	$count = keys %distinct_agent;
450	}	450	}
451	else	451	else
452	{	452	{
453	$count = $topagents;	453	$count = $topagents;
454	}	454	}
455	printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );	455	printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
456	print "=" x 76, "\n";	456	print "=" x 76, "\n";
457	$i = 0;	457	$i = 0;
458	foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }	458	foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
459	keys %distinct_agent )	459	keys %distinct_agent )
460	{	460	{
461	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),	461	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
462	$distinct_agent{$agent},	462	$distinct_agent{$agent},
463	( ( $distinct_agent{$agent} / $totalposts ) * 100 );	463	( ( $distinct_agent{$agent} / $totalposts ) * 100 );
464	last if ( ++$i == $count );	464	last if ( ++$i == $count );
465	}	465	}
466	print "\n", "=" x 76, "\n";	466	print "\n", "=" x 76, "\n";
467	}	467	}
468		468
469	##########################	469	##########################
470	#show timezones and counts	470	#show timezones and counts
471	##########################	471	##########################
472	unless ( $skipSec{10} )	472	unless ( $skipSec{10} )
473	{	473	{
474	if ( keys %tz < $toptz )	474	if ( keys %tz < $toptz )
475	{	475	{
476	$count = keys %tz;	476	$count = keys %tz;
477	}	477	}
478	else	478	else
479	{	479	{
480	$count = $toptz;	480	$count = $toptz;
481	}	481	}
482	printf "%s\n", &centred( "Top 10 time zones", 76 );	482	printf "%s\n", &centred( "Top 10 time zones", 76 );
483	print "=" x 76, "\n";	483	print "=" x 76, "\n";
484	$i = 0;	484	$i = 0;
485	foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )	485	foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
486	{	486	{
487	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};	487	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
488	last if ( ++$i == $count );	488	last if ( ++$i == $count );
489	}	489	}
490	print "\n", "=" x 76, "\n";	490	print "\n", "=" x 76, "\n";
491	}	491	}
492		492
493	################################ SUBROUTINES ################################	493	################################ SUBROUTINES ################################
494		494
495	#######################################	495	#######################################
496	# get current article's header and body	496	# get current article's header and body
497	#######################################	497	#######################################
498	sub getarticle	498	sub getarticle
499	{	499	{
500	%headers = (); # dump old headers	500	%headers = (); # dump old headers

Subversion Repositories LCARS

(root)/trunk/tools/network/news/newsstat/newsstat.pl - Rev 7 → 8