WebSVN - LCARS - Diff - Rev 5 and 6 - /trunk/tools/network/news/newsstat/newsstat.pl


#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use Encode;

#########################
# newsstat.pl version 0.4

############################################################################
# Collect statistics about a newsgroup (specified by first argument) in
# the local news spool. Check all articles in the last 30-day period. 
# Rank posters by number of posts and by volume of posts, report on top and
# bottom 20 posters. Show their name, number of posts, size of posts,
# percentage of quoted lines. Rank user-agents used, by poster rather than
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
# 
# (Numbers and paths can be configured below.  -- PE)
############################################################################

############################################################################
#                       RECENT CHANGES                                     #
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
#                 - Use warnings, utf8 (just in case) 
#                 - Documentation update
# N/A         NN  - Take newsgroup name as argument
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
#                 - Allow command line flags for subtracting
#                   output if not pertinent for a group
# 2002-11-09  NN  - Put Garry's writedata() function back in.
#                 - added "rn" to my list of UA's
#                 - Started using %distinct_agent for both User agent
#                   sections
#                 - named it newsstat.pl version 0.3
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
#                   mtime rather than ctime, and simplifying the logic
# 2002-11-05  NN  - moved user configurations to the top
#                 - fixed the cross-posting section
#                 - introduced the $newsgroup_name variable which
#                   later becomes $news$group
#                 - changed $name to $agent_name in countagents()
# 
# Contributors
# -------------
# NN  Nomen nominandum (name to be determined later)
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>

########### TODO #############
# Commas in bottom section of report
# Show date the figures were compiled
# No. of HTML articles (Content-Type: text/html)
# No. of quoted sigs (/>\s*-- /)
# Per cent of top-posted articles
# Top 10 cross-posters
# Top 20 news posting hosts (from Path)
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
# Count *all* User Agents that each poster uses
# What do we do about Bill Unruh's ] quote style?
# Change the way dates/times are checked
# include % share in posters by no. of arts
# include % share in posters by size
# Total, orig & quoted lines by user agent with per cent
# Take more arguments
#######################################################

###################### USER CONFIGURATIONS ############################

# The name of the group to do stats for
my $newsgroup_name = $ARGV[0];
$newsgroup_name or &usage;

# Check for removal flags
my $ix;
my $j;
my %skipSec;
my @skiplist;
my $args = @ARGV;
for ( $ix = 1 ; $ix < $args ; $ix++ ) {
       $j = $ix + 1;
       if ( $ARGV[$ix] eq "-x" ) {
               @skiplist = split(",",$ARGV[$j]);
       } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) {
               @skiplist = split(",",$1);
       }
}
foreach(@skiplist) {
  $skipSec{$_} = 1;
}

# Leafnode users will want /var/spool/news for this variable.
my $news = "/var/spool/news/";

# How many days are we doing statistics for?
my $numdays = 30;

# no. of agents we list
my $topagents = 10;

# no. of threads we want to know about
my $topthreads = 20;

# no. of top or bottom posters to show
my $topposters = 20;

# no. of cross-posted threads to show
my $topcrossposts = 10;

# no. of time zones to show
my $toptz = 10;



###################### DATA STRUCTURES ######################
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data;                               # name, count, agent, total, orig, quoted
my %threads;                            # subject, count
my %crossposts;                         # group, count
my %tz;                                 # timezones by count
my %headers;                            # holds header of current article
my %lcheader;                           # holds lowercase headers
my @body;                               # holds body of current article
my @sig;                                # holds sig text;
my $totalposts;                         # total no. of posts considered
my $filename;                           # name of current article file
my $filesize;                           # size of current article file
my $earliest;                           # earliest article we have found
my $latest;                             # latest article we have found
my $poster;                             # poster we are dealing with
my $totsize = 0;                        # holds total sizes of all files
my $totheader = 0;                      # total size of header material
my $totbody = 0;                        # total size of body material
my $totsig = 0;                         # total size of sig material
my $totorig = 0;                        # total size of original material
my $totquoted = 0;                      # total size of quoted material
my $origposts = 0;                      # total no. of original posts
my $replies = 0;                        # total no. of replies
my $i;                                  # general purpose
my %distinct_agent;
my %agents =                            # used to hold counts of User Agents used
            (  "KNode"                         => 0,
               "Pan"                           => 0,
               "Mozilla"                       => 0,
               "Sylpheed"                      => 0,
               "Gnus"                          => 0,
               "Forte Agent"                   => 0,
               "Forte Free Agent"              => 0,
               "MicroPlanet Gravity"           => 0,
               "Microsoft Outlook Express"     => 0,
               "Xnews"                         => 0,
               "slrn"                          => 0,
               "tin"                           => 0,
               "rn"                            => 0,
               "NN"                            => 0,
               "MacSOUP"                       => 0,
               "Foorum"                        => 0,
               "MT-NewsWatcher"                => 0,
               "News Rover"                    => 0,
               "WebTV"                         => 0,
               "Compuserver"                   => 0,
               "VSoup"                         => 0);

######################## MAIN CODE ########################
$! = 1;

chdir("$news$group") or die "Can't cd to $news$group: $!\n";
opendir(DIR, ".") or die "Can't open $news$group directory: $!\n";
while (defined($filename = readdir(DIR))) {
 %lcheader = ();
 next unless -f $filename;             # only want real files
 next if ($filename eq ".overview");   # real articles only
 next if (-M $filename > $numdays);    # only want articles <= a certain age
 $earliest = (stat $filename)[9] unless defined ($earliest);
 $latest   = (stat $filename)[9] unless defined ($latest);
 &getarticle($filename);               # read in the article
 &getdata;                             # grab the data from the article
 $totalposts++;                        # bump count of articles considered
}
closedir(DIR);                          # finished with the directory
# post-processing
&countagents;                           # count agents, collapsing versions
&fixpercent;                            # check percentages orig/total for posters

&writedata;

#################### DISPLAY RESULTS #####################
print "=" x 76, "\n";
printf "%s\n", &centred("Analysis of posts to $newsgroup_name", 76);
print "=" x 76, "\n";
printf "%s\n", &centred("(stats compiled with a script by Garry Knight et al.)", 76);
print "\n\n";
printf "Total posts considered: %s over %d days\n",
       commify($totalposts), $numdays;
printf "Earliest article: %s\n", scalar localtime($earliest);
printf "Latest article:   %s\n", scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies);
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
       commify(int($totsize / 1024)), $totsize / 1048576; #
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
       commify(int($totalposts / $numdays)),
       $totsize / $numdays / 1048576, commify(int($totsize / $totalposts));
my $count = keys %data;
printf "Total headers: %s KB  bodies: %s KB\n",
       commify(int($totheader / 1024)), commify(int($totbody / 1024));
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
       commify(int($totquoted / 1024)), commify(int($totorig / 1024)),
       ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024));
printf "Total number of posters: %s, average %s bytes per poster\n", commify($count),
       commify(int($totsize / $count)); #/
$count = keys %threads;
printf "Total number of threads: %s, average %s bytes per thread\n", commify($count),
       commify(int($totsize / $count)); #/
printf "Total number of User-Agents: %d\n", scalar keys %agents;
print "\n", "=" x 76, "\n";

###############################
# show posters by article count  Sec 1;
###############################
unless ( $skipSec{1} ) {
    if (keys %data < $topposters) {
      $count = keys %data;
    } else {
      $count = $topposters;
    }
    printf "%s\n", &centred("Top $count posters by number of articles", 76);
    print "=" x 76, "\n";
    $i = 0;
    foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) {
    my $name = substr($poster, 0, 65);
    printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count};
    last if (++$i == $count);
 }
 print "\n", "=" x 76, "\n";
}

################################
# show posters by size in Kbytes Sec 2;
################################
unless ( $skipSec{2} ) {
  if (keys %data < $topposters) {
    $count = keys %data;
  } else {
    $count = $topposters;
  }
  printf "%s\n", &centred("Top $count posters by article size in Kbytes", 76);
  print "=" x 76, "\n";
  $i = 0;
  foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) {
    my $name = substr($poster, 0, 62);
    printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/
    last if (++$i == $count);
  }
  print "\n", "=" x 76, "\n";
}

####################################
# show top posters for original text
####################################
unless ( $skipSec{3} ) {
   if (keys %data < $topposters) {
     $count = keys %data;
   } else {
     $count = $topposters;
   }
   printf "%s\n", &centred("Top $count responders by original text (> 5 posts)", 76);
   print "=" x 76, "\n";
   $i = 0;
   foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) {
     next if $data{$poster}{quoted} == 0;
     next if $data{$poster}{count} < 5;
     my $name = substr($poster, 0, 63);
     printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
     last if (++$i == $count);
   }
   print "\n", "=" x 76, "\n";
}

#######################################
# show bottom posters for original text
#######################################
unless ( $skipSec{4} ) {
  if (keys %data < $topposters) {
    $count = keys %data;
  } else {
    $count = $topposters;
  }
  printf "%s\n", &centred("Bottom $count responders by original text  (> 5 posts)", 76);
  print "=" x 76, "\n";
  $i = 0;
  foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) {
    next if $data{$poster}{quoted} == 0;
    next if $data{$poster}{count} < 5;
    my $name = substr($poster, 0, 63);
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
    last if (++$i == $count);
  }
  print "\n", "=" x 76, "\n";
}

####################################
# show threads by number of articles
####################################
unless ( $skipSec{5} ) {
  if (keys %threads < $topthreads) {
    $count = keys %threads;
  } else {
    $count = $topthreads;
  }
  printf "%s\n", &centred("Top $count threads by no. of articles", 76);
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) {
    my $name = substr($thread, 0, 65);
    printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count};
    last if (++$i == $count);
  }
  print "\n", "=" x 76, "\n";
}
################################
# show threads by size in Kbytes
################################
unless ( $skipSec{6} ) {
  if (keys %threads < $topthreads) {
    $count = keys %threads;
  } else {
    $count = $topthreads;
  }
  printf "%s\n", &centred("Top $count threads by size in KB", 76);
  print "=" x 76, "\n";
  $i = 0;
  foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) {
    my $name = substr($thread, 0, 65);
    printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/
    last if (++$i == $count);
  }
  print "\n", "=" x 76, "\n";
}

#################################
# show top 10 cross-posted groups
#################################
unless ( $skipSec{7} ) {
delete $crossposts{"$newsgroup_name"};  # don't include ours
if (keys %crossposts < $topcrossposts) {
 $count = keys %crossposts;
} else {
 $count = $topcrossposts;
}
printf "%s\n", &centred("Top $count cross-posted groups", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) {
 printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name};
 last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}
#######################
#show agents and counts
#######################
unless ( $skipSec{8} ) {
if (keys %agents < $topagents) {
 $count = keys %agents;
} else {
 $count = $topagents;
}
printf "%s\n", &centred("Top $count User Agents by poster", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) {
 printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent};
 last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}

#######################
#show distinct agents
#######################
unless ( $skipSec{9} ) {
if (keys %distinct_agent < $topagents) {
 $count = keys %distinct_agent;
} else {
 $count = $topagents;
}
printf "%s\n", &centred("Top $count User Agents by number of posts", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) {
printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100);
 last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}

##########################
#show timezones and counts
##########################
unless ( $skipSec{10} ) {
if (keys %tz < $toptz) {
 $count = keys %tz;
} else {
 $count = $toptz;
}
printf "%s\n", &centred("Top 10 time zones", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) {
 printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone};
 last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}


################################ SUBROUTINES ################################


#######################################
# get current article's header and body
#######################################
sub getarticle {
 %headers = ();                        # dump old headers
 my $filename = shift;                 # get the name of the file
# get stats about the file itself
 $filesize = -s $filename;             # get total size of file
 $totsize += $filesize;                # bump total sizes of all files

 my $mtime = (stat $filename)[9];
 if ( $mtime < $earliest ) {
    $earliest = $mtime;
 } elsif ( $mtime > $latest ) {
    $latest = $mtime;
 }

# now read the file
 open(FILE, $filename) or die "Can't open $filename: $!\n";
 while (<FILE>) {
   $totheader += length($_);           # bump total header size
   last if (/^\s*$/);                  # end of header?
   if (/^([^:\s]*):\s+(.*)/) {
     my($key,$val) = ($1,$2);
     $headers{$key} = decode('MIME-Header', $val);
     $lcheader{clean(lc($key))} = clean($val);
  }
 }
 @body = <FILE>;                       # slurp up body
 close(FILE);
} # getarticle

###################################
# get data from the current article
###################################
sub getdata {
#### First, analyse header fields ####

# Set up this poster if not defined, get counts, sizes
 $poster = encode('UTF-8', $headers{From});             # get the poster's name
 if (!defined($data{$poster})) {       # seen this one before?
   $data{$poster}{agent} = 'Unknown';  # comes after For: field
   $data{$poster}{orig} = 0;
   $data{$poster}{quoted} = 0;
 }
 $data{$poster}{count}++;      # bump count for this poster
 $data{$poster}{size} += $filesize;    # total size of file

# The User-Agent and/or X-Newsreader fields
# for User-Agent by poster
 if (defined $lcheader{"user-agent"}) {
   $data{$poster}{agent} = $lcheader{"user-agent"};
 }
 if (defined $lcheader{"x-newsreader"}) {
   $data{$poster}{agent} = $lcheader{"x-newsreader"};
 }

# The User Agent for User-Agent by number of posts
 my $UA = "unknown";
 foreach my $keys ( keys %lcheader )
 {
   if (defined $lcheader{'user-agent'})
   {
           $UA = $lcheader{'user-agent'};
   }
    elsif (defined $lcheader{"x-newsreader"})
   {
           $UA = $lcheader{"x-newsreader"};
   }
   elsif (defined $lcheader{'x-mailer'})
   {
           $UA = $lcheader{'x-mailer'};
   }
   elsif ((defined $lcheader{'organization'}) &&
         ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/))
   {
           $UA = $lcheader{'organization'};
   }
   elsif ( $lcheader{'message-id'} =~ /pine/i )
   {
           $UA = "Pine";
   } ## Hopefully found UA, else set to unknown
 }


$UA = clean($UA);
$UA = get_agent($UA);


sub get_agent {
 my $raw = shift;
 my $agent = $raw;

 ## strip http
 if ( $raw =~ /.*http.*/ ) {
   $raw =~ s!posted via!!i;
   $raw =~ s!http://!!g;
   $raw =~ s!/!!g;
   $raw =~ s! !!g;
 }

 ## Fix Outlook from Mac
 if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;}

 ## Pick out the popular agents
 if ( $raw =~ /(outlook express)/i     ||
      $raw =~ /(microplanet gravity)/i ||
      $raw =~ /(news rover)/i          ||
      $raw =~ /(forte agent)/i         ||
      $raw =~ /(forte free agent)/i
    )
 {
       $agent = $1;
 }
 elsif ( $raw =~ /^(
        pan
       |sylpheed
       |slrn
       |mozilla
       |knode
       |tin
       |hamster
       |xrn
       |xnews
       |aol
       |gnus
       |krn
       |macsoup
       |messenger
       |openxp
       |pine
       |thoth
       |turnpike
       |winvn
       |vsoup
       |google
       |supernews
       |nn
       |rn
       |007
       |webtv
       |compuserve
       )/ix )
 {
       $agent = $1;
 }
 else
 {
 ## Clean up unknown agents
       if ( $raw =~ m!^(.*?)/! ) {
             $agent = $1;
       }
       elsif ( $raw =~ /^(\w*)\d.*/ )
       {
            $agent = $1;
       }
  }

$distinct_agent{$agent}++;
return $agent;
}


# Get all cross-posted newsgroups
 for (split /,/, $headers{"Newsgroups"}) {
   $crossposts{$_}++;          # bump count for each
 }

# Get threads
 my $thread = encode('UTF-8', $headers{"Subject"});
 $thread =~ s/^re: //i;                # Remove Re: or re: at start
 $thread =~ s/\s+/ /g;                 # collapse whitespace
 $threads{$thread}{count} += 1;        # bump count of this subject
 $threads{$thread}{size} += $filesize; # bump bytes for this thread

# Is this an original post or a reply?
 if (defined $headers{"References"}) {
   $replies++;
 } else {
   $origposts++;
 }

# Get the time zone
 $_ = $headers{"Date"};
 my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
 if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) {
   $tz = "UTC";
 }
 $tz{$tz}++;

#### Now analyse the body text ####
 my $insig = 0;
 for (@body) {
   $totbody += length($_);             # bump total body size
   next if (/^$>/);                    # don't count blank lines in body
   if ($insig == 1) {
     $totsig += length($_);            # bump total sig size
# Bill Unruh uses ] quotes, and another poster uses ::
   } elsif (/^\s*[>\]]/ or /^\s*::/) {         # are we in a quote line?
     $data{$poster}{quoted} += length($_);     # bump count of quoted chrs
     $totquoted += length($_);
   } elsif (/-- /) {
     $insig = 1;
   } else {
# we must be processing an original line
     $data{$poster}{orig} += length($_); # bump count of original chrs
     $totorig += length($_);
   }
 } # end for (@body)

} # getdata

########################################
# Count the User-Agents used, collapsing
# different versions into one per agent.
########################################
sub countagents {
POSTER:
 foreach $poster (keys %data) {
   foreach my $agent_name (keys %distinct_agent) {     # check against known ones
     if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) {
       $agents{$agent_name}++;
       next POSTER;
     }
   }
   $agents{$data{$poster}{agent}}++;
 }
} # countagents

############################################
# set orig/total percentages for all posters
############################################
sub fixpercent {
 foreach $poster (keys %data) {
   my $percent = 100;
   if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) {
     $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/
   } elsif ($data{$poster}{orig} == 0) {
     $percent = 0;
   }
   $data{$poster}{percent} = $percent;
 }
}

##############################
# right pad a string with '.'s
##############################
sub rpad {
# get text to pad, length to pad, pad chr
 my ($text, $pad_len, $pad_chr) = @_;
 if (length($text) > $pad_len) {
   $text = substr($text, 0, $pad_len);
 }
 my $padded = $text . $pad_chr x ( $pad_len - length( $text ) );
 return $padded;
}

#################
# centre a string
#################
sub centred {
 my ($text, $width) = @_;              # text to centre, size of field to centre in
 my $pad_len = ($width - length($text)) / 2;   #/
 my $centred = " " x $pad_len . $text;
 return $centred;
}

##########################
# put commas into a number
##########################
sub commify {
 $_  = shift;
 1 while s/^(-?\d+)(\d{3})/$1,$2/;
 return $_;
}

#########################
# clean
#########################
sub clean {
 my $dirty = shift;
 my $clean = $dirty;
 $clean =~ s/^\s*//;
 $clean =~ s/\s*$//;

return $clean;
}


sub usage {

 print "usage: newstat.pl newsgroupname\n";
 exit 1;
}

###################################
# Write data structures to a file #
###################################
sub writedata {
 open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
 print OUTF "Data collected from alt.os.linux.mandrake\n\n";
 print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
 foreach my $name (keys %data) {
   print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
 }
 print OUTF "============================================================================\n";
 print OUTF "Thread subjects\n";
 print OUTF "----------------------------------------------------------------------------\n";
 foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) {
   print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
 }
 print OUTF "============================================================================\n";
 print OUTF "Cross-posts\n";
 print OUTF "----------------------------------------------------------------------------\n";
 foreach my $name (sort keys %crossposts) {
   print OUTF "$name : $crossposts{$name}\n";
 }
 print OUTF
 print OUTF "============================================================================\n";
 print OUTF "User agents\n";
 print OUTF "----------------------------------------------------------------------------\n";
 foreach my $name (sort keys %agents) {
   print OUTF "$name : $agents{$name}\n";
 }
 print OUTF "============================================================================\n";
 print OUTF "Time zones\n";
 print OUTF "----------------------------------------------------------------------------\n";
 foreach my $name (sort keys %tz) {
   print OUTF "$name : $tz{$name}\n";
 }
 close OUTF;
} # writedata
 

Rev 5	Rev 6
1	#!/usr/bin/perl -w	1	#!/usr/bin/perl -w
2	use strict;	2	use strict;
3	#########################	3	use warnings;
4	# newsstat.pl version 0.3	4	use utf8;
5		5	use Encode;
6		6
-		7	#########################
-		8	# newsstat.pl version 0.4
7		9
8	###################################################################	10	############################################################################
9	# Collect statistics about the alt.os.linux.mandrake newsgroup.	11	# Collect statistics about a newsgroup (specified by first argument) in
10	# Check all articles in the last 7-day period. Rank posters by	12	# the local news spool. Check all articles in the last 30-day period.
11	# no. of posts and by volume of posts, report on top and bottom	13	# Rank posters by number of posts and by volume of posts, report on top and
12	# 20 posters. Show their name, no. posts, size of posts, percentage	14	# bottom 20 posters. Show their name, number of posts, size of posts,
13	# quoted lines. Rank user-agents used, by poster rather than by	15	# percentage of quoted lines. Rank user-agents used, by poster rather than
14	# post. Rank top 10 threads. Rank top 10 cross-posted groups.	16	# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
-		17	#
-		18	# (Numbers and paths can be configured below. -- PE)
15	###################################################################	19	############################################################################
16		20
17	##################################################################	21	############################################################################
18	# RECENT CHANGES #	22	# RECENT CHANGES #
-		23	# 2011-07-03 PE - Use Encode to decode/encode MIME encodings
-		24	# - Use warnings, utf8 (just in case)
-		25	# - Documentation update
-		26	# N/A NN - Take newsgroup name as argument
19	# 2004/06/19 - newsgroup name is $ARGV[0]	27	# 2004-06-19 NN - newsgroup name is $ARGV[0]
20	# - Allow command line flags for subtracting	28	# - Allow command line flags for subtracting
21	# output if not pertinent for a group	29	# output if not pertinent for a group
22	# 2002/11/09 - Put Garry's writedata() function back in.	30	# 2002-11-09 NN - Put Garry's writedata() function back in.
23	# - added "rn" to my list of UA's	31	# - added "rn" to my list of UA's
24	# - Started using %distinct_agent for both User agent	32	# - Started using %distinct_agent for both User agent
25	# sections	33	# sections
26	# - named it newsstat.pl version 0.3	34	# - named it newsstat.pl version 0.3
27	# 2002/11/06 - Fixed the earliest/latest file problem by using	35	# 2002-11-06 NN - Fixed the earliest/latest file problem by using
28	# mtime rather than ctime, and simplifying the logic	36	# mtime rather than ctime, and simplifying the logic
29	# 2002/11/05 - moved user configurations to the top	37	# 2002-11-05 NN - moved user configurations to the top
30	# - fixed the cross-posting section	38	# - fixed the cross-posting section
31	# - introduced the $newsgroup_name variable which	39	# - introduced the $newsgroup_name variable which
32	# later becomes $news$group	40	# later becomes $news$group
33	# - changed $name to $agent_name in countagents()	41	# - changed $name to $agent_name in countagents()
-		42	#
-		43	# Contributors
-		44	# -------------
-		45	# NN Nomen nominandum (name to be determined later)
-		46	# PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
34		47
35	########### NEXT #############	48	########### TODO #############
36	# Commas in bottom section of report	49	# Commas in bottom section of report
37	# Show date the figures were compiled	50	# Show date the figures were compiled
38	# No. of HTML articles (Content-Type: text/html)	51	# No. of HTML articles (Content-Type: text/html)
39	# No. of quoted sigs (/>\s*-- /)	52	# No. of quoted sigs (/>\s*-- /)
40	# Per cent of top-posted articles	53	# Per cent of top-posted articles
41	# Top 10 cross-posters	54	# Top 10 cross-posters
42	# Top 20 news posting hosts (from Path)	55	# Top 20 news posting hosts (from Path)
43	# Count of certain subject words: newbie, kde, burner, sendmail, etc.	56	# Count of certain subject words: newbie, kde, burner, sendmail, etc.
44	# Count all User Agents that each poster uses	57	# Count all User Agents that each poster uses
45	# What do we do about Bill Unruh's ] quote style?	58	# What do we do about Bill Unruh's ] quote style?
46	# Change the way dates/times are checked	59	# Change the way dates/times are checked
47	# include % share in posters by no. of arts	60	# include % share in posters by no. of arts
48	# include % share in posters by size	61	# include % share in posters by size
49	# Total, orig & quoted lines by user agent with per cent	62	# Total, orig & quoted lines by user agent with per cent
50	# Take arguments, i.e. newsgroup name	63	# Take more arguments
51	#######################################################	64	#######################################################
52		65
53	###################### USER CONFIGURATIONS ############################	66	###################### USER CONFIGURATIONS ############################
54		67
55	# The name of the group to do stats for	68	# The name of the group to do stats for
56	my $newsgroup_name = $ARGV[0];	69	my $newsgroup_name = $ARGV[0];
57	$newsgroup_name or &usage;	70	$newsgroup_name or &usage;
58		71
59	# Check for removal flags	72	# Check for removal flags
60	my $ix;	73	my $ix;
61	my $j;	74	my $j;
62	my %skipSec;	75	my %skipSec;
63	my @skiplist;	76	my @skiplist;
64	my $args = @ARGV;	77	my $args = @ARGV;
65	for ( $ix = 1 ; $ix < $args ; $ix++ ) {	78	for ( $ix = 1 ; $ix < $args ; $ix++ ) {
66	$j = $ix + 1;	79	$j = $ix + 1;
67	if ( $ARGV[$ix] eq "-x" ) {	80	if ( $ARGV[$ix] eq "-x" ) {
68	@skiplist = split(",",$ARGV[$j]);	81	@skiplist = split(",",$ARGV[$j]);
69	} elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) {	82	} elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) {
70	@skiplist = split(",",$1);	83	@skiplist = split(",",$1);
71	}	84	}
72	}	85	}
73	foreach(@skiplist) {	86	foreach(@skiplist) {
74	$skipSec{$_} = 1;	87	$skipSec{$_} = 1;
75	}	88	}
76		89
77	# Leafnode users will want /var/spool/news for this variable.	90	# Leafnode users will want /var/spool/news for this variable.
78	my $news = "/var/spool/news/";	91	my $news = "/var/spool/news/";
79		92
80	# How many days are we doing statistics for?	93	# How many days are we doing statistics for?
81	my $numdays = 30;	94	my $numdays = 30;
82		95
83	# no. of agents we list	96	# no. of agents we list
84	my $topagents = 10;	97	my $topagents = 10;
85		98
86	# no. of threads we want to know about	99	# no. of threads we want to know about
87	my $topthreads = 20;	100	my $topthreads = 20;
88		101
89	# no. of top or bottom posters to show	102	# no. of top or bottom posters to show
90	my $topposters = 20;	103	my $topposters = 20;
91		104
92	# no. of cross-posted threads to show	105	# no. of cross-posted threads to show
93	my $topcrossposts = 10;	106	my $topcrossposts = 10;
94		107
95	# no. of time zones to show	108	# no. of time zones to show
96	my $toptz = 10;	109	my $toptz = 10;
97		110
98		111
99		112
100	###################### DATA STRUCTURES ######################	113	###################### DATA STRUCTURES ######################
101	my $group = $newsgroup_name;	114	my $group = $newsgroup_name;
102	$group =~ s!\.!/!g;	115	$group =~ s!\.!/!g;
103	my %data; # name, count, agent, total, orig, quoted	116	my %data; # name, count, agent, total, orig, quoted
104	my %threads; # subject, count	117	my %threads; # subject, count
105	my %crossposts; # group, count	118	my %crossposts; # group, count
106	my %tz; # timezones by count	119	my %tz; # timezones by count
107	my %headers; # holds header of current article	120	my %headers; # holds header of current article
108	my %lcheader; # holds lowercase headers	121	my %lcheader; # holds lowercase headers
109	my @body; # holds body of current article	122	my @body; # holds body of current article
110	my @sig; # holds sig text;	123	my @sig; # holds sig text;
111	my $totalposts; # total no. of posts considered	124	my $totalposts; # total no. of posts considered
112	my $filename; # name of current article file	125	my $filename; # name of current article file
113	my $filesize; # size of current article file	126	my $filesize; # size of current article file
114	my $earliest; # earliest article we have found	127	my $earliest; # earliest article we have found
115	my $latest; # latest article we have found	128	my $latest; # latest article we have found
116	my $poster; # poster we are dealing with	129	my $poster; # poster we are dealing with
117	my $totsize = 0; # holds total sizes of all files	130	my $totsize = 0; # holds total sizes of all files
118	my $totheader = 0; # total size of header material	131	my $totheader = 0; # total size of header material
119	my $totbody = 0; # total size of body material	132	my $totbody = 0; # total size of body material
120	my $totsig = 0; # total size of sig material	133	my $totsig = 0; # total size of sig material
121	my $totorig = 0; # total size of original material	134	my $totorig = 0; # total size of original material
122	my $totquoted = 0; # total size of quoted material	135	my $totquoted = 0; # total size of quoted material
123	my $origposts = 0; # total no. of original posts	136	my $origposts = 0; # total no. of original posts
124	my $replies = 0; # total no. of replies	137	my $replies = 0; # total no. of replies
125	my $i; # general purpose	138	my $i; # general purpose
126	my %distinct_agent;	139	my %distinct_agent;
127	my %agents = # used to hold counts of User Agents used	140	my %agents = # used to hold counts of User Agents used
128	( "KNode" => 0,	141	( "KNode" => 0,
129	"Pan" => 0,	142	"Pan" => 0,
130	"Mozilla" => 0,	143	"Mozilla" => 0,
131	"Sylpheed" => 0,	144	"Sylpheed" => 0,
132	"Gnus" => 0,	145	"Gnus" => 0,
133	"Forte Agent" => 0,	146	"Forte Agent" => 0,
134	"Forte Free Agent" => 0,	147	"Forte Free Agent" => 0,
135	"MicroPlanet Gravity" => 0,	148	"MicroPlanet Gravity" => 0,
136	"Microsoft Outlook Express" => 0,	149	"Microsoft Outlook Express" => 0,
137	"Xnews" => 0,	150	"Xnews" => 0,
138	"slrn" => 0,	151	"slrn" => 0,
139	"tin" => 0,	152	"tin" => 0,
140	"rn" => 0,	153	"rn" => 0,
141	"NN" => 0,	154	"NN" => 0,
142	"MacSOUP" => 0,	155	"MacSOUP" => 0,
143	"Foorum" => 0,	156	"Foorum" => 0,
144	"MT-NewsWatcher" => 0,	157	"MT-NewsWatcher" => 0,
145	"News Rover" => 0,	158	"News Rover" => 0,
146	"WebTV" => 0,	159	"WebTV" => 0,
147	"Compuserver" => 0,	160	"Compuserver" => 0,
148	"VSoup" => 0);	161	"VSoup" => 0);
149		162
150	######################## MAIN CODE ########################	163	######################## MAIN CODE ########################
151	$! = 1;	164	$! = 1;
152		165
153	chdir("$news$group") or die "Can't cd to $news$group: $!\n";	166	chdir("$news$group") or die "Can't cd to $news$group: $!\n";
154	opendir(DIR, ".") or die "Can't open $news$group directory: $!\n";	167	opendir(DIR, ".") or die "Can't open $news$group directory: $!\n";
155	while (defined($filename = readdir(DIR))) {	168	while (defined($filename = readdir(DIR))) {
156	%lcheader = ();	169	%lcheader = ();
157	next unless -f $filename; # only want real files	170	next unless -f $filename; # only want real files
158	next if ($filename eq ".overview"); # real articles only	171	next if ($filename eq ".overview"); # real articles only
159	next if (-M $filename > $numdays); # only want articles <= a certain age	172	next if (-M $filename > $numdays); # only want articles <= a certain age
160	$earliest = (stat $filename)[9] unless defined ($earliest);	173	$earliest = (stat $filename)[9] unless defined ($earliest);
161	$latest = (stat $filename)[9] unless defined ($latest);	174	$latest = (stat $filename)[9] unless defined ($latest);
162	&getarticle($filename); # read in the article	175	&getarticle($filename); # read in the article
163	&getdata; # grab the data from the article	176	&getdata; # grab the data from the article
164	$totalposts++; # bump count of articles considered	177	$totalposts++; # bump count of articles considered
165	}	178	}
166	closedir(DIR); # finished with the directory	179	closedir(DIR); # finished with the directory
167	# post-processing	180	# post-processing
168	&countagents; # count agents, collapsing versions	181	&countagents; # count agents, collapsing versions
169	&fixpercent; # check percentages orig/total for posters	182	&fixpercent; # check percentages orig/total for posters
170		183
171	&writedata;	184	&writedata;
172		185
173	#################### DISPLAY RESULTS #####################	186	#################### DISPLAY RESULTS #####################
174	print "=" x 76, "\n";	187	print "=" x 76, "\n";
175	printf "%s\n", &centred("Analysis of posts to $newsgroup_name", 76);	188	printf "%s\n", &centred("Analysis of posts to $newsgroup_name", 76);
176	print "=" x 76, "\n";	189	print "=" x 76, "\n";
177	printf "%s\n", &centred("(stats compiled with a script by Garry Knight)", 76);	190	printf "%s\n", &centred("(stats compiled with a script by Garry Knight et al.)", 76);
178	print "\n\n";	191	print "\n\n";
179	printf "Total posts considered: %s over %d days\n",	192	printf "Total posts considered: %s over %d days\n",
180	commify($totalposts), $numdays;	193	commify($totalposts), $numdays;
181	printf "Earliest article: %s\n", scalar localtime($earliest);	194	printf "Earliest article: %s\n", scalar localtime($earliest);
182	printf "Latest article: %s\n", scalar localtime($latest);	195	printf "Latest article: %s\n", scalar localtime($latest);
183	printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies);	196	printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies);
184	printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),	197	printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
185	commify(int($totsize / 1024)), $totsize / 1048576; #	198	commify(int($totsize / 1024)), $totsize / 1048576; #
186	printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",	199	printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
187	commify(int($totalposts / $numdays)),	200	commify(int($totalposts / $numdays)),
188	$totsize / $numdays / 1048576, commify(int($totsize / $totalposts));	201	$totsize / $numdays / 1048576, commify(int($totsize / $totalposts));
189	my $count = keys %data;	202	my $count = keys %data;
190	printf "Total headers: %s KB bodies: %s KB\n",	203	printf "Total headers: %s KB bodies: %s KB\n",
191	commify(int($totheader / 1024)), commify(int($totbody / 1024));	204	commify(int($totheader / 1024)), commify(int($totbody / 1024));
192	printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",	205	printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",
193	commify(int($totquoted / 1024)), commify(int($totorig / 1024)),	206	commify(int($totquoted / 1024)), commify(int($totorig / 1024)),
194	($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024));	207	($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024));
195	printf "Total number of posters: %s, average %s bytes per poster\n", commify($count),	208	printf "Total number of posters: %s, average %s bytes per poster\n", commify($count),
196	commify(int($totsize / $count)); #/	209	commify(int($totsize / $count)); #/
197	$count = keys %threads;	210	$count = keys %threads;
198	printf "Total number of threads: %s, average %s bytes per thread\n", commify($count),	211	printf "Total number of threads: %s, average %s bytes per thread\n", commify($count),
199	commify(int($totsize / $count)); #/	212	commify(int($totsize / $count)); #/
200	printf "Total number of User-Agents: %d\n", scalar keys %agents;	213	printf "Total number of User-Agents: %d\n", scalar keys %agents;
201	print "\n", "=" x 76, "\n";	214	print "\n", "=" x 76, "\n";
202		215
203	###############################	216	###############################
204	# show posters by article count Sec 1;	217	# show posters by article count Sec 1;
205	###############################	218	###############################
206	unless ( $skipSec{1} ) {	219	unless ( $skipSec{1} ) {
207	if (keys %data < $topposters) {	220	if (keys %data < $topposters) {
208	$count = keys %data;	221	$count = keys %data;
209	} else {	222	} else {
210	$count = $topposters;	223	$count = $topposters;
211	}	224	}
212	printf "%s\n", &centred("Top $count posters by number of articles", 76);	225	printf "%s\n", &centred("Top $count posters by number of articles", 76);
213	print "=" x 76, "\n";	226	print "=" x 76, "\n";
214	$i = 0;	227	$i = 0;
215	foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) {	228	foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) {
216	my $name = substr($poster, 0, 65);	229	my $name = substr($poster, 0, 65);
217	printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count};	230	printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count};
218	last if (++$i == $count);	231	last if (++$i == $count);
219	}	232	}
220	print "\n", "=" x 76, "\n";	233	print "\n", "=" x 76, "\n";
221	}	234	}
222		235
223	################################	236	################################
224	# show posters by size in Kbytes Sec 2;	237	# show posters by size in Kbytes Sec 2;
225	################################	238	################################
226	unless ( $skipSec{2} ) {	239	unless ( $skipSec{2} ) {
227	if (keys %data < $topposters) {	240	if (keys %data < $topposters) {
228	$count = keys %data;	241	$count = keys %data;
229	} else {	242	} else {
230	$count = $topposters;	243	$count = $topposters;
231	}	244	}
232	printf "%s\n", &centred("Top $count posters by article size in Kbytes", 76);	245	printf "%s\n", &centred("Top $count posters by article size in Kbytes", 76);
233	print "=" x 76, "\n";	246	print "=" x 76, "\n";
234	$i = 0;	247	$i = 0;
235	foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) {	248	foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) {
236	my $name = substr($poster, 0, 62);	249	my $name = substr($poster, 0, 62);
237	printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/	250	printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/
238	last if (++$i == $count);	251	last if (++$i == $count);
239	}	252	}
240	print "\n", "=" x 76, "\n";	253	print "\n", "=" x 76, "\n";
241	}	254	}
242		255
243	####################################	256	####################################
244	# show top posters for original text	257	# show top posters for original text
245	####################################	258	####################################
246	unless ( $skipSec{3} ) {	259	unless ( $skipSec{3} ) {
247	if (keys %data < $topposters) {	260	if (keys %data < $topposters) {
248	$count = keys %data;	261	$count = keys %data;
249	} else {	262	} else {
250	$count = $topposters;	263	$count = $topposters;
251	}	264	}
252	printf "%s\n", &centred("Top $count responders by original text (> 5 posts)", 76);	265	printf "%s\n", &centred("Top $count responders by original text (> 5 posts)", 76);
253	print "=" x 76, "\n";	266	print "=" x 76, "\n";
254	$i = 0;	267	$i = 0;
255	foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) {	268	foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) {
256	next if $data{$poster}{quoted} == 0;	269	next if $data{$poster}{quoted} == 0;
257	next if $data{$poster}{count} < 5;	270	next if $data{$poster}{count} < 5;
258	my $name = substr($poster, 0, 63);	271	my $name = substr($poster, 0, 63);
259	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};	272	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
260	last if (++$i == $count);	273	last if (++$i == $count);
261	}	274	}
262	print "\n", "=" x 76, "\n";	275	print "\n", "=" x 76, "\n";
263	}	276	}
264		277
265	#######################################	278	#######################################
266	# show bottom posters for original text	279	# show bottom posters for original text
267	#######################################	280	#######################################
268	unless ( $skipSec{4} ) {	281	unless ( $skipSec{4} ) {
269	if (keys %data < $topposters) {	282	if (keys %data < $topposters) {
270	$count = keys %data;	283	$count = keys %data;
271	} else {	284	} else {
272	$count = $topposters;	285	$count = $topposters;
273	}	286	}
274	printf "%s\n", &centred("Bottom $count responders by original text (> 5 posts)", 76);	287	printf "%s\n", &centred("Bottom $count responders by original text (> 5 posts)", 76);
275	print "=" x 76, "\n";	288	print "=" x 76, "\n";
276	$i = 0;	289	$i = 0;
277	foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) {	290	foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) {
278	next if $data{$poster}{quoted} == 0;	291	next if $data{$poster}{quoted} == 0;
279	next if $data{$poster}{count} < 5;	292	next if $data{$poster}{count} < 5;
280	my $name = substr($poster, 0, 63);	293	my $name = substr($poster, 0, 63);
281	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};	294	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
282	last if (++$i == $count);	295	last if (++$i == $count);
283	}	296	}
284	print "\n", "=" x 76, "\n";	297	print "\n", "=" x 76, "\n";
285	}	298	}
286		299
287	####################################	300	####################################
288	# show threads by number of articles	301	# show threads by number of articles
289	####################################	302	####################################
290	unless ( $skipSec{5} ) {	303	unless ( $skipSec{5} ) {
291	if (keys %threads < $topthreads) {	304	if (keys %threads < $topthreads) {
292	$count = keys %threads;	305	$count = keys %threads;
293	} else {	306	} else {
294	$count = $topthreads;	307	$count = $topthreads;
295	}	308	}
296	printf "%s\n", &centred("Top $count threads by no. of articles", 76);	309	printf "%s\n", &centred("Top $count threads by no. of articles", 76);
297	print "=" x 76, "\n";	310	print "=" x 76, "\n";
298	$i = 0;	311	$i = 0;
299	foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) {	312	foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) {
300	my $name = substr($thread, 0, 65);	313	my $name = substr($thread, 0, 65);
301	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count};	314	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count};
302	last if (++$i == $count);	315	last if (++$i == $count);
303	}	316	}
304	print "\n", "=" x 76, "\n";	317	print "\n", "=" x 76, "\n";
305	}	318	}
306	################################	319	################################
307	# show threads by size in Kbytes	320	# show threads by size in Kbytes
308	################################	321	################################
309	unless ( $skipSec{6} ) {	322	unless ( $skipSec{6} ) {
310	if (keys %threads < $topthreads) {	323	if (keys %threads < $topthreads) {
311	$count = keys %threads;	324	$count = keys %threads;
312	} else {	325	} else {
313	$count = $topthreads;	326	$count = $topthreads;
314	}	327	}
315	printf "%s\n", &centred("Top $count threads by size in KB", 76);	328	printf "%s\n", &centred("Top $count threads by size in KB", 76);
316	print "=" x 76, "\n";	329	print "=" x 76, "\n";
317	$i = 0;	330	$i = 0;
318	foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) {	331	foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) {
319	my $name = substr($thread, 0, 65);	332	my $name = substr($thread, 0, 65);
320	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/	333	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/
321	last if (++$i == $count);	334	last if (++$i == $count);
322	}	335	}
323	print "\n", "=" x 76, "\n";	336	print "\n", "=" x 76, "\n";
324	}	337	}
325		338
326	#################################	339	#################################
327	# show top 10 cross-posted groups	340	# show top 10 cross-posted groups
328	#################################	341	#################################
329	unless ( $skipSec{7} ) {	342	unless ( $skipSec{7} ) {
330	delete $crossposts{"$newsgroup_name"}; # don't include ours	343	delete $crossposts{"$newsgroup_name"}; # don't include ours
331	if (keys %crossposts < $topcrossposts) {	344	if (keys %crossposts < $topcrossposts) {
332	$count = keys %crossposts;	345	$count = keys %crossposts;
333	} else {	346	} else {
334	$count = $topcrossposts;	347	$count = $topcrossposts;
335	}	348	}
336	printf "%s\n", &centred("Top $count cross-posted groups", 76);	349	printf "%s\n", &centred("Top $count cross-posted groups", 76);
337	print "=" x 76, "\n";	350	print "=" x 76, "\n";
338	$i = 0;	351	$i = 0;
339	foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) {	352	foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) {
340	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name};	353	printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name};
341	last if (++$i == $count);	354	last if (++$i == $count);
342	}	355	}
343	print "\n", "=" x 76, "\n";	356	print "\n", "=" x 76, "\n";
344	}	357	}
345	#######################	358	#######################
346	#show agents and counts	359	#show agents and counts
347	#######################	360	#######################
348	unless ( $skipSec{8} ) {	361	unless ( $skipSec{8} ) {
349	if (keys %agents < $topagents) {	362	if (keys %agents < $topagents) {
350	$count = keys %agents;	363	$count = keys %agents;
351	} else {	364	} else {
352	$count = $topagents;	365	$count = $topagents;
353	}	366	}
354	printf "%s\n", &centred("Top $count User Agents by poster", 76);	367	printf "%s\n", &centred("Top $count User Agents by poster", 76);
355	print "=" x 76, "\n";	368	print "=" x 76, "\n";
356	$i = 0;	369	$i = 0;
357	foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) {	370	foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) {
358	printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent};	371	printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent};
359	last if (++$i == $count);	372	last if (++$i == $count);
360	}	373	}
361	print "\n", "=" x 76, "\n";	374	print "\n", "=" x 76, "\n";
362	}	375	}
363		376
364	#######################	377	#######################
365	#show distinct agents	378	#show distinct agents
366	#######################	379	#######################
367	unless ( $skipSec{9} ) {	380	unless ( $skipSec{9} ) {
368	if (keys %distinct_agent < $topagents) {	381	if (keys %distinct_agent < $topagents) {
369	$count = keys %distinct_agent;	382	$count = keys %distinct_agent;
370	} else {	383	} else {
371	$count = $topagents;	384	$count = $topagents;
372	}	385	}
373	printf "%s\n", &centred("Top $count User Agents by number of posts", 76);	386	printf "%s\n", &centred("Top $count User Agents by number of posts", 76);
374	print "=" x 76, "\n";	387	print "=" x 76, "\n";
375	$i = 0;	388	$i = 0;
376	foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) {	389	foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) {
377	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100);	390	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100);
378	last if (++$i == $count);	391	last if (++$i == $count);
379	}	392	}
380	print "\n", "=" x 76, "\n";	393	print "\n", "=" x 76, "\n";
381	}	394	}
382		395
383	##########################	396	##########################
384	#show timezones and counts	397	#show timezones and counts
385	##########################	398	##########################
386	unless ( $skipSec{10} ) {	399	unless ( $skipSec{10} ) {
387	if (keys %tz < $toptz) {	400	if (keys %tz < $toptz) {
388	$count = keys %tz;	401	$count = keys %tz;
389	} else {	402	} else {
390	$count = $toptz;	403	$count = $toptz;
391	}	404	}
392	printf "%s\n", &centred("Top 10 time zones", 76);	405	printf "%s\n", &centred("Top 10 time zones", 76);
393	print "=" x 76, "\n";	406	print "=" x 76, "\n";
394	$i = 0;	407	$i = 0;
395	foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) {	408	foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) {
396	printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone};	409	printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone};
397	last if (++$i == $count);	410	last if (++$i == $count);
398	}	411	}
399	print "\n", "=" x 76, "\n";	412	print "\n", "=" x 76, "\n";
400	}	413	}
401		414
402		415
403	################################ SUBROUTINES ################################	416	################################ SUBROUTINES ################################
404		417
405		418
406	#######################################	419	#######################################
407	# get current article's header and body	420	# get current article's header and body
408	#######################################	421	#######################################
409	sub getarticle {	422	sub getarticle {
410	%headers = (); # dump old headers	423	%headers = (); # dump old headers
411	my $filename = shift; # get the name of the file	424	my $filename = shift; # get the name of the file
412	# get stats about the file itself	425	# get stats about the file itself
413	$filesize = -s $filename; # get total size of file	426	$filesize = -s $filename; # get total size of file
414	$totsize += $filesize; # bump total sizes of all files	427	$totsize += $filesize; # bump total sizes of all files
415		428
416	my $mtime = (stat $filename)[9];	429	my $mtime = (stat $filename)[9];
417	if ( $mtime < $earliest ) {	430	if ( $mtime < $earliest ) {
418	$earliest = $mtime;	431	$earliest = $mtime;
419	} elsif ( $mtime > $latest ) {	432	} elsif ( $mtime > $latest ) {
420	$latest = $mtime;	433	$latest = $mtime;
421	}	434	}
422		435
423	# now read the file	436	# now read the file
424	open(FILE, $filename) or die "Can't open $filename: $!\n";	437	open(FILE, $filename) or die "Can't open $filename: $!\n";
425	while (<FILE>) {	438	while (<FILE>) {
426	$totheader += length($_); # bump total header size	439	$totheader += length($_); # bump total header size
427	last if (/^\s*$/); # end of header?	440	last if (/^\s*$/); # end of header?
428	if (/^([^:\s]):\s+(.)/) {	441	if (/^([^:\s]):\s+(.)/) {
429	my($key,$val) = ($1,$2);	442	my($key,$val) = ($1,$2);
430	$headers{$key} = $val;	443	$headers{$key} = decode('MIME-Header', $val);
431	$lcheader{clean(lc($key))} = clean($val);	444	$lcheader{clean(lc($key))} = clean($val);
432	}	445	}
433	}	446	}
434	@body = <FILE>; # slurp up body	447	@body = <FILE>; # slurp up body
435	close(FILE);	448	close(FILE);
436	} # getarticle	449	} # getarticle
437		450
438	###################################	451	###################################
439	# get data from the current article	452	# get data from the current article
440	###################################	453	###################################
441	sub getdata {	454	sub getdata {
442	#### First, analyse header fields ####	455	#### First, analyse header fields ####
443		456
444	# Set up this poster if not defined, get counts, sizes	457	# Set up this poster if not defined, get counts, sizes
445	$poster = $headers{From}; # get the poster's name	458	$poster = encode('UTF-8', $headers{From}); # get the poster's name
446	if (!defined($data{$poster})) { # seen this one before?	459	if (!defined($data{$poster})) { # seen this one before?
447	$data{$poster}{agent} = 'Unknown'; # comes after For: field	460	$data{$poster}{agent} = 'Unknown'; # comes after For: field
448	$data{$poster}{orig} = 0;	461	$data{$poster}{orig} = 0;
449	$data{$poster}{quoted} = 0;	462	$data{$poster}{quoted} = 0;
450	}	463	}
451	$data{$poster}{count}++; # bump count for this poster	464	$data{$poster}{count}++; # bump count for this poster
452	$data{$poster}{size} += $filesize; # total size of file	465	$data{$poster}{size} += $filesize; # total size of file
453		466
454	# The User-Agent and/or X-Newsreader fields	467	# The User-Agent and/or X-Newsreader fields
455	# for User-Agent by poster	468	# for User-Agent by poster
456	if (defined $lcheader{"user-agent"}) {	469	if (defined $lcheader{"user-agent"}) {
457	$data{$poster}{agent} = $lcheader{"user-agent"};	470	$data{$poster}{agent} = $lcheader{"user-agent"};
458	}	471	}
459	if (defined $lcheader{"x-newsreader"}) {	472	if (defined $lcheader{"x-newsreader"}) {
460	$data{$poster}{agent} = $lcheader{"x-newsreader"};	473	$data{$poster}{agent} = $lcheader{"x-newsreader"};
461	}	474	}
462		475
463	# The User Agent for User-Agent by number of posts	476	# The User Agent for User-Agent by number of posts
464	my $UA = "unknown";	477	my $UA = "unknown";
465	foreach my $keys ( keys %lcheader )	478	foreach my $keys ( keys %lcheader )
466	{	479	{
467	if (defined $lcheader{'user-agent'})	480	if (defined $lcheader{'user-agent'})
468	{	481	{
469	$UA = $lcheader{'user-agent'};	482	$UA = $lcheader{'user-agent'};
470	}	483	}
471	elsif (defined $lcheader{"x-newsreader"})	484	elsif (defined $lcheader{"x-newsreader"})
472	{	485	{
473	$UA = $lcheader{"x-newsreader"};	486	$UA = $lcheader{"x-newsreader"};
474	}	487	}
475	elsif (defined $lcheader{'x-mailer'})	488	elsif (defined $lcheader{'x-mailer'})
476	{	489	{
477	$UA = $lcheader{'x-mailer'};	490	$UA = $lcheader{'x-mailer'};
478	}	491	}
479	elsif ((defined $lcheader{'organization'}) &&	492	elsif ((defined $lcheader{'organization'}) &&
480	($lcheader{'organization'} =~ /groups\.google\|AOL\|Supernews\|WebTV\|compuserve/))	493	($lcheader{'organization'} =~ /groups\.google\|AOL\|Supernews\|WebTV\|compuserve/))
481	{	494	{
482	$UA = $lcheader{'organization'};	495	$UA = $lcheader{'organization'};
483	}	496	}
484	elsif ( $lcheader{'message-id'} =~ /pine/i )	497	elsif ( $lcheader{'message-id'} =~ /pine/i )
485	{	498	{
486	$UA = "Pine";	499	$UA = "Pine";
487	} ## Hopefully found UA, else set to unknown	500	} ## Hopefully found UA, else set to unknown

Subversion Repositories LCARS

(root)/trunk/tools/network/news/newsstat/newsstat.pl @ 55 - Rev 5 → 6