Subversion Repositories LCARS

Compare Revisions

Last modification

Ignore whitespace Rev 6 → Rev 7

/trunk/tools/network/news/newsstat/newsstat.pl
1,4 → 1,4
#!/usr/bin/perl -w
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
9,12 → 9,12
 
############################################################################
# Collect statistics about a newsgroup (specified by first argument) in
# the local news spool. Check all articles in the last 30-day period.
# the local news spool. Check all articles in the last 30-day period.
# Rank posters by number of posts and by volume of posts, report on top and
# bottom 20 posters. Show their name, number of posts, size of posts,
# percentage of quoted lines. Rank user-agents used, by poster rather than
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
#
#
# (Numbers and paths can be configured below. -- PE)
############################################################################
 
21,7 → 21,7
############################################################################
# RECENT CHANGES #
# 2011-07-03 PE - Use Encode to decode/encode MIME encodings
# - Use warnings, utf8 (just in case)
# - Use warnings, utf8 (just in case)
# - Documentation update
# N/A NN - Take newsgroup name as argument
# 2004-06-19 NN - newsgroup name is $ARGV[0]
39,7 → 39,7
# - introduced the $newsgroup_name variable which
# later becomes $news$group
# - changed $name to $agent_name in countagents()
#
#
# Contributors
# -------------
# NN Nomen nominandum (name to be determined later)
75,15 → 75,20
my %skipSec;
my @skiplist;
my $args = @ARGV;
for ( $ix = 1 ; $ix < $args ; $ix++ ) {
$j = $ix + 1;
if ( $ARGV[$ix] eq "-x" ) {
@skiplist = split(",",$ARGV[$j]);
} elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) {
@skiplist = split(",",$1);
}
for ( $ix = 1 ; $ix < $args ; $ix++ )
{
$j = $ix + 1;
if ( $ARGV[$ix] eq "-x" )
{
@skiplist = split( ",", $ARGV[$j] );
}
elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
{
@skiplist = split( ",", $1 );
}
}
foreach(@skiplist) {
foreach (@skiplist)
{
$skipSec{$_} = 1;
}
 
108,108 → 113,113
# no. of time zones to show
my $toptz = 10;
 
 
 
###################### DATA STRUCTURES ######################
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data; # name, count, agent, total, orig, quoted
my %threads; # subject, count
my %crossposts; # group, count
my %tz; # timezones by count
my %headers; # holds header of current article
my %lcheader; # holds lowercase headers
my @body; # holds body of current article
my @sig; # holds sig text;
my $totalposts; # total no. of posts considered
my $filename; # name of current article file
my $filesize; # size of current article file
my $earliest; # earliest article we have found
my $latest; # latest article we have found
my $poster; # poster we are dealing with
my $totsize = 0; # holds total sizes of all files
my $totheader = 0; # total size of header material
my $totbody = 0; # total size of body material
my $totsig = 0; # total size of sig material
my $totorig = 0; # total size of original material
my $totquoted = 0; # total size of quoted material
my $origposts = 0; # total no. of original posts
my $replies = 0; # total no. of replies
my $i; # general purpose
my %data; # name, count, agent, total, orig, quoted
my %threads; # subject, count
my %crossposts; # group, count
my %tz; # timezones by count
my %headers; # holds header of current article
my %lcheader; # holds lowercase headers
my @body; # holds body of current article
my @sig; # holds sig text;
my $totalposts; # total no. of posts considered
my $filename; # name of current article file
my $filesize; # size of current article file
my $earliest; # earliest article we have found
my $latest; # latest article we have found
my $poster; # poster we are dealing with
my $totsize = 0; # holds total sizes of all files
my $totheader = 0; # total size of header material
my $totbody = 0; # total size of body material
my $totsig = 0; # total size of sig material
my $totorig = 0; # total size of original material
my $totquoted = 0; # total size of quoted material
my $origposts = 0; # total no. of original posts
my $replies = 0; # total no. of replies
my $i; # general purpose
my %distinct_agent;
my %agents = # used to hold counts of User Agents used
( "KNode" => 0,
"Pan" => 0,
"Mozilla" => 0,
"Sylpheed" => 0,
"Gnus" => 0,
"Forte Agent" => 0,
"Forte Free Agent" => 0,
"MicroPlanet Gravity" => 0,
"Microsoft Outlook Express" => 0,
"Xnews" => 0,
"slrn" => 0,
"tin" => 0,
"rn" => 0,
"NN" => 0,
"MacSOUP" => 0,
"Foorum" => 0,
"MT-NewsWatcher" => 0,
"News Rover" => 0,
"WebTV" => 0,
"Compuserver" => 0,
"VSoup" => 0);
my %agents = # used to hold counts of User Agents used
(
"KNode" => 0,
"Pan" => 0,
"Mozilla" => 0,
"Sylpheed" => 0,
"Gnus" => 0,
"Forte Agent" => 0,
"Forte Free Agent" => 0,
"MicroPlanet Gravity" => 0,
"Microsoft Outlook Express" => 0,
"Xnews" => 0,
"slrn" => 0,
"tin" => 0,
"rn" => 0,
"NN" => 0,
"MacSOUP" => 0,
"Foorum" => 0,
"MT-NewsWatcher" => 0,
"News Rover" => 0,
"WebTV" => 0,
"Compuserver" => 0,
"VSoup" => 0
);
 
######################## MAIN CODE ########################
$! = 1;
 
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
opendir(DIR, ".") or die "Can't open $news$group directory: $!\n";
while (defined($filename = readdir(DIR))) {
%lcheader = ();
next unless -f $filename; # only want real files
next if ($filename eq ".overview"); # real articles only
next if (-M $filename > $numdays); # only want articles <= a certain age
$earliest = (stat $filename)[9] unless defined ($earliest);
$latest = (stat $filename)[9] unless defined ($latest);
&getarticle($filename); # read in the article
&getdata; # grab the data from the article
$totalposts++; # bump count of articles considered
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
while ( defined( $filename = readdir(DIR) ) )
{
%lcheader = ();
next unless -f $filename; # only want real files
next if ( $filename eq ".overview" ); # real articles only
next if ( -M $filename > $numdays ); # only want articles <= a certain age
$earliest = ( stat $filename )[9] unless defined($earliest);
$latest = ( stat $filename )[9] unless defined($latest);
&getarticle($filename); # read in the article
&getdata; # grab the data from the article
$totalposts++; # bump count of articles considered
}
closedir(DIR); # finished with the directory
closedir(DIR); # finished with the directory
 
# post-processing
&countagents; # count agents, collapsing versions
&fixpercent; # check percentages orig/total for posters
&countagents; # count agents, collapsing versions
&fixpercent; # check percentages orig/total for posters
 
&writedata;
 
#################### DISPLAY RESULTS #####################
print "=" x 76, "\n";
printf "%s\n", &centred("Analysis of posts to $newsgroup_name", 76);
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
print "=" x 76, "\n";
printf "%s\n", &centred("(stats compiled with a script by Garry Knight et al.)", 76);
printf "%s\n",
&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
print "\n\n";
printf "Total posts considered: %s over %d days\n",
commify($totalposts), $numdays;
printf "Earliest article: %s\n", scalar localtime($earliest);
printf "Latest article: %s\n", scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies);
printf "Total posts considered: %s over %d days\n", commify($totalposts),
$numdays;
printf "Earliest article: %s\n", scalar localtime($earliest);
printf "Latest article: %s\n", scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts),
commify($replies);
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
commify(int($totsize / 1024)), $totsize / 1048576; #
commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
commify(int($totalposts / $numdays)),
$totsize / $numdays / 1048576, commify(int($totsize / $totalposts));
commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
commify( int( $totsize / $totalposts ) );
my $count = keys %data;
printf "Total headers: %s KB bodies: %s KB\n",
commify(int($totheader / 1024)), commify(int($totbody / 1024));
commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",
commify(int($totquoted / 1024)), commify(int($totorig / 1024)),
($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024));
printf "Total number of posters: %s, average %s bytes per poster\n", commify($count),
commify(int($totsize / $count)); #/
commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
( $totorig * 100 ) / ( $totorig + $totquoted ),
commify( int( $totsig / 1024 ) );
printf "Total number of posters: %s, average %s bytes per poster\n",
commify($count), commify( int( $totsize / $count ) ); #/
$count = keys %threads;
printf "Total number of threads: %s, average %s bytes per thread\n", commify($count),
commify(int($totsize / $count)); #/
printf "Total number of threads: %s, average %s bytes per thread\n",
commify($count), commify( int( $totsize / $count ) ); #/
printf "Total number of User-Agents: %d\n", scalar keys %agents;
print "\n", "=" x 76, "\n";
 
216,39 → 226,51
###############################
# show posters by article count Sec 1;
###############################
unless ( $skipSec{1} ) {
if (keys %data < $topposters) {
$count = keys %data;
} else {
$count = $topposters;
}
printf "%s\n", &centred("Top $count posters by number of articles", 76);
print "=" x 76, "\n";
$i = 0;
foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) {
my $name = substr($poster, 0, 65);
printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count};
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
unless ( $skipSec{1} )
{
if ( keys %data < $topposters )
{
$count = keys %data;
}
else
{
$count = $topposters;
}
printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
print "=" x 76, "\n";
$i = 0;
foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
{
my $name = substr( $poster, 0, 65 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
$data{$poster}{count};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
 
################################
# show posters by size in Kbytes Sec 2;
################################
unless ( $skipSec{2} ) {
if (keys %data < $topposters) {
unless ( $skipSec{2} )
{
if ( keys %data < $topposters )
{
$count = keys %data;
} else {
}
else
{
$count = $topposters;
}
printf "%s\n", &centred("Top $count posters by article size in Kbytes", 76);
printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
print "=" x 76, "\n";
$i = 0;
foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) {
my $name = substr($poster, 0, 62);
printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/
last if (++$i == $count);
foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
{
my $name = substr( $poster, 0, 62 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
$data{$poster}{size} / 1024; #/
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
256,43 → 278,59
####################################
# show top posters for original text
####################################
unless ( $skipSec{3} ) {
if (keys %data < $topposters) {
$count = keys %data;
} else {
$count = $topposters;
}
printf "%s\n", &centred("Top $count responders by original text (> 5 posts)", 76);
print "=" x 76, "\n";
$i = 0;
foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) {
next if $data{$poster}{quoted} == 0;
next if $data{$poster}{count} < 5;
my $name = substr($poster, 0, 63);
printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
unless ( $skipSec{3} )
{
if ( keys %data < $topposters )
{
$count = keys %data;
}
else
{
$count = $topposters;
}
printf "%s\n",
&centred( "Top $count responders by original text (> 5 posts)", 76 );
print "=" x 76, "\n";
$i = 0;
foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
keys %data )
{
next if $data{$poster}{quoted} == 0;
next if $data{$poster}{count} < 5;
my $name = substr( $poster, 0, 63 );
printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
$data{$poster}{percent};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
 
#######################################
# show bottom posters for original text
#######################################
unless ( $skipSec{4} ) {
if (keys %data < $topposters) {
unless ( $skipSec{4} )
{
if ( keys %data < $topposters )
{
$count = keys %data;
} else {
}
else
{
$count = $topposters;
}
printf "%s\n", &centred("Bottom $count responders by original text (> 5 posts)", 76);
printf "%s\n",
&centred( "Bottom $count responders by original text (> 5 posts)", 76 );
print "=" x 76, "\n";
$i = 0;
foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) {
foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
keys %data )
{
next if $data{$poster}{quoted} == 0;
next if $data{$poster}{count} < 5;
my $name = substr($poster, 0, 63);
printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent};
last if (++$i == $count);
my $name = substr( $poster, 0, 63 );
printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
$data{$poster}{percent};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
300,19 → 338,26
####################################
# show threads by number of articles
####################################
unless ( $skipSec{5} ) {
if (keys %threads < $topthreads) {
unless ( $skipSec{5} )
{
if ( keys %threads < $topthreads )
{
$count = keys %threads;
} else {
}
else
{
$count = $topthreads;
}
printf "%s\n", &centred("Top $count threads by no. of articles", 76);
printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
print "=" x 76, "\n";
$i = 0;
foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) {
my $name = substr($thread, 0, 65);
printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count};
last if (++$i == $count);
foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
keys %threads )
{
my $name = substr( $thread, 0, 65 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
$threads{$thread}{count};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
319,19 → 364,26
################################
# show threads by size in Kbytes
################################
unless ( $skipSec{6} ) {
if (keys %threads < $topthreads) {
unless ( $skipSec{6} )
{
if ( keys %threads < $topthreads )
{
$count = keys %threads;
} else {
}
else
{
$count = $topthreads;
}
printf "%s\n", &centred("Top $count threads by size in KB", 76);
printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
print "=" x 76, "\n";
$i = 0;
foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) {
my $name = substr($thread, 0, 65);
printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/
last if (++$i == $count);
foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
keys %threads )
{
my $name = substr( $thread, 0, 65 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
$threads{$thread}{size} / 1024; #/
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
339,198 → 391,236
#################################
# show top 10 cross-posted groups
#################################
unless ( $skipSec{7} ) {
delete $crossposts{"$newsgroup_name"}; # don't include ours
if (keys %crossposts < $topcrossposts) {
$count = keys %crossposts;
} else {
$count = $topcrossposts;
unless ( $skipSec{7} )
{
delete $crossposts{"$newsgroup_name"}; # don't include ours
if ( keys %crossposts < $topcrossposts )
{
$count = keys %crossposts;
}
else
{
$count = $topcrossposts;
}
printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
print "=" x 76, "\n";
$i = 0;
foreach
my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
{
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
$crossposts{$name};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
printf "%s\n", &centred("Top $count cross-posted groups", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) {
printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name};
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}
#######################
#show agents and counts
#######################
unless ( $skipSec{8} ) {
if (keys %agents < $topagents) {
$count = keys %agents;
} else {
$count = $topagents;
unless ( $skipSec{8} )
{
if ( keys %agents < $topagents )
{
$count = keys %agents;
}
else
{
$count = $topagents;
}
printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
print "=" x 76, "\n";
$i = 0;
foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
{
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
$agents{$agent};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
printf "%s\n", &centred("Top $count User Agents by poster", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) {
printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent};
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}
 
#######################
#show distinct agents
#######################
unless ( $skipSec{9} ) {
if (keys %distinct_agent < $topagents) {
$count = keys %distinct_agent;
} else {
$count = $topagents;
unless ( $skipSec{9} )
{
if ( keys %distinct_agent < $topagents )
{
$count = keys %distinct_agent;
}
else
{
$count = $topagents;
}
printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
print "=" x 76, "\n";
$i = 0;
foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
keys %distinct_agent )
{
printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
$distinct_agent{$agent},
( ( $distinct_agent{$agent} / $totalposts ) * 100 );
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
printf "%s\n", &centred("Top $count User Agents by number of posts", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) {
printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100);
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}
 
##########################
#show timezones and counts
##########################
unless ( $skipSec{10} ) {
if (keys %tz < $toptz) {
$count = keys %tz;
} else {
$count = $toptz;
unless ( $skipSec{10} )
{
if ( keys %tz < $toptz )
{
$count = keys %tz;
}
else
{
$count = $toptz;
}
printf "%s\n", &centred( "Top 10 time zones", 76 );
print "=" x 76, "\n";
$i = 0;
foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
{
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
}
printf "%s\n", &centred("Top 10 time zones", 76);
print "=" x 76, "\n";
$i = 0;
foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) {
printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone};
last if (++$i == $count);
}
print "\n", "=" x 76, "\n";
}
 
 
################################ SUBROUTINES ################################
 
 
#######################################
# get current article's header and body
#######################################
sub getarticle {
%headers = (); # dump old headers
my $filename = shift; # get the name of the file
# get stats about the file itself
$filesize = -s $filename; # get total size of file
$totsize += $filesize; # bump total sizes of all files
sub getarticle
{
%headers = (); # dump old headers
my $filename = shift; # get the name of the file
 
my $mtime = (stat $filename)[9];
if ( $mtime < $earliest ) {
# get stats about the file itself
$filesize = -s $filename; # get total size of file
$totsize += $filesize; # bump total sizes of all files
 
my $mtime = ( stat $filename )[9];
if ( $mtime < $earliest )
{
$earliest = $mtime;
} elsif ( $mtime > $latest ) {
}
elsif ( $mtime > $latest )
{
$latest = $mtime;
}
}
 
# now read the file
open(FILE, $filename) or die "Can't open $filename: $!\n";
while (<FILE>) {
$totheader += length($_); # bump total header size
last if (/^\s*$/); # end of header?
if (/^([^:\s]*):\s+(.*)/) {
my($key,$val) = ($1,$2);
$headers{$key} = decode('MIME-Header', $val);
$lcheader{clean(lc($key))} = clean($val);
# now read the file
open( FILE, $filename ) or die "Can't open $filename: $!\n";
while (<FILE>)
{
$totheader += length($_); # bump total header size
last if (/^\s*$/); # end of header?
if (/^([^:\s]*):\s+(.*)/)
{
my ( $key, $val ) = ( $1, $2 );
$headers{$key} = decode( 'MIME-Header', $val );
$lcheader{ clean( lc($key) ) } = clean($val);
}
}
}
@body = <FILE>; # slurp up body
close(FILE);
} # getarticle
@body = <FILE>; # slurp up body
close(FILE);
} # getarticle
 
###################################
# get data from the current article
###################################
sub getdata {
sub getdata
{
#### First, analyse header fields ####
 
# Set up this poster if not defined, get counts, sizes
$poster = encode('UTF-8', $headers{From}); # get the poster's name
if (!defined($data{$poster})) { # seen this one before?
$data{$poster}{agent} = 'Unknown'; # comes after For: field
$data{$poster}{orig} = 0;
$data{$poster}{quoted} = 0;
}
$data{$poster}{count}++; # bump count for this poster
$data{$poster}{size} += $filesize; # total size of file
# Set up this poster if not defined, get counts, sizes
$poster = encode( 'UTF-8', $headers{From} ); # get the poster's name
if ( !defined( $data{$poster} ) )
{ # seen this one before?
$data{$poster}{agent} = 'Unknown'; # comes after For: field
$data{$poster}{orig} = 0;
$data{$poster}{quoted} = 0;
}
$data{$poster}{count}++; # bump count for this poster
$data{$poster}{size} += $filesize; # total size of file
 
# The User-Agent and/or X-Newsreader fields
# for User-Agent by poster
if (defined $lcheader{"user-agent"}) {
$data{$poster}{agent} = $lcheader{"user-agent"};
}
if (defined $lcheader{"x-newsreader"}) {
$data{$poster}{agent} = $lcheader{"x-newsreader"};
}
# The User-Agent and/or X-Newsreader fields
# for User-Agent by poster
if ( defined $lcheader{"user-agent"} )
{
$data{$poster}{agent} = $lcheader{"user-agent"};
}
if ( defined $lcheader{"x-newsreader"} )
{
$data{$poster}{agent} = $lcheader{"x-newsreader"};
}
 
# The User Agent for User-Agent by number of posts
my $UA = "unknown";
foreach my $keys ( keys %lcheader )
{
if (defined $lcheader{'user-agent'})
{
$UA = $lcheader{'user-agent'};
}
elsif (defined $lcheader{"x-newsreader"})
{
$UA = $lcheader{"x-newsreader"};
}
elsif (defined $lcheader{'x-mailer'})
{
$UA = $lcheader{'x-mailer'};
}
elsif ((defined $lcheader{'organization'}) &&
($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/))
{
$UA = $lcheader{'organization'};
}
elsif ( $lcheader{'message-id'} =~ /pine/i )
{
$UA = "Pine";
} ## Hopefully found UA, else set to unknown
}
# The User Agent for User-Agent by number of posts
my $UA = "unknown";
foreach my $keys ( keys %lcheader )
{
if ( defined $lcheader{'user-agent'} )
{
$UA = $lcheader{'user-agent'};
}
elsif ( defined $lcheader{"x-newsreader"} )
{
$UA = $lcheader{"x-newsreader"};
}
elsif ( defined $lcheader{'x-mailer'} )
{
$UA = $lcheader{'x-mailer'};
}
elsif (
( defined $lcheader{'organization'} )
&& ( $lcheader{'organization'} =~
/groups\.google|AOL|Supernews|WebTV|compuserve/ )
)
{
$UA = $lcheader{'organization'};
}
elsif ( $lcheader{'message-id'} =~ /pine/i )
{
$UA = "Pine";
} ## Hopefully found UA, else set to unknown
}
 
$UA = clean($UA);
$UA = get_agent($UA);
 
$UA = clean($UA);
$UA = get_agent($UA);
sub get_agent
{
my $raw = shift;
my $agent = $raw;
 
## strip http
if ( $raw =~ /.*http.*/ )
{
$raw =~ s!posted via!!i;
$raw =~ s!http://!!g;
$raw =~ s!/!!g;
$raw =~ s! !!g;
}
 
sub get_agent {
my $raw = shift;
my $agent = $raw;
## Fix Outlook from Mac
if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
 
## strip http
if ( $raw =~ /.*http.*/ ) {
$raw =~ s!posted via!!i;
$raw =~ s!http://!!g;
$raw =~ s!/!!g;
$raw =~ s! !!g;
}
 
## Fix Outlook from Mac
if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;}
 
## Pick out the popular agents
if ( $raw =~ /(outlook express)/i ||
$raw =~ /(microplanet gravity)/i ||
$raw =~ /(news rover)/i ||
$raw =~ /(forte agent)/i ||
$raw =~ /(forte free agent)/i
)
{
$agent = $1;
}
elsif ( $raw =~ /^(
## Pick out the popular agents
if ( $raw =~ /(outlook express)/i
|| $raw =~ /(microplanet gravity)/i
|| $raw =~ /(news rover)/i
|| $raw =~ /(forte agent)/i
|| $raw =~ /(forte free agent)/i )
{
$agent = $1;
}
elsif (
$raw =~ /^(
pan
|sylpheed
|slrn
558,193 → 648,240
|007
|webtv
|compuserve
)/ix )
{
$agent = $1;
}
else
{
## Clean up unknown agents
if ( $raw =~ m!^(.*?)/! ) {
$agent = $1;
}
elsif ( $raw =~ /^(\w*)\d.*/ )
{
$agent = $1;
}
)/ix
)
{
$agent = $1;
}
else
{
## Clean up unknown agents
if ( $raw =~ m!^(.*?)/! )
{
$agent = $1;
}
elsif ( $raw =~ /^(\w*)\d.*/ )
{
$agent = $1;
}
}
 
$distinct_agent{$agent}++;
return $agent;
}
 
$distinct_agent{$agent}++;
return $agent;
}
# Get all cross-posted newsgroups
for ( split /,/, $headers{"Newsgroups"} )
{
$crossposts{$_}++; # bump count for each
}
 
# Get threads
my $thread = encode( 'UTF-8', $headers{"Subject"} );
$thread =~ s/^re: //i; # Remove Re: or re: at start
$thread =~ s/\s+/ /g; # collapse whitespace
$threads{$thread}{count} += 1; # bump count of this subject
$threads{$thread}{size} += $filesize; # bump bytes for this thread
 
# Get all cross-posted newsgroups
for (split /,/, $headers{"Newsgroups"}) {
$crossposts{$_}++; # bump count for each
}
# Is this an original post or a reply?
if ( defined $headers{"References"} )
{
$replies++;
}
else
{
$origposts++;
}
 
# Get threads
my $thread = encode('UTF-8', $headers{"Subject"});
$thread =~ s/^re: //i; # Remove Re: or re: at start
$thread =~ s/\s+/ /g; # collapse whitespace
$threads{$thread}{count} += 1; # bump count of this subject
$threads{$thread}{size} += $filesize; # bump bytes for this thread
# Get the time zone
$_ = $headers{"Date"};
my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
{
$tz = "UTC";
}
$tz{$tz}++;
 
# Is this an original post or a reply?
if (defined $headers{"References"}) {
$replies++;
} else {
$origposts++;
}
#### Now analyse the body text ####
my $insig = 0;
for (@body)
{
$totbody += length($_); # bump total body size
next if (/^$>/); # don't count blank lines in body
if ( $insig == 1 )
{
$totsig += length($_); # bump total sig size
 
# Get the time zone
$_ = $headers{"Date"};
my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) {
$tz = "UTC";
}
$tz{$tz}++;
# Bill Unruh uses ] quotes, and another poster uses ::
}
elsif ( /^\s*[>\]]/ or /^\s*::/ )
{ # are we in a quote line?
$data{$poster}{quoted} += length($_); # bump count of quoted chrs
$totquoted += length($_);
}
elsif (/-- /)
{
$insig = 1;
}
else
{
 
#### Now analyse the body text ####
my $insig = 0;
for (@body) {
$totbody += length($_); # bump total body size
next if (/^$>/); # don't count blank lines in body
if ($insig == 1) {
$totsig += length($_); # bump total sig size
# Bill Unruh uses ] quotes, and another poster uses ::
} elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line?
$data{$poster}{quoted} += length($_); # bump count of quoted chrs
$totquoted += length($_);
} elsif (/-- /) {
$insig = 1;
} else {
# we must be processing an original line
$data{$poster}{orig} += length($_); # bump count of original chrs
$totorig += length($_);
}
} # end for (@body)
# we must be processing an original line
$data{$poster}{orig} += length($_); # bump count of original chrs
$totorig += length($_);
}
} # end for (@body)
 
} # getdata
} # getdata
 
########################################
# Count the User-Agents used, collapsing
# different versions into one per agent.
########################################
sub countagents {
sub countagents
{
POSTER:
foreach $poster (keys %data) {
foreach my $agent_name (keys %distinct_agent) { # check against known ones
if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) {
$agents{$agent_name}++;
next POSTER;
}
}
$agents{$data{$poster}{agent}}++;
}
} # countagents
foreach $poster ( keys %data )
{
foreach my $agent_name ( keys %distinct_agent )
{ # check against known ones
if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
{
$agents{$agent_name}++;
next POSTER;
}
}
$agents{ $data{$poster}{agent} }++;
}
} # countagents
 
############################################
# set orig/total percentages for all posters
############################################
sub fixpercent {
foreach $poster (keys %data) {
my $percent = 100;
if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) {
$percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/
} elsif ($data{$poster}{orig} == 0) {
$percent = 0;
}
$data{$poster}{percent} = $percent;
}
sub fixpercent
{
foreach $poster ( keys %data )
{
my $percent = 100;
if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
{
$percent = $data{$poster}{orig} * 100 /
( $data{$poster}{quoted} + $data{$poster}{orig} ); #/
}
elsif ( $data{$poster}{orig} == 0 )
{
$percent = 0;
}
$data{$poster}{percent} = $percent;
}
}
 
##############################
# right pad a string with '.'s
##############################
sub rpad {
# get text to pad, length to pad, pad chr
my ($text, $pad_len, $pad_chr) = @_;
if (length($text) > $pad_len) {
$text = substr($text, 0, $pad_len);
}
my $padded = $text . $pad_chr x ( $pad_len - length( $text ) );
return $padded;
sub rpad
{
 
# get text to pad, length to pad, pad chr
my ( $text, $pad_len, $pad_chr ) = @_;
if ( length($text) > $pad_len )
{
$text = substr( $text, 0, $pad_len );
}
my $padded = $text . $pad_chr x ( $pad_len - length($text) );
return $padded;
}
 
#################
# centre a string
#################
sub centred {
my ($text, $width) = @_; # text to centre, size of field to centre in
my $pad_len = ($width - length($text)) / 2; #/
my $centred = " " x $pad_len . $text;
return $centred;
sub centred
{
my ( $text, $width ) = @_; # text to centre, size of field to centre in
my $pad_len = ( $width - length($text) ) / 2; #/
my $centred = " " x $pad_len . $text;
return $centred;
}
 
##########################
# put commas into a number
##########################
sub commify {
$_ = shift;
1 while s/^(-?\d+)(\d{3})/$1,$2/;
return $_;
sub commify
{
$_ = shift;
1 while s/^(-?\d+)(\d{3})/$1,$2/;
return $_;
}
 
#########################
# clean
#########################
sub clean {
my $dirty = shift;
my $clean = $dirty;
$clean =~ s/^\s*//;
$clean =~ s/\s*$//;
sub clean
{
my $dirty = shift;
my $clean = $dirty;
$clean =~ s/^\s*//;
$clean =~ s/\s*$//;
 
return $clean;
return $clean;
}
 
sub usage
{
 
sub usage {
 
print "usage: newstat.pl newsgroupname\n";
exit 1;
print "usage: newstat.pl newsgroupname\n";
exit 1;
}
 
###################################
# Write data structures to a file #
###################################
sub writedata {
open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
print OUTF "Data collected from alt.os.linux.mandrake\n\n";
print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
foreach my $name (keys %data) {
print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
}
print OUTF "============================================================================\n";
print OUTF "Thread subjects\n";
print OUTF "----------------------------------------------------------------------------\n";
foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) {
print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
}
print OUTF "============================================================================\n";
print OUTF "Cross-posts\n";
print OUTF "----------------------------------------------------------------------------\n";
foreach my $name (sort keys %crossposts) {
print OUTF "$name : $crossposts{$name}\n";
}
print OUTF
print OUTF "============================================================================\n";
print OUTF "User agents\n";
print OUTF "----------------------------------------------------------------------------\n";
foreach my $name (sort keys %agents) {
print OUTF "$name : $agents{$name}\n";
}
print OUTF "============================================================================\n";
print OUTF "Time zones\n";
print OUTF "----------------------------------------------------------------------------\n";
foreach my $name (sort keys %tz) {
print OUTF "$name : $tz{$name}\n";
}
close OUTF;
} # writedata
sub writedata
{
open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
print OUTF "Data collected from alt.os.linux.mandrake\n\n";
print OUTF
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
foreach my $name ( keys %data )
{
print OUTF
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
}
print OUTF
"============================================================================\n";
print OUTF "Thread subjects\n";
print OUTF
"----------------------------------------------------------------------------\n";
foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
{
print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
}
print OUTF
"============================================================================\n";
print OUTF "Cross-posts\n";
print OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %crossposts )
{
print OUTF "$name : $crossposts{$name}\n";
}
print OUTF print OUTF
"============================================================================\n";
print OUTF "User agents\n";
print OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %agents )
{
print OUTF "$name : $agents{$name}\n";
}
print OUTF
"============================================================================\n";
print OUTF "Time zones\n";
print OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %tz )
{
print OUTF "$name : $tz{$name}\n";
}
close OUTF;
} # writedata