Subversion Repositories LCARS

Compare Revisions

Last modification

Regard whitespace Rev 22 → Rev 14

/trunk/tools/network/news/newsstat/po/POTFILES.in
File deleted
\ No newline at end of file
/trunk/tools/network/news/newsstat/po/PACKAGE
File deleted
/trunk/tools/network/news/newsstat/po/de.pointedears.newsstat.pot
File deleted
/trunk/tools/network/news/newsstat/po/Makefile
File deleted
/trunk/tools/network/news/newsstat/po/de.po
File deleted
/trunk/tools/network/news/newsstat/newsstat.pl
1,39 → 1,33
#!/usr/bin/env perl
use strict;
use warnings;
 
#use diagnostics;
use diagnostics;
use utf8;
use Encode;
 
use constant DEBUG => 0;
 
## Print out all text to STDOUT UTF-8 encoded
binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';
 
# FIXME: Automatically include resolved '.' in @INC
# print join "\n", @INC;
##############################
## newsstat.pl version 0.4.3.1
 
use locale ':not_characters';
use Locale::TextDomain ('de.pointedears.newsstat');
use POSIX ('locale_h');
use Locale::Messages qw (bind_textdomain_filter
bind_textdomain_codeset
turn_utf_8_on);
#setlocale( LC_MESSAGES, '' );
bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
###########################################################################
## Collect statistics about a newsgroup (specified by first argument)
## in the local news spool. Check all articles in the last 30-day period.
## Rank posters by number of posts and by volume of posts, report on top
## and bottom 20 posters. Show their name, number of posts, size of posts,
## percentage of quoted lines. Rank user-agents used, by poster rather
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
##
## Numbers and paths can be configured below. See ChangeLog and TODO
## for more. -- PE
###########################################################################
require Mail::Message;
require DateTime;
require DateTime::Format::Mail;
###################### USER CONFIGURATIONS ############################
 
## The name of the group to do stats for
my $newsgroup_name = $ARGV[0];
$newsgroup_name or usage();
$newsgroup_name or &usage;
 
## Check for removal flags
my $ix;
61,18 → 55,21
## Leafnode users will want /var/spool/news for this variable.
my $news = "/var/spool/news/";
 
## Number of top or bottom posters to show
my $topposters = 20;
## How many days are we doing statistics for?
my $numdays = 30;
 
## Number of agents we list
my $topagents = 10;
 
## Number of threads we want to know about
my $topthreads = 20;
 
## Number of top or bottom posters to show
my $topposters = 20;
 
## Number of cross-posted threads to show
my $topcrossposts = 10;
 
## Number of agents we list
my $topagents = 10;
 
## Number of time zones to show
my $toptz = 10;
 
80,20 → 77,28
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data; # name, count, agent, total, orig, quoted
my $totsize = 0; # holds total sizes of all files
my %threads; # subject, count
my %crossposts; # group, count
my %threads; # subject, count
my $replies = 0; # total no. of replies
my $origposts = 0; # total no. of original posts
my %tz; # timezones by count
my %headers; # holds header of current article
my %lcheader; # holds lowercase headers
my @body; # holds body of current article
my @sig; # holds sig text;
my $totalposts; # total no. of posts considered
my $filename; # name of current article file
my $filesize; # size of current article file
my $earliest; # earliest article we have found
my $latest; # latest article we have found
my $poster; # poster we are dealing with
my $totsize = 0; # holds total sizes of all files
my $totheader = 0; # total size of header material
my $totbody = 0; # total size of body material
my $totsig = 0; # total size of sig material
my $totorig = 0; # total size of original material
my $totquoted = 0; # total size of quoted material
my $totorig = 0; # total size of original material
my $totalposts; # total no. of posts considered
my $origposts = 0; # total no. of original posts
my $replies = 0; # total no. of replies
my $i; # general purpose
my %distinct_agent;
 
## Used to hold counts of User Agents used
119,478 → 124,65
"tin" => 0,
"VSoup" => 0,
"WebTV" => 0,
"Xnews" => 0,
"Xnews" => 0
);
 
my $datetime_parser = DateTime::Format::Mail->new();
$datetime_parser->loose();
######################## MAIN CODE ########################
$! = 1;
 
my $today = DateTime->today( time_zone => 'UTC' );
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
my $start = int $prev_month->strftime('%s');
my $numdays = int DateTime->last_day_of_month(
year => $prev_month->year(),
month => $prev_month->month(),
time_zone => $prev_month->time_zone(),
)->day();
my $end = int $today->clone()->set_day(1)->strftime('%s');
 
dmsg( $start, " to ", $end ) if DEBUG;
 
chdir("$news$group")
or die __x(
"Can't cd to {newsgroup}: {error}\n",
newsgroup => "$news$group",
error => $!
);
opendir( DIR, "." )
or die __x(
"Can't open {newsgroup}: {error}\n",
newsgroup => "$news$group",
error => $!
);
 
while ( defined( my $filename = readdir(DIR) ) )
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
while ( defined( $filename = readdir(DIR) ) )
{
%lcheader = ();
next unless -f $filename; # only want real files
next if ( $filename eq ".overview" ); # real articles only
 
get_article($filename); # read in the article
next if ( -M $filename > $numdays ); # only want articles <= a certain age
$earliest = ( stat $filename )[9] unless defined($earliest);
$latest = ( stat $filename )[9] unless defined($latest);
&get_article($filename); # read in the article
&get_data; # grab the data from the article
$totalposts++; # bump count of articles considered
}
closedir(DIR); # finished with the directory
 
dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG;
 
## Post-processing
count_agents(); # count agents, collapsing versions
fix_percent();
&count_agents; # count agents, collapsing versions
&fix_percent; # check percentages orig/total for posters
 
write_data();
display_results();
&write_data;
 
########################################
## Get current article's header and body
########################################
sub get_article
{
my $filename = shift;
 
open( my $FILE, '<', $filename )
or
die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
my $msg = Mail::Message->read($FILE);
my $timestamp = $msg->timestamp();
my $date = $msg->study('Date');
 
## Disregard article if timestamp is not in range
dmsg($timestamp) if DEBUG;
if ( $timestamp < $start or $timestamp >= $end )
{
dmsg("Posting on $date ignored.") if DEBUG;
return;
}
 
$totalposts++; # bump count of articles considered
 
## DEBUG
dmsg($date) if DEBUG;
 
## get stats about the file itself
my $filesize = -s $filename; # get total size of file
$totsize += $filesize; # bump total sizes of all files
 
if ( ( not defined $earliest ) or $timestamp < $earliest )
{
$earliest = $timestamp;
}
elsif ( ( not defined $latest ) or $timestamp > $latest )
{
$latest = $timestamp;
}
 
#print "timestamp: $timestamp\n";
 
## count header size
$totheader += $msg->head()->size();
 
## get the poster's name (MIME-decoded, in UTF-8)
my $poster = $msg->study('From');
if ( defined $poster )
{
## Convert old to new format
$poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
 
## Collapse whitespace
$poster =~ s/\s+/ /g;
 
## Remove outer quotes; TODO: observe RFC 5322 strictly
$poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
 
## DEBUG
dmsg($poster) if DEBUG;
 
## seen this one before?
if ( !defined( $data{$poster} ) )
{
$data{$poster}{'agent'} = __ 'unknown'; # comes after For: field
$data{$poster}{'orig'} = 0;
$data{$poster}{'quoted'} = 0;
}
$data{$poster}{'count'}++; # bump count for this poster
$data{$poster}{'size'} += $filesize; # total size of file
 
## The User-Agent and/or X-Newsreader fields
## for User-Agent by poster
my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
if ( defined $ua )
{
$data{$poster}{'agent'} = $ua;
 
## DEBUG
dmsg($ua) if DEBUG;
}
 
## The User Agent for User-Agent by number of posts
get_agent($msg);
 
## Get all cross-posted newsgroups
for ( split( /,/, $msg->study('Newsgroups') ) )
{
$crossposts{$_}++; # bump count for each
}
 
## Get threads
my $thread = $msg->study('Subject');
$thread =~ s/^re:\s+//i; # Remove Re: or re: at start
$thread =~ s/\s+/ /g; # collapse whitespace
$threads{$thread}{'count'}++; # bump count of this subject
$threads{$thread}{'size'} += $filesize; # bump bytes for this thread
 
## Is this an original post or a reply?
if ( defined $msg->study('References') )
{
$replies++;
}
else
{
$origposts++;
}
 
## Get the time zone
my $datetime = $datetime_parser->parse_datetime($date);
my $tz = $datetime->strftime('%z');
$tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
$tz{$tz}++;
 
## DEBUG
dmsg($tz) if DEBUG;
 
#### Now analyse the body text ####
my $body = $msg->body();
 
my $insig = 0;
my @body = $body->lines;
for (@body)
{
$totbody += length($_); # bump total body size
next if (m{^$>}o); # don't count blank lines in body
if ( $insig == 1 )
{
 
# bump total sig size
$totsig += length($_);
}
## are we in a quote line?
## Bill Unruh uses ] quotes, and another poster uses ::
elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
{
## bump count of quoted chrs
$data{$poster}{'quoted'} += length($_);
$totquoted += length($_);
}
elsif (/^-- $/)
{
$insig = 1;
}
else
{
## We must be processing an original line
$data{$poster}{'orig'} += length($_); # bump count of original chrs
$totorig += length($_);
}
}
 
# end for (@body)
}
 
close($FILE);
}
 
sub get_agent
{
my $msg = shift;
 
my $ua =
$msg->study('User-Agent')
or $msg->study('X-Newsreader')
or $msg->study('X-Mailer');
if ( not defined $ua )
{
my $org = $msg->study('Organization');
if ( defined $org
and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
{
$ua = $org;
}
elsif ( $msg->study('Message-ID') =~ /pine/i )
{
$ua = "Pine";
}
}
 
## Hopefully found UA, else set to unknown
if ( not defined $ua )
{
$ua = __ "unknown";
}
 
$ua = clean($ua);
 
my $raw = $ua;
my $agent = $raw;
 
## strip http
if ( $raw =~ /.*http.*/ )
{
$raw =~ s!posted via!!i;
$raw =~ s!http://!!g;
$raw =~ s!/!!g;
$raw =~ s! !!g;
}
 
## Fix Outlook from Mac
if ( $raw =~ /^microsoft/i )
{
$raw =~ s/-/ /g;
}
 
## Pick out the popular agents
if ( $raw =~ /(outlook express)/i
|| $raw =~ /(windows mail)/i
|| $raw =~ /(microplanet gravity)/i
|| $raw =~ /(news rover)/i
|| $raw =~ /(forte agent)/i
|| $raw =~ /(forte free agent)/i )
{
$agent = $1;
}
elsif (
$raw =~ /^(
pan
|sylpheed
|slrn
|mozilla
|knode
|tin
|hamster
|xrn
|xnews
|aol
|gnus
|krn
|macsoup
|messenger
|openxp
|pine
|thoth
|turnpike
|winvn
|vsoup
|google
|supernews
|nn
|rn
|007
|webtv
|compuserve
)/ix
)
{
$agent = $1;
}
else
{
## Clean up unknown agents
if ( $raw =~ m!^(.*?)/! )
{
$agent = $1;
}
elsif ( $raw =~ /^(\w*)\d.*/ )
{
$agent = $1;
}
}
 
$distinct_agent{$agent}++;
return $agent;
}
## get_agent
 
#########################################
## Count the User-Agents used, collapsing
## different versions into one per agent.
#########################################
sub count_agents
{
POSTER:
foreach my $poster ( keys %data )
{
foreach my $agent_name ( keys %distinct_agent )
{ # check against known ones
if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
{
$agents{$agent_name}++;
next POSTER;
}
}
$agents{ $data{$poster}{'agent'} }++;
}
} # count_agents
 
#############################################
## Set orig/total percentages for all posters
#############################################
sub fix_percent
{
foreach my $poster ( keys %data )
{
my $percent = 100;
if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
{
$percent =
$data{$poster}{'orig'} * 100 /
( $data{$poster}{'quoted'} + $data{$poster}{'orig'} ); #/
}
elsif ( $data{$poster}{'orig'} == 0 )
{
$percent = 0;
}
$data{$poster}{'percent'} = $percent;
}
}
## fix_percent
 
##################################
## Write data structures to a file
##################################
sub write_data
{
open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
or die __x( "Can't create XDATA: {error}\n", error => $! );
print $OUTF "Data collected from $newsgroup_name\n\n";
print $OUTF
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
foreach my $name ( keys %data )
{
print $OUTF
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "Thread subjects\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
{
print $OUTF
"$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "Cross-posts\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %crossposts )
{
print $OUTF "$name : $crossposts{$name}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "User agents\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %agents )
{
print $OUTF "$name : $agents{$name}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "Time zones\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %tz )
{
print $OUTF "$name : $tz{$name}\n";
}
close $OUTF;
} # write_data
 
sub display_results
{
#################### DISPLAY RESULTS #####################
print "=" x 76, "\n";
printf "%s\n",
centred(
__x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
76 );
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
print "=" x 76, "\n";
printf "%s\n",
centred(
__
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)",
76
);
&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
print "\n\n";
printf __"Total posts considered: %s over %d days" . "\n",
commify($totalposts),
printf "Total posts considered: %s over %d days\n", commify($totalposts),
$numdays;
my $time_locale = setlocale(LC_TIME);
my $earliest_datetime = DateTime->from_epoch(
epoch => $earliest,
locale => $time_locale,
time_zone => 'UTC',
);
my $latest_datetime = DateTime->from_epoch(
epoch => $latest,
locale => $time_locale,
time_zone => 'UTC',
);
my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format);
printf __"Latest article" . ": %s\n", $latest_datetime->strftime($datetime_format);
printf __"Original articles: %s; replies" . ": %s\n",
commify($origposts),
printf "Earliest article: %s\n", scalar localtime($earliest);
printf "Latest article: %s\n", scalar localtime($latest);
printf "Original articles: %s, replies: %s\n", commify($origposts),
commify($replies);
printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n",
commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
printf __
"Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
commify( int( $totsize / $totalposts ) );
my $count = keys %data;
printf __"Total headers: %s KiB; bodies: %s KiB\n",
printf "Total headers: %s KiB bodies: %s KiB\n",
commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
printf __
"Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n",
printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n",
commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
( $totorig * 100 ) / ( $totorig + $totquoted ),
commify( int( $totsig / 1024 ) );
printf __"Total number of posters: %s, average %s bytes per poster\n",
printf "Total number of posters: %s, average %s bytes per poster\n",
commify($count), commify( int( $totsize / $count ) ); #/
$count = keys %threads;
printf __"Total number of threads: %s, average %s bytes per thread\n",
printf "Total number of threads: %s, average %s bytes per thread\n",
commify($count), commify( int( $totsize / $count ) ); #/
printf __"Total number of user agents: %d\n", scalar keys %agents;
printf "Total number of user agents: %d\n", scalar keys %agents;
print "\n", "=" x 76, "\n";
 
########################################
606,11 → 198,9
{
$count = $topposters;
}
printf "%s\n",
centred(
__x( "Top {count} posters by number of articles", count => $topposters ), 76 );
printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach
my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
{
635,14 → 225,10
{
$count = $topposters;
}
printf "%s\n",
centred(
__x( "Top {count} posters by article size in KiB", count => $topposters ),
76 );
printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
print "=" x 76, "\n";
my $i = 0;
foreach
my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
$i = 0;
foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
{
my $name = substr( $poster, 0, 62 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
666,15 → 252,9
$count = $topposters;
}
printf "%s\n",
centred(
__x(
"Top {count} responders by original text (> 5 posts)",
count => $topposters
),
76
);
&centred( "Top $count responders by original text (> 5 posts)", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $poster (
sort { $data{$b}{percent} <=> $data{$a}{percent} }
keys %data
704,15 → 284,9
$count = $topposters;
}
printf "%s\n",
centred(
__x(
"Bottom {count} responders by original text (> 5 posts)",
count => $topposters
),
76
);
&centred( "Bottom $count responders by original text (> 5 posts)", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $poster (
sort { $data{$a}{percent} <=> $data{$b}{percent} }
keys %data
741,19 → 315,17
{
$count = $topthreads;
}
printf "%s\n",
centred( __x( "Top {count} threads by no. of articles", count => $topthreads ),
76 );
printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $thread (
sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
sort { $threads{$b}{count} <=> $threads{$a}{count} }
keys %threads
)
{
my $name = substr( $thread, 0, 65 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
$threads{$thread}{'count'};
$threads{$thread}{count};
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
772,19 → 344,17
{
$count = $topthreads;
}
printf "%s\n",
centred( __x( "Top {count} threads by size in KiB", count => $topthreads ),
76 );
printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $thread (
sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
sort { $threads{$b}{size} <=> $threads{$a}{size} }
keys %threads
)
{
my $name = substr( $thread, 0, 65 );
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
$threads{$thread}{'size'} / 1024; #/
$threads{$thread}{size} / 1024; #/
last if ( ++$i == $count );
}
print "\n", "=" x 76, "\n";
804,10 → 374,9
{
$count = $topcrossposts;
}
printf "%s\n",
centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach
my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
{
831,11 → 400,9
{
$count = $topagents;
}
printf "%s\n",
centred( __x( "Top {count} user agents by poster", count => $topagents ),
76 );
printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
{
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
858,12 → 425,9
{
$count = $topagents;
}
printf "%s\n",
centred(
__x( "Top {count} user agents by number of posts", count => $topagents ),
76 );
printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $agent (
sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
keys %distinct_agent
890,9 → 454,9
{
$count = $toptz;
}
printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 );
printf "%s\n", &centred( "Top 10 time zones", 76 );
print "=" x 76, "\n";
my $i = 0;
$i = 0;
foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
{
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
900,10 → 464,307
}
print "\n", "=" x 76, "\n";
}
 
################################ SUBROUTINES ################################
 
########################################
## Get current article's header and body
########################################
sub get_article
{
%headers = (); # dump old headers
my $filename = shift; # get the name of the file
 
## get stats about the file itself
$filesize = -s $filename; # get total size of file
$totsize += $filesize; # bump total sizes of all files
 
my $mtime = ( stat $filename )[9];
if ( $mtime < $earliest )
{
$earliest = $mtime;
}
elsif ( $mtime > $latest )
{
$latest = $mtime;
}
 
## helper subs
## now read the file
open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
while (<$FILE>)
{
$totheader += length($_); # bump total header size
last if (/^\s*$/); # end of header?
if (/^([^:\s]*):\s*(.*)/)
{
my ( $key, $val ) = ( $1, $2 );
$headers{$key} = decode( 'MIME-Header', $val );
$lcheader{ clean( lc($key) ) } = clean($val);
}
}
@body = <$FILE>; # slurp up body
close($FILE);
} # get_article
 
####################################
## Get data from the current article
####################################
sub get_data
{
#### First, analyse header fields ####
 
## Set up this poster if not defined, get counts, sizes
my $poster = $headers{From}; # get the poster's name
# Convert old to new format
$poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
# Collapse whitespace
$poster =~ s/\s+/ /;
# Remove outer quotes
$poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
if ( !defined( $data{$poster} ) )
{ # seen this one before?
$data{$poster}{agent} = 'Unknown'; # comes after For: field
$data{$poster}{orig} = 0;
$data{$poster}{quoted} = 0;
}
$data{$poster}{count}++; # bump count for this poster
$data{$poster}{size} += $filesize; # total size of file
 
## The User-Agent and/or X-Newsreader fields
## for User-Agent by poster
if ( defined $lcheader{"user-agent"} )
{
$data{$poster}{agent} = $lcheader{"user-agent"};
}
if ( defined $lcheader{"x-newsreader"} )
{
$data{$poster}{agent} = $lcheader{"x-newsreader"};
}
 
## The User Agent for User-Agent by number of posts
my $UA = "unknown";
foreach my $keys ( keys %lcheader )
{
if ( defined $lcheader{'user-agent'} )
{
$UA = $lcheader{'user-agent'};
}
elsif ( defined $lcheader{"x-newsreader"} )
{
$UA = $lcheader{"x-newsreader"};
}
elsif ( defined $lcheader{'x-mailer'} )
{
$UA = $lcheader{'x-mailer'};
}
elsif (
( defined $lcheader{'organization'} )
&& ( $lcheader{'organization'} =~
/groups\.google|AOL|Supernews|WebTV|compuserve/ )
)
{
$UA = $lcheader{'organization'};
}
elsif ( $lcheader{'message-id'} =~ /pine/i )
{
$UA = "Pine";
} ## Hopefully found UA, else set to unknown
}
 
$UA = clean($UA);
$UA = get_agent($UA);
 
sub get_agent
{
my $raw = shift;
my $agent = $raw;
 
## strip http
if ( $raw =~ /.*http.*/ )
{
$raw =~ s!posted via!!i;
$raw =~ s!http://!!g;
$raw =~ s!/!!g;
$raw =~ s! !!g;
}
 
## Fix Outlook from Mac
if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
 
## Pick out the popular agents
if (
$raw =~ /(outlook express)/i
|| $raw =~ /(windows mail)/i
|| $raw =~ /(microplanet gravity)/i
|| $raw =~ /(news rover)/i
|| $raw =~ /(forte agent)/i
|| $raw =~ /(forte free agent)/i
)
{
$agent = $1;
}
elsif (
$raw =~ /^(
pan
|sylpheed
|slrn
|mozilla
|knode
|tin
|hamster
|xrn
|xnews
|aol
|gnus
|krn
|macsoup
|messenger
|openxp
|pine
|thoth
|turnpike
|winvn
|vsoup
|google
|supernews
|nn
|rn
|007
|webtv
|compuserve
)/ix
)
{
$agent = $1;
}
else
{
## Clean up unknown agents
if ( $raw =~ m!^(.*?)/! )
{
$agent = $1;
}
elsif ( $raw =~ /^(\w*)\d.*/ )
{
$agent = $1;
}
}
 
$distinct_agent{$agent}++;
return $agent;
}
 
## Get all cross-posted newsgroups
for ( split /,/, $headers{"Newsgroups"} )
{
$crossposts{$_}++; # bump count for each
}
 
## Get threads
my $thread = $headers{"Subject"};
$thread =~ s/^re: //i; # Remove Re: or re: at start
$thread =~ s/\s+/ /g; # collapse whitespace
$threads{$thread}{count} += 1; # bump count of this subject
$threads{$thread}{size} += $filesize; # bump bytes for this thread
 
## Is this an original post or a reply?
if ( defined $headers{"References"} )
{
$replies++;
}
else
{
$origposts++;
}
 
## Get the time zone
$_ = $headers{"Date"};
my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
{
$tz = "UTC";
}
$tz{$tz}++;
 
#### Now analyse the body text ####
my $insig = 0;
for (@body)
{
$totbody += length($_); # bump total body size
next if (/^$>/); # don't count blank lines in body
if ( $insig == 1 )
{
$totsig += length($_); # bump total sig size
 
## Bill Unruh uses ] quotes, and another poster uses ::
}
elsif ( /^\s*[>\]]/ or /^\s*::/ )
{ # are we in a quote line?
$data{$poster}{quoted} += length($_); # bump count of quoted chrs
$totquoted += length($_);
}
elsif (/-- /)
{
$insig = 1;
}
else
{
 
## We must be processing an original line
$data{$poster}{orig} += length($_); # bump count of original chrs
$totorig += length($_);
}
} # end for (@body)
 
} # get_data
 
#########################################
## Count the User-Agents used, collapsing
## different versions into one per agent.
#########################################
sub count_agents
{
POSTER:
foreach my $poster ( keys %data )
{
foreach my $agent_name ( keys %distinct_agent )
{ # check against known ones
if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
{
$agents{$agent_name}++;
next POSTER;
}
}
$agents{ $data{$poster}{agent} }++;
}
} # count_agents
 
#############################################
## Set orig/total percentages for all posters
#############################################
sub fix_percent
{
foreach my $poster ( keys %data )
{
my $percent = 100;
if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
{
$percent =
$data{$poster}{orig} * 100 /
( $data{$poster}{quoted} + $data{$poster}{orig} ); #/
}
elsif ( $data{$poster}{orig} == 0 )
{
$percent = 0;
}
$data{$poster}{percent} = $percent;
}
}
 
###############################
## Right pad a string with '.'s
###############################
913,7 → 774,7
my ( $text, $pad_len, $pad_chr ) = @_;
 
## DEBUG
printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
#printf "|%s| = %d\n", $text, length($text);
 
if ( length($text) > $pad_len )
{
937,36 → 798,10
###########################
## Put commas into a number
###########################
# Get some of locale's numeric formatting parameters
my ($thousands_sep, $grouping) =
@{localeconv()}{'thousands_sep', 'grouping'};
# Apply defaults if values are missing
$thousands_sep = ',' unless $thousands_sep;
# grouping and mon_grouping are packed lists
# of small integers (characters) telling the
# grouping (thousand_seps and mon_thousand_seps
# being the group dividers) of numbers and
# monetary quantities. The integers' meanings:
# 255 means no more grouping, 0 means repeat
# the previous grouping, 1-254 means use that
# as the current grouping. Grouping goes from
# right to left (low to high digits). In the
# below we cheat slightly by never using anything
# else than the first grouping (whatever that is).
my @grouping;
if ($grouping) {
@grouping = unpack("C*", $grouping);
} else {
@grouping = (3);
}
 
sub commify
{
local $_ = shift;
#1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
$_ = int; # Chop non-integer part
1 while
s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/;
1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
return $_;
}
 
984,17 → 819,61
 
sub usage
{
print __"usage: newsstat.pl NEWS.GROUP\n";
print "usage: newstat.pl newsgroupname\n";
exit 1;
}
 
sub dmsg
##################################
## Write data structures to a file
##################################
sub write_data
{
print STDERR @_, "\n";
open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
or die "Can't create XDATA: $!\n";
print $OUTF "Data collected from $newsgroup_name\n\n";
print $OUTF
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
foreach my $name ( keys %data )
{
print $OUTF
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
}
 
sub dmsg2
print $OUTF
"============================================================================\n";
print $OUTF "Thread subjects\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
{
my ( $level, @msg ) = @_;
print STDERR @msg, "\n" if $level >= DEBUG;
print $OUTF
"$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "Cross-posts\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %crossposts )
{
print $OUTF "$name : $crossposts{$name}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "User agents\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %agents )
{
print $OUTF "$name : $agents{$name}\n";
}
print $OUTF
"============================================================================\n";
print $OUTF "Time zones\n";
print $OUTF
"----------------------------------------------------------------------------\n";
foreach my $name ( sort keys %tz )
{
print $OUTF "$name : $tz{$name}\n";
}
close $OUTF;
} # write_data