1,33 → 1,39 |
#!/usr/bin/env perl |
use strict; |
use warnings; |
use diagnostics; |
|
#use diagnostics; |
use utf8; |
use Encode; |
|
use constant DEBUG => 0; |
|
## Print out all text to STDOUT UTF-8 encoded |
binmode STDOUT, ':encoding(UTF-8)'; |
binmode STDERR, ':encoding(UTF-8)'; |
|
############################## |
## newsstat.pl version 0.4.3.1 |
# FIXME: Automatically include resolved '.' in @INC |
# print join "\n", @INC; |
|
########################################################################### |
## Collect statistics about a newsgroup (specified by first argument) |
## in the local news spool. Check all articles in the last 30-day period. |
## Rank posters by number of posts and by volume of posts, report on top |
## and bottom 20 posters. Show their name, number of posts, size of posts, |
## percentage of quoted lines. Rank user-agents used, by poster rather |
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
## |
## Numbers and paths can be configured below. See ChangeLog and TODO |
## for more. -- PE |
########################################################################### |
use locale ':not_characters'; |
use Locale::TextDomain ('de.pointedears.newsstat'); |
use POSIX ('locale_h'); |
use Locale::Messages qw (bind_textdomain_filter |
bind_textdomain_codeset |
turn_utf_8_on); |
#setlocale( LC_MESSAGES, '' ); |
bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on; |
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8'; |
|
require Mail::Message; |
require DateTime; |
require DateTime::Format::Mail; |
|
###################### USER CONFIGURATIONS ############################ |
|
## The name of the group to do stats for |
my $newsgroup_name = $ARGV[0]; |
$newsgroup_name or &usage; |
$newsgroup_name or usage(); |
|
## Check for removal flags |
my $ix; |
55,21 → 61,18 |
## Leafnode users will want /var/spool/news for this variable. |
my $news = "/var/spool/news/"; |
|
## How many days are we doing statistics for? |
my $numdays = 30; |
## Number of top or bottom posters to show |
my $topposters = 20; |
|
## Number of agents we list |
my $topagents = 10; |
|
## Number of threads we want to know about |
my $topthreads = 20; |
|
## Number of top or bottom posters to show |
my $topposters = 20; |
|
## Number of cross-posted threads to show |
my $topcrossposts = 10; |
|
## Number of agents we list |
my $topagents = 10; |
|
## Number of time zones to show |
my $toptz = 10; |
|
77,28 → 80,20 |
my $group = $newsgroup_name; |
$group =~ s!\.!/!g; |
my %data; # name, count, agent, total, orig, quoted |
my $totsize = 0; # holds total sizes of all files |
my %crossposts; # group, count |
my %threads; # subject, count |
my %crossposts; # group, count |
my $replies = 0; # total no. of replies |
my $origposts = 0; # total no. of original posts |
my %tz; # timezones by count |
my %headers; # holds header of current article |
my %lcheader; # holds lowercase headers |
my @body; # holds body of current article |
my @sig; # holds sig text; |
my $totalposts; # total no. of posts considered |
my $filename; # name of current article file |
my $filesize; # size of current article file |
my $earliest; # earliest article we have found |
my $latest; # latest article we have found |
my $poster; # poster we are dealing with |
my $totsize = 0; # holds total sizes of all files |
my $totheader = 0; # total size of header material |
my $totbody = 0; # total size of body material |
my $totsig = 0; # total size of sig material |
my $totquoted = 0; # total size of quoted material |
my $totorig = 0; # total size of original material |
my $totquoted = 0; # total size of quoted material |
my $origposts = 0; # total no. of original posts |
my $replies = 0; # total no. of replies |
my $i; # general purpose |
my $totalposts; # total no. of posts considered |
my %distinct_agent; |
|
## Used to hold counts of User Agents used |
124,65 → 119,478 |
"tin" => 0, |
"VSoup" => 0, |
"WebTV" => 0, |
"Xnews" => 0 |
"Xnews" => 0, |
); |
|
######################## MAIN CODE ######################## |
$! = 1; |
my $datetime_parser = DateTime::Format::Mail->new(); |
$datetime_parser->loose(); |
|
chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
while ( defined( $filename = readdir(DIR) ) ) |
my $today = DateTime->today( time_zone => 'UTC' ); |
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1); |
my $start = int $prev_month->strftime('%s'); |
my $numdays = int DateTime->last_day_of_month( |
year => $prev_month->year(), |
month => $prev_month->month(), |
time_zone => $prev_month->time_zone(), |
)->day(); |
my $end = int $today->clone()->set_day(1)->strftime('%s'); |
|
dmsg( $start, " to ", $end ) if DEBUG; |
|
chdir("$news$group") |
or die __x( |
"Can't cd to {newsgroup}: {error}\n", |
newsgroup => "$news$group", |
error => $! |
); |
opendir( DIR, "." ) |
or die __x( |
"Can't open {newsgroup}: {error}\n", |
newsgroup => "$news$group", |
error => $! |
); |
|
while ( defined( my $filename = readdir(DIR) ) ) |
{ |
%lcheader = (); |
next unless -f $filename; # only want real files |
next if ( $filename eq ".overview" ); # real articles only |
next if ( -M $filename > $numdays ); # only want articles <= a certain age |
$earliest = ( stat $filename )[9] unless defined($earliest); |
$latest = ( stat $filename )[9] unless defined($latest); |
&get_article($filename); # read in the article |
&get_data; # grab the data from the article |
$totalposts++; # bump count of articles considered |
|
get_article($filename); # read in the article |
} |
closedir(DIR); # finished with the directory |
|
dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG; |
|
## Post-processing |
&count_agents; # count agents, collapsing versions |
&fix_percent; # check percentages orig/total for posters |
count_agents(); # count agents, collapsing versions |
fix_percent(); |
|
&write_data; |
write_data(); |
display_results(); |
|
######################################## |
## Get current article's header and body |
######################################## |
sub get_article |
{ |
my $filename = shift; |
|
open( my $FILE, '<', $filename ) |
or |
die __x( "Can't open {file}: {error}\n", file => $filename, error => $! ); |
my $msg = Mail::Message->read($FILE); |
my $timestamp = $msg->timestamp(); |
my $date = $msg->study('Date'); |
|
## Disregard article if timestamp is not in range |
dmsg($timestamp) if DEBUG; |
if ( $timestamp < $start or $timestamp >= $end ) |
{ |
dmsg("Posting on $date ignored.") if DEBUG; |
return; |
} |
|
$totalposts++; # bump count of articles considered |
|
## DEBUG |
dmsg($date) if DEBUG; |
|
## get stats about the file itself |
my $filesize = -s $filename; # get total size of file |
$totsize += $filesize; # bump total sizes of all files |
|
if ( ( not defined $earliest ) or $timestamp < $earliest ) |
{ |
$earliest = $timestamp; |
} |
elsif ( ( not defined $latest ) or $timestamp > $latest ) |
{ |
$latest = $timestamp; |
} |
|
#print "timestamp: $timestamp\n"; |
|
## count header size |
$totheader += $msg->head()->size(); |
|
## get the poster's name (MIME-decoded, in UTF-8) |
my $poster = $msg->study('From'); |
if ( defined $poster ) |
{ |
## Convert old to new format |
$poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
|
## Collapse whitespace |
$poster =~ s/\s+/ /g; |
|
## Remove outer quotes; TODO: observe RFC 5322 strictly |
$poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x; |
|
## DEBUG |
dmsg($poster) if DEBUG; |
|
## seen this one before? |
if ( !defined( $data{$poster} ) ) |
{ |
$data{$poster}{'agent'} = __ 'unknown'; # comes after For: field |
$data{$poster}{'orig'} = 0; |
$data{$poster}{'quoted'} = 0; |
} |
$data{$poster}{'count'}++; # bump count for this poster |
$data{$poster}{'size'} += $filesize; # total size of file |
|
## The User-Agent and/or X-Newsreader fields |
## for User-Agent by poster |
my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader'); |
if ( defined $ua ) |
{ |
$data{$poster}{'agent'} = $ua; |
|
## DEBUG |
dmsg($ua) if DEBUG; |
} |
|
## The User Agent for User-Agent by number of posts |
get_agent($msg); |
|
## Get all cross-posted newsgroups |
for ( split( /,/, $msg->study('Newsgroups') ) ) |
{ |
$crossposts{$_}++; # bump count for each |
} |
|
## Get threads |
my $thread = $msg->study('Subject'); |
$thread =~ s/^re:\s+//i; # Remove Re: or re: at start |
$thread =~ s/\s+/ /g; # collapse whitespace |
$threads{$thread}{'count'}++; # bump count of this subject |
$threads{$thread}{'size'} += $filesize; # bump bytes for this thread |
|
## Is this an original post or a reply? |
if ( defined $msg->study('References') ) |
{ |
$replies++; |
} |
else |
{ |
$origposts++; |
} |
|
## Get the time zone |
my $datetime = $datetime_parser->parse_datetime($date); |
my $tz = $datetime->strftime('%z'); |
$tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o; |
$tz{$tz}++; |
|
## DEBUG |
dmsg($tz) if DEBUG; |
|
#### Now analyse the body text #### |
my $body = $msg->body(); |
|
my $insig = 0; |
my @body = $body->lines; |
for (@body) |
{ |
$totbody += length($_); # bump total body size |
next if (m{^$>}o); # don't count blank lines in body |
if ( $insig == 1 ) |
{ |
|
# bump total sig size |
$totsig += length($_); |
} |
## are we in a quote line? |
## Bill Unruh uses ] quotes, and another poster uses :: |
elsif ( m{^\s*[>\]]}o or m{^\s*::}o ) |
{ |
## bump count of quoted chrs |
$data{$poster}{'quoted'} += length($_); |
$totquoted += length($_); |
} |
elsif (/^-- $/) |
{ |
$insig = 1; |
} |
else |
{ |
## We must be processing an original line |
$data{$poster}{'orig'} += length($_); # bump count of original chrs |
$totorig += length($_); |
} |
} |
|
# end for (@body) |
} |
|
close($FILE); |
} |
|
sub get_agent |
{ |
my $msg = shift; |
|
my $ua = |
$msg->study('User-Agent') |
or $msg->study('X-Newsreader') |
or $msg->study('X-Mailer'); |
if ( not defined $ua ) |
{ |
my $org = $msg->study('Organization'); |
if ( defined $org |
and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
{ |
$ua = $org; |
} |
elsif ( $msg->study('Message-ID') =~ /pine/i ) |
{ |
$ua = "Pine"; |
} |
} |
|
## Hopefully found UA, else set to unknown |
if ( not defined $ua ) |
{ |
$ua = __ "unknown"; |
} |
|
$ua = clean($ua); |
|
my $raw = $ua; |
my $agent = $raw; |
|
## strip http |
if ( $raw =~ /.*http.*/ ) |
{ |
$raw =~ s!posted via!!i; |
$raw =~ s!http://!!g; |
$raw =~ s!/!!g; |
$raw =~ s! !!g; |
} |
|
## Fix Outlook from Mac |
if ( $raw =~ /^microsoft/i ) |
{ |
$raw =~ s/-/ /g; |
} |
|
## Pick out the popular agents |
if ( $raw =~ /(outlook express)/i |
|| $raw =~ /(windows mail)/i |
|| $raw =~ /(microplanet gravity)/i |
|| $raw =~ /(news rover)/i |
|| $raw =~ /(forte agent)/i |
|| $raw =~ /(forte free agent)/i ) |
{ |
$agent = $1; |
} |
elsif ( |
$raw =~ /^( |
pan |
|sylpheed |
|slrn |
|mozilla |
|knode |
|tin |
|hamster |
|xrn |
|xnews |
|aol |
|gnus |
|krn |
|macsoup |
|messenger |
|openxp |
|pine |
|thoth |
|turnpike |
|winvn |
|vsoup |
|google |
|supernews |
|nn |
|rn |
|007 |
|webtv |
|compuserve |
)/ix |
) |
{ |
$agent = $1; |
} |
else |
{ |
## Clean up unknown agents |
if ( $raw =~ m!^(.*?)/! ) |
{ |
$agent = $1; |
} |
elsif ( $raw =~ /^(\w*)\d.*/ ) |
{ |
$agent = $1; |
} |
} |
|
$distinct_agent{$agent}++; |
return $agent; |
} |
## get_agent |
|
######################################### |
## Count the User-Agents used, collapsing |
## different versions into one per agent. |
######################################### |
sub count_agents |
{ |
POSTER: |
foreach my $poster ( keys %data ) |
{ |
foreach my $agent_name ( keys %distinct_agent ) |
{ # check against known ones |
if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ ) |
{ |
$agents{$agent_name}++; |
next POSTER; |
} |
} |
$agents{ $data{$poster}{'agent'} }++; |
} |
} # count_agents |
|
############################################# |
## Set orig/total percentages for all posters |
############################################# |
sub fix_percent |
{ |
foreach my $poster ( keys %data ) |
{ |
my $percent = 100; |
if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) ) |
{ |
$percent = |
$data{$poster}{'orig'} * 100 / |
( $data{$poster}{'quoted'} + $data{$poster}{'orig'} ); #/ |
} |
elsif ( $data{$poster}{'orig'} == 0 ) |
{ |
$percent = 0; |
} |
$data{$poster}{'percent'} = $percent; |
} |
} |
## fix_percent |
|
################################## |
## Write data structures to a file |
################################## |
sub write_data |
{ |
open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" ) |
or die __x( "Can't create XDATA: {error}\n", error => $! ); |
print $OUTF "Data collected from $newsgroup_name\n\n"; |
print $OUTF |
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
foreach my $name ( keys %data ) |
{ |
print $OUTF |
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Thread subjects\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
{ |
print $OUTF |
"$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Cross-posts\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %crossposts ) |
{ |
print $OUTF "$name : $crossposts{$name}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "User agents\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %agents ) |
{ |
print $OUTF "$name : $agents{$name}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Time zones\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %tz ) |
{ |
print $OUTF "$name : $tz{$name}\n"; |
} |
close $OUTF; |
} # write_data |
|
sub display_results |
{ |
#################### DISPLAY RESULTS ##################### |
print "=" x 76, "\n"; |
printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
printf "%s\n", |
centred( |
__x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ), |
76 ); |
print "=" x 76, "\n"; |
printf "%s\n", |
¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
centred( |
__ |
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)", |
76 |
); |
print "\n\n"; |
printf "Total posts considered: %s over %d days\n", commify($totalposts), |
printf __"Total posts considered: %s over %d days" . "\n", |
commify($totalposts), |
$numdays; |
printf "Earliest article: %s\n", scalar localtime($earliest); |
printf "Latest article: %s\n", scalar localtime($latest); |
printf "Original articles: %s, replies: %s\n", commify($origposts), |
my $time_locale = setlocale(LC_TIME); |
my $earliest_datetime = DateTime->from_epoch( |
epoch => $earliest, |
locale => $time_locale, |
time_zone => 'UTC', |
); |
my $latest_datetime = DateTime->from_epoch( |
epoch => $latest, |
locale => $time_locale, |
time_zone => 'UTC', |
); |
my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z'; |
printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format); |
printf __"Latest article" . ": %s\n", $latest_datetime->strftime($datetime_format); |
printf __"Original articles: %s; replies" . ": %s\n", |
commify($origposts), |
commify($replies); |
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n", |
commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
printf __ |
"Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
commify( int( $totsize / $totalposts ) ); |
my $count = keys %data; |
printf "Total headers: %s KiB bodies: %s KiB\n", |
printf __"Total headers: %s KiB; bodies: %s KiB\n", |
commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
printf __ |
"Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n", |
commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
( $totorig * 100 ) / ( $totorig + $totquoted ), |
commify( int( $totsig / 1024 ) ); |
printf "Total number of posters: %s, average %s bytes per poster\n", |
printf __"Total number of posters: %s, average %s bytes per poster\n", |
commify($count), commify( int( $totsize / $count ) ); #/ |
$count = keys %threads; |
printf "Total number of threads: %s, average %s bytes per thread\n", |
printf __"Total number of threads: %s, average %s bytes per thread\n", |
commify($count), commify( int( $totsize / $count ) ); #/ |
printf "Total number of user agents: %d\n", scalar keys %agents; |
printf __"Total number of user agents: %d\n", scalar keys %agents; |
print "\n", "=" x 76, "\n"; |
|
######################################## |
198,9 → 606,11 |
{ |
$count = $topposters; |
} |
printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
printf "%s\n", |
centred( |
__x( "Top {count} posters by number of articles", count => $topposters ), 76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach |
my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
{ |
225,10 → 635,14 |
{ |
$count = $topposters; |
} |
printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
printf "%s\n", |
centred( |
__x( "Top {count} posters by article size in KiB", count => $topposters ), |
76 ); |
print "=" x 76, "\n"; |
$i = 0; |
foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
my $i = 0; |
foreach |
my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
{ |
my $name = substr( $poster, 0, 62 ); |
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
252,9 → 666,15 |
$count = $topposters; |
} |
printf "%s\n", |
¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
centred( |
__x( |
"Top {count} responders by original text (> 5 posts)", |
count => $topposters |
), |
76 |
); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $poster ( |
sort { $data{$b}{percent} <=> $data{$a}{percent} } |
keys %data |
284,9 → 704,15 |
$count = $topposters; |
} |
printf "%s\n", |
¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
centred( |
__x( |
"Bottom {count} responders by original text (> 5 posts)", |
count => $topposters |
), |
76 |
); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $poster ( |
sort { $data{$a}{percent} <=> $data{$b}{percent} } |
keys %data |
315,17 → 741,19 |
{ |
$count = $topthreads; |
} |
printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
printf "%s\n", |
centred( __x( "Top {count} threads by no. of articles", count => $topthreads ), |
76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $thread ( |
sort { $threads{$b}{count} <=> $threads{$a}{count} } |
sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} } |
keys %threads |
) |
{ |
my $name = substr( $thread, 0, 65 ); |
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
$threads{$thread}{count}; |
$threads{$thread}{'count'}; |
last if ( ++$i == $count ); |
} |
print "\n", "=" x 76, "\n"; |
344,17 → 772,19 |
{ |
$count = $topthreads; |
} |
printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
printf "%s\n", |
centred( __x( "Top {count} threads by size in KiB", count => $topthreads ), |
76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $thread ( |
sort { $threads{$b}{size} <=> $threads{$a}{size} } |
sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} } |
keys %threads |
) |
{ |
my $name = substr( $thread, 0, 65 ); |
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
$threads{$thread}{size} / 1024; #/ |
$threads{$thread}{'size'} / 1024; #/ |
last if ( ++$i == $count ); |
} |
print "\n", "=" x 76, "\n"; |
374,9 → 804,10 |
{ |
$count = $topcrossposts; |
} |
printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
printf "%s\n", |
centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach |
my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
{ |
400,9 → 831,11 |
{ |
$count = $topagents; |
} |
printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
printf "%s\n", |
centred( __x( "Top {count} user agents by poster", count => $topagents ), |
76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
{ |
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
425,9 → 858,12 |
{ |
$count = $topagents; |
} |
printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
printf "%s\n", |
centred( |
__x( "Top {count} user agents by number of posts", count => $topagents ), |
76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $agent ( |
sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
keys %distinct_agent |
454,9 → 890,9 |
{ |
$count = $toptz; |
} |
printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 ); |
print "=" x 76, "\n"; |
$i = 0; |
my $i = 0; |
foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
{ |
printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
464,307 → 900,10 |
} |
print "\n", "=" x 76, "\n"; |
} |
|
################################ SUBROUTINES ################################ |
|
######################################## |
## Get current article's header and body |
######################################## |
sub get_article |
{ |
%headers = (); # dump old headers |
my $filename = shift; # get the name of the file |
|
## get stats about the file itself |
$filesize = -s $filename; # get total size of file |
$totsize += $filesize; # bump total sizes of all files |
|
my $mtime = ( stat $filename )[9]; |
if ( $mtime < $earliest ) |
{ |
$earliest = $mtime; |
} |
elsif ( $mtime > $latest ) |
{ |
$latest = $mtime; |
} |
|
## now read the file |
open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
while (<$FILE>) |
{ |
$totheader += length($_); # bump total header size |
last if (/^\s*$/); # end of header? |
if (/^([^:\s]*):\s*(.*)/) |
{ |
my ( $key, $val ) = ( $1, $2 ); |
$headers{$key} = decode( 'MIME-Header', $val ); |
$lcheader{ clean( lc($key) ) } = clean($val); |
} |
} |
@body = <$FILE>; # slurp up body |
close($FILE); |
} # get_article |
## helper subs |
|
#################################### |
## Get data from the current article |
#################################### |
sub get_data |
{ |
#### First, analyse header fields #### |
|
## Set up this poster if not defined, get counts, sizes |
my $poster = $headers{From}; # get the poster's name |
|
# Convert old to new format |
$poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
|
# Collapse whitespace |
$poster =~ s/\s+/ /; |
|
# Remove outer quotes |
$poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
|
if ( !defined( $data{$poster} ) ) |
{ # seen this one before? |
$data{$poster}{agent} = 'Unknown'; # comes after For: field |
$data{$poster}{orig} = 0; |
$data{$poster}{quoted} = 0; |
} |
$data{$poster}{count}++; # bump count for this poster |
$data{$poster}{size} += $filesize; # total size of file |
|
## The User-Agent and/or X-Newsreader fields |
## for User-Agent by poster |
if ( defined $lcheader{"user-agent"} ) |
{ |
$data{$poster}{agent} = $lcheader{"user-agent"}; |
} |
if ( defined $lcheader{"x-newsreader"} ) |
{ |
$data{$poster}{agent} = $lcheader{"x-newsreader"}; |
} |
|
## The User Agent for User-Agent by number of posts |
my $UA = "unknown"; |
foreach my $keys ( keys %lcheader ) |
{ |
if ( defined $lcheader{'user-agent'} ) |
{ |
$UA = $lcheader{'user-agent'}; |
} |
elsif ( defined $lcheader{"x-newsreader"} ) |
{ |
$UA = $lcheader{"x-newsreader"}; |
} |
elsif ( defined $lcheader{'x-mailer'} ) |
{ |
$UA = $lcheader{'x-mailer'}; |
} |
elsif ( |
( defined $lcheader{'organization'} ) |
&& ( $lcheader{'organization'} =~ |
/groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
) |
{ |
$UA = $lcheader{'organization'}; |
} |
elsif ( $lcheader{'message-id'} =~ /pine/i ) |
{ |
$UA = "Pine"; |
} ## Hopefully found UA, else set to unknown |
} |
|
$UA = clean($UA); |
$UA = get_agent($UA); |
|
sub get_agent |
{ |
my $raw = shift; |
my $agent = $raw; |
|
## strip http |
if ( $raw =~ /.*http.*/ ) |
{ |
$raw =~ s!posted via!!i; |
$raw =~ s!http://!!g; |
$raw =~ s!/!!g; |
$raw =~ s! !!g; |
} |
|
## Fix Outlook from Mac |
if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
|
## Pick out the popular agents |
if ( |
$raw =~ /(outlook express)/i |
|| $raw =~ /(windows mail)/i |
|| $raw =~ /(microplanet gravity)/i |
|| $raw =~ /(news rover)/i |
|| $raw =~ /(forte agent)/i |
|| $raw =~ /(forte free agent)/i |
) |
{ |
$agent = $1; |
} |
elsif ( |
$raw =~ /^( |
pan |
|sylpheed |
|slrn |
|mozilla |
|knode |
|tin |
|hamster |
|xrn |
|xnews |
|aol |
|gnus |
|krn |
|macsoup |
|messenger |
|openxp |
|pine |
|thoth |
|turnpike |
|winvn |
|vsoup |
|google |
|supernews |
|nn |
|rn |
|007 |
|webtv |
|compuserve |
)/ix |
) |
{ |
$agent = $1; |
} |
else |
{ |
## Clean up unknown agents |
if ( $raw =~ m!^(.*?)/! ) |
{ |
$agent = $1; |
} |
elsif ( $raw =~ /^(\w*)\d.*/ ) |
{ |
$agent = $1; |
} |
} |
|
$distinct_agent{$agent}++; |
return $agent; |
} |
|
## Get all cross-posted newsgroups |
for ( split /,/, $headers{"Newsgroups"} ) |
{ |
$crossposts{$_}++; # bump count for each |
} |
|
## Get threads |
my $thread = $headers{"Subject"}; |
$thread =~ s/^re: //i; # Remove Re: or re: at start |
$thread =~ s/\s+/ /g; # collapse whitespace |
$threads{$thread}{count} += 1; # bump count of this subject |
$threads{$thread}{size} += $filesize; # bump bytes for this thread |
|
## Is this an original post or a reply? |
if ( defined $headers{"References"} ) |
{ |
$replies++; |
} |
else |
{ |
$origposts++; |
} |
|
## Get the time zone |
$_ = $headers{"Date"}; |
my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
{ |
$tz = "UTC"; |
} |
$tz{$tz}++; |
|
#### Now analyse the body text #### |
my $insig = 0; |
for (@body) |
{ |
$totbody += length($_); # bump total body size |
next if (/^$>/); # don't count blank lines in body |
if ( $insig == 1 ) |
{ |
$totsig += length($_); # bump total sig size |
|
## Bill Unruh uses ] quotes, and another poster uses :: |
} |
elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
{ # are we in a quote line? |
$data{$poster}{quoted} += length($_); # bump count of quoted chrs |
$totquoted += length($_); |
} |
elsif (/-- /) |
{ |
$insig = 1; |
} |
else |
{ |
|
## We must be processing an original line |
$data{$poster}{orig} += length($_); # bump count of original chrs |
$totorig += length($_); |
} |
} # end for (@body) |
|
} # get_data |
|
######################################### |
## Count the User-Agents used, collapsing |
## different versions into one per agent. |
######################################### |
sub count_agents |
{ |
POSTER: |
foreach my $poster ( keys %data ) |
{ |
foreach my $agent_name ( keys %distinct_agent ) |
{ # check against known ones |
if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
{ |
$agents{$agent_name}++; |
next POSTER; |
} |
} |
$agents{ $data{$poster}{agent} }++; |
} |
} # count_agents |
|
############################################# |
## Set orig/total percentages for all posters |
############################################# |
sub fix_percent |
{ |
foreach my $poster ( keys %data ) |
{ |
my $percent = 100; |
if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
{ |
$percent = |
$data{$poster}{orig} * 100 / |
( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
} |
elsif ( $data{$poster}{orig} == 0 ) |
{ |
$percent = 0; |
} |
$data{$poster}{percent} = $percent; |
} |
} |
|
############################### |
## Right pad a string with '.'s |
############################### |
774,7 → 913,7 |
my ( $text, $pad_len, $pad_chr ) = @_; |
|
## DEBUG |
#printf "|%s| = %d\n", $text, length($text); |
printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1; |
|
if ( length($text) > $pad_len ) |
{ |
798,10 → 937,36 |
########################### |
## Put commas into a number |
########################### |
# Get some of locale's numeric formatting parameters |
my ($thousands_sep, $grouping) = |
@{localeconv()}{'thousands_sep', 'grouping'}; |
# Apply defaults if values are missing |
$thousands_sep = ',' unless $thousands_sep; |
# grouping and mon_grouping are packed lists |
# of small integers (characters) telling the |
# grouping (thousand_seps and mon_thousand_seps |
# being the group dividers) of numbers and |
# monetary quantities. The integers' meanings: |
# 255 means no more grouping, 0 means repeat |
# the previous grouping, 1-254 means use that |
# as the current grouping. Grouping goes from |
# right to left (low to high digits). In the |
# below we cheat slightly by never using anything |
# else than the first grouping (whatever that is). |
my @grouping; |
if ($grouping) { |
@grouping = unpack("C*", $grouping); |
} else { |
@grouping = (3); |
} |
|
sub commify |
{ |
local $_ = shift; |
1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
#1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
$_ = int; # Chop non-integer part |
1 while |
s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/; |
return $_; |
} |
|
819,61 → 984,17 |
|
sub usage |
{ |
print "usage: newstat.pl newsgroupname\n"; |
print __"usage: newsstat.pl NEWS.GROUP\n"; |
exit 1; |
} |
|
################################## |
## Write data structures to a file |
################################## |
sub write_data |
sub dmsg |
{ |
open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
or die "Can't create XDATA: $!\n"; |
print $OUTF "Data collected from $newsgroup_name\n\n"; |
print $OUTF |
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
foreach my $name ( keys %data ) |
{ |
print $OUTF |
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
print STDERR @_, "\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Thread subjects\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
|
sub dmsg2 |
{ |
print $OUTF |
"$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
my ( $level, @msg ) = @_; |
print STDERR @msg, "\n" if $level >= DEBUG; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Cross-posts\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %crossposts ) |
{ |
print $OUTF "$name : $crossposts{$name}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "User agents\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %agents ) |
{ |
print $OUTF "$name : $agents{$name}\n"; |
} |
print $OUTF |
"============================================================================\n"; |
print $OUTF "Time zones\n"; |
print $OUTF |
"----------------------------------------------------------------------------\n"; |
foreach my $name ( sort keys %tz ) |
{ |
print $OUTF "$name : $tz{$name}\n"; |
} |
close $OUTF; |
} # write_data |