1,80 → 1,35 |
#!/usr/bin/env perl |
use strict; |
use warnings; |
use diagnostics; |
use utf8; |
use encoding 'utf-8'; |
use Encode; |
|
########################### |
# newsstat.pl version 0.4.2 |
## Print out all text to STDOUT UTF-8 encoded |
binmode STDOUT, ':encoding(UTF-8)'; |
|
############################################################################ |
# Collect statistics about a newsgroup (specified by first argument) in |
# the local news spool. Check all articles in the last 30-day period. |
# Rank posters by number of posts and by volume of posts, report on top and |
# bottom 20 posters. Show their name, number of posts, size of posts, |
# percentage of quoted lines. Rank user-agents used, by poster rather than |
# by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
# |
# (Numbers and paths can be configured below. -- PE) |
############################################################################ |
############################ |
## newsstat.pl version 0.4.3 |
|
############################################################################ |
# RECENT CHANGES # |
# 2011-10-03 PE - Use more compatible shebang |
# - Fixed some Perl::Critic-ized code |
# - Fixed wrong indent for non-ASCII names |
# - Formatted source code |
# 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
# - Use warnings, utf8 (just in case) |
# - Documentation update |
# N/A NN - Take newsgroup name as argument |
# 2004-06-19 NN - newsgroup name is $ARGV[0] |
# - Allow command line flags for subtracting |
# output if not pertinent for a group |
# 2002-11-09 NN - Put Garry's writedata() function back in. |
# - added "rn" to my list of UA's |
# - Started using %distinct_agent for both User agent |
# sections |
# - named it newsstat.pl version 0.3 |
# 2002-11-06 NN - Fixed the earliest/latest file problem by using |
# mtime rather than ctime, and simplifying the logic |
# 2002-11-05 NN - moved user configurations to the top |
# - fixed the cross-posting section |
# - introduced the $newsgroup_name variable which |
# later becomes $news$group |
# - changed $name to $agent_name in countagents() |
# |
# Contributors |
# ------------- |
# NN Nomen nominandum (name to be determined later) |
# PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de> |
########################################################################### |
## Collect statistics about a newsgroup (specified by first argument) |
## in the local news spool. Check all articles in the last 30-day period. |
## Rank posters by number of posts and by volume of posts, report on top |
## and bottom 20 posters. Show their name, number of posts, size of posts, |
## percentage of quoted lines. Rank user-agents used, by poster rather |
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
## |
## Numbers and paths can be configured below. See ChangeLog and TODO |
## for more. -- PE |
########################################################################### |
|
########### TODO ############# |
# Commas in bottom section of report |
# Show date the figures were compiled |
# No. of HTML articles (Content-Type: text/html) |
# No. of quoted sigs (/>\s*-- /) |
# Per cent of top-posted articles |
# Top 10 cross-posters |
# Top 20 news posting hosts (from Path) |
# Count of certain subject words: newbie, kde, burner, sendmail, etc. |
# Count *all* User Agents that each poster uses |
# What do we do about Bill Unruh's ] quote style? |
# Change the way dates/times are checked |
# include % share in posters by no. of arts |
# include % share in posters by size |
# Total, orig & quoted lines by user agent with per cent |
# Take more arguments |
####################################################### |
|
###################### USER CONFIGURATIONS ############################ |
|
# The name of the group to do stats for |
## The name of the group to do stats for |
my $newsgroup_name = $ARGV[0]; |
$newsgroup_name or &usage; |
|
# Check for removal flags |
## Check for removal flags |
my $ix; |
my $j; |
my %skipSec; |
97,25 → 52,25 |
$skipSec{$_} = 1; |
} |
|
# Leafnode users will want /var/spool/news for this variable. |
## Leafnode users will want /var/spool/news for this variable. |
my $news = "/var/spool/news/"; |
|
# How many days are we doing statistics for? |
## How many days are we doing statistics for? |
my $numdays = 30; |
|
# no. of agents we list |
## Number of agents we list |
my $topagents = 10; |
|
# no. of threads we want to know about |
## Number of threads we want to know about |
my $topthreads = 20; |
|
# no. of top or bottom posters to show |
## Number of top or bottom posters to show |
my $topposters = 20; |
|
# no. of cross-posted threads to show |
## Number of cross-posted threads to show |
my $topcrossposts = 10; |
|
# no. of time zones to show |
## Number of time zones to show |
my $toptz = 10; |
|
###################### DATA STRUCTURES ###################### |
145,30 → 100,32 |
my $replies = 0; # total no. of replies |
my $i; # general purpose |
my %distinct_agent; |
my %agents = # used to hold counts of User Agents used |
( |
"KNode" => 0, |
"Pan" => 0, |
"Mozilla" => 0, |
"Sylpheed" => 0, |
"Gnus" => 0, |
|
## Used to hold counts of User Agents used |
my %agents = ( |
"Compuserver" => 0, |
"Foorum" => 0, |
"Forte Agent" => 0, |
"Forte Free Agent" => 0, |
"Gnus" => 0, |
"KNode" => 0, |
"MacSOUP" => 0, |
"MT-NewsWatcher" => 0, |
"MicroPlanet Gravity" => 0, |
"Microsoft Outlook Express" => 0, |
"Xnews" => 0, |
"Microsoft Windows Mail" => 0, |
"Mozilla" => 0, |
"News Rover" => 0, |
"NN" => 0, |
"Pan" => 0, |
"rn" => 0, |
"slrn" => 0, |
"Sylpheed" => 0, |
"tin" => 0, |
"rn" => 0, |
"NN" => 0, |
"MacSOUP" => 0, |
"Foorum" => 0, |
"MT-NewsWatcher" => 0, |
"News Rover" => 0, |
"VSoup" => 0, |
"WebTV" => 0, |
"Compuserver" => 0, |
"VSoup" => 0 |
); |
"Xnews" => 0 |
); |
|
######################## MAIN CODE ######################## |
$! = 1; |
178,22 → 135,22 |
while ( defined( $filename = readdir(DIR) ) ) |
{ |
%lcheader = (); |
next unless -f $filename; # only want real files |
next if ( $filename eq ".overview" ); # real articles only |
next if ( -M $filename > $numdays ); # only want articles <= a certain age |
next unless -f $filename; # only want real files |
next if ( $filename eq ".overview" ); # real articles only |
next if ( -M $filename > $numdays ); # only want articles <= a certain age |
$earliest = ( stat $filename )[9] unless defined($earliest); |
$latest = ( stat $filename )[9] unless defined($latest); |
&getarticle($filename); # read in the article |
&getdata; # grab the data from the article |
$totalposts++; # bump count of articles considered |
&get_article($filename); # read in the article |
&get_data; # grab the data from the article |
$totalposts++; # bump count of articles considered |
} |
closedir(DIR); # finished with the directory |
closedir(DIR); # finished with the directory |
|
# post-processing |
&countagents; # count agents, collapsing versions |
&fixpercent; # check percentages orig/total for posters |
## Post-processing |
&count_agents; # count agents, collapsing versions |
&fix_percent; # check percentages orig/total for posters |
|
&writedata; |
&write_data; |
|
#################### DISPLAY RESULTS ##################### |
print "=" x 76, "\n"; |
208,15 → 165,15 |
printf "Latest article: %s\n", scalar localtime($latest); |
printf "Original articles: %s, replies: %s\n", commify($origposts), |
commify($replies); |
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
commify( int( $totsize / $totalposts ) ); |
my $count = keys %data; |
printf "Total headers: %s KB bodies: %s KB\n", |
printf "Total headers: %s KiB bodies: %s KiB\n", |
commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
( $totorig * 100 ) / ( $totorig + $totquoted ), |
commify( int( $totsig / 1024 ) ); |
225,12 → 182,12 |
$count = keys %threads; |
printf "Total number of threads: %s, average %s bytes per thread\n", |
commify($count), commify( int( $totsize / $count ) ); #/ |
printf "Total number of User-Agents: %d\n", scalar keys %agents; |
printf "Total number of user agents: %d\n", scalar keys %agents; |
print "\n", "=" x 76, "\n"; |
|
############################### |
# show posters by article count Sec 1; |
############################### |
######################################## |
## Show posters by article count Sec 1; |
######################################## |
unless ( $skipSec{1} ) |
{ |
if ( keys %data < $topposters ) |
255,9 → 212,9 |
print "\n", "=" x 76, "\n"; |
} |
|
################################ |
# show posters by size in Kbytes Sec 2; |
################################ |
###################################### |
## Show posters by size in KiB Sec 2; |
###################################### |
unless ( $skipSec{2} ) |
{ |
if ( keys %data < $topposters ) |
268,7 → 225,7 |
{ |
$count = $topposters; |
} |
printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
print "=" x 76, "\n"; |
$i = 0; |
foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
281,9 → 238,9 |
print "\n", "=" x 76, "\n"; |
} |
|
#################################### |
# show top posters for original text |
#################################### |
##################################### |
## Show top posters for original text |
##################################### |
unless ( $skipSec{3} ) |
{ |
if ( keys %data < $topposters ) |
313,9 → 270,9 |
print "\n", "=" x 76, "\n"; |
} |
|
####################################### |
# show bottom posters for original text |
####################################### |
######################################## |
## Show bottom posters for original text |
######################################## |
unless ( $skipSec{4} ) |
{ |
if ( keys %data < $topposters ) |
345,9 → 302,9 |
print "\n", "=" x 76, "\n"; |
} |
|
#################################### |
# show threads by number of articles |
#################################### |
##################################### |
## Show threads by number of articles |
##################################### |
unless ( $skipSec{5} ) |
{ |
if ( keys %threads < $topthreads ) |
373,9 → 330,10 |
} |
print "\n", "=" x 76, "\n"; |
} |
################################ |
# show threads by size in Kbytes |
################################ |
|
############################## |
## Show threads by size in KiB |
############################## |
unless ( $skipSec{6} ) |
{ |
if ( keys %threads < $topthreads ) |
386,7 → 344,7 |
{ |
$count = $topthreads; |
} |
printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
print "=" x 76, "\n"; |
$i = 0; |
foreach my $thread ( |
402,9 → 360,9 |
print "\n", "=" x 76, "\n"; |
} |
|
################################# |
# show top 10 cross-posted groups |
################################# |
################################## |
## Show top 10 cross-posted groups |
################################## |
unless ( $skipSec{7} ) |
{ |
delete $crossposts{"$newsgroup_name"}; # don't include ours |
428,9 → 386,10 |
} |
print "\n", "=" x 76, "\n"; |
} |
####################### |
#show agents and counts |
####################### |
|
######################### |
## Show agents and counts |
######################### |
unless ( $skipSec{8} ) |
{ |
if ( keys %agents < $topagents ) |
454,7 → 413,7 |
} |
|
####################### |
#show distinct agents |
## Show distinct agents |
####################### |
unless ( $skipSec{9} ) |
{ |
482,9 → 441,9 |
print "\n", "=" x 76, "\n"; |
} |
|
########################## |
#show timezones and counts |
########################## |
############################ |
## Show timezones and counts |
############################ |
unless ( $skipSec{10} ) |
{ |
if ( keys %tz < $toptz ) |
508,15 → 467,15 |
|
################################ SUBROUTINES ################################ |
|
####################################### |
# get current article's header and body |
####################################### |
sub getarticle |
######################################## |
## Get current article's header and body |
######################################## |
sub get_article |
{ |
%headers = (); # dump old headers |
my $filename = shift; # get the name of the file |
|
# get stats about the file itself |
## get stats about the file itself |
$filesize = -s $filename; # get total size of file |
$totsize += $filesize; # bump total sizes of all files |
|
530,13 → 489,13 |
$latest = $mtime; |
} |
|
# now read the file |
open( my $FILE, $filename ) or die "Can't open $filename: $!\n"; |
## now read the file |
open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
while (<$FILE>) |
{ |
$totheader += length($_); # bump total header size |
last if (/^\s*$/); # end of header? |
if (/^([^:\s]*):\s+(.*)/) |
if (/^([^:\s]*):\s*(.*)/) |
{ |
my ( $key, $val ) = ( $1, $2 ); |
$headers{$key} = decode( 'MIME-Header', $val ); |
545,19 → 504,29 |
} |
@body = <$FILE>; # slurp up body |
close($FILE); |
} # getarticle |
} # get_article |
|
################################### |
# get data from the current article |
################################### |
sub getdata |
#################################### |
## Get data from the current article |
#################################### |
sub get_data |
{ |
#### First, analyse header fields #### |
|
# Set up this poster if not defined, get counts, sizes |
## Set up this poster if not defined, get counts, sizes |
my $poster = $headers{From}; # get the poster's name |
|
# Convert old to new format |
$poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
|
# Collapse whitespace |
$poster =~ s/\s+/ /; |
|
# Remove outer quotes |
$poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
|
if ( !defined( $data{$poster} ) ) |
{ # seen this one before? |
{ # seen this one before? |
$data{$poster}{agent} = 'Unknown'; # comes after For: field |
$data{$poster}{orig} = 0; |
$data{$poster}{quoted} = 0; |
565,8 → 534,8 |
$data{$poster}{count}++; # bump count for this poster |
$data{$poster}{size} += $filesize; # total size of file |
|
# The User-Agent and/or X-Newsreader fields |
# for User-Agent by poster |
## The User-Agent and/or X-Newsreader fields |
## for User-Agent by poster |
if ( defined $lcheader{"user-agent"} ) |
{ |
$data{$poster}{agent} = $lcheader{"user-agent"}; |
576,7 → 545,7 |
$data{$poster}{agent} = $lcheader{"x-newsreader"}; |
} |
|
# The User Agent for User-Agent by number of posts |
## The User Agent for User-Agent by number of posts |
my $UA = "unknown"; |
foreach my $keys ( keys %lcheader ) |
{ |
627,11 → 596,14 |
if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
|
## Pick out the popular agents |
if ( $raw =~ /(outlook express)/i |
|| $raw =~ /(microplanet gravity)/i |
|| $raw =~ /(news rover)/i |
|| $raw =~ /(forte agent)/i |
|| $raw =~ /(forte free agent)/i ) |
if ( |
$raw =~ /(outlook express)/i |
|| $raw =~ /(windows mail)/i |
|| $raw =~ /(microplanet gravity)/i |
|| $raw =~ /(news rover)/i |
|| $raw =~ /(forte agent)/i |
|| $raw =~ /(forte free agent)/i |
) |
{ |
$agent = $1; |
} |
686,13 → 658,13 |
return $agent; |
} |
|
# Get all cross-posted newsgroups |
## Get all cross-posted newsgroups |
for ( split /,/, $headers{"Newsgroups"} ) |
{ |
$crossposts{$_}++; # bump count for each |
} |
|
# Get threads |
## Get threads |
my $thread = $headers{"Subject"}; |
$thread =~ s/^re: //i; # Remove Re: or re: at start |
$thread =~ s/\s+/ /g; # collapse whitespace |
699,7 → 671,7 |
$threads{$thread}{count} += 1; # bump count of this subject |
$threads{$thread}{size} += $filesize; # bump bytes for this thread |
|
# Is this an original post or a reply? |
## Is this an original post or a reply? |
if ( defined $headers{"References"} ) |
{ |
$replies++; |
709,9 → 681,9 |
$origposts++; |
} |
|
# Get the time zone |
## Get the time zone |
$_ = $headers{"Date"}; |
my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
{ |
$tz = "UTC"; |
728,7 → 700,7 |
{ |
$totsig += length($_); # bump total sig size |
|
# Bill Unruh uses ] quotes, and another poster uses :: |
## Bill Unruh uses ] quotes, and another poster uses :: |
} |
elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
{ # are we in a quote line? |
742,19 → 714,19 |
else |
{ |
|
# we must be processing an original line |
## We must be processing an original line |
$data{$poster}{orig} += length($_); # bump count of original chrs |
$totorig += length($_); |
} |
} # end for (@body) |
|
} # getdata |
} # get_data |
|
######################################## |
# Count the User-Agents used, collapsing |
# different versions into one per agent. |
######################################## |
sub countagents |
######################################### |
## Count the User-Agents used, collapsing |
## different versions into one per agent. |
######################################### |
sub count_agents |
{ |
POSTER: |
foreach my $poster ( keys %data ) |
769,12 → 741,12 |
} |
$agents{ $data{$poster}{agent} }++; |
} |
} # countagents |
} # count_agents |
|
############################################ |
# set orig/total percentages for all posters |
############################################ |
sub fixpercent |
############################################# |
## Set orig/total percentages for all posters |
############################################# |
sub fix_percent |
{ |
foreach my $poster ( keys %data ) |
{ |
793,16 → 765,16 |
} |
} |
|
############################## |
# right pad a string with '.'s |
############################## |
############################### |
## Right pad a string with '.'s |
############################### |
sub rpad |
{ |
# get text to pad, length to pad, pad chr |
## Get text to pad, length to pad, pad chr |
my ( $text, $pad_len, $pad_chr ) = @_; |
|
### DEBUG |
# printf "|%s| = %d\n", $text, length($text); |
## DEBUG |
#printf "|%s| = %d\n", $text, length($text); |
|
if ( length($text) > $pad_len ) |
{ |
812,9 → 784,9 |
return $padded; |
} |
|
################# |
# centre a string |
################# |
################## |
## Centre a string |
################## |
sub centred |
{ |
my ( $text, $width ) = @_; # text to centre, size of field to centre in |
823,25 → 795,24 |
return $centred; |
} |
|
########################## |
# put commas into a number |
########################## |
########################### |
## Put commas into a number |
########################### |
sub commify |
{ |
$_ = shift; |
1 while s/^(-?\d+)(\d{3})/$1,$2/; |
local $_ = shift; |
1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
return $_; |
} |
|
######################### |
# clean |
######################### |
################################################################ |
## Returns a string with leading and trailing whitespace removed |
################################################################ |
sub clean |
{ |
my $dirty = shift; |
my $clean = $dirty; |
$clean =~ s/^\s*//; |
$clean =~ s/\s*$//; |
$clean =~ s/^\s*|\s*$//g; |
|
return $clean; |
} |
848,18 → 819,18 |
|
sub usage |
{ |
|
print "usage: newstat.pl newsgroupname\n"; |
exit 1; |
} |
|
################################### |
# Write data structures to a file # |
################################### |
sub writedata |
################################## |
## Write data structures to a file |
################################## |
sub write_data |
{ |
open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
print $OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
or die "Can't create XDATA: $!\n"; |
print $OUTF "Data collected from $newsgroup_name\n\n"; |
print $OUTF |
"Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
foreach my $name ( keys %data ) |
886,7 → 857,7 |
{ |
print $OUTF "$name : $crossposts{$name}\n"; |
} |
print $OUTF print $OUTF |
print $OUTF |
"============================================================================\n"; |
print $OUTF "User agents\n"; |
print $OUTF |
905,4 → 876,4 |
print $OUTF "$name : $tz{$name}\n"; |
} |
close $OUTF; |
} # writedata |
} # write_data |