WebSVN - LCARS - Diff - Rev 49 and 198 - /branches/live/tools/network/news/newsstat/newsstat.pl


#!/usr/bin/env perl
use strict;
use warnings;
require 5.004;

#use diagnostics;
use utf8;

## NOTE:
## Enable and remove binmode when utf8::all has actually become lexically scoped
# use utf8:all;

use constant DEBUG => 0;

## newsstat.pl
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
## Based on work by Garry Knight et al.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Print out all text to STDOUT UTF-8 encoded
binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

## L10n
use locale ':not_characters';

# setlocale( LC_MESSAGES, '' );
require Number::Format;

## i18n
## FIXME: Automatically include resolved '.' in @INC
# print join "\n", @INC;

use Locale::TextDomain ('de.pointedears.newsstat');
use POSIX              ('locale_h');
use Locale::Messages qw (bind_textdomain_filter
  bind_textdomain_codeset
  turn_utf_8_on);

bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';

require Mail::Message;
require DateTime;
require DateTime::Format::Mail;

# See comments in previous example
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
  'decimal_point' };

# Apply defaults if values are missing
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
$thousands_sep = ' ' unless $thousands_sep;

# grouping and mon_grouping are packed lists
# of small integers (characters) telling the
# grouping (thousand_seps and mon_thousand_seps
# being the group dividers) of numbers and
# monetary quantities.  The integers' meanings:
# 255 means no more grouping, 0 means repeat
# the previous grouping, 1-254 means use that
# as the current grouping.  Grouping goes from
# right to left (low to high digits).  In the
# below we cheat slightly by never using anything
# else than the first grouping (whatever that is).
my @grouping;
if ($grouping)
{
  @grouping = unpack( "C*", $grouping );
}
else
{
  @grouping = (3);
}

## FIXME: Why don't the defaults work already?
my $formatter = new Number::Format(
  -decimal_point => $decimal_point,
  -thousands_sep => $thousands_sep,
  -kibi_suffix   => ' KiB',
  -mebi_suffix   => ' MiB',
  -gibi_suffix   => ' GiB',

  # -grouping      => $grouping[0]
);

###################### USER CONFIGURATIONS ############################

## The name of the group to do stats for
my $newsgroup_name = $ARGV[0];
$newsgroup_name // usage();

## Check for removal flags
my $ix;
my $j;
my %skipSec;
my @skiplist;
my $args = @ARGV;
for ( $ix = 1 ; $ix < $args ; $ix++ )
{
  $j = $ix + 1;
  if ( $ARGV[$ix] eq "-x" )
  {
    @skiplist = split( ",", $ARGV[$j] );
  }
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
  {
    @skiplist = split( ",", $1 );
  }
}
foreach (@skiplist)
{
  $skipSec{$_} = 1;
}

## Leafnode users will want /var/spool/news for this variable.
my $news = "/var/spool/news/";

## Number of top or bottom posters to show
my $topposters = 20;

## Number of threads we want to know about
my $topthreads = 20;

## Number of cross-posted threads to show
my $topcrossposts = 10;

## Number of agents we list
my $topagents = 10;

## Number of time zones to show
my $toptz = 10;

###################### DATA STRUCTURES ######################
my $group = $newsgroup_name;
$group =~ s!\.!/!g;
my %data;    # name, count, agent, total, orig, quoted
my $totsize = 0;    # holds total sizes of all files
my %crossposts;     # group, count
my %threads;        # subject, count
my $replies   = 0;  # total no. of replies
my $origposts = 0;  # total no. of original posts
my %tz;             # timezones by count
my $earliest;       # earliest article we have found
my $latest;         # latest article we have found
my $totheader = 0;  # total size of header material
my $totbody   = 0;  # total size of body material
my $totsig    = 0;  # total size of sig material
my $totquoted = 0;  # total size of quoted material
my $totorig   = 0;  # total size of original material
my $totalposts;     # total no. of posts considered
my %distinct_agent;

## Used to hold counts of User Agents used
my %agents = (
  "Compuserver"               => 0,
  "Foorum"                    => 0,
  "Forte Agent"               => 0,
  "Forte Free Agent"          => 0,
  "Gnus"                      => 0,
  "KNode"                     => 0,
  "MacSOUP"                   => 0,
  "MT-NewsWatcher"            => 0,
  "MicroPlanet Gravity"       => 0,
  "Microsoft Outlook Express" => 0,
  "Microsoft Windows Mail"    => 0,
  "Mozilla"                   => 0,
  "News Rover"                => 0,
  "NN"                        => 0,
  "Pan"                       => 0,
  "rn"                        => 0,
  "slrn"                      => 0,
  "Sylpheed"                  => 0,
  "tin"                       => 0,
  "VSoup"                     => 0,
  "WebTV"                     => 0,
  "Xnews"                     => 0,
);

my $datetime_parser = DateTime::Format::Mail->new();
$datetime_parser->loose();

my $today = DateTime->today( time_zone => 'UTC' );
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
my $start      = int $prev_month->strftime('%s');
my $numdays    = int DateTime->last_day_of_month(
  year      => $prev_month->year(),
  month     => $prev_month->month(),
  time_zone => $prev_month->time_zone(),
)->day();
my $end = int $today->clone()->set_day(1)->strftime('%s');

dmsg( $start, " to ", $end ) if DEBUG;

chdir("$news$group")
  or die __x(
  "Can't cd to {newsgroup}: {error}\n",
  newsgroup => "$news$group",
  error     => $!
  );
opendir( DIR, "." )
  or die __x(
  "Can't open {newsgroup}: {error}\n",
  newsgroup => "$news$group",
  error     => $!
  );

while ( defined( my $filename = readdir(DIR) ) )
{
  next unless -f $filename;    # only want real files
  next if ( $filename eq ".overview" );    # real articles only

  get_article($filename);                  # read in the article
}
closedir(DIR);                             # finished with the directory

dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;

## Post-processing
count_agents();                            # count agents, collapsing versions
fix_percent();

write_data();
display_results();

########################################
## Get current article's header and body
########################################
sub get_article
{
  my $filename = shift;

  open( my $FILE, '<', $filename )
    or
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
  my $msg       = Mail::Message->read($FILE);
  my $timestamp = $msg->timestamp();
  my $date      = $msg->study('Date');

  ## Disregard article if timestamp is not in range
  dmsg($timestamp) if DEBUG;
  if ( $timestamp < $start || $timestamp >= $end )
  {
    dmsg("Posting on $date ignored.") if DEBUG;
    return;
  }

  $totalposts++;    # bump count of articles considered

  ## DEBUG
  dmsg($date) if DEBUG;

  ## get stats about the file itself
  my $filesize = -s $filename;    # get total size of file
  $totsize += $filesize;          # bump total sizes of all files

  if ( ( not defined $earliest ) || $timestamp < $earliest )
  {
    $earliest = $timestamp;
  }
  elsif ( ( not defined $latest ) || $timestamp > $latest )
  {
    $latest = $timestamp;
  }

  #print "timestamp: $timestamp\n";

  ## count header size
  $totheader += $msg->head()->size();

  ## get the poster's name (MIME-decoded, in UTF-8)
  my $poster = $msg->study('From');
  if ( defined $poster )
  {
    ## Convert old to new format
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;

    ## Collapse whitespace
    $poster =~ s/\s+/ /g;

    ## Remove outer quotes; TODO: observe RFC 5322 strictly
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;

    ## DEBUG
    dmsg($poster) if DEBUG;

    ## seen this one before?
    if ( !defined( $data{$poster} ) )
    {
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
      $data{$poster}{'orig'}   = 0;
      $data{$poster}{'quoted'} = 0;
    }
    $data{$poster}{'count'}++;                    # bump count for this poster
    $data{$poster}{'size'} += $filesize;          # total size of file

    ## The User-Agent and/or X-Newsreader fields
    ## for User-Agent by poster
    my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');
    if ( defined $ua )
    {
      $data{$poster}{'agent'} = $ua;

      ## DEBUG
      dmsg($ua) if DEBUG;
    }

    ## The User Agent for User-Agent by number of articles
    get_agent($msg);

    ## Get all cross-posted newsgroups
    for ( split( /,/, $msg->study('Newsgroups') ) )
    {
      $crossposts{$_}++;    # bump count for each
    }

    ## Get threads
    my $thread = $msg->study('Subject');
    $thread =~ s/^\s*re:\s*//i;              # Remove Re: or re: at the start
    $thread =~ s/\s*\(was:\s*.*\)\s*$//i;    # Remove (was: ...) at the end
    $thread =~ s/\s+/ /g;                    # collapse whitespace
    $threads{$thread}{'count'}++;            # bump count of this subject
    $threads{$thread}{'size'} += $filesize;  # bump bytes for this thread

    ## Is this an original post or a reply?
    if ( defined $msg->study('References') )
    {
      $replies++;
    }
    else
    {
      $origposts++;
    }

    ## Get the time zone
    my $datetime = $datetime_parser->parse_datetime($date);
    my $tz       = $datetime->strftime('%z');
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
    $tz{$tz}++;

    ## DEBUG
    dmsg($tz) if DEBUG;

#### Now analyse the body text ####
    my $body = $msg->body();

    my $insig = 0;
    my @body  = $body->lines;
    for (@body)
    {
      $totbody += length($_);    # bump total body size
      next if (m{^$>}o);         # don't count blank lines in body
      if ( $insig == 1 )
      {

        # bump total sig size
        $totsig += length($_);
      }
      ## are we in a quote line?
      ## Bill Unruh uses ] quotes, and another poster uses ::
      elsif ( m{^\s*[>\]]}o || m{^\s*::}o )
      {
        ## bump count of quoted chrs
        $data{$poster}{'quoted'} += length($_);
        $totquoted += length($_);
      }
      elsif (/^-- $/)
      {
        $insig = 1;
      }
      else
      {
        ## We must be processing an original line
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
        $totorig += length($_);
      }
    }

    # end for (@body)
  }

  close($FILE);
}

sub get_agent
{
  my $msg = shift;

  my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')
    // $msg->study('X-Mailer');

  if ( not defined $ua )
  {
    my $org = $msg->study('Organization');
    if ( defined $org
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
    {
      $ua = $org;
    }
    elsif ( $msg->study('Message-ID') =~ /pine/i )
    {
      $ua = "Pine";
    }
  }

  ## Hopefully found UA, else set to unknown
  if ( not defined $ua )
  {
    $ua = __ "unknown";
  }

  $ua = clean($ua);

  my $raw   = $ua;
  my $agent = $raw;

  ## strip http
  if ( $raw =~ /.*http.*/ )
  {
    $raw =~ s!posted via!!i;
    $raw =~ s!http://!!g;
    $raw =~ s!/!!g;
    $raw =~ s! !!g;
  }

  ## Fix Outlook from Mac
  if ( $raw =~ /^microsoft/i )
  {
    $raw =~ s/-/ /g;
  }

  ## Pick out the popular agents
  if ( $raw =~ /(outlook express)/i
    || $raw =~ /(windows mail)/i
    || $raw =~ /(microplanet gravity)/i
    || $raw =~ /(news rover)/i
    || $raw =~ /(forte agent)/i
    || $raw =~ /(forte free agent)/i )
  {
    $agent = $1;
  }
  elsif (
    $raw =~ /^(
        pan
       |sylpheed
       |slrn
       |mozilla
       |knode
       |tin
       |hamster
       |xrn
       |xnews
       |aol
       |gnus
       |krn
       |macsoup
       |messenger
       |openxp
       |pine
       |thoth
       |turnpike
       |winvn
       |vsoup
       |google
       |supernews
       |nn
       |rn
       |007
       |webtv
       |compuserve
       )/ix
    )
  {
    $agent = $1;
  }
  else
  {
    ## Clean up unknown agents
    if ( $raw =~ m!^(.*?)/! )
    {
      $agent = $1;
    }
    elsif ( $raw =~ /^(\w*)\d.*/ )
    {
      $agent = $1;
    }
  }

  $distinct_agent{$agent}++;
  return $agent;
}
## get_agent

#########################################
## Count the User-Agents used, collapsing
## different versions into one per agent.
#########################################
sub count_agents
{
POSTER:
  foreach my $poster ( keys %data )
  {
    foreach my $agent_name ( keys %distinct_agent )
    {    # check against known ones
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
      {
        $agents{$agent_name}++;
        next POSTER;
      }
    }
    $agents{ $data{$poster}{'agent'} }++;
  }
}    # count_agents

#############################################
## Set orig/total percentages for all posters
#############################################
sub fix_percent
{
  foreach my $poster ( keys %data )
  {
    my $percent = 100;
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
    {
      $percent =
        $data{$poster}{'orig'} * 100 /
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
    }
    elsif ( $data{$poster}{'orig'} == 0 )
    {
      $percent = 0;
    }
    $data{$poster}{'percent'} = $percent;
  }
}
## fix_percent

##################################
## Write data structures to a file
##################################
sub write_data
{
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
    or die __x( "Can't create XDATA: {error}\n", error => $! );
  print $OUTF "Data collected from $newsgroup_name\n\n";
  print $OUTF
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
  foreach my $name ( keys %data )
  {
    print $OUTF
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Thread subjects\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
  {
    print $OUTF
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Cross-posts\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %crossposts )
  {
    print $OUTF "$name : $crossposts{$name}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "User agents\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %agents )
  {
    print $OUTF "$name : $agents{$name}\n";
  }
  print $OUTF
"============================================================================\n";
  print $OUTF "Time zones\n";
  print $OUTF
"----------------------------------------------------------------------------\n";
  foreach my $name ( sort keys %tz )
  {
    print $OUTF "$name : $tz{$name}\n";
  }
  close $OUTF;
}    # write_data

sub display_results
{
  #################### DISPLAY RESULTS #####################
  print "=" x 76, "\n";
  printf "%s\n",
    centred(
    __x( "Analysis of articles to {newsgroup}", newsgroup => $newsgroup_name ),
    76
    );
  print "=" x 76, "\n";
  printf "%s\n",
    centred(
    __(
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
    ),
    76
    );
  print "\n";
  printf __"Total articles considered:   %s over %d days\n",
    $formatter->format_number($totalposts),
    $formatter->format_number($numdays);
  my $time_locale       = setlocale(LC_TIME);
  my $earliest_datetime = DateTime->from_epoch(
    epoch     => $earliest,
    locale    => $time_locale,
    time_zone => 'UTC',
  );
  my $latest_datetime = DateTime->from_epoch(
    epoch     => $latest,
    locale    => $time_locale,
    time_zone => 'UTC',
  );
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
  printf __"Earliest article:            %s" . "\n",
    $earliest_datetime->strftime($datetime_format);
  printf __"Latest article:              %s" . "\n",
    $latest_datetime->strftime($datetime_format);
  printf __"Original articles:           %s; replies: %s" . "\n",
    $formatter->format_number($origposts),
    $formatter->format_number($replies);
  printf __"Total size of articles:      %s bytes (%s)" . "\n",
    $formatter->format_number($totsize),
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
  printf __"Average %s articles per day, %s per day, %s bytes per article.\n",
    $formatter->format_number( int( $totalposts / $numdays ) ),
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
    $formatter->format_number( int( $totsize / $totalposts ) );

  my $count = keys %data;
  print "\n";
  printf __"Total headers:      %s; bodies: %s\n",
    $formatter->format_bytes(
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
    ),
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
  printf __
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
    $formatter->format_bytes(
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
    ),
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) )
    ,
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
  print "\n";
  printf __"Total number of posters:     %s, average %s per poster\n",
    $formatter->format_number($count),
    $formatter->format_bytes( $totsize / $count,
    ( 'precision' => 1, 'mode' => 'iec' ) );
  $count = keys %threads;
  printf __"Total number of threads:     %s, average %s per thread\n",
    $formatter->format_number($count),
    $formatter->format_bytes( $totsize / $count,
    ( 'precision' => 1, 'mode' => 'iec' ) );
  printf __"Total number of user agents: %d\n",
    $formatter->format_number( scalar keys %agents );
  print "\n", "=" x 76, "\n";
  ########################################
  ## Show posters by article count  Sec 1;
  ########################################
  unless ( $skipSec{1} )
  {
    if ( keys %data < $topposters )
    {
      $count = keys %data;
    }
    else
    {
      $count = $topposters;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} posters by number of articles", count => $topposters ),
      76
      );
    print "=" x 76, "\n";
    my $i = 0;
    foreach
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
    {
      my $name = substr( $poster, 0, 65 );
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
        $data{$poster}{count};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  ######################################
  ## Show posters by size in KiB  Sec 2;
  ######################################
  unless ( $skipSec{2} )
  {
    if ( keys %data < $topposters )
    {
      $count = keys %data;
    }
    else
    {
      $count = $topposters;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
      76
      );
    print "=" x 76, "\n";
    my $i = 0;
    foreach
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
    {
      my $name = substr( $poster, 0, 62 );
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
        $data{$poster}{size} / 1024;    #/
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  #####################################
  ## Show top posters for original text
  #####################################
  my $topposters_real = 0;

  unless ( $skipSec{3} )
  {
    if ( keys %data < $topposters )
    {
      $count = keys %data;
    }
    else
    {
      $count = $topposters;
    }

    printf "%s\n",
      centred(
      __x(
        "Top {count} responders by original text (> 5 articles)",
        count => $topposters
      ),
      76
      );
    print "=" x 76, "\n";
    foreach my $poster (
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
      keys %data
      )
    {
      next if $data{$poster}{quoted} == 0;
      next if $data{$poster}{count} < 5;
      my $name = substr( $poster, 0, 63 );
      printf "%2d. %-63s : %02.2f%%\n", $topposters_real + 1,
        rpad( $poster, 63, "." ),
        $data{$poster}{percent};
      last if ( ++$topposters_real == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  ########################################
  ## Show bottom posters for original text
  ########################################

  $skipSec{4} = ( $topposters_real <= $topposters ) unless defined $skipSec{4};

  unless ( $skipSec{4} )
  {
    if ( keys %data < $topposters )
    {
      $count = keys %data;
    }
    else
    {
      $count = $topposters;
    }

    printf "%s\n",
      centred(
      __x(
        "Bottom {count} responders by original text  (> 5 articles)",
        count => $topposters
      ),
      76
      );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $poster (
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
      keys %data
      )
    {
      next if $data{$poster}{quoted} == 0;
      next if $data{$poster}{count} < 5;
      my $name = substr( $poster, 0, 63 );
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
        $data{$poster}{percent};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  #####################################
  ## Show threads by number of articles
  #####################################
  unless ( $skipSec{5} )
  {
    if ( keys %threads < $topthreads )
    {
      $count = keys %threads;
    }
    else
    {
      $count = $topthreads;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
      76 );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $thread (
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
      keys %threads
      )
    {
      my $name = substr( $thread, 0, 65 );
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
        $threads{$thread}{'count'};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  ##############################
  ## Show threads by size in KiB
  ##############################
  unless ( $skipSec{6} )
  {
    if ( keys %threads < $topthreads )
    {
      $count = keys %threads;
    }
    else
    {
      $count = $topthreads;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $thread (
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
      keys %threads
      )
    {
      my $name = substr( $thread, 0, 65 );
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
        $threads{$thread}{'size'} / 1024;    #/
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  ##################################
  ## Show top 10 cross-posted groups
  ##################################
  unless ( $skipSec{7} )
  {
    delete $crossposts{"$newsgroup_name"};    # don't include ours
    if ( keys %crossposts < $topcrossposts )
    {
      $count = keys %crossposts;
    }
    else
    {
      $count = $topcrossposts;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
    print "=" x 76, "\n";
    my $i = 0;
    foreach
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
    {
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
        $crossposts{$name};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  #########################
  ## Show agents and counts
  #########################
  unless ( $skipSec{8} )
  {
    if ( keys %agents < $topagents )
    {
      $count = keys %agents;
    }
    else
    {
      $count = $topagents;
    }
    printf "%s\n",
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
      76 );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
    {
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
        $agents{$agent};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  #######################
  ## Show distinct agents
  #######################
  unless ( $skipSec{9} )
  {
    if ( keys %distinct_agent < $topagents )
    {
      $count = keys %distinct_agent;
    }
    else
    {
      $count = $topagents;
    }
    printf "%s\n",
      centred(
      __x(
        "Top {count} user agents by number of articles",
        count => $topagents
      ),
      76
      );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $agent (
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
      keys %distinct_agent
      )
    {
      printf "%2d. %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
        $distinct_agent{$agent},
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }

  ############################
  ## Show timezones and counts
  ############################
  unless ( $skipSec{10} )
  {
    if ( keys %tz < $toptz )
    {
      $count = keys %tz;
    }
    else
    {
      $count = $toptz;
    }
    printf "%s\n",
      centred(
      __x( "Top {count} time zones by number of articles", count => $toptz ),
      76 );
    print "=" x 76, "\n";
    my $i = 0;
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
    {
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
      last if ( ++$i == $count );
    }
    print "\n", "=" x 76, "\n";
  }
}

## helper subs

###############################
## Right pad a string with '.'s
###############################
sub rpad
{
  ## Get text to pad, length to pad, pad chr
  my ( $text, $pad_len, $pad_chr ) = @_;

  ## DEBUG
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;

  if ( length($text) > $pad_len )
  {
    $text = substr( $text, 0, $pad_len );
  }
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
  return $padded;
}

##################
## Centre a string
##################
sub centred
{
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
  my $pad_len = ( $width - length($text) ) / 2;    #/
  my $centred = " " x $pad_len . $text;
  return $centred;
}

###########################
## Put commas into a number
###########################
sub commify
{
  local $_ = shift;
  my $number = $_;
  $_ = int;                                        # Chop non-integer part
  1 while
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
  my $int_part  = $_;
  my $real_part = '';
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
  {
    $real_part = $1;
  }
  return $int_part . $real_part;
}

################################################################
## Returns a string with leading and trailing whitespace removed
################################################################
sub clean
{
  my $dirty = shift;
  my $clean = $dirty;
  $clean =~ s/^\s+|\s+$//g;

  return $clean;
}

sub usage
{
  print __ "usage: newsstat.pl NEWS.GROUP", "\n";
  exit 1;
}

sub dmsg
{
  print STDERR @_, "\n";
}

sub dmsg2
{
  my ( $level, @msg ) = @_;
  print STDERR @msg, "\n" if $level >= DEBUG;
}
 

Rev 49	Rev 198
1	#!/usr/bin/env perl	1	#!/usr/bin/env perl
2	use strict;	2	use strict;
3	use warnings;	3	use warnings;
4	require 5.004;	4	require 5.004;
5		5
6	#use diagnostics;	6	#use diagnostics;
7	use utf8;	7	use utf8;
8		8
9	## NOTE:	9	## NOTE:
10	## Enable and remove binmode when utf8::all has actually become lexically scoped	10	## Enable and remove binmode when utf8::all has actually become lexically scoped
11	# use utf8:all;	11	# use utf8:all;
12		12
13	use constant DEBUG => 0;	13	use constant DEBUG => 0;
14		14
15	## newsstat.pl	15	## newsstat.pl
16	## Copyright (C) 2011, 2012 Thomas Lahn <startrek@PointedEars.de>	16	## Copyright (C) 2011, 2012 Thomas Lahn <startrek@PointedEars.de>
17	## Based on work by Garry Knight et al.	17	## Based on work by Garry Knight et al.
18	##	18	##
19	## This program is free software: you can redistribute it and/or modify	19	## This program is free software: you can redistribute it and/or modify
20	## it under the terms of the GNU General Public License as published by	20	## it under the terms of the GNU General Public License as published by
21	## the Free Software Foundation, either version 3 of the License, or	21	## the Free Software Foundation, either version 3 of the License, or
22	## (at your option) any later version.	22	## (at your option) any later version.
23	##	23	##
24	## This program is distributed in the hope that it will be useful,	24	## This program is distributed in the hope that it will be useful,
25	## but WITHOUT ANY WARRANTY; without even the implied warranty of	25	## but WITHOUT ANY WARRANTY; without even the implied warranty of
26	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	26	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27	## GNU General Public License for more details.	27	## GNU General Public License for more details.
28	##	28	##
29	## You should have received a copy of the GNU General Public License	29	## You should have received a copy of the GNU General Public License
30	## along with this program. If not, see <http://www.gnu.org/licenses/>.	30	## along with this program. If not, see <http://www.gnu.org/licenses/>.
31		31
32	## Print out all text to STDOUT UTF-8 encoded	32	## Print out all text to STDOUT UTF-8 encoded
33	binmode STDOUT, ':encoding(UTF-8)';	33	binmode STDOUT, ':encoding(UTF-8)';
34	binmode STDERR, ':encoding(UTF-8)';	34	binmode STDERR, ':encoding(UTF-8)';
35		35
36	## L10n	36	## L10n
37	use locale ':not_characters';	37	use locale ':not_characters';
38		38
39	# setlocale( LC_MESSAGES, '' );	39	# setlocale( LC_MESSAGES, '' );
40	require Number::Format;	40	require Number::Format;
41		41
42	## i18n	42	## i18n
43	## FIXME: Automatically include resolved '.' in @INC	43	## FIXME: Automatically include resolved '.' in @INC
44	# print join "\n", @INC;	44	# print join "\n", @INC;
45		45
46	use Locale::TextDomain ('de.pointedears.newsstat');	46	use Locale::TextDomain ('de.pointedears.newsstat');
47	use POSIX ('locale_h');	47	use POSIX ('locale_h');
48	use Locale::Messages qw (bind_textdomain_filter	48	use Locale::Messages qw (bind_textdomain_filter
49	bind_textdomain_codeset	49	bind_textdomain_codeset
50	turn_utf_8_on);	50	turn_utf_8_on);
51		51
52	bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;	52	bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;
53	bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';	53	bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
54		54
55	require Mail::Message;	55	require Mail::Message;
56	require DateTime;	56	require DateTime;
57	require DateTime::Format::Mail;	57	require DateTime::Format::Mail;
58		58
59	# See comments in previous example	59	# See comments in previous example
60	my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =	60	my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
61	@{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',	61	@{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
62	'decimal_point' };	62	'decimal_point' };
63		63
64	# Apply defaults if values are missing	64	# Apply defaults if values are missing
65	$thousands_sep = $mon_thousands_sep unless $thousands_sep;	65	$thousands_sep = $mon_thousands_sep unless $thousands_sep;
66	$thousands_sep = ' ' unless $thousands_sep;	66	$thousands_sep = ' ' unless $thousands_sep;
67		67
68	# grouping and mon_grouping are packed lists	68	# grouping and mon_grouping are packed lists
69	# of small integers (characters) telling the	69	# of small integers (characters) telling the
70	# grouping (thousand_seps and mon_thousand_seps	70	# grouping (thousand_seps and mon_thousand_seps
71	# being the group dividers) of numbers and	71	# being the group dividers) of numbers and
72	# monetary quantities. The integers' meanings:	72	# monetary quantities. The integers' meanings:
73	# 255 means no more grouping, 0 means repeat	73	# 255 means no more grouping, 0 means repeat
74	# the previous grouping, 1-254 means use that	74	# the previous grouping, 1-254 means use that
75	# as the current grouping. Grouping goes from	75	# as the current grouping. Grouping goes from
76	# right to left (low to high digits). In the	76	# right to left (low to high digits). In the
77	# below we cheat slightly by never using anything	77	# below we cheat slightly by never using anything
78	# else than the first grouping (whatever that is).	78	# else than the first grouping (whatever that is).
79	my @grouping;	79	my @grouping;
80	if ($grouping)	80	if ($grouping)
81	{	81	{
82	@grouping = unpack( "C*", $grouping );	82	@grouping = unpack( "C*", $grouping );
83	}	83	}
84	else	84	else
85	{	85	{
86	@grouping = (3);	86	@grouping = (3);
87	}	87	}
88		88
89	## FIXME: Why don't the defaults work already?	89	## FIXME: Why don't the defaults work already?
90	my $formatter = new Number::Format(	90	my $formatter = new Number::Format(
91	-decimal_point => $decimal_point,	91	-decimal_point => $decimal_point,
92	-thousands_sep => $thousands_sep,	92	-thousands_sep => $thousands_sep,
93	-kibi_suffix => ' KiB',	93	-kibi_suffix => ' KiB',
94	-mebi_suffix => ' MiB',	94	-mebi_suffix => ' MiB',
95	-gibi_suffix => ' GiB',	95	-gibi_suffix => ' GiB',
96		96
97	# -grouping => $grouping[0]	97	# -grouping => $grouping[0]
98	);	98	);
99		99
100	###################### USER CONFIGURATIONS ############################	100	###################### USER CONFIGURATIONS ############################
101		101
102	## The name of the group to do stats for	102	## The name of the group to do stats for
103	my $newsgroup_name = $ARGV[0];	103	my $newsgroup_name = $ARGV[0];
104	$newsgroup_name // usage();	104	$newsgroup_name // usage();
105		105
106	## Check for removal flags	106	## Check for removal flags
107	my $ix;	107	my $ix;
108	my $j;	108	my $j;
109	my %skipSec;	109	my %skipSec;
110	my @skiplist;	110	my @skiplist;
111	my $args = @ARGV;	111	my $args = @ARGV;
112	for ( $ix = 1 ; $ix < $args ; $ix++ )	112	for ( $ix = 1 ; $ix < $args ; $ix++ )
113	{	113	{
114	$j = $ix + 1;	114	$j = $ix + 1;
115	if ( $ARGV[$ix] eq "-x" )	115	if ( $ARGV[$ix] eq "-x" )
116	{	116	{
117	@skiplist = split( ",", $ARGV[$j] );	117	@skiplist = split( ",", $ARGV[$j] );
118	}	118	}
119	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )	119	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
120	{	120	{
121	@skiplist = split( ",", $1 );	121	@skiplist = split( ",", $1 );
122	}	122	}
123	}	123	}
124	foreach (@skiplist)	124	foreach (@skiplist)
125	{	125	{
126	$skipSec{$_} = 1;	126	$skipSec{$_} = 1;
127	}	127	}
128		128
129	## Leafnode users will want /var/spool/news for this variable.	129	## Leafnode users will want /var/spool/news for this variable.
130	my $news = "/var/spool/news/";	130	my $news = "/var/spool/news/";
131		131
132	## Number of top or bottom posters to show	132	## Number of top or bottom posters to show
133	my $topposters = 20;	133	my $topposters = 20;
134		134
135	## Number of threads we want to know about	135	## Number of threads we want to know about
136	my $topthreads = 20;	136	my $topthreads = 20;
137		137
138	## Number of cross-posted threads to show	138	## Number of cross-posted threads to show
139	my $topcrossposts = 10;	139	my $topcrossposts = 10;
140		140
141	## Number of agents we list	141	## Number of agents we list
142	my $topagents = 10;	142	my $topagents = 10;
143		143
144	## Number of time zones to show	144	## Number of time zones to show
145	my $toptz = 10;	145	my $toptz = 10;
146		146
147	###################### DATA STRUCTURES ######################	147	###################### DATA STRUCTURES ######################
148	my $group = $newsgroup_name;	148	my $group = $newsgroup_name;
149	$group =~ s!\.!/!g;	149	$group =~ s!\.!/!g;
150	my %data; # name, count, agent, total, orig, quoted	150	my %data; # name, count, agent, total, orig, quoted
151	my $totsize = 0; # holds total sizes of all files	151	my $totsize = 0; # holds total sizes of all files
152	my %crossposts; # group, count	152	my %crossposts; # group, count
153	my %threads; # subject, count	153	my %threads; # subject, count
154	my $replies = 0; # total no. of replies	154	my $replies = 0; # total no. of replies
155	my $origposts = 0; # total no. of original posts	155	my $origposts = 0; # total no. of original posts
156	my %tz; # timezones by count	156	my %tz; # timezones by count
157	my $earliest; # earliest article we have found	157	my $earliest; # earliest article we have found
158	my $latest; # latest article we have found	158	my $latest; # latest article we have found
159	my $totheader = 0; # total size of header material	159	my $totheader = 0; # total size of header material
160	my $totbody = 0; # total size of body material	160	my $totbody = 0; # total size of body material
161	my $totsig = 0; # total size of sig material	161	my $totsig = 0; # total size of sig material
162	my $totquoted = 0; # total size of quoted material	162	my $totquoted = 0; # total size of quoted material
163	my $totorig = 0; # total size of original material	163	my $totorig = 0; # total size of original material
164	my $totalposts; # total no. of posts considered	164	my $totalposts; # total no. of posts considered
165	my %distinct_agent;	165	my %distinct_agent;
166		166
167	## Used to hold counts of User Agents used	167	## Used to hold counts of User Agents used
168	my %agents = (	168	my %agents = (
169	"Compuserver" => 0,	169	"Compuserver" => 0,
170	"Foorum" => 0,	170	"Foorum" => 0,
171	"Forte Agent" => 0,	171	"Forte Agent" => 0,
172	"Forte Free Agent" => 0,	172	"Forte Free Agent" => 0,
173	"Gnus" => 0,	173	"Gnus" => 0,
174	"KNode" => 0,	174	"KNode" => 0,
175	"MacSOUP" => 0,	175	"MacSOUP" => 0,
176	"MT-NewsWatcher" => 0,	176	"MT-NewsWatcher" => 0,
177	"MicroPlanet Gravity" => 0,	177	"MicroPlanet Gravity" => 0,
178	"Microsoft Outlook Express" => 0,	178	"Microsoft Outlook Express" => 0,
179	"Microsoft Windows Mail" => 0,	179	"Microsoft Windows Mail" => 0,
180	"Mozilla" => 0,	180	"Mozilla" => 0,
181	"News Rover" => 0,	181	"News Rover" => 0,
182	"NN" => 0,	182	"NN" => 0,
183	"Pan" => 0,	183	"Pan" => 0,
184	"rn" => 0,	184	"rn" => 0,
185	"slrn" => 0,	185	"slrn" => 0,
186	"Sylpheed" => 0,	186	"Sylpheed" => 0,
187	"tin" => 0,	187	"tin" => 0,
188	"VSoup" => 0,	188	"VSoup" => 0,
189	"WebTV" => 0,	189	"WebTV" => 0,
190	"Xnews" => 0,	190	"Xnews" => 0,
191	);	191	);
192		192
193	my $datetime_parser = DateTime::Format::Mail->new();	193	my $datetime_parser = DateTime::Format::Mail->new();
194	$datetime_parser->loose();	194	$datetime_parser->loose();
195		195
196	my $today = DateTime->today( time_zone => 'UTC' );	196	my $today = DateTime->today( time_zone => 'UTC' );
197	my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);	197	my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
198	my $start = int $prev_month->strftime('%s');	198	my $start = int $prev_month->strftime('%s');
199	my $numdays = int DateTime->last_day_of_month(	199	my $numdays = int DateTime->last_day_of_month(
200	year => $prev_month->year(),	200	year => $prev_month->year(),
201	month => $prev_month->month(),	201	month => $prev_month->month(),
202	time_zone => $prev_month->time_zone(),	202	time_zone => $prev_month->time_zone(),
203	)->day();	203	)->day();
204	my $end = int $today->clone()->set_day(1)->strftime('%s');	204	my $end = int $today->clone()->set_day(1)->strftime('%s');
205		205
206	dmsg( $start, " to ", $end ) if DEBUG;	206	dmsg( $start, " to ", $end ) if DEBUG;
207		207
208	chdir("$news$group")	208	chdir("$news$group")
209	or die __x(	209	or die __x(
210	"Can't cd to {newsgroup}: {error}\n",	210	"Can't cd to {newsgroup}: {error}\n",
211	newsgroup => "$news$group",	211	newsgroup => "$news$group",
212	error => $!	212	error => $!
213	);	213	);
214	opendir( DIR, "." )	214	opendir( DIR, "." )
215	or die __x(	215	or die __x(
216	"Can't open {newsgroup}: {error}\n",	216	"Can't open {newsgroup}: {error}\n",
217	newsgroup => "$news$group",	217	newsgroup => "$news$group",
218	error => $!	218	error => $!
219	);	219	);
220		220
221	while ( defined( my $filename = readdir(DIR) ) )	221	while ( defined( my $filename = readdir(DIR) ) )
222	{	222	{
223	next unless -f $filename; # only want real files	223	next unless -f $filename; # only want real files
224	next if ( $filename eq ".overview" ); # real articles only	224	next if ( $filename eq ".overview" ); # real articles only
225		225
226	get_article($filename); # read in the article	226	get_article($filename); # read in the article
227	}	227	}
228	closedir(DIR); # finished with the directory	228	closedir(DIR); # finished with the directory
229		229
230	dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG;	230	dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG;
231		231
232	## Post-processing	232	## Post-processing
233	count_agents(); # count agents, collapsing versions	233	count_agents(); # count agents, collapsing versions
234	fix_percent();	234	fix_percent();
235		235
236	write_data();	236	write_data();
237	display_results();	237	display_results();
238		238
239	########################################	239	########################################
240	## Get current article's header and body	240	## Get current article's header and body
241	########################################	241	########################################
242	sub get_article	242	sub get_article
243	{	243	{
244	my $filename = shift;	244	my $filename = shift;
245		245
246	open( my $FILE, '<', $filename )	246	open( my $FILE, '<', $filename )
247	or	247	or
248	die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );	248	die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
249	my $msg = Mail::Message->read($FILE);	249	my $msg = Mail::Message->read($FILE);
250	my $timestamp = $msg->timestamp();	250	my $timestamp = $msg->timestamp();
251	my $date = $msg->study('Date');	251	my $date = $msg->study('Date');
252		252
253	## Disregard article if timestamp is not in range	253	## Disregard article if timestamp is not in range
254	dmsg($timestamp) if DEBUG;	254	dmsg($timestamp) if DEBUG;
255	if ( $timestamp < $start \|\| $timestamp >= $end )	255	if ( $timestamp < $start \|\| $timestamp >= $end )
256	{	256	{
257	dmsg("Posting on $date ignored.") if DEBUG;	257	dmsg("Posting on $date ignored.") if DEBUG;
258	return;	258	return;
259	}	259	}
260		260
261	$totalposts++; # bump count of articles considered	261	$totalposts++; # bump count of articles considered
262		262
263	## DEBUG	263	## DEBUG
264	dmsg($date) if DEBUG;	264	dmsg($date) if DEBUG;
265		265
266	## get stats about the file itself	266	## get stats about the file itself
267	my $filesize = -s $filename; # get total size of file	267	my $filesize = -s $filename; # get total size of file
268	$totsize += $filesize; # bump total sizes of all files	268	$totsize += $filesize; # bump total sizes of all files
269		269
270	if ( ( not defined $earliest ) \|\| $timestamp < $earliest )	270	if ( ( not defined $earliest ) \|\| $timestamp < $earliest )
271	{	271	{
272	$earliest = $timestamp;	272	$earliest = $timestamp;
273	}	273	}
274	elsif ( ( not defined $latest ) \|\| $timestamp > $latest )	274	elsif ( ( not defined $latest ) \|\| $timestamp > $latest )
275	{	275	{
276	$latest = $timestamp;	276	$latest = $timestamp;
277	}	277	}
278		278
279	#print "timestamp: $timestamp\n";	279	#print "timestamp: $timestamp\n";
280		280
281	## count header size	281	## count header size
282	$totheader += $msg->head()->size();	282	$totheader += $msg->head()->size();
283		283
284	## get the poster's name (MIME-decoded, in UTF-8)	284	## get the poster's name (MIME-decoded, in UTF-8)
285	my $poster = $msg->study('From');	285	my $poster = $msg->study('From');
286	if ( defined $poster )	286	if ( defined $poster )
287	{	287	{
288	## Convert old to new format	288	## Convert old to new format
289	$poster =~ s/^\s(.+?\@.+?)\s\((.+?)\)\s*$/$2 <$1>/;	289	$poster =~ s/^\s(.+?\@.+?)\s\((.+?)\)\s*$/$2 <$1>/;
290		290
291	## Collapse whitespace	291	## Collapse whitespace
292	$poster =~ s/\s+/ /g;	292	$poster =~ s/\s+/ /g;
293		293
294	## Remove outer quotes; TODO: observe RFC 5322 strictly	294	## Remove outer quotes; TODO: observe RFC 5322 strictly
295	$poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;	295	$poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
296		296
297	## DEBUG	297	## DEBUG
298	dmsg($poster) if DEBUG;	298	dmsg($poster) if DEBUG;
299		299
300	## seen this one before?	300	## seen this one before?
301	if ( !defined( $data{$poster} ) )	301	if ( !defined( $data{$poster} ) )
302	{	302	{
303	$data{$poster}{'agent'} = __ 'unknown'; # comes after For: field	303	$data{$poster}{'agent'} = __ 'unknown'; # comes after For: field
304	$data{$poster}{'orig'} = 0;	304	$data{$poster}{'orig'} = 0;
305	$data{$poster}{'quoted'} = 0;	305	$data{$poster}{'quoted'} = 0;
306	}	306	}
307	$data{$poster}{'count'}++; # bump count for this poster	307	$data{$poster}{'count'}++; # bump count for this poster
308	$data{$poster}{'size'} += $filesize; # total size of file	308	$data{$poster}{'size'} += $filesize; # total size of file
309		309
310	## The User-Agent and/or X-Newsreader fields	310	## The User-Agent and/or X-Newsreader fields
311	## for User-Agent by poster	311	## for User-Agent by poster
312	my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');	312	my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');
313	if ( defined $ua )	313	if ( defined $ua )
314	{	314	{
315	$data{$poster}{'agent'} = $ua;	315	$data{$poster}{'agent'} = $ua;
316		316
317	## DEBUG	317	## DEBUG
318	dmsg($ua) if DEBUG;	318	dmsg($ua) if DEBUG;
319	}	319	}
320		320
321	## The User Agent for User-Agent by number of articles	321	## The User Agent for User-Agent by number of articles
322	get_agent($msg);	322	get_agent($msg);
323		323
324	## Get all cross-posted newsgroups	324	## Get all cross-posted newsgroups
325	for ( split( /,/, $msg->study('Newsgroups') ) )	325	for ( split( /,/, $msg->study('Newsgroups') ) )
326	{	326	{
327	$crossposts{$_}++; # bump count for each	327	$crossposts{$_}++; # bump count for each
328	}	328	}
329		329
330	## Get threads	330	## Get threads
331	my $thread = $msg->study('Subject');	331	my $thread = $msg->study('Subject');
332	$thread =~ s/^\sre:\s//i; # Remove Re: or re: at the start	332	$thread =~ s/^\sre:\s//i; # Remove Re: or re: at the start
333	$thread =~ s/\s\(was:\s.\)\s$//i; # Remove (was: ...) at the end	333	$thread =~ s/\s\(was:\s.\)\s$//i; # Remove (was: ...) at the end
334	$thread =~ s/\s+/ /g; # collapse whitespace	334	$thread =~ s/\s+/ /g; # collapse whitespace
335	$threads{$thread}{'count'}++; # bump count of this subject	335	$threads{$thread}{'count'}++; # bump count of this subject
336	$threads{$thread}{'size'} += $filesize; # bump bytes for this thread	336	$threads{$thread}{'size'} += $filesize; # bump bytes for this thread
337		337
338	## Is this an original post or a reply?	338	## Is this an original post or a reply?
339	if ( defined $msg->study('References') )	339	if ( defined $msg->study('References') )
340	{	340	{
341	$replies++;	341	$replies++;
342	}	342	}
343	else	343	else
344	{	344	{
345	$origposts++;	345	$origposts++;
346	}	346	}
347		347
348	## Get the time zone	348	## Get the time zone
349	my $datetime = $datetime_parser->parse_datetime($date);	349	my $datetime = $datetime_parser->parse_datetime($date);
350	my $tz = $datetime->strftime('%z');	350	my $tz = $datetime->strftime('%z');
351	$tz = "UTC" if $tz =~ m{^(?:GMT\|0000)$}o;	351	$tz = "UTC" if $tz =~ m{^(?:GMT\|0000)$}o;
352	$tz{$tz}++;	352	$tz{$tz}++;
353		353
354	## DEBUG	354	## DEBUG
355	dmsg($tz) if DEBUG;	355	dmsg($tz) if DEBUG;
356		356
357	#### Now analyse the body text ####	357	#### Now analyse the body text ####
358	my $body = $msg->body();	358	my $body = $msg->body();
359		359
360	my $insig = 0;	360	my $insig = 0;
361	my @body = $body->lines;	361	my @body = $body->lines;
362	for (@body)	362	for (@body)
363	{	363	{
364	$totbody += length($_); # bump total body size	364	$totbody += length($_); # bump total body size
365	next if (m{^$>}o); # don't count blank lines in body	365	next if (m{^$>}o); # don't count blank lines in body
366	if ( $insig == 1 )	366	if ( $insig == 1 )
367	{	367	{
368		368
369	# bump total sig size	369	# bump total sig size
370	$totsig += length($_);	370	$totsig += length($_);
371	}	371	}
372	## are we in a quote line?	372	## are we in a quote line?
373	## Bill Unruh uses ] quotes, and another poster uses ::	373	## Bill Unruh uses ] quotes, and another poster uses ::
374	elsif ( m{^\s[>\]]}o \|\| m{^\s::}o )	374	elsif ( m{^\s[>\]]}o \|\| m{^\s::}o )
375	{	375	{
376	## bump count of quoted chrs	376	## bump count of quoted chrs
377	$data{$poster}{'quoted'} += length($_);	377	$data{$poster}{'quoted'} += length($_);
378	$totquoted += length($_);	378	$totquoted += length($_);
379	}	379	}
380	elsif (/^-- $/)	380	elsif (/^-- $/)
381	{	381	{
382	$insig = 1;	382	$insig = 1;
383	}	383	}
384	else	384	else
385	{	385	{
386	## We must be processing an original line	386	## We must be processing an original line
387	$data{$poster}{'orig'} += length($_); # bump count of original chrs	387	$data{$poster}{'orig'} += length($_); # bump count of original chrs
388	$totorig += length($_);	388	$totorig += length($_);
389	}	389	}
390	}	390	}
391		391
392	# end for (@body)	392	# end for (@body)
393	}	393	}
394		394
395	close($FILE);	395	close($FILE);
396	}	396	}
397		397
398	sub get_agent	398	sub get_agent
399	{	399	{
400	my $msg = shift;	400	my $msg = shift;
401		401
402	my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')	402	my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')
403	// $msg->study('X-Mailer');	403	// $msg->study('X-Mailer');
404		404
405	if ( not defined $ua )	405	if ( not defined $ua )
406	{	406	{
407	my $org = $msg->study('Organization');	407	my $org = $msg->study('Organization');
408	if ( defined $org	408	if ( defined $org
409	and $org =~ /groups\.google\|AOL\|Supernews\|WebTV\|compuserve/ )	409	and $org =~ /groups\.google\|AOL\|Supernews\|WebTV\|compuserve/ )
410	{	410	{
411	$ua = $org;	411	$ua = $org;
412	}	412	}
413	elsif ( $msg->study('Message-ID') =~ /pine/i )	413	elsif ( $msg->study('Message-ID') =~ /pine/i )
414	{	414	{
415	$ua = "Pine";	415	$ua = "Pine";
416	}	416	}
417	}	417	}
418		418
419	## Hopefully found UA, else set to unknown	419	## Hopefully found UA, else set to unknown
420	if ( not defined $ua )	420	if ( not defined $ua )
421	{	421	{
422	$ua = __ "unknown";	422	$ua = __ "unknown";
423	}	423	}
424		424
425	$ua = clean($ua);	425	$ua = clean($ua);
426		426
427	my $raw = $ua;	427	my $raw = $ua;
428	my $agent = $raw;	428	my $agent = $raw;
429		429
430	## strip http	430	## strip http
431	if ( $raw =~ /.http./ )	431	if ( $raw =~ /.http./ )
432	{	432	{
433	$raw =~ s!posted via!!i;	433	$raw =~ s!posted via!!i;
434	$raw =~ s!http://!!g;	434	$raw =~ s!http://!!g;
435	$raw =~ s!/!!g;	435	$raw =~ s!/!!g;
436	$raw =~ s! !!g;	436	$raw =~ s! !!g;
437	}	437	}
438		438
439	## Fix Outlook from Mac	439	## Fix Outlook from Mac
440	if ( $raw =~ /^microsoft/i )	440	if ( $raw =~ /^microsoft/i )
441	{	441	{
442	$raw =~ s/-/ /g;	442	$raw =~ s/-/ /g;
443	}	443	}
444		444
445	## Pick out the popular agents	445	## Pick out the popular agents
446	if ( $raw =~ /(outlook express)/i	446	if ( $raw =~ /(outlook express)/i
447	\|\| $raw =~ /(windows mail)/i	447	\|\| $raw =~ /(windows mail)/i
448	\|\| $raw =~ /(microplanet gravity)/i	448	\|\| $raw =~ /(microplanet gravity)/i
449	\|\| $raw =~ /(news rover)/i	449	\|\| $raw =~ /(news rover)/i
450	\|\| $raw =~ /(forte agent)/i	450	\|\| $raw =~ /(forte agent)/i
451	\|\| $raw =~ /(forte free agent)/i )	451	\|\| $raw =~ /(forte free agent)/i )
452	{	452	{
453	$agent = $1;	453	$agent = $1;
454	}	454	}
455	elsif (	455	elsif (
456	$raw =~ /^(	456	$raw =~ /^(
457	pan	457	pan
458	\|sylpheed	458	\|sylpheed
459	\|slrn	459	\|slrn
460	\|mozilla	460	\|mozilla
461	\|knode	461	\|knode
462	\|tin	462	\|tin
463	\|hamster	463	\|hamster
464	\|xrn	464	\|xrn
465	\|xnews	465	\|xnews
466	\|aol	466	\|aol
467	\|gnus	467	\|gnus
468	\|krn	468	\|krn
469	\|macsoup	469	\|macsoup
470	\|messenger	470	\|messenger
471	\|openxp	471	\|openxp
472	\|pine	472	\|pine
473	\|thoth	473	\|thoth
474	\|turnpike	474	\|turnpike
475	\|winvn	475	\|winvn
476	\|vsoup	476	\|vsoup
477	\|google	477	\|google
478	\|supernews	478	\|supernews
479	\|nn	479	\|nn
480	\|rn	480	\|rn
481	\|007	481	\|007
482	\|webtv	482	\|webtv
483	\|compuserve	483	\|compuserve
484	)/ix	484	)/ix
485	)	485	)
486	{	486	{
487	$agent = $1;	487	$agent = $1;
488	}	488	}
489	else	489	else
490	{	490	{
491	## Clean up unknown agents	491	## Clean up unknown agents
492	if ( $raw =~ m!^(.*?)/! )	492	if ( $raw =~ m!^(.*?)/! )
493	{	493	{
494	$agent = $1;	494	$agent = $1;
495	}	495	}
496	elsif ( $raw =~ /^(\w)\d./ )	496	elsif ( $raw =~ /^(\w)\d./ )
497	{	497	{
498	$agent = $1;	498	$agent = $1;
499	}	499	}
500	}	500	}

Subversion Repositories LCARS

(root)/branches/live/tools/network/news/newsstat/newsstat.pl - Rev 49 → 198