Subversion Repositories LCARS

Rev

Rev 14 | Rev 23 | Go to most recent revision | Show entire file | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14 Rev 22
Line 1... Line 1...
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
-
 
4
4
use diagnostics;
5
#use diagnostics;
5
use utf8;
6
use utf8;
6
use Encode;
7
use Encode;
7
8
-
 
9
use constant DEBUG => 0;
-
 
10
8
## Print out all text to STDOUT UTF-8 encoded
11
## Print out all text to STDOUT UTF-8 encoded
9
binmode STDOUT, ':encoding(UTF-8)';
12
binmode STDOUT, ':encoding(UTF-8)';
-
 
13
binmode STDERR, ':encoding(UTF-8)';
10
14
11
##############################
15
# FIXME: Automatically include resolved '.' in @INC
12
## newsstat.pl version 0.4.3.1
16
# print join "\n", @INC;
13
17
-
 
18
use locale ':not_characters';
14
###########################################################################
19
use Locale::TextDomain ('de.pointedears.newsstat');
-
 
20
use POSIX ('locale_h');
15
## Collect statistics about a newsgroup (specified by first argument)
21
use Locale::Messages qw (bind_textdomain_filter
16
## in the local news spool. Check all articles in the last 30-day period.
22
                         bind_textdomain_codeset
17
## Rank posters by number of posts and by volume of posts, report on top
23
                         turn_utf_8_on);
18
## and bottom 20 posters. Show their name, number of posts, size of posts,
24
#setlocale( LC_MESSAGES, '' );
19
## percentage of quoted lines. Rank user-agents used, by poster rather
25
bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;
20
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
26
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
21
##
27
 
22
## Numbers and paths can be configured below.  See ChangeLog and TODO
28
require Mail::Message;
23
## for more.  -- PE
29
require DateTime;
24
###########################################################################
30
require DateTime::Format::Mail;
25
31
 
26
###################### USER CONFIGURATIONS ############################
32
###################### USER CONFIGURATIONS ############################
27
33
28
## The name of the group to do stats for
34
## The name of the group to do stats for
29
my $newsgroup_name = $ARGV[0];
35
my $newsgroup_name = $ARGV[0];
30
$newsgroup_name or &usage;
36
$newsgroup_name or usage();
31
37
32
## Check for removal flags
38
## Check for removal flags
33
my $ix;
39
my $ix;
34
my $j;
40
my $j;
35
my %skipSec;
41
my %skipSec;
Line 53... Line 59...
53
}
59
}
54
60
55
## Leafnode users will want /var/spool/news for this variable.
61
## Leafnode users will want /var/spool/news for this variable.
56
my $news = "/var/spool/news/";
62
my $news = "/var/spool/news/";
57
63
58
## How many days are we doing statistics for?
-
 
59
my $numdays = 30;
-
 
60
-
 
61
## Number of agents we list
64
## Number of top or bottom posters to show
62
my $topagents = 10;
65
my $topposters = 20;
63
66
64
## Number of threads we want to know about
67
## Number of threads we want to know about
65
my $topthreads = 20;
68
my $topthreads = 20;
66
69
67
## Number of top or bottom posters to show
-
 
68
my $topposters = 20;
-
 
69
-
 
70
## Number of cross-posted threads to show
70
## Number of cross-posted threads to show
71
my $topcrossposts = 10;
71
my $topcrossposts = 10;
72
72
-
 
73
## Number of agents we list
-
 
74
my $topagents = 10;
-
 
75
73
## Number of time zones to show
76
## Number of time zones to show
74
my $toptz = 10;
77
my $toptz = 10;
75
78
76
###################### DATA STRUCTURES ######################
79
###################### DATA STRUCTURES ######################
77
my $group = $newsgroup_name;
80
my $group = $newsgroup_name;
78
$group =~ s!\.!/!g;
81
$group =~ s!\.!/!g;
79
my %data;          # name, count, agent, total, orig, quoted
82
my %data;    # name, count, agent, total, orig, quoted
80
my %threads;       # subject, count
83
my $totsize = 0;    # holds total sizes of all files
81
my %crossposts;    # group, count
84
my %crossposts;     # group, count
-
 
85
my %threads;        # subject, count
-
 
86
my $replies   = 0;  # total no. of replies
-
 
87
my $origposts = 0;  # total no. of original posts
82
my %tz;            # timezones by count
88
my %tz;             # timezones by count
83
my %headers;       # holds header of current article
-
 
84
my %lcheader;      # holds lowercase headers
-
 
85
my @body;          # holds body of current article
-
 
86
my @sig;           # holds sig text;
-
 
87
my $totalposts;    # total no. of posts considered
-
 
88
my $filename;      # name of current article file
-
 
89
my $filesize;      # size of current article file
-
 
90
my $earliest;      # earliest article we have found
89
my $earliest;       # earliest article we have found
91
my $latest;        # latest article we have found
90
my $latest;         # latest article we have found
92
my $poster;        # poster we are dealing with
-
 
93
my $totsize   = 0; # holds total sizes of all files
-
 
94
my $totheader = 0; # total size of header material
91
my $totheader = 0;  # total size of header material
95
my $totbody   = 0; # total size of body material
92
my $totbody   = 0;  # total size of body material
96
my $totsig    = 0; # total size of sig material
93
my $totsig    = 0;  # total size of sig material
97
my $totorig   = 0; # total size of original material
-
 
98
my $totquoted = 0; # total size of quoted material
94
my $totquoted = 0;  # total size of quoted material
99
my $origposts = 0; # total no. of original posts
95
my $totorig   = 0;  # total size of original material
100
my $replies   = 0; # total no. of replies
96
my $totalposts;     # total no. of posts considered
101
my $i;             # general purpose
-
 
102
my %distinct_agent;
97
my %distinct_agent;
103
98
104
## Used to hold counts of User Agents used
99
## Used to hold counts of User Agents used
105
my %agents = (
100
my %agents = (
106
  "Compuserver"               => 0,
101
  "Compuserver"               => 0,
Line 122... Line 117...
122
  "slrn"                      => 0,
117
  "slrn"                      => 0,
123
  "Sylpheed"                  => 0,
118
  "Sylpheed"                  => 0,
124
  "tin"                       => 0,
119
  "tin"                       => 0,
125
  "VSoup"                     => 0,
120
  "VSoup"                     => 0,
126
  "WebTV"                     => 0,
121
  "WebTV"                     => 0,
127
  "Xnews"                     => 0
122
  "Xnews"                     => 0,
128
);
123
);
129
124
-
 
125
my $datetime_parser = DateTime::Format::Mail->new();
-
 
126
$datetime_parser->loose();
-
 
127
-
 
128
my $today = DateTime->today( time_zone => 'UTC' );
-
 
129
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
-
 
130
my $start      = int $prev_month->strftime('%s');
-
 
131
my $numdays    = int DateTime->last_day_of_month(
-
 
132
  year      => $prev_month->year(),
-
 
133
  month     => $prev_month->month(),
-
 
134
  time_zone => $prev_month->time_zone(),
-
 
135
)->day();
130
######################## MAIN CODE ########################
136
my $end = int $today->clone()->set_day(1)->strftime('%s');
-
 
137
-
 
138
dmsg( $start, " to ", $end ) if DEBUG;
-
 
139
-
 
140
chdir("$news$group")
-
 
141
  or die __x(
-
 
142
  "Can't cd to {newsgroup}: {error}\n",
-
 
143
  newsgroup => "$news$group",
-
 
144
  error     => $!
-
 
145
  );
-
 
146
opendir( DIR, "." )
-
 
147
  or die __x(
-
 
148
  "Can't open {newsgroup}: {error}\n",
-
 
149
  newsgroup => "$news$group",
-
 
150
  error     => $!
131
$! = 1;
151
  );
132
152
133
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
-
 
134
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
-
 
135
while ( defined( $filename = readdir(DIR) ) )
153
while ( defined( my $filename = readdir(DIR) ) )
136
{
154
{
137
  %lcheader = ();
-
 
138
  next unless -f $filename;               # only want real files
155
  next unless -f $filename;    # only want real files
139
  next if ( $filename eq ".overview" );   # real articles only
156
  next if ( $filename eq ".overview" );    # real articles only
140
  next if ( -M $filename > $numdays );    # only want articles <= a certain age
-
 
141
  $earliest = ( stat $filename )[9] unless defined($earliest);
-
 
142
  $latest   = ( stat $filename )[9] unless defined($latest);
-
 
-
 
157
143
  &get_article($filename);                 # read in the article
158
  get_article($filename);                  # read in the article
144
  &get_data;                               # grab the data from the article
-
 
145
  $totalposts++;                          # bump count of articles considered
-
 
146
}
159
}
147
closedir(DIR);                            # finished with the directory
160
closedir(DIR);                             # finished with the directory
148
161
-
 
162
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
-
 
163
149
## Post-processing
164
## Post-processing
150
&count_agents;    # count agents, collapsing versions
165
count_agents();                            # count agents, collapsing versions
-
 
166
fix_percent();
-
 
167
-
 
168
write_data();
-
 
169
display_results();
-
 
170
-
 
171
########################################
-
 
172
## Get current article's header and body
-
 
173
########################################
-
 
174
sub get_article
-
 
175
{
-
 
176
  my $filename = shift;
-
 
177
-
 
178
  open( my $FILE, '<', $filename )
-
 
179
    or
-
 
180
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
-
 
181
  my $msg       = Mail::Message->read($FILE);
-
 
182
  my $timestamp = $msg->timestamp();
-
 
183
  my $date      = $msg->study('Date');
-
 
184
-
 
185
  ## Disregard article if timestamp is not in range
-
 
186
  dmsg($timestamp) if DEBUG;
-
 
187
  if ( $timestamp < $start or $timestamp >= $end )
-
 
188
  {
-
 
189
    dmsg("Posting on $date ignored.") if DEBUG;
-
 
190
    return;
-
 
191
  }
-
 
192
-
 
193
  $totalposts++;    # bump count of articles considered
-
 
194
-
 
195
  ## DEBUG
-
 
196
  dmsg($date) if DEBUG;
-
 
197
-
 
198
  ## get stats about the file itself
-
 
199
  my $filesize = -s $filename;    # get total size of file
-
 
200
  $totsize += $filesize;          # bump total sizes of all files
-
 
201
-
 
202
  if ( ( not defined $earliest ) or $timestamp < $earliest )
-
 
203
  {
-
 
204
    $earliest = $timestamp;
-
 
205
  }
-
 
206
  elsif ( ( not defined $latest ) or $timestamp > $latest )
-
 
207
  {
-
 
208
    $latest = $timestamp;
-
 
209
  }
-
 
210
-
 
211
  #print "timestamp: $timestamp\n";
-
 
212
-
 
213
  ## count header size
-
 
214
  $totheader += $msg->head()->size();
-
 
215
-
 
216
  ## get the poster's name (MIME-decoded, in UTF-8)
-
 
217
  my $poster = $msg->study('From');
-
 
218
  if ( defined $poster )
-
 
219
  {
-
 
220
    ## Convert old to new format
-
 
221
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
-
 
222
-
 
223
    ## Collapse whitespace
-
 
224
    $poster =~ s/\s+/ /g;
-
 
225
-
 
226
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
-
 
227
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
-
 
228
-
 
229
    ## DEBUG
-
 
230
    dmsg($poster) if DEBUG;
-
 
231
-
 
232
    ## seen this one before?
-
 
233
    if ( !defined( $data{$poster} ) )
-
 
234
    {
-
 
235
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
-
 
236
      $data{$poster}{'orig'}   = 0;
-
 
237
      $data{$poster}{'quoted'} = 0;
-
 
238
    }
-
 
239
    $data{$poster}{'count'}++;                    # bump count for this poster
-
 
240
    $data{$poster}{'size'} += $filesize;          # total size of file
-
 
241
-
 
242
    ## The User-Agent and/or X-Newsreader fields
-
 
243
    ## for User-Agent by poster
-
 
244
    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
-
 
245
    if ( defined $ua )
-
 
246
    {
-
 
247
      $data{$poster}{'agent'} = $ua;
-
 
248
-
 
249
      ## DEBUG
-
 
250
      dmsg($ua) if DEBUG;
-
 
251
    }
-
 
252
151
&fix_percent;     # check percentages orig/total for posters
253
    ## The User Agent for User-Agent by number of posts
-
 
254
    get_agent($msg);
-
 
255
-
 
256
    ## Get all cross-posted newsgroups
-
 
257
    for ( split( /,/, $msg->study('Newsgroups') ) )
-
 
258
    {
-
 
259
      $crossposts{$_}++;    # bump count for each
-
 
260
    }
-
 
261
-
 
262
    ## Get threads
-
 
263
    my $thread = $msg->study('Subject');
-
 
264
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
-
 
265
    $thread =~ s/\s+/ /g;       # collapse whitespace
-
 
266
    $threads{$thread}{'count'}++;    # bump count of this subject
-
 
267
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
-
 
268
-
 
269
    ## Is this an original post or a reply?
-
 
270
    if ( defined $msg->study('References') )
-
 
271
    {
-
 
272
      $replies++;
-
 
273
    }
-
 
274
    else
-
 
275
    {
-
 
276
      $origposts++;
-
 
277
    }
-
 
278
-
 
279
    ## Get the time zone
-
 
280
    my $datetime = $datetime_parser->parse_datetime($date);
-
 
281
    my $tz       = $datetime->strftime('%z');
-
 
282
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
-
 
283
    $tz{$tz}++;
-
 
284
-
 
285
    ## DEBUG
-
 
286
    dmsg($tz) if DEBUG;
-
 
287
-
 
288
#### Now analyse the body text ####
-
 
289
    my $body = $msg->body();
-
 
290
-
 
291
    my $insig = 0;
-
 
292
    my @body  = $body->lines;
-
 
293
    for (@body)
-
 
294
    {
-
 
295
      $totbody += length($_);    # bump total body size
-
 
296
      next if (m{^$>}o);         # don't count blank lines in body
-
 
297
      if ( $insig == 1 )
-
 
298
      {
-
 
299
-
 
300
        # bump total sig size
-
 
301
        $totsig += length($_);
-
 
302
      }
-
 
303
      ## are we in a quote line?
-
 
304
      ## Bill Unruh uses ] quotes, and another poster uses ::
-
 
305
      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
-
 
306
      {
-
 
307
        ## bump count of quoted chrs
-
 
308
        $data{$poster}{'quoted'} += length($_);
-
 
309
        $totquoted += length($_);
-
 
310
      }
-
 
311
      elsif (/^-- $/)
-
 
312
      {
-
 
313
        $insig = 1;
-
 
314
      }
-
 
315
      else
-
 
316
      {
-
 
317
        ## We must be processing an original line
-
 
318
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
-
 
319
        $totorig += length($_);
-
 
320
      }
-
 
321
    }
-
 
322
-
 
323
    # end for (@body)
-
 
324
  }
-
 
325
-
 
326
  close($FILE);
-
 
327
}
-
 
328
-
 
329
sub get_agent
-
 
330
{
-
 
331
  my $msg = shift;
-
 
332
-
 
333
  my $ua =
-
 
334
       $msg->study('User-Agent')
-
 
335
    or $msg->study('X-Newsreader')
-
 
336
    or $msg->study('X-Mailer');
-
 
337
  if ( not defined $ua )
-
 
338
  {
-
 
339
    my $org = $msg->study('Organization');
-
 
340
    if ( defined $org
-
 
341
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
-
 
342
    {
-
 
343
      $ua = $org;
-
 
344
    }
-
 
345
    elsif ( $msg->study('Message-ID') =~ /pine/i )
-
 
346
    {
-
 
347
      $ua = "Pine";
-
 
348
    }
-
 
349
  }
-
 
350
-
 
351
  ## Hopefully found UA, else set to unknown
-
 
352
  if ( not defined $ua )
-
 
353
  {
-
 
354
    $ua = __ "unknown";
-
 
355
  }
152
356
153
&write_data;
357
  $ua = clean($ua);
154
358
-
 
359
  my $raw   = $ua;
-
 
360
  my $agent = $raw;
-
 
361
-
 
362
  ## strip http
-
 
363
  if ( $raw =~ /.*http.*/ )
-
 
364
  {
-
 
365
    $raw =~ s!posted via!!i;
-
 
366
    $raw =~ s!http://!!g;
-
 
367
    $raw =~ s!/!!g;
-
 
368
    $raw =~ s! !!g;
-
 
369
  }
-
 
370
-
 
371
  ## Fix Outlook from Mac
-
 
372
  if ( $raw =~ /^microsoft/i )
-
 
373
  {
-
 
374
    $raw =~ s/-/ /g;
-
 
375
  }
-
 
376
-
 
377
  ## Pick out the popular agents
-
 
378
  if ( $raw =~ /(outlook express)/i
-
 
379
    || $raw =~ /(windows mail)/i
-
 
380
    || $raw =~ /(microplanet gravity)/i
-
 
381
    || $raw =~ /(news rover)/i
-
 
382
    || $raw =~ /(forte agent)/i
-
 
383
    || $raw =~ /(forte free agent)/i )
-
 
384
  {
-
 
385
    $agent = $1;
-
 
386
  }
-
 
387
  elsif (
-
 
388
    $raw =~ /^(
-
 
389
        pan
-
 
390
       |sylpheed
-
 
391
       |slrn
-
 
392
       |mozilla
-
 
393
       |knode
-
 
394
       |tin
-
 
395
       |hamster
-
 
396
       |xrn
-
 
397
       |xnews
-
 
398
       |aol
-
 
399
       |gnus
-
 
400
       |krn
-
 
401
       |macsoup
-
 
402
       |messenger
-
 
403
       |openxp
-
 
404
       |pine
-
 
405
       |thoth
-
 
406
       |turnpike
-
 
407
       |winvn
-
 
408
       |vsoup
-
 
409
       |google
-
 
410
       |supernews
-
 
411
       |nn
-
 
412
       |rn
-
 
413
       |007
-
 
414
       |webtv
-
 
415
       |compuserve
-
 
416
       )/ix
-
 
417
    )
-
 
418
  {
-
 
419
    $agent = $1;
-
 
420
  }
-
 
421
  else
-
 
422
  {
-
 
423
    ## Clean up unknown agents
-
 
424
    if ( $raw =~ m!^(.*?)/! )
-
 
425
    {
-
 
426
      $agent = $1;
-
 
427
    }
-
 
428
    elsif ( $raw =~ /^(\w*)\d.*/ )
-
 
429
    {
-
 
430
      $agent = $1;
-
 
431
    }
-
 
432
  }
-
 
433
-
 
434
  $distinct_agent{$agent}++;
-
 
435
  return $agent;
-
 
436
}
-
 
437
## get_agent
-
 
438
-
 
439
#########################################
-
 
440
## Count the User-Agents used, collapsing
-
 
441
## different versions into one per agent.
-
 
442
#########################################
-
 
443
sub count_agents
-
 
444
{
-
 
445
POSTER:
-
 
446
  foreach my $poster ( keys %data )
-
 
447
  {
-
 
448
    foreach my $agent_name ( keys %distinct_agent )
-
 
449
    {    # check against known ones
-
 
450
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
-
 
451
      {
-
 
452
        $agents{$agent_name}++;
-
 
453
        next POSTER;
-
 
454
      }
-
 
455
    }
-
 
456
    $agents{ $data{$poster}{'agent'} }++;
-
 
457
  }
-
 
458
}    # count_agents
-
 
459
-
 
460
#############################################
-
 
461
## Set orig/total percentages for all posters
-
 
462
#############################################
-
 
463
sub fix_percent
-
 
464
{
-
 
465
  foreach my $poster ( keys %data )
-
 
466
  {
-
 
467
    my $percent = 100;
-
 
468
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
-
 
469
    {
-
 
470
      $percent =
-
 
471
        $data{$poster}{'orig'} * 100 /
-
 
472
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
-
 
473
    }
-
 
474
    elsif ( $data{$poster}{'orig'} == 0 )
-
 
475
    {
-
 
476
      $percent = 0;
-
 
477
    }
-
 
478
    $data{$poster}{'percent'} = $percent;
-
 
479
  }
-
 
480
}
-
 
481
## fix_percent
-
 
482
-
 
483
##################################
-
 
484
## Write data structures to a file
-
 
485
##################################
-
 
486
sub write_data
-
 
487
{
-
 
488
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
-
 
489
    or die __x( "Can't create XDATA: {error}\n", error => $! );
-
 
490
  print $OUTF "Data collected from $newsgroup_name\n\n";
-
 
491
  print $OUTF
-
 
492
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
-
 
493
  foreach my $name ( keys %data )
-
 
494
  {
-
 
495
    print $OUTF
-
 
496
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
-
 
497
  }
-
 
498
  print $OUTF
-
 
499
"============================================================================\n";
-
 
500
  print $OUTF "Thread subjects\n";
-
 
501
  print $OUTF
-
 
502
"----------------------------------------------------------------------------\n";
-
 
503
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
-
 
504
  {
-
 
505
    print $OUTF
-
 
506
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
-
 
507
  }
-
 
508
  print $OUTF
-
 
509
"============================================================================\n";
-
 
510
  print $OUTF "Cross-posts\n";
-
 
511
  print $OUTF
-
 
512
"----------------------------------------------------------------------------\n";
-
 
513
  foreach my $name ( sort keys %crossposts )
-
 
514
  {
-
 
515
    print $OUTF "$name : $crossposts{$name}\n";
-
 
516
  }
-
 
517
  print $OUTF
-
 
518
"============================================================================\n";
-
 
519
  print $OUTF "User agents\n";
-
 
520
  print $OUTF
-
 
521
"----------------------------------------------------------------------------\n";
-
 
522
  foreach my $name ( sort keys %agents )
-
 
523
  {
-
 
524
    print $OUTF "$name : $agents{$name}\n";
-
 
525
  }
-
 
526
  print $OUTF
-
 
527
"============================================================================\n";
-
 
528
  print $OUTF "Time zones\n";
-
 
529
  print $OUTF
-
 
530
"----------------------------------------------------------------------------\n";
-
 
531
  foreach my $name ( sort keys %tz )
-
 
532
  {
-
 
533
    print $OUTF "$name : $tz{$name}\n";
-
 
534
  }
-
 
535
  close $OUTF;
-
 
536
}    # write_data
-
 
537
-
 
538
sub display_results
-
 
539
{
155
#################### DISPLAY RESULTS #####################
540
  #################### DISPLAY RESULTS #####################
156
print "=" x 76, "\n";
541
  print "=" x 76, "\n";
-
 
542
  printf "%s\n",
-
 
543
    centred(
157
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
544
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
-
 
545
    76 );
158
print "=" x 76, "\n";
546
  print "=" x 76, "\n";
159
printf "%s\n",
547
  printf "%s\n",
-
 
548
    centred(
-
 
549
    __
160
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
550
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)",
-
 
551
    76
-
 
552
    );
161
print "\n\n";
553
  print "\n\n";
162
printf "Total posts considered: %s over %d days\n", commify($totalposts),
554
  printf __"Total posts considered: %s over %d days" . "\n",
-
 
555
    commify($totalposts),
163
  $numdays;
556
    $numdays;
-
 
557
  my $time_locale = setlocale(LC_TIME);
-
 
558
  my $earliest_datetime = DateTime->from_epoch(
-
 
559
    epoch => $earliest,
-
 
560
    locale => $time_locale,
-
 
561
    time_zone => 'UTC',
-
 
562
  );
-
 
563
  my $latest_datetime = DateTime->from_epoch(
-
 
564
    epoch => $latest,
-
 
565
    locale => $time_locale,
-
 
566
    time_zone => 'UTC',
-
 
567
  );
-
 
568
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
164
printf "Earliest article: %s\n",               scalar localtime($earliest);
569
  printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format);
165
printf "Latest article:   %s\n",               scalar localtime($latest);
570
  printf __"Latest article" . ":   %s\n", $latest_datetime->strftime($datetime_format);
166
printf "Original articles: %s, replies: %s\n", commify($origposts),
571
  printf __"Original articles: %s; replies" . ": %s\n",
-
 
572
    commify($origposts),
167
  commify($replies);
573
    commify($replies);
168
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
574
  printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n",
169
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
575
    commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576;  #
-
 
576
  printf __
170
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
577
    "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
171
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
578
    commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
172
  commify( int( $totsize / $totalposts ) );
579
    commify( int( $totsize / $totalposts ) );
173
my $count = keys %data;
580
  my $count = keys %data;
174
printf "Total headers: %s KiB  bodies: %s KiB\n",
581
  printf __"Total headers: %s KiB; bodies: %s KiB\n",
175
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
582
    commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
-
 
583
  printf __
176
printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
584
    "Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n",
177
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
585
    commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
178
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
586
    ( $totorig * 100 ) / ( $totorig + $totquoted ),
179
  commify( int( $totsig / 1024 ) );
587
    commify( int( $totsig / 1024 ) );
180
printf "Total number of posters: %s, average %s bytes per poster\n",
588
  printf __"Total number of posters: %s, average %s bytes per poster\n",
181
  commify($count), commify( int( $totsize / $count ) );     #/
589
    commify($count), commify( int( $totsize / $count ) );    #/
182
$count = keys %threads;
590
  $count = keys %threads;
183
printf "Total number of threads: %s, average %s bytes per thread\n",
591
  printf __"Total number of threads: %s, average %s bytes per thread\n",
184
  commify($count), commify( int( $totsize / $count ) );     #/
592
    commify($count), commify( int( $totsize / $count ) );    #/
185
printf "Total number of user agents: %d\n", scalar keys %agents;
593
  printf __"Total number of user agents: %d\n", scalar keys %agents;
186
print "\n", "=" x 76, "\n";
594
  print "\n", "=" x 76, "\n";
187
595
188
########################################
596
  ########################################
189
## Show posters by article count  Sec 1;
597
  ## Show posters by article count  Sec 1;
190
########################################
598
  ########################################
Line 196... Line 604...
196
  }
604
    }
197
  else
605
    else
198
  {
606
    {
199
    $count = $topposters;
607
      $count = $topposters;
200
  }
608
    }
-
 
609
    printf "%s\n",
-
 
610
      centred(
201
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
611
      __x( "Top {count} posters by number of articles", count => $topposters ), 76 );
202
  print "=" x 76, "\n";
612
    print "=" x 76, "\n";
203
  $i = 0;
613
    my $i = 0;
204
  foreach
614
    foreach
205
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
615
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
206
  {
616
    {
207
    my $name = substr( $poster, 0, 65 );
617
      my $name = substr( $poster, 0, 65 );
208
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
618
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
Line 223... Line 633...
223
  }
633
    }
224
  else
634
    else
225
  {
635
    {
226
    $count = $topposters;
636
      $count = $topposters;
227
  }
637
    }
-
 
638
    printf "%s\n",
-
 
639
      centred(
228
  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
640
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
-
 
641
      76 );
229
  print "=" x 76, "\n";
642
    print "=" x 76, "\n";
230
  $i = 0;
643
    my $i = 0;
-
 
644
    foreach
231
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
645
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
232
  {
646
    {
233
    my $name = substr( $poster, 0, 62 );
647
      my $name = substr( $poster, 0, 62 );
234
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
648
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
235
      $data{$poster}{size} / 1024;    #/
649
        $data{$poster}{size} / 1024;    #/
236
    last if ( ++$i == $count );
650
      last if ( ++$i == $count );
Line 250... Line 664...
250
  else
664
    else
251
  {
665
    {
252
    $count = $topposters;
666
      $count = $topposters;
253
  }
667
    }
254
  printf "%s\n",
668
    printf "%s\n",
-
 
669
      centred(
-
 
670
      __x(
255
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
671
        "Top {count} responders by original text (> 5 posts)",
-
 
672
        count => $topposters
-
 
673
      ),
-
 
674
      76
-
 
675
      );
256
  print "=" x 76, "\n";
676
    print "=" x 76, "\n";
257
  $i = 0;
677
    my $i = 0;
258
  foreach my $poster (
678
    foreach my $poster (
259
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
679
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
260
    keys %data
680
      keys %data
261
    )
681
      )
262
  {
682
    {
Line 282... Line 702...
282
  else
702
    else
283
  {
703
    {
284
    $count = $topposters;
704
      $count = $topposters;
285
  }
705
    }
286
  printf "%s\n",
706
    printf "%s\n",
-
 
707
      centred(
-
 
708
      __x(
287
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
709
        "Bottom {count} responders by original text  (> 5 posts)",
-
 
710
        count => $topposters
-
 
711
      ),
-
 
712
      76
-
 
713
      );
288
  print "=" x 76, "\n";
714
    print "=" x 76, "\n";
289
  $i = 0;
715
    my $i = 0;
290
  foreach my $poster (
716
    foreach my $poster (
291
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
717
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
292
    keys %data
718
      keys %data
293
    )
719
      )
294
  {
720
    {
Line 313... Line 739...
313
  }
739
    }
314
  else
740
    else
315
  {
741
    {
316
    $count = $topthreads;
742
      $count = $topthreads;
317
  }
743
    }
-
 
744
    printf "%s\n",
318
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
745
      centred( __x( "Top {count} threads by no. of articles", count => $topthreads ),
-
 
746
      76 );
319
  print "=" x 76, "\n";
747
    print "=" x 76, "\n";
320
  $i = 0;
748
    my $i = 0;
321
  foreach my $thread (
749
    foreach my $thread (
322
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
750
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
323
    keys %threads
751
      keys %threads
324
    )
752
      )
325
  {
753
    {
326
    my $name = substr( $thread, 0, 65 );
754
      my $name = substr( $thread, 0, 65 );
327
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
755
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
328
      $threads{$thread}{count};
756
        $threads{$thread}{'count'};
329
    last if ( ++$i == $count );
757
      last if ( ++$i == $count );
330
  }
758
    }
331
  print "\n", "=" x 76, "\n";
759
    print "\n", "=" x 76, "\n";
332
}
760
  }
333
761
Line 342... Line 770...
342
  }
770
    }
343
  else
771
    else
344
  {
772
    {
345
    $count = $topthreads;
773
      $count = $topthreads;
346
  }
774
    }
-
 
775
    printf "%s\n",
347
  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
776
      centred( __x( "Top {count} threads by size in KiB", count => $topthreads ),
-
 
777
      76 );
348
  print "=" x 76, "\n";
778
    print "=" x 76, "\n";
349
  $i = 0;
779
    my $i = 0;
350
  foreach my $thread (
780
    foreach my $thread (
351
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
781
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
352
    keys %threads
782
      keys %threads
353
    )
783
      )
354
  {
784
    {
355
    my $name = substr( $thread, 0, 65 );
785
      my $name = substr( $thread, 0, 65 );
356
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
786
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
357
      $threads{$thread}{size} / 1024;    #/
787
        $threads{$thread}{'size'} / 1024;    #/
358
    last if ( ++$i == $count );
788
      last if ( ++$i == $count );
359
  }
789
    }
360
  print "\n", "=" x 76, "\n";
790
    print "\n", "=" x 76, "\n";
361
}
791
  }
362
792
Line 372... Line 802...
372
  }
802
    }
373
  else
803
    else
374
  {
804
    {
375
    $count = $topcrossposts;
805
      $count = $topcrossposts;
376
  }
806
    }
-
 
807
    printf "%s\n",
377
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
808
      centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
378
  print "=" x 76, "\n";
809
    print "=" x 76, "\n";
379
  $i = 0;
810
    my $i = 0;
380
  foreach
811
    foreach
381
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
812
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
382
  {
813
    {
383
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
814
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
384
      $crossposts{$name};
815
        $crossposts{$name};
Line 398... Line 829...
398
  }
829
    }
399
  else
830
    else
400
  {
831
    {
401
    $count = $topagents;
832
      $count = $topagents;
402
  }
833
    }
-
 
834
    printf "%s\n",
403
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
835
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
-
 
836
      76 );
404
  print "=" x 76, "\n";
837
    print "=" x 76, "\n";
405
  $i = 0;
838
    my $i = 0;
406
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
839
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
407
  {
840
    {
408
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
841
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
409
      $agents{$agent};
842
        $agents{$agent};
410
    last if ( ++$i == $count );
843
      last if ( ++$i == $count );
Line 423... Line 856...
423
  }
856
    }
424
  else
857
    else
425
  {
858
    {
426
    $count = $topagents;
859
      $count = $topagents;
427
  }
860
    }
-
 
861
    printf "%s\n",
-
 
862
      centred(
428
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
863
      __x( "Top {count} user agents by number of posts", count => $topagents ),
-
 
864
      76 );
429
  print "=" x 76, "\n";
865
    print "=" x 76, "\n";
430
  $i = 0;
866
    my $i = 0;
431
  foreach my $agent (
867
    foreach my $agent (
432
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
868
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
433
    keys %distinct_agent
869
      keys %distinct_agent
434
    )
870
      )
435
  {
871
    {
Line 452... Line 888...
452
  }
888
    }
453
  else
889
    else
454
  {
890
    {
455
    $count = $toptz;
891
      $count = $toptz;
456
  }
892
    }
457
  printf "%s\n", &centred( "Top 10 time zones", 76 );
893
    printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 );
458
  print "=" x 76, "\n";
894
    print "=" x 76, "\n";
459
  $i = 0;
895
    my $i = 0;
460
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
896
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
461
  {
897
    {
462
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
898
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
463
    last if ( ++$i == $count );
899
      last if ( ++$i == $count );
464
  }
900
    }
465
  print "\n", "=" x 76, "\n";
901
    print "\n", "=" x 76, "\n";
466
}
902
  }
467
-
 
468
################################ SUBROUTINES ################################
-
 
469
-
 
470
########################################
-
 
471
## Get current article's header and body
-
 
472
########################################
-
 
473
sub get_article
-
 
474
{
-
 
475
  %headers = ();    # dump old headers
-
 
476
  my $filename = shift;    # get the name of the file
-
 
477
-
 
478
  ## get stats about the file itself
-
 
479
  $filesize = -s $filename;    # get total size of file
-
 
480
  $totsize += $filesize;       # bump total sizes of all files
-
 
481
-
 
482
  my $mtime = ( stat $filename )[9];
-
 
483
  if ( $mtime < $earliest )
-
 
484
  {
-
 
485
    $earliest = $mtime;
-
 
486
  }
-
 
487
  elsif ( $mtime > $latest )
-
 
488
  {
-
 
489
    $latest = $mtime;
-
 
490
  }
-
 
491
-
 
492
  ## now read the file
-
 
493
  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
-
 
494
  while (<$FILE>)
-
 
495
  {
-
 
496
    $totheader += length($_);    # bump total header size
-
 
497
    last if (/^\s*$/);           # end of header?
-
 
498
    if (/^([^:\s]*):\s*(.*)/)
-
 
499
    {
-
 
500
      my ( $key, $val ) = ( $1, $2 );
-
 
501
      $headers{$key} = decode( 'MIME-Header', $val );
-
 
502
      $lcheader{ clean( lc($key) ) } = clean($val);
-
 
503
    }
-
 
504
  }
-
 
505
  @body = <$FILE>;               # slurp up body
-
 
506
  close($FILE);
-
 
507
}    # get_article
-
 
508
-
 
509
####################################
-
 
510
## Get data from the current article
-
 
511
####################################
-
 
512
sub get_data
-
 
513
{
-
 
514
#### First, analyse header fields ####
-
 
515
-
 
516
  ## Set up this poster if not defined, get counts, sizes
-
 
517
  my $poster = $headers{From};    # get the poster's name
-
 
518
 
-
 
519
  # Convert old to new format
-
 
520
  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
-
 
521
 
-
 
522
  # Collapse whitespace
-
 
523
  $poster =~ s/\s+/ /;
-
 
524
 
-
 
525
  # Remove outer quotes
-
 
526
  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
-
 
527
 
-
 
528
  if ( !defined( $data{$poster} ) )
-
 
529
  {                               # seen this one before?
-
 
530
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
-
 
531
    $data{$poster}{orig}   = 0;
-
 
532
    $data{$poster}{quoted} = 0;
-
 
533
  }
-
 
534
  $data{$poster}{count}++;                 # bump count for this poster
-
 
535
  $data{$poster}{size} += $filesize;       # total size of file
-
 
536
-
 
537
  ## The User-Agent and/or X-Newsreader fields
-
 
538
  ## for User-Agent by poster
-
 
539
  if ( defined $lcheader{"user-agent"} )
-
 
540
  {
-
 
541
    $data{$poster}{agent} = $lcheader{"user-agent"};
-
 
542
  }
-
 
543
  if ( defined $lcheader{"x-newsreader"} )
-
 
544
  {
-
 
545
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
-
 
546
  }
-
 
547
-
 
548
  ## The User Agent for User-Agent by number of posts
-
 
549
  my $UA = "unknown";
-
 
550
  foreach my $keys ( keys %lcheader )
-
 
551
  {
-
 
552
    if ( defined $lcheader{'user-agent'} )
-
 
553
    {
-
 
554
      $UA = $lcheader{'user-agent'};
-
 
555
    }
-
 
556
    elsif ( defined $lcheader{"x-newsreader"} )
-
 
557
    {
-
 
558
      $UA = $lcheader{"x-newsreader"};
-
 
559
    }
-
 
560
    elsif ( defined $lcheader{'x-mailer'} )
-
 
561
    {
-
 
562
      $UA = $lcheader{'x-mailer'};
-
 
563
    }
-
 
564
    elsif (
-
 
565
      ( defined $lcheader{'organization'} )
-
 
566
      && ( $lcheader{'organization'} =~
-
 
567
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
-
 
568
      )
-
 
569
    {
-
 
570
      $UA = $lcheader{'organization'};
-
 
571
    }
-
 
572
    elsif ( $lcheader{'message-id'} =~ /pine/i )
-
 
573
    {
-
 
574
      $UA = "Pine";
-
 
575
    }    ## Hopefully found UA, else set to unknown
-
 
576
  }
-
 
577
-
 
578
  $UA = clean($UA);
-
 
579
  $UA = get_agent($UA);
-
 
580
-
 
581
  sub get_agent
-
 
582
  {
-
 
583
    my $raw   = shift;
-
 
584
    my $agent = $raw;
-
 
585
-
 
586
    ## strip http
-
 
587
    if ( $raw =~ /.*http.*/ )
-
 
588
    {
-
 
589
      $raw =~ s!posted via!!i;
-
 
590
      $raw =~ s!http://!!g;
-
 
591
      $raw =~ s!/!!g;
-
 
592
      $raw =~ s! !!g;
-
 
593
    }
-
 
594
-
 
595
    ## Fix Outlook from Mac
-
 
596
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
-
 
597
-
 
598
    ## Pick out the popular agents
-
 
599
    if (
-
 
600
           $raw =~ /(outlook express)/i
-
 
601
        || $raw =~ /(windows mail)/i
-
 
602
        || $raw =~ /(microplanet gravity)/i
-
 
603
        || $raw =~ /(news rover)/i
-
 
604
        || $raw =~ /(forte agent)/i
-
 
605
        || $raw =~ /(forte free agent)/i
-
 
606
      )
-
 
607
    {
-
 
608
      $agent = $1;
-
 
609
    }
-
 
610
    elsif (
-
 
611
      $raw =~ /^(
-
 
612
        pan
-
 
613
       |sylpheed
-
 
614
       |slrn
-
 
615
       |mozilla
-
 
616
       |knode
-
 
617
       |tin
-
 
618
       |hamster
-
 
619
       |xrn
-
 
620
       |xnews
-
 
621
       |aol
-
 
622
       |gnus
-
 
623
       |krn
-
 
624
       |macsoup
-
 
625
       |messenger
-
 
626
       |openxp
-
 
627
       |pine
-
 
628
       |thoth
-
 
629
       |turnpike
-
 
630
       |winvn
-
 
631
       |vsoup
-
 
632
       |google
-
 
633
       |supernews
-
 
634
       |nn
-
 
635
       |rn
-
 
636
       |007
-
 
637
       |webtv
-
 
638
       |compuserve
-
 
639
       )/ix
-
 
640
      )
-
 
641
    {
-
 
642
      $agent = $1;
-
 
643
    }
-
 
644
    else
-
 
645
    {
-
 
646
      ## Clean up unknown agents
-
 
647
      if ( $raw =~ m!^(.*?)/! )
-
 
648
      {
-
 
649
        $agent = $1;
-
 
650
      }
-
 
651
      elsif ( $raw =~ /^(\w*)\d.*/ )
-
 
652
      {
-
 
653
        $agent = $1;
-
 
654
      }
-
 
655
    }
-
 
656
-
 
657
    $distinct_agent{$agent}++;
-
 
658
    return $agent;
-
 
659
  }
-
 
660
-
 
661
  ## Get all cross-posted newsgroups
-
 
662
  for ( split /,/, $headers{"Newsgroups"} )
-
 
663
  {
-
 
664
    $crossposts{$_}++;    # bump count for each
-
 
665
  }
-
 
666
-
 
667
  ## Get threads
-
 
668
  my $thread = $headers{"Subject"};
-
 
669
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
-
 
670
  $thread =~ s/\s+/ /g;     # collapse whitespace
-
 
671
  $threads{$thread}{count} += 1;            # bump count of this subject
-
 
672
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
-
 
673
-
 
674
  ## Is this an original post or a reply?
-
 
675
  if ( defined $headers{"References"} )
-
 
676
  {
-
 
677
    $replies++;
-
 
678
  }
-
 
679
  else
-
 
680
  {
-
 
681
    $origposts++;
-
 
682
  }
-
 
683
-
 
684
  ## Get the time zone
-
 
685
  $_ = $headers{"Date"};
-
 
686
  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
-
 
687
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
-
 
688
  {
-
 
689
    $tz = "UTC";
-
 
690
  }
-
 
691
  $tz{$tz}++;
-
 
692
-
 
693
#### Now analyse the body text ####
-
 
694
  my $insig = 0;
-
 
695
  for (@body)
-
 
696
  {
-
 
697
    $totbody += length($_);    # bump total body size
-
 
698
    next if (/^$>/);           # don't count blank lines in body
-
 
699
    if ( $insig == 1 )
-
 
700
    {
-
 
701
      $totsig += length($_);    # bump total sig size
-
 
702
-
 
703
      ## Bill Unruh uses ] quotes, and another poster uses ::
-
 
704
    }
-
 
705
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
-
 
706
    {                           # are we in a quote line?
-
 
707
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
-
 
708
      $totquoted += length($_);
-
 
709
    }
-
 
710
    elsif (/-- /)
-
 
711
    {
-
 
712
      $insig = 1;
-
 
713
    }
-
 
714
    else
-
 
715
    {
-
 
716
-
 
717
      ## We must be processing an original line
-
 
718
      $data{$poster}{orig} += length($_);      # bump count of original chrs
-
 
719
      $totorig += length($_);
-
 
720
    }
903
}
721
  }    # end for (@body)
-
 
722
904
723
}    # get_data
-
 
724
-
 
725
#########################################
-
 
726
## Count the User-Agents used, collapsing
-
 
727
## different versions into one per agent.
-
 
728
#########################################
-
 
729
sub count_agents
-
 
730
{
-
 
731
POSTER:
-
 
732
  foreach my $poster ( keys %data )
-
 
733
  {
-
 
734
    foreach my $agent_name ( keys %distinct_agent )
-
 
735
    {    # check against known ones
-
 
736
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
-
 
737
      {
-
 
738
        $agents{$agent_name}++;
-
 
739
        next POSTER;
-
 
740
      }
-
 
741
    }
-
 
742
    $agents{ $data{$poster}{agent} }++;
-
 
743
  }
-
 
744
}    # count_agents
-
 
745
-
 
746
#############################################
-
 
747
## Set orig/total percentages for all posters
-
 
748
#############################################
-
 
749
sub fix_percent
-
 
750
{
-
 
751
  foreach my $poster ( keys %data )
-
 
752
  {
-
 
753
    my $percent = 100;
-
 
754
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
-
 
755
    {
-
 
756
      $percent =
905
## helper subs
757
        $data{$poster}{orig} * 100 /
-
 
758
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
-
 
759
    }
-
 
760
    elsif ( $data{$poster}{orig} == 0 )
-
 
761
    {
-
 
762
      $percent = 0;
-
 
763
    }
-
 
764
    $data{$poster}{percent} = $percent;
-
 
765
  }
-
 
766
}
-
 
767
906
768
###############################
907
###############################
769
## Right pad a string with '.'s
908
## Right pad a string with '.'s
770
###############################
909
###############################
771
sub rpad
910
sub rpad
772
{
911
{
773
  ## Get text to pad, length to pad, pad chr
912
  ## Get text to pad, length to pad, pad chr
774
  my ( $text, $pad_len, $pad_chr ) = @_;
913
  my ( $text, $pad_len, $pad_chr ) = @_;
775
914
776
  ## DEBUG
915
  ## DEBUG
777
#printf "|%s| = %d\n", $text, length($text);
916
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
778
917
779
  if ( length($text) > $pad_len )
918
  if ( length($text) > $pad_len )
780
  {
919
  {
781
    $text = substr( $text, 0, $pad_len );
920
    $text = substr( $text, 0, $pad_len );
782
  }
921
  }
Line 796... Line 935...
796
}
935
}
797
936
798
###########################
937
###########################
799
## Put commas into a number
938
## Put commas into a number
800
###########################
939
###########################
-
 
940
# Get some of locale's numeric formatting parameters
-
 
941
my ($thousands_sep, $grouping) =
-
 
942
        @{localeconv()}{'thousands_sep', 'grouping'};
-
 
943
# Apply defaults if values are missing
-
 
944
$thousands_sep = ',' unless $thousands_sep;
-
 
945
# grouping and mon_grouping are packed lists
-
 
946
# of small integers (characters) telling the
-
 
947
# grouping (thousand_seps and mon_thousand_seps
-
 
948
# being the group dividers) of numbers and
-
 
949
# monetary quantities.  The integers' meanings:
-
 
950
# 255 means no more grouping, 0 means repeat
-
 
951
# the previous grouping, 1-254 means use that
-
 
952
# as the current grouping.  Grouping goes from
-
 
953
# right to left (low to high digits).  In the
-
 
954
# below we cheat slightly by never using anything
-
 
955
# else than the first grouping (whatever that is).
-
 
956
my @grouping;
-
 
957
if ($grouping) {
-
 
958
    @grouping = unpack("C*", $grouping);
-
 
959
} else {
-
 
960
    @grouping = (3);
-
 
961
}
-
 
962
801
sub commify
963
sub commify
802
{
964
{
803
  local $_ = shift;
965
  local $_ = shift;
804
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
966
  #1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
-
 
967
  $_ = int;    # Chop non-integer part
-
 
968
  1 while
-
 
969
  s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/;
805
  return $_;
970
  return $_;
806
}
971
}
807
972
808
################################################################
973
################################################################
809
## Returns a string with leading and trailing whitespace removed
974
## Returns a string with leading and trailing whitespace removed
Line 817... Line 982...
817
  return $clean;
982
  return $clean;
818
}
983
}
819
984
820
sub usage
985
sub usage
821
{
986
{
822
  print "usage: newstat.pl newsgroupname\n";
987
  print __"usage: newsstat.pl NEWS.GROUP\n";
823
  exit 1;
988
  exit 1;
824
}
989
}
825
990
826
##################################
-
 
827
## Write data structures to a file
-
 
828
##################################
-
 
829
sub write_data
991
sub dmsg
830
{
992
{
831
  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
-
 
832
    or die "Can't create XDATA: $!\n";
993
  print STDERR @_, "\n";
833
  print $OUTF "Data collected from $newsgroup_name\n\n";
-
 
834
  print $OUTF
-
 
835
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
-
 
836
  foreach my $name ( keys %data )
-
 
837
  {
-
 
838
    print $OUTF
-
 
839
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
-
 
840
  }
994
}
841
  print $OUTF
-
 
842
"============================================================================\n";
-
 
843
  print $OUTF "Thread subjects\n";
-
 
844
  print $OUTF
-
 
845
"----------------------------------------------------------------------------\n";
-
 
846
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
-
 
847
  {
995
848
    print $OUTF
-
 
849
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
-
 
850
  }
-
 
851
  print $OUTF
-
 
852
"============================================================================\n";
-
 
853
  print $OUTF "Cross-posts\n";
-
 
854
  print $OUTF
-
 
855
"----------------------------------------------------------------------------\n";
-
 
856
  foreach my $name ( sort keys %crossposts )
-
 
857
  {
-
 
858
    print $OUTF "$name : $crossposts{$name}\n";
-
 
859
  }
-
 
860
  print $OUTF
-
 
861
"============================================================================\n";
-
 
862
  print $OUTF "User agents\n";
-
 
863
  print $OUTF
-
 
864
"----------------------------------------------------------------------------\n";
-
 
865
  foreach my $name ( sort keys %agents )
-
 
866
  {
-
 
867
    print $OUTF "$name : $agents{$name}\n";
-
 
868
  }
-
 
869
  print $OUTF
-
 
870
"============================================================================\n";
-
 
871
  print $OUTF "Time zones\n";
-
 
872
  print $OUTF
996
sub dmsg2
873
"----------------------------------------------------------------------------\n";
-
 
874
  foreach my $name ( sort keys %tz )
-
 
875
  {
997
{
-
 
998
  my ( $level, @msg ) = @_;
876
    print $OUTF "$name : $tz{$name}\n";
999
  print STDERR @msg, "\n" if $level >= DEBUG;
877
  }
1000
}
878
  close $OUTF;
-
 
879
}    # write_data
-