Subversion Repositories LCARS

Rev

Rev 11 | Rev 14 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
8 PointedEar 1
#!/usr/bin/env perl
5 PointedEar 2
use strict;
6 PointedEar 3
use warnings;
13 PointedEar 4
use diagnostics;
6 PointedEar 5
use utf8;
6
use Encode;
7
 
13 PointedEar 8
## Print out all text to STDOUT UTF-8 encoded
9
binmode STDOUT, ':encoding(UTF-8)';
5 PointedEar 10
 
13 PointedEar 11
############################
12
## newsstat.pl version 0.4.3
5 PointedEar 13
 
13 PointedEar 14
###########################################################################
15
## Collect statistics about a newsgroup (specified by first argument)
16
## in the local news spool. Check all articles in the last 30-day period.
17
## Rank posters by number of posts and by volume of posts, report on top
18
## and bottom 20 posters. Show their name, number of posts, size of posts,
19
## percentage of quoted lines. Rank user-agents used, by poster rather
20
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
21
##
22
## Numbers and paths can be configured below.  See ChangeLog and TODO
23
## for more.  -- PE
24
###########################################################################
5 PointedEar 25
 
26
###################### USER CONFIGURATIONS ############################
27
 
13 PointedEar 28
## The name of the group to do stats for
5 PointedEar 29
my $newsgroup_name = $ARGV[0];
6 PointedEar 30
$newsgroup_name or &usage;
5 PointedEar 31
 
13 PointedEar 32
## Check for removal flags
5 PointedEar 33
my $ix;
34
my $j;
35
my %skipSec;
36
my @skiplist;
37
my $args = @ARGV;
7 PointedEar 38
for ( $ix = 1 ; $ix < $args ; $ix++ )
39
{
40
  $j = $ix + 1;
41
  if ( $ARGV[$ix] eq "-x" )
42
  {
43
    @skiplist = split( ",", $ARGV[$j] );
44
  }
45
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
46
  {
47
    @skiplist = split( ",", $1 );
48
  }
5 PointedEar 49
}
7 PointedEar 50
foreach (@skiplist)
51
{
5 PointedEar 52
  $skipSec{$_} = 1;
53
}
54
 
13 PointedEar 55
## Leafnode users will want /var/spool/news for this variable.
5 PointedEar 56
my $news = "/var/spool/news/";
57
 
13 PointedEar 58
## How many days are we doing statistics for?
5 PointedEar 59
my $numdays = 30;
60
 
13 PointedEar 61
## Number of agents we list
5 PointedEar 62
my $topagents = 10;
63
 
13 PointedEar 64
## Number of threads we want to know about
5 PointedEar 65
my $topthreads = 20;
66
 
13 PointedEar 67
## Number of top or bottom posters to show
5 PointedEar 68
my $topposters = 20;
69
 
13 PointedEar 70
## Number of cross-posted threads to show
5 PointedEar 71
my $topcrossposts = 10;
72
 
13 PointedEar 73
## Number of time zones to show
5 PointedEar 74
my $toptz = 10;
75
 
76
###################### DATA STRUCTURES ######################
77
my $group = $newsgroup_name;
78
$group =~ s!\.!/!g;
7 PointedEar 79
my %data;          # name, count, agent, total, orig, quoted
80
my %threads;       # subject, count
81
my %crossposts;    # group, count
82
my %tz;            # timezones by count
83
my %headers;       # holds header of current article
84
my %lcheader;      # holds lowercase headers
85
my @body;          # holds body of current article
86
my @sig;           # holds sig text;
87
my $totalposts;    # total no. of posts considered
88
my $filename;      # name of current article file
89
my $filesize;      # size of current article file
90
my $earliest;      # earliest article we have found
91
my $latest;        # latest article we have found
92
my $poster;        # poster we are dealing with
93
my $totsize   = 0; # holds total sizes of all files
94
my $totheader = 0; # total size of header material
95
my $totbody   = 0; # total size of body material
96
my $totsig    = 0; # total size of sig material
97
my $totorig   = 0; # total size of original material
98
my $totquoted = 0; # total size of quoted material
99
my $origposts = 0; # total no. of original posts
100
my $replies   = 0; # total no. of replies
101
my $i;             # general purpose
5 PointedEar 102
my %distinct_agent;
13 PointedEar 103
 
104
## Used to hold counts of User Agents used
105
my %agents = (
106
  "Compuserver"               => 0,
107
  "Foorum"                    => 0,
7 PointedEar 108
  "Forte Agent"               => 0,
109
  "Forte Free Agent"          => 0,
13 PointedEar 110
  "Gnus"                      => 0,
111
  "KNode"                     => 0,
112
  "MacSOUP"                   => 0,
113
  "MT-NewsWatcher"            => 0,
7 PointedEar 114
  "MicroPlanet Gravity"       => 0,
115
  "Microsoft Outlook Express" => 0,
13 PointedEar 116
  "Microsoft Windows Mail"    => 0,
117
  "Mozilla"                   => 0,
118
  "News Rover"                => 0,
119
  "NN"                        => 0,
120
  "Pan"                       => 0,
121
  "rn"                        => 0,
7 PointedEar 122
  "slrn"                      => 0,
13 PointedEar 123
  "Sylpheed"                  => 0,
7 PointedEar 124
  "tin"                       => 0,
13 PointedEar 125
  "VSoup"                     => 0,
7 PointedEar 126
  "WebTV"                     => 0,
13 PointedEar 127
  "Xnews"                     => 0
128
);
5 PointedEar 129
 
130
######################## MAIN CODE ########################
131
$! = 1;
132
 
133
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
7 PointedEar 134
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
135
while ( defined( $filename = readdir(DIR) ) )
136
{
137
  %lcheader = ();
13 PointedEar 138
  next unless -f $filename;               # only want real files
139
  next if ( $filename eq ".overview" );   # real articles only
140
  next if ( -M $filename > $numdays );    # only want articles <= a certain age
7 PointedEar 141
  $earliest = ( stat $filename )[9] unless defined($earliest);
142
  $latest   = ( stat $filename )[9] unless defined($latest);
13 PointedEar 143
  &get_article($filename);                 # read in the article
144
  &get_data;                               # grab the data from the article
145
  $totalposts++;                          # bump count of articles considered
5 PointedEar 146
}
13 PointedEar 147
closedir(DIR);                            # finished with the directory
7 PointedEar 148
 
13 PointedEar 149
## Post-processing
150
&count_agents;    # count agents, collapsing versions
151
&fix_percent;     # check percentages orig/total for posters
5 PointedEar 152
 
13 PointedEar 153
&write_data;
5 PointedEar 154
 
155
#################### DISPLAY RESULTS #####################
156
print "=" x 76, "\n";
7 PointedEar 157
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
5 PointedEar 158
print "=" x 76, "\n";
7 PointedEar 159
printf "%s\n",
160
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
5 PointedEar 161
print "\n\n";
7 PointedEar 162
printf "Total posts considered: %s over %d days\n", commify($totalposts),
163
  $numdays;
164
printf "Earliest article: %s\n",               scalar localtime($earliest);
165
printf "Latest article:   %s\n",               scalar localtime($latest);
166
printf "Original articles: %s, replies: %s\n", commify($origposts),
167
  commify($replies);
13 PointedEar 168
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
7 PointedEar 169
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
13 PointedEar 170
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
7 PointedEar 171
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
172
  commify( int( $totsize / $totalposts ) );
5 PointedEar 173
my $count = keys %data;
13 PointedEar 174
printf "Total headers: %s KiB  bodies: %s KiB\n",
7 PointedEar 175
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
13 PointedEar 176
printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
7 PointedEar 177
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
178
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
179
  commify( int( $totsig / 1024 ) );
180
printf "Total number of posters: %s, average %s bytes per poster\n",
181
  commify($count), commify( int( $totsize / $count ) );     #/
5 PointedEar 182
$count = keys %threads;
7 PointedEar 183
printf "Total number of threads: %s, average %s bytes per thread\n",
184
  commify($count), commify( int( $totsize / $count ) );     #/
13 PointedEar 185
printf "Total number of user agents: %d\n", scalar keys %agents;
5 PointedEar 186
print "\n", "=" x 76, "\n";
187
 
13 PointedEar 188
########################################
189
## Show posters by article count  Sec 1;
190
########################################
7 PointedEar 191
unless ( $skipSec{1} )
192
{
193
  if ( keys %data < $topposters )
194
  {
195
    $count = keys %data;
196
  }
197
  else
198
  {
199
    $count = $topposters;
200
  }
201
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
202
  print "=" x 76, "\n";
203
  $i = 0;
9 PointedEar 204
  foreach
205
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
7 PointedEar 206
  {
207
    my $name = substr( $poster, 0, 65 );
208
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
209
      $data{$poster}{count};
210
    last if ( ++$i == $count );
211
  }
212
  print "\n", "=" x 76, "\n";
5 PointedEar 213
}
214
 
13 PointedEar 215
######################################
216
## Show posters by size in KiB  Sec 2;
217
######################################
7 PointedEar 218
unless ( $skipSec{2} )
219
{
220
  if ( keys %data < $topposters )
221
  {
5 PointedEar 222
    $count = keys %data;
7 PointedEar 223
  }
224
  else
225
  {
5 PointedEar 226
    $count = $topposters;
227
  }
13 PointedEar 228
  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
5 PointedEar 229
  print "=" x 76, "\n";
230
  $i = 0;
8 PointedEar 231
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
7 PointedEar 232
  {
233
    my $name = substr( $poster, 0, 62 );
234
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
235
      $data{$poster}{size} / 1024;    #/
236
    last if ( ++$i == $count );
5 PointedEar 237
  }
238
  print "\n", "=" x 76, "\n";
239
}
240
 
13 PointedEar 241
#####################################
242
## Show top posters for original text
243
#####################################
7 PointedEar 244
unless ( $skipSec{3} )
245
{
246
  if ( keys %data < $topposters )
247
  {
248
    $count = keys %data;
249
  }
250
  else
251
  {
252
    $count = $topposters;
253
  }
254
  printf "%s\n",
255
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
256
  print "=" x 76, "\n";
257
  $i = 0;
9 PointedEar 258
  foreach my $poster (
259
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
260
    keys %data
261
    )
7 PointedEar 262
  {
263
    next if $data{$poster}{quoted} == 0;
264
    next if $data{$poster}{count} < 5;
265
    my $name = substr( $poster, 0, 63 );
266
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
267
      $data{$poster}{percent};
268
    last if ( ++$i == $count );
269
  }
270
  print "\n", "=" x 76, "\n";
5 PointedEar 271
}
272
 
13 PointedEar 273
########################################
274
## Show bottom posters for original text
275
########################################
7 PointedEar 276
unless ( $skipSec{4} )
277
{
278
  if ( keys %data < $topposters )
279
  {
5 PointedEar 280
    $count = keys %data;
7 PointedEar 281
  }
282
  else
283
  {
5 PointedEar 284
    $count = $topposters;
285
  }
7 PointedEar 286
  printf "%s\n",
287
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
5 PointedEar 288
  print "=" x 76, "\n";
289
  $i = 0;
9 PointedEar 290
  foreach my $poster (
291
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
292
    keys %data
293
    )
7 PointedEar 294
  {
5 PointedEar 295
    next if $data{$poster}{quoted} == 0;
296
    next if $data{$poster}{count} < 5;
7 PointedEar 297
    my $name = substr( $poster, 0, 63 );
298
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
299
      $data{$poster}{percent};
300
    last if ( ++$i == $count );
5 PointedEar 301
  }
302
  print "\n", "=" x 76, "\n";
303
}
304
 
13 PointedEar 305
#####################################
306
## Show threads by number of articles
307
#####################################
7 PointedEar 308
unless ( $skipSec{5} )
309
{
310
  if ( keys %threads < $topthreads )
311
  {
5 PointedEar 312
    $count = keys %threads;
7 PointedEar 313
  }
314
  else
315
  {
5 PointedEar 316
    $count = $topthreads;
317
  }
7 PointedEar 318
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
5 PointedEar 319
  print "=" x 76, "\n";
320
  $i = 0;
9 PointedEar 321
  foreach my $thread (
322
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
323
    keys %threads
324
    )
7 PointedEar 325
  {
326
    my $name = substr( $thread, 0, 65 );
327
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
328
      $threads{$thread}{count};
329
    last if ( ++$i == $count );
5 PointedEar 330
  }
331
  print "\n", "=" x 76, "\n";
332
}
13 PointedEar 333
 
334
##############################
335
## Show threads by size in KiB
336
##############################
7 PointedEar 337
unless ( $skipSec{6} )
338
{
339
  if ( keys %threads < $topthreads )
340
  {
5 PointedEar 341
    $count = keys %threads;
7 PointedEar 342
  }
343
  else
344
  {
5 PointedEar 345
    $count = $topthreads;
346
  }
13 PointedEar 347
  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
5 PointedEar 348
  print "=" x 76, "\n";
349
  $i = 0;
9 PointedEar 350
  foreach my $thread (
351
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
352
    keys %threads
353
    )
7 PointedEar 354
  {
355
    my $name = substr( $thread, 0, 65 );
356
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
357
      $threads{$thread}{size} / 1024;    #/
358
    last if ( ++$i == $count );
5 PointedEar 359
  }
360
  print "\n", "=" x 76, "\n";
361
}
362
 
13 PointedEar 363
##################################
364
## Show top 10 cross-posted groups
365
##################################
7 PointedEar 366
unless ( $skipSec{7} )
367
{
368
  delete $crossposts{"$newsgroup_name"};    # don't include ours
369
  if ( keys %crossposts < $topcrossposts )
370
  {
371
    $count = keys %crossposts;
372
  }
373
  else
374
  {
375
    $count = $topcrossposts;
376
  }
377
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
378
  print "=" x 76, "\n";
379
  $i = 0;
380
  foreach
381
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
382
  {
383
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
384
      $crossposts{$name};
385
    last if ( ++$i == $count );
386
  }
387
  print "\n", "=" x 76, "\n";
5 PointedEar 388
}
13 PointedEar 389
 
390
#########################
391
## Show agents and counts
392
#########################
7 PointedEar 393
unless ( $skipSec{8} )
394
{
395
  if ( keys %agents < $topagents )
396
  {
397
    $count = keys %agents;
398
  }
399
  else
400
  {
401
    $count = $topagents;
402
  }
403
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
404
  print "=" x 76, "\n";
405
  $i = 0;
406
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
407
  {
408
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
409
      $agents{$agent};
410
    last if ( ++$i == $count );
411
  }
412
  print "\n", "=" x 76, "\n";
5 PointedEar 413
}
414
 
415
#######################
13 PointedEar 416
## Show distinct agents
5 PointedEar 417
#######################
7 PointedEar 418
unless ( $skipSec{9} )
419
{
420
  if ( keys %distinct_agent < $topagents )
421
  {
422
    $count = keys %distinct_agent;
423
  }
424
  else
425
  {
426
    $count = $topagents;
427
  }
428
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
429
  print "=" x 76, "\n";
430
  $i = 0;
9 PointedEar 431
  foreach my $agent (
432
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
433
    keys %distinct_agent
434
    )
7 PointedEar 435
  {
436
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
437
      $distinct_agent{$agent},
438
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
439
    last if ( ++$i == $count );
440
  }
441
  print "\n", "=" x 76, "\n";
5 PointedEar 442
}
443
 
13 PointedEar 444
############################
445
## Show timezones and counts
446
############################
7 PointedEar 447
unless ( $skipSec{10} )
448
{
449
  if ( keys %tz < $toptz )
450
  {
451
    $count = keys %tz;
452
  }
453
  else
454
  {
455
    $count = $toptz;
456
  }
457
  printf "%s\n", &centred( "Top 10 time zones", 76 );
458
  print "=" x 76, "\n";
459
  $i = 0;
460
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
461
  {
462
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
463
    last if ( ++$i == $count );
464
  }
465
  print "\n", "=" x 76, "\n";
5 PointedEar 466
}
467
 
468
################################ SUBROUTINES ################################
469
 
13 PointedEar 470
########################################
471
## Get current article's header and body
472
########################################
473
sub get_article
7 PointedEar 474
{
475
  %headers = ();    # dump old headers
476
  my $filename = shift;    # get the name of the file
5 PointedEar 477
 
13 PointedEar 478
  ## get stats about the file itself
7 PointedEar 479
  $filesize = -s $filename;    # get total size of file
480
  $totsize += $filesize;       # bump total sizes of all files
481
 
482
  my $mtime = ( stat $filename )[9];
483
  if ( $mtime < $earliest )
484
  {
5 PointedEar 485
    $earliest = $mtime;
7 PointedEar 486
  }
487
  elsif ( $mtime > $latest )
488
  {
5 PointedEar 489
    $latest = $mtime;
7 PointedEar 490
  }
5 PointedEar 491
 
13 PointedEar 492
  ## now read the file
493
  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
8 PointedEar 494
  while (<$FILE>)
7 PointedEar 495
  {
496
    $totheader += length($_);    # bump total header size
497
    last if (/^\s*$/);           # end of header?
13 PointedEar 498
    if (/^([^:\s]*):\s*(.*)/)
7 PointedEar 499
    {
500
      my ( $key, $val ) = ( $1, $2 );
501
      $headers{$key} = decode( 'MIME-Header', $val );
502
      $lcheader{ clean( lc($key) ) } = clean($val);
503
    }
5 PointedEar 504
  }
9 PointedEar 505
  @body = <$FILE>;               # slurp up body
8 PointedEar 506
  close($FILE);
13 PointedEar 507
}    # get_article
5 PointedEar 508
 
13 PointedEar 509
####################################
510
## Get data from the current article
511
####################################
512
sub get_data
7 PointedEar 513
{
5 PointedEar 514
#### First, analyse header fields ####
515
 
13 PointedEar 516
  ## Set up this poster if not defined, get counts, sizes
11 PointedEar 517
  my $poster = $headers{From};    # get the poster's name
13 PointedEar 518
 
519
  # Convert old to new format
520
  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
521
 
522
  # Collapse whitespace
523
  $poster =~ s/\s+/ /;
524
 
525
  # Remove outer quotes
526
  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
527
 
7 PointedEar 528
  if ( !defined( $data{$poster} ) )
13 PointedEar 529
  {                               # seen this one before?
11 PointedEar 530
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
7 PointedEar 531
    $data{$poster}{orig}   = 0;
532
    $data{$poster}{quoted} = 0;
533
  }
11 PointedEar 534
  $data{$poster}{count}++;                 # bump count for this poster
535
  $data{$poster}{size} += $filesize;       # total size of file
5 PointedEar 536
 
13 PointedEar 537
  ## The User-Agent and/or X-Newsreader fields
538
  ## for User-Agent by poster
7 PointedEar 539
  if ( defined $lcheader{"user-agent"} )
540
  {
541
    $data{$poster}{agent} = $lcheader{"user-agent"};
542
  }
543
  if ( defined $lcheader{"x-newsreader"} )
544
  {
545
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
546
  }
5 PointedEar 547
 
13 PointedEar 548
  ## The User Agent for User-Agent by number of posts
7 PointedEar 549
  my $UA = "unknown";
550
  foreach my $keys ( keys %lcheader )
551
  {
552
    if ( defined $lcheader{'user-agent'} )
553
    {
554
      $UA = $lcheader{'user-agent'};
555
    }
556
    elsif ( defined $lcheader{"x-newsreader"} )
557
    {
558
      $UA = $lcheader{"x-newsreader"};
559
    }
560
    elsif ( defined $lcheader{'x-mailer'} )
561
    {
562
      $UA = $lcheader{'x-mailer'};
563
    }
564
    elsif (
565
      ( defined $lcheader{'organization'} )
566
      && ( $lcheader{'organization'} =~
567
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
568
      )
569
    {
570
      $UA = $lcheader{'organization'};
571
    }
572
    elsif ( $lcheader{'message-id'} =~ /pine/i )
573
    {
574
      $UA = "Pine";
575
    }    ## Hopefully found UA, else set to unknown
576
  }
5 PointedEar 577
 
7 PointedEar 578
  $UA = clean($UA);
579
  $UA = get_agent($UA);
5 PointedEar 580
 
7 PointedEar 581
  sub get_agent
582
  {
583
    my $raw   = shift;
584
    my $agent = $raw;
5 PointedEar 585
 
7 PointedEar 586
    ## strip http
587
    if ( $raw =~ /.*http.*/ )
588
    {
589
      $raw =~ s!posted via!!i;
590
      $raw =~ s!http://!!g;
591
      $raw =~ s!/!!g;
592
      $raw =~ s! !!g;
593
    }
5 PointedEar 594
 
7 PointedEar 595
    ## Fix Outlook from Mac
596
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
5 PointedEar 597
 
7 PointedEar 598
    ## Pick out the popular agents
13 PointedEar 599
    if (
600
           $raw =~ /(outlook express)/i
601
        || $raw =~ /(windows mail)/i
602
        || $raw =~ /(microplanet gravity)/i
603
        || $raw =~ /(news rover)/i
604
        || $raw =~ /(forte agent)/i
605
        || $raw =~ /(forte free agent)/i
606
      )
7 PointedEar 607
    {
608
      $agent = $1;
609
    }
610
    elsif (
611
      $raw =~ /^(
5 PointedEar 612
        pan
613
       |sylpheed
614
       |slrn
615
       |mozilla
616
       |knode
617
       |tin
618
       |hamster
619
       |xrn
620
       |xnews
621
       |aol
622
       |gnus
623
       |krn
624
       |macsoup
625
       |messenger
626
       |openxp
627
       |pine
628
       |thoth
629
       |turnpike
630
       |winvn
631
       |vsoup
632
       |google
633
       |supernews
634
       |nn
635
       |rn
636
       |007
637
       |webtv
638
       |compuserve
7 PointedEar 639
       )/ix
640
      )
641
    {
642
      $agent = $1;
643
    }
644
    else
645
    {
646
      ## Clean up unknown agents
647
      if ( $raw =~ m!^(.*?)/! )
648
      {
649
        $agent = $1;
650
      }
651
      elsif ( $raw =~ /^(\w*)\d.*/ )
652
      {
653
        $agent = $1;
654
      }
655
    }
656
 
657
    $distinct_agent{$agent}++;
658
    return $agent;
5 PointedEar 659
  }
660
 
13 PointedEar 661
  ## Get all cross-posted newsgroups
7 PointedEar 662
  for ( split /,/, $headers{"Newsgroups"} )
663
  {
664
    $crossposts{$_}++;    # bump count for each
665
  }
5 PointedEar 666
 
13 PointedEar 667
  ## Get threads
11 PointedEar 668
  my $thread = $headers{"Subject"};
7 PointedEar 669
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
670
  $thread =~ s/\s+/ /g;     # collapse whitespace
671
  $threads{$thread}{count} += 1;            # bump count of this subject
672
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
5 PointedEar 673
 
13 PointedEar 674
  ## Is this an original post or a reply?
7 PointedEar 675
  if ( defined $headers{"References"} )
676
  {
677
    $replies++;
678
  }
679
  else
680
  {
681
    $origposts++;
682
  }
5 PointedEar 683
 
13 PointedEar 684
  ## Get the time zone
7 PointedEar 685
  $_ = $headers{"Date"};
13 PointedEar 686
  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
9 PointedEar 687
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
7 PointedEar 688
  {
689
    $tz = "UTC";
690
  }
691
  $tz{$tz}++;
5 PointedEar 692
 
7 PointedEar 693
#### Now analyse the body text ####
694
  my $insig = 0;
695
  for (@body)
696
  {
697
    $totbody += length($_);    # bump total body size
698
    next if (/^$>/);           # don't count blank lines in body
699
    if ( $insig == 1 )
700
    {
701
      $totsig += length($_);    # bump total sig size
5 PointedEar 702
 
13 PointedEar 703
      ## Bill Unruh uses ] quotes, and another poster uses ::
7 PointedEar 704
    }
705
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
706
    {                           # are we in a quote line?
707
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
9 PointedEar 708
      $totquoted += length($_);
7 PointedEar 709
    }
710
    elsif (/-- /)
711
    {
712
      $insig = 1;
713
    }
714
    else
715
    {
5 PointedEar 716
 
13 PointedEar 717
      ## We must be processing an original line
7 PointedEar 718
      $data{$poster}{orig} += length($_);      # bump count of original chrs
9 PointedEar 719
      $totorig += length($_);
7 PointedEar 720
    }
721
  }    # end for (@body)
5 PointedEar 722
 
13 PointedEar 723
}    # get_data
5 PointedEar 724
 
13 PointedEar 725
#########################################
726
## Count the User-Agents used, collapsing
727
## different versions into one per agent.
728
#########################################
729
sub count_agents
7 PointedEar 730
{
5 PointedEar 731
POSTER:
8 PointedEar 732
  foreach my $poster ( keys %data )
7 PointedEar 733
  {
734
    foreach my $agent_name ( keys %distinct_agent )
735
    {    # check against known ones
736
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
737
      {
738
        $agents{$agent_name}++;
739
        next POSTER;
740
      }
741
    }
742
    $agents{ $data{$poster}{agent} }++;
743
  }
13 PointedEar 744
}    # count_agents
5 PointedEar 745
 
13 PointedEar 746
#############################################
747
## Set orig/total percentages for all posters
748
#############################################
749
sub fix_percent
7 PointedEar 750
{
8 PointedEar 751
  foreach my $poster ( keys %data )
7 PointedEar 752
  {
753
    my $percent = 100;
754
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
755
    {
9 PointedEar 756
      $percent =
757
        $data{$poster}{orig} * 100 /
7 PointedEar 758
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
759
    }
760
    elsif ( $data{$poster}{orig} == 0 )
761
    {
762
      $percent = 0;
763
    }
764
    $data{$poster}{percent} = $percent;
765
  }
5 PointedEar 766
}
767
 
13 PointedEar 768
###############################
769
## Right pad a string with '.'s
770
###############################
7 PointedEar 771
sub rpad
772
{
13 PointedEar 773
  ## Get text to pad, length to pad, pad chr
7 PointedEar 774
  my ( $text, $pad_len, $pad_chr ) = @_;
11 PointedEar 775
 
13 PointedEar 776
  ## DEBUG
777
#printf "|%s| = %d\n", $text, length($text);
11 PointedEar 778
 
7 PointedEar 779
  if ( length($text) > $pad_len )
780
  {
781
    $text = substr( $text, 0, $pad_len );
782
  }
783
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
784
  return $padded;
5 PointedEar 785
}
786
 
13 PointedEar 787
##################
788
## Centre a string
789
##################
7 PointedEar 790
sub centred
791
{
792
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
793
  my $pad_len = ( $width - length($text) ) / 2;    #/
794
  my $centred = " " x $pad_len . $text;
795
  return $centred;
5 PointedEar 796
}
797
 
13 PointedEar 798
###########################
799
## Put commas into a number
800
###########################
7 PointedEar 801
sub commify
802
{
13 PointedEar 803
  local $_ = shift;
804
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
7 PointedEar 805
  return $_;
5 PointedEar 806
}
807
 
13 PointedEar 808
################################################################
809
## Returns a string with leading and trailing whitespace removed
810
################################################################
7 PointedEar 811
sub clean
812
{
813
  my $dirty = shift;
814
  my $clean = $dirty;
13 PointedEar 815
  $clean =~ s/^\s*|\s*$//g;
5 PointedEar 816
 
7 PointedEar 817
  return $clean;
5 PointedEar 818
}
819
 
7 PointedEar 820
sub usage
821
{
822
  print "usage: newstat.pl newsgroupname\n";
823
  exit 1;
5 PointedEar 824
}
825
 
13 PointedEar 826
##################################
827
## Write data structures to a file
828
##################################
829
sub write_data
7 PointedEar 830
{
13 PointedEar 831
  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
832
    or die "Can't create XDATA: $!\n";
833
  print $OUTF "Data collected from $newsgroup_name\n\n";
8 PointedEar 834
  print $OUTF
7 PointedEar 835
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
836
  foreach my $name ( keys %data )
837
  {
8 PointedEar 838
    print $OUTF
7 PointedEar 839
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
840
  }
8 PointedEar 841
  print $OUTF
7 PointedEar 842
"============================================================================\n";
8 PointedEar 843
  print $OUTF "Thread subjects\n";
844
  print $OUTF
7 PointedEar 845
"----------------------------------------------------------------------------\n";
846
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
847
  {
9 PointedEar 848
    print $OUTF
849
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
7 PointedEar 850
  }
8 PointedEar 851
  print $OUTF
7 PointedEar 852
"============================================================================\n";
8 PointedEar 853
  print $OUTF "Cross-posts\n";
854
  print $OUTF
7 PointedEar 855
"----------------------------------------------------------------------------\n";
856
  foreach my $name ( sort keys %crossposts )
857
  {
8 PointedEar 858
    print $OUTF "$name : $crossposts{$name}\n";
7 PointedEar 859
  }
13 PointedEar 860
  print $OUTF
7 PointedEar 861
"============================================================================\n";
8 PointedEar 862
  print $OUTF "User agents\n";
863
  print $OUTF
7 PointedEar 864
"----------------------------------------------------------------------------\n";
865
  foreach my $name ( sort keys %agents )
866
  {
8 PointedEar 867
    print $OUTF "$name : $agents{$name}\n";
7 PointedEar 868
  }
8 PointedEar 869
  print $OUTF
7 PointedEar 870
"============================================================================\n";
8 PointedEar 871
  print $OUTF "Time zones\n";
872
  print $OUTF
7 PointedEar 873
"----------------------------------------------------------------------------\n";
874
  foreach my $name ( sort keys %tz )
875
  {
8 PointedEar 876
    print $OUTF "$name : $tz{$name}\n";
7 PointedEar 877
  }
8 PointedEar 878
  close $OUTF;
13 PointedEar 879
}    # write_data