Subversion Repositories LCARS

Rev

Rev 6 | Rev 8 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
7 PointedEar 1
#!/usr/bin/perl
5 PointedEar 2
use strict;
6 PointedEar 3
use warnings;
4
use utf8;
5
use Encode;
6
 
5 PointedEar 7
#########################
6 PointedEar 8
# newsstat.pl version 0.4
5 PointedEar 9
 
6 PointedEar 10
############################################################################
11
# Collect statistics about a newsgroup (specified by first argument) in
7 PointedEar 12
# the local news spool. Check all articles in the last 30-day period.
6 PointedEar 13
# Rank posters by number of posts and by volume of posts, report on top and
14
# bottom 20 posters. Show their name, number of posts, size of posts,
15
# percentage of quoted lines. Rank user-agents used, by poster rather than
16
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
7 PointedEar 17
#
6 PointedEar 18
# (Numbers and paths can be configured below.  -- PE)
19
############################################################################
5 PointedEar 20
 
6 PointedEar 21
############################################################################
22
#                       RECENT CHANGES                                     #
23
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
7 PointedEar 24
#                 - Use warnings, utf8 (just in case)
6 PointedEar 25
#                 - Documentation update
26
# N/A         NN  - Take newsgroup name as argument
27
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
28
#                 - Allow command line flags for subtracting
29
#                   output if not pertinent for a group
30
# 2002-11-09  NN  - Put Garry's writedata() function back in.
31
#                 - added "rn" to my list of UA's
32
#                 - Started using %distinct_agent for both User agent
33
#                   sections
34
#                 - named it newsstat.pl version 0.3
35
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
36
#                   mtime rather than ctime, and simplifying the logic
37
# 2002-11-05  NN  - moved user configurations to the top
38
#                 - fixed the cross-posting section
39
#                 - introduced the $newsgroup_name variable which
40
#                   later becomes $news$group
41
#                 - changed $name to $agent_name in countagents()
7 PointedEar 42
#
6 PointedEar 43
# Contributors
44
# -------------
45
# NN  Nomen nominandum (name to be determined later)
46
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
5 PointedEar 47
 
6 PointedEar 48
########### TODO #############
5 PointedEar 49
# Commas in bottom section of report
50
# Show date the figures were compiled
51
# No. of HTML articles (Content-Type: text/html)
52
# No. of quoted sigs (/>\s*-- /)
53
# Per cent of top-posted articles
54
# Top 10 cross-posters
55
# Top 20 news posting hosts (from Path)
56
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
57
# Count *all* User Agents that each poster uses
58
# What do we do about Bill Unruh's ] quote style?
59
# Change the way dates/times are checked
60
# include % share in posters by no. of arts
61
# include % share in posters by size
62
# Total, orig & quoted lines by user agent with per cent
6 PointedEar 63
# Take more arguments
5 PointedEar 64
#######################################################
65
 
66
###################### USER CONFIGURATIONS ############################
67
 
68
# The name of the group to do stats for
69
my $newsgroup_name = $ARGV[0];
6 PointedEar 70
$newsgroup_name or &usage;
5 PointedEar 71
 
72
# Check for removal flags
73
my $ix;
74
my $j;
75
my %skipSec;
76
my @skiplist;
77
my $args = @ARGV;
7 PointedEar 78
for ( $ix = 1 ; $ix < $args ; $ix++ )
79
{
80
  $j = $ix + 1;
81
  if ( $ARGV[$ix] eq "-x" )
82
  {
83
    @skiplist = split( ",", $ARGV[$j] );
84
  }
85
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
86
  {
87
    @skiplist = split( ",", $1 );
88
  }
5 PointedEar 89
}
7 PointedEar 90
foreach (@skiplist)
91
{
5 PointedEar 92
  $skipSec{$_} = 1;
93
}
94
 
95
# Leafnode users will want /var/spool/news for this variable.
96
my $news = "/var/spool/news/";
97
 
98
# How many days are we doing statistics for?
99
my $numdays = 30;
100
 
101
# no. of agents we list
102
my $topagents = 10;
103
 
104
# no. of threads we want to know about
105
my $topthreads = 20;
106
 
107
# no. of top or bottom posters to show
108
my $topposters = 20;
109
 
110
# no. of cross-posted threads to show
111
my $topcrossposts = 10;
112
 
113
# no. of time zones to show
114
my $toptz = 10;
115
 
116
###################### DATA STRUCTURES ######################
117
my $group = $newsgroup_name;
118
$group =~ s!\.!/!g;
7 PointedEar 119
my %data;          # name, count, agent, total, orig, quoted
120
my %threads;       # subject, count
121
my %crossposts;    # group, count
122
my %tz;            # timezones by count
123
my %headers;       # holds header of current article
124
my %lcheader;      # holds lowercase headers
125
my @body;          # holds body of current article
126
my @sig;           # holds sig text;
127
my $totalposts;    # total no. of posts considered
128
my $filename;      # name of current article file
129
my $filesize;      # size of current article file
130
my $earliest;      # earliest article we have found
131
my $latest;        # latest article we have found
132
my $poster;        # poster we are dealing with
133
my $totsize   = 0; # holds total sizes of all files
134
my $totheader = 0; # total size of header material
135
my $totbody   = 0; # total size of body material
136
my $totsig    = 0; # total size of sig material
137
my $totorig   = 0; # total size of original material
138
my $totquoted = 0; # total size of quoted material
139
my $origposts = 0; # total no. of original posts
140
my $replies   = 0; # total no. of replies
141
my $i;             # general purpose
5 PointedEar 142
my %distinct_agent;
7 PointedEar 143
my %agents =       # used to hold counts of User Agents used
144
  (
145
  "KNode"                     => 0,
146
  "Pan"                       => 0,
147
  "Mozilla"                   => 0,
148
  "Sylpheed"                  => 0,
149
  "Gnus"                      => 0,
150
  "Forte Agent"               => 0,
151
  "Forte Free Agent"          => 0,
152
  "MicroPlanet Gravity"       => 0,
153
  "Microsoft Outlook Express" => 0,
154
  "Xnews"                     => 0,
155
  "slrn"                      => 0,
156
  "tin"                       => 0,
157
  "rn"                        => 0,
158
  "NN"                        => 0,
159
  "MacSOUP"                   => 0,
160
  "Foorum"                    => 0,
161
  "MT-NewsWatcher"            => 0,
162
  "News Rover"                => 0,
163
  "WebTV"                     => 0,
164
  "Compuserver"               => 0,
165
  "VSoup"                     => 0
166
  );
5 PointedEar 167
 
168
######################## MAIN CODE ########################
169
$! = 1;
170
 
171
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
7 PointedEar 172
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
173
while ( defined( $filename = readdir(DIR) ) )
174
{
175
  %lcheader = ();
176
  next unless -f $filename;    # only want real files
177
  next if ( $filename eq ".overview" );    # real articles only
178
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
179
  $earliest = ( stat $filename )[9] unless defined($earliest);
180
  $latest   = ( stat $filename )[9] unless defined($latest);
181
  &getarticle($filename);                  # read in the article
182
  &getdata;                                # grab the data from the article
183
  $totalposts++;                           # bump count of articles considered
5 PointedEar 184
}
7 PointedEar 185
closedir(DIR);                             # finished with the directory
186
 
5 PointedEar 187
# post-processing
7 PointedEar 188
&countagents;    # count agents, collapsing versions
189
&fixpercent;     # check percentages orig/total for posters
5 PointedEar 190
 
191
&writedata;
192
 
193
#################### DISPLAY RESULTS #####################
194
print "=" x 76, "\n";
7 PointedEar 195
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
5 PointedEar 196
print "=" x 76, "\n";
7 PointedEar 197
printf "%s\n",
198
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
5 PointedEar 199
print "\n\n";
7 PointedEar 200
printf "Total posts considered: %s over %d days\n", commify($totalposts),
201
  $numdays;
202
printf "Earliest article: %s\n",               scalar localtime($earliest);
203
printf "Latest article:   %s\n",               scalar localtime($latest);
204
printf "Original articles: %s, replies: %s\n", commify($origposts),
205
  commify($replies);
5 PointedEar 206
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
7 PointedEar 207
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
5 PointedEar 208
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
7 PointedEar 209
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
210
  commify( int( $totsize / $totalposts ) );
5 PointedEar 211
my $count = keys %data;
212
printf "Total headers: %s KB  bodies: %s KB\n",
7 PointedEar 213
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
5 PointedEar 214
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
7 PointedEar 215
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
216
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
217
  commify( int( $totsig / 1024 ) );
218
printf "Total number of posters: %s, average %s bytes per poster\n",
219
  commify($count), commify( int( $totsize / $count ) );     #/
5 PointedEar 220
$count = keys %threads;
7 PointedEar 221
printf "Total number of threads: %s, average %s bytes per thread\n",
222
  commify($count), commify( int( $totsize / $count ) );     #/
5 PointedEar 223
printf "Total number of User-Agents: %d\n", scalar keys %agents;
224
print "\n", "=" x 76, "\n";
225
 
226
###############################
227
# show posters by article count  Sec 1;
228
###############################
7 PointedEar 229
unless ( $skipSec{1} )
230
{
231
  if ( keys %data < $topposters )
232
  {
233
    $count = keys %data;
234
  }
235
  else
236
  {
237
    $count = $topposters;
238
  }
239
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
240
  print "=" x 76, "\n";
241
  $i = 0;
242
  foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
243
  {
244
    my $name = substr( $poster, 0, 65 );
245
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
246
      $data{$poster}{count};
247
    last if ( ++$i == $count );
248
  }
249
  print "\n", "=" x 76, "\n";
5 PointedEar 250
}
251
 
252
################################
253
# show posters by size in Kbytes Sec 2;
254
################################
7 PointedEar 255
unless ( $skipSec{2} )
256
{
257
  if ( keys %data < $topposters )
258
  {
5 PointedEar 259
    $count = keys %data;
7 PointedEar 260
  }
261
  else
262
  {
5 PointedEar 263
    $count = $topposters;
264
  }
7 PointedEar 265
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
5 PointedEar 266
  print "=" x 76, "\n";
267
  $i = 0;
7 PointedEar 268
  foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
269
  {
270
    my $name = substr( $poster, 0, 62 );
271
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
272
      $data{$poster}{size} / 1024;    #/
273
    last if ( ++$i == $count );
5 PointedEar 274
  }
275
  print "\n", "=" x 76, "\n";
276
}
277
 
278
####################################
279
# show top posters for original text
280
####################################
7 PointedEar 281
unless ( $skipSec{3} )
282
{
283
  if ( keys %data < $topposters )
284
  {
285
    $count = keys %data;
286
  }
287
  else
288
  {
289
    $count = $topposters;
290
  }
291
  printf "%s\n",
292
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
293
  print "=" x 76, "\n";
294
  $i = 0;
295
  foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
296
    keys %data )
297
  {
298
    next if $data{$poster}{quoted} == 0;
299
    next if $data{$poster}{count} < 5;
300
    my $name = substr( $poster, 0, 63 );
301
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
302
      $data{$poster}{percent};
303
    last if ( ++$i == $count );
304
  }
305
  print "\n", "=" x 76, "\n";
5 PointedEar 306
}
307
 
308
#######################################
309
# show bottom posters for original text
310
#######################################
7 PointedEar 311
unless ( $skipSec{4} )
312
{
313
  if ( keys %data < $topposters )
314
  {
5 PointedEar 315
    $count = keys %data;
7 PointedEar 316
  }
317
  else
318
  {
5 PointedEar 319
    $count = $topposters;
320
  }
7 PointedEar 321
  printf "%s\n",
322
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
5 PointedEar 323
  print "=" x 76, "\n";
324
  $i = 0;
7 PointedEar 325
  foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
326
    keys %data )
327
  {
5 PointedEar 328
    next if $data{$poster}{quoted} == 0;
329
    next if $data{$poster}{count} < 5;
7 PointedEar 330
    my $name = substr( $poster, 0, 63 );
331
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
332
      $data{$poster}{percent};
333
    last if ( ++$i == $count );
5 PointedEar 334
  }
335
  print "\n", "=" x 76, "\n";
336
}
337
 
338
####################################
339
# show threads by number of articles
340
####################################
7 PointedEar 341
unless ( $skipSec{5} )
342
{
343
  if ( keys %threads < $topthreads )
344
  {
5 PointedEar 345
    $count = keys %threads;
7 PointedEar 346
  }
347
  else
348
  {
5 PointedEar 349
    $count = $topthreads;
350
  }
7 PointedEar 351
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
5 PointedEar 352
  print "=" x 76, "\n";
353
  $i = 0;
7 PointedEar 354
  foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
355
    keys %threads )
356
  {
357
    my $name = substr( $thread, 0, 65 );
358
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
359
      $threads{$thread}{count};
360
    last if ( ++$i == $count );
5 PointedEar 361
  }
362
  print "\n", "=" x 76, "\n";
363
}
364
################################
365
# show threads by size in Kbytes
366
################################
7 PointedEar 367
unless ( $skipSec{6} )
368
{
369
  if ( keys %threads < $topthreads )
370
  {
5 PointedEar 371
    $count = keys %threads;
7 PointedEar 372
  }
373
  else
374
  {
5 PointedEar 375
    $count = $topthreads;
376
  }
7 PointedEar 377
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
5 PointedEar 378
  print "=" x 76, "\n";
379
  $i = 0;
7 PointedEar 380
  foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
381
    keys %threads )
382
  {
383
    my $name = substr( $thread, 0, 65 );
384
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
385
      $threads{$thread}{size} / 1024;    #/
386
    last if ( ++$i == $count );
5 PointedEar 387
  }
388
  print "\n", "=" x 76, "\n";
389
}
390
 
391
#################################
392
# show top 10 cross-posted groups
393
#################################
7 PointedEar 394
unless ( $skipSec{7} )
395
{
396
  delete $crossposts{"$newsgroup_name"};    # don't include ours
397
  if ( keys %crossposts < $topcrossposts )
398
  {
399
    $count = keys %crossposts;
400
  }
401
  else
402
  {
403
    $count = $topcrossposts;
404
  }
405
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
406
  print "=" x 76, "\n";
407
  $i = 0;
408
  foreach
409
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
410
  {
411
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
412
      $crossposts{$name};
413
    last if ( ++$i == $count );
414
  }
415
  print "\n", "=" x 76, "\n";
5 PointedEar 416
}
417
#######################
418
#show agents and counts
419
#######################
7 PointedEar 420
unless ( $skipSec{8} )
421
{
422
  if ( keys %agents < $topagents )
423
  {
424
    $count = keys %agents;
425
  }
426
  else
427
  {
428
    $count = $topagents;
429
  }
430
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
431
  print "=" x 76, "\n";
432
  $i = 0;
433
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
434
  {
435
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
436
      $agents{$agent};
437
    last if ( ++$i == $count );
438
  }
439
  print "\n", "=" x 76, "\n";
5 PointedEar 440
}
441
 
442
#######################
443
#show distinct agents
444
#######################
7 PointedEar 445
unless ( $skipSec{9} )
446
{
447
  if ( keys %distinct_agent < $topagents )
448
  {
449
    $count = keys %distinct_agent;
450
  }
451
  else
452
  {
453
    $count = $topagents;
454
  }
455
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
456
  print "=" x 76, "\n";
457
  $i = 0;
458
  foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
459
    keys %distinct_agent )
460
  {
461
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
462
      $distinct_agent{$agent},
463
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
464
    last if ( ++$i == $count );
465
  }
466
  print "\n", "=" x 76, "\n";
5 PointedEar 467
}
468
 
469
##########################
470
#show timezones and counts
471
##########################
7 PointedEar 472
unless ( $skipSec{10} )
473
{
474
  if ( keys %tz < $toptz )
475
  {
476
    $count = keys %tz;
477
  }
478
  else
479
  {
480
    $count = $toptz;
481
  }
482
  printf "%s\n", &centred( "Top 10 time zones", 76 );
483
  print "=" x 76, "\n";
484
  $i = 0;
485
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
486
  {
487
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
488
    last if ( ++$i == $count );
489
  }
490
  print "\n", "=" x 76, "\n";
5 PointedEar 491
}
492
 
493
################################ SUBROUTINES ################################
494
 
495
#######################################
496
# get current article's header and body
497
#######################################
7 PointedEar 498
sub getarticle
499
{
500
  %headers = ();    # dump old headers
501
  my $filename = shift;    # get the name of the file
5 PointedEar 502
 
7 PointedEar 503
  # get stats about the file itself
504
  $filesize = -s $filename;    # get total size of file
505
  $totsize += $filesize;       # bump total sizes of all files
506
 
507
  my $mtime = ( stat $filename )[9];
508
  if ( $mtime < $earliest )
509
  {
5 PointedEar 510
    $earliest = $mtime;
7 PointedEar 511
  }
512
  elsif ( $mtime > $latest )
513
  {
5 PointedEar 514
    $latest = $mtime;
7 PointedEar 515
  }
5 PointedEar 516
 
7 PointedEar 517
  # now read the file
518
  open( FILE, $filename ) or die "Can't open $filename: $!\n";
519
  while (<FILE>)
520
  {
521
    $totheader += length($_);    # bump total header size
522
    last if (/^\s*$/);           # end of header?
523
    if (/^([^:\s]*):\s+(.*)/)
524
    {
525
      my ( $key, $val ) = ( $1, $2 );
526
      $headers{$key} = decode( 'MIME-Header', $val );
527
      $lcheader{ clean( lc($key) ) } = clean($val);
528
    }
5 PointedEar 529
  }
7 PointedEar 530
  @body = <FILE>;                # slurp up body
531
  close(FILE);
532
}    # getarticle
5 PointedEar 533
 
534
###################################
535
# get data from the current article
536
###################################
7 PointedEar 537
sub getdata
538
{
5 PointedEar 539
#### First, analyse header fields ####
540
 
7 PointedEar 541
  # Set up this poster if not defined, get counts, sizes
542
  $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
543
  if ( !defined( $data{$poster} ) )
544
  {                                               # seen this one before?
545
    $data{$poster}{agent}  = 'Unknown';           # comes after For: field
546
    $data{$poster}{orig}   = 0;
547
    $data{$poster}{quoted} = 0;
548
  }
549
  $data{$poster}{count}++;                        # bump count for this poster
550
  $data{$poster}{size} += $filesize;              # total size of file
5 PointedEar 551
 
7 PointedEar 552
  # The User-Agent and/or X-Newsreader fields
553
  # for User-Agent by poster
554
  if ( defined $lcheader{"user-agent"} )
555
  {
556
    $data{$poster}{agent} = $lcheader{"user-agent"};
557
  }
558
  if ( defined $lcheader{"x-newsreader"} )
559
  {
560
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
561
  }
5 PointedEar 562
 
7 PointedEar 563
  # The User Agent for User-Agent by number of posts
564
  my $UA = "unknown";
565
  foreach my $keys ( keys %lcheader )
566
  {
567
    if ( defined $lcheader{'user-agent'} )
568
    {
569
      $UA = $lcheader{'user-agent'};
570
    }
571
    elsif ( defined $lcheader{"x-newsreader"} )
572
    {
573
      $UA = $lcheader{"x-newsreader"};
574
    }
575
    elsif ( defined $lcheader{'x-mailer'} )
576
    {
577
      $UA = $lcheader{'x-mailer'};
578
    }
579
    elsif (
580
      ( defined $lcheader{'organization'} )
581
      && ( $lcheader{'organization'} =~
582
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
583
      )
584
    {
585
      $UA = $lcheader{'organization'};
586
    }
587
    elsif ( $lcheader{'message-id'} =~ /pine/i )
588
    {
589
      $UA = "Pine";
590
    }    ## Hopefully found UA, else set to unknown
591
  }
5 PointedEar 592
 
7 PointedEar 593
  $UA = clean($UA);
594
  $UA = get_agent($UA);
5 PointedEar 595
 
7 PointedEar 596
  sub get_agent
597
  {
598
    my $raw   = shift;
599
    my $agent = $raw;
5 PointedEar 600
 
7 PointedEar 601
    ## strip http
602
    if ( $raw =~ /.*http.*/ )
603
    {
604
      $raw =~ s!posted via!!i;
605
      $raw =~ s!http://!!g;
606
      $raw =~ s!/!!g;
607
      $raw =~ s! !!g;
608
    }
5 PointedEar 609
 
7 PointedEar 610
    ## Fix Outlook from Mac
611
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
5 PointedEar 612
 
7 PointedEar 613
    ## Pick out the popular agents
614
    if ( $raw =~ /(outlook express)/i
615
      || $raw =~ /(microplanet gravity)/i
616
      || $raw =~ /(news rover)/i
617
      || $raw =~ /(forte agent)/i
618
      || $raw =~ /(forte free agent)/i )
619
    {
620
      $agent = $1;
621
    }
622
    elsif (
623
      $raw =~ /^(
5 PointedEar 624
        pan
625
       |sylpheed
626
       |slrn
627
       |mozilla
628
       |knode
629
       |tin
630
       |hamster
631
       |xrn
632
       |xnews
633
       |aol
634
       |gnus
635
       |krn
636
       |macsoup
637
       |messenger
638
       |openxp
639
       |pine
640
       |thoth
641
       |turnpike
642
       |winvn
643
       |vsoup
644
       |google
645
       |supernews
646
       |nn
647
       |rn
648
       |007
649
       |webtv
650
       |compuserve
7 PointedEar 651
       )/ix
652
      )
653
    {
654
      $agent = $1;
655
    }
656
    else
657
    {
658
      ## Clean up unknown agents
659
      if ( $raw =~ m!^(.*?)/! )
660
      {
661
        $agent = $1;
662
      }
663
      elsif ( $raw =~ /^(\w*)\d.*/ )
664
      {
665
        $agent = $1;
666
      }
667
    }
668
 
669
    $distinct_agent{$agent}++;
670
    return $agent;
5 PointedEar 671
  }
672
 
7 PointedEar 673
  # Get all cross-posted newsgroups
674
  for ( split /,/, $headers{"Newsgroups"} )
675
  {
676
    $crossposts{$_}++;    # bump count for each
677
  }
5 PointedEar 678
 
7 PointedEar 679
  # Get threads
680
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
681
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
682
  $thread =~ s/\s+/ /g;     # collapse whitespace
683
  $threads{$thread}{count} += 1;            # bump count of this subject
684
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
5 PointedEar 685
 
7 PointedEar 686
  # Is this an original post or a reply?
687
  if ( defined $headers{"References"} )
688
  {
689
    $replies++;
690
  }
691
  else
692
  {
693
    $origposts++;
694
  }
5 PointedEar 695
 
7 PointedEar 696
  # Get the time zone
697
  $_ = $headers{"Date"};
698
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
699
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
700
  {
701
    $tz = "UTC";
702
  }
703
  $tz{$tz}++;
5 PointedEar 704
 
7 PointedEar 705
#### Now analyse the body text ####
706
  my $insig = 0;
707
  for (@body)
708
  {
709
    $totbody += length($_);    # bump total body size
710
    next if (/^$>/);           # don't count blank lines in body
711
    if ( $insig == 1 )
712
    {
713
      $totsig += length($_);    # bump total sig size
5 PointedEar 714
 
7 PointedEar 715
      # Bill Unruh uses ] quotes, and another poster uses ::
716
    }
717
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
718
    {                           # are we in a quote line?
719
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
720
      $totquoted             += length($_);
721
    }
722
    elsif (/-- /)
723
    {
724
      $insig = 1;
725
    }
726
    else
727
    {
5 PointedEar 728
 
7 PointedEar 729
      # we must be processing an original line
730
      $data{$poster}{orig} += length($_);      # bump count of original chrs
731
      $totorig             += length($_);
732
    }
733
  }    # end for (@body)
5 PointedEar 734
 
7 PointedEar 735
}    # getdata
5 PointedEar 736
 
737
########################################
738
# Count the User-Agents used, collapsing
739
# different versions into one per agent.
740
########################################
7 PointedEar 741
sub countagents
742
{
5 PointedEar 743
POSTER:
7 PointedEar 744
  foreach $poster ( keys %data )
745
  {
746
    foreach my $agent_name ( keys %distinct_agent )
747
    {    # check against known ones
748
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
749
      {
750
        $agents{$agent_name}++;
751
        next POSTER;
752
      }
753
    }
754
    $agents{ $data{$poster}{agent} }++;
755
  }
756
}    # countagents
5 PointedEar 757
 
758
############################################
759
# set orig/total percentages for all posters
760
############################################
7 PointedEar 761
sub fixpercent
762
{
763
  foreach $poster ( keys %data )
764
  {
765
    my $percent = 100;
766
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
767
    {
768
      $percent = $data{$poster}{orig} * 100 /
769
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
770
    }
771
    elsif ( $data{$poster}{orig} == 0 )
772
    {
773
      $percent = 0;
774
    }
775
    $data{$poster}{percent} = $percent;
776
  }
5 PointedEar 777
}
778
 
779
##############################
780
# right pad a string with '.'s
781
##############################
7 PointedEar 782
sub rpad
783
{
784
 
785
  # get text to pad, length to pad, pad chr
786
  my ( $text, $pad_len, $pad_chr ) = @_;
787
  if ( length($text) > $pad_len )
788
  {
789
    $text = substr( $text, 0, $pad_len );
790
  }
791
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
792
  return $padded;
5 PointedEar 793
}
794
 
795
#################
796
# centre a string
797
#################
7 PointedEar 798
sub centred
799
{
800
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
801
  my $pad_len = ( $width - length($text) ) / 2;    #/
802
  my $centred = " " x $pad_len . $text;
803
  return $centred;
5 PointedEar 804
}
805
 
806
##########################
807
# put commas into a number
808
##########################
7 PointedEar 809
sub commify
810
{
811
  $_ = shift;
812
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
813
  return $_;
5 PointedEar 814
}
815
 
816
#########################
817
# clean
818
#########################
7 PointedEar 819
sub clean
820
{
821
  my $dirty = shift;
822
  my $clean = $dirty;
823
  $clean =~ s/^\s*//;
824
  $clean =~ s/\s*$//;
5 PointedEar 825
 
7 PointedEar 826
  return $clean;
5 PointedEar 827
}
828
 
7 PointedEar 829
sub usage
830
{
5 PointedEar 831
 
7 PointedEar 832
  print "usage: newstat.pl newsgroupname\n";
833
  exit 1;
5 PointedEar 834
}
835
 
836
###################################
837
# Write data structures to a file #
838
###################################
7 PointedEar 839
sub writedata
840
{
841
  open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
842
  print OUTF "Data collected from alt.os.linux.mandrake\n\n";
843
  print OUTF
844
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
845
  foreach my $name ( keys %data )
846
  {
847
    print OUTF
848
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
849
  }
850
  print OUTF
851
"============================================================================\n";
852
  print OUTF "Thread subjects\n";
853
  print OUTF
854
"----------------------------------------------------------------------------\n";
855
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
856
  {
857
    print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
858
  }
859
  print OUTF
860
"============================================================================\n";
861
  print OUTF "Cross-posts\n";
862
  print OUTF
863
"----------------------------------------------------------------------------\n";
864
  foreach my $name ( sort keys %crossposts )
865
  {
866
    print OUTF "$name : $crossposts{$name}\n";
867
  }
868
  print OUTF print OUTF
869
"============================================================================\n";
870
  print OUTF "User agents\n";
871
  print OUTF
872
"----------------------------------------------------------------------------\n";
873
  foreach my $name ( sort keys %agents )
874
  {
875
    print OUTF "$name : $agents{$name}\n";
876
  }
877
  print OUTF
878
"============================================================================\n";
879
  print OUTF "Time zones\n";
880
  print OUTF
881
"----------------------------------------------------------------------------\n";
882
  foreach my $name ( sort keys %tz )
883
  {
884
    print OUTF "$name : $tz{$name}\n";
885
  }
886
  close OUTF;
887
}    # writedata