Subversion Repositories LCARS

Rev

Rev 9 | Rev 11 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
8 PointedEar 1
#!/usr/bin/env perl
5 PointedEar 2
use strict;
6 PointedEar 3
use warnings;
4
use utf8;
9 PointedEar 5
use encoding 'utf-8';
6 PointedEar 6
use Encode;
7
 
8 PointedEar 8
###########################
10 PointedEar 9
# newsstat.pl version 0.4.2
5 PointedEar 10
 
6 PointedEar 11
############################################################################
12
# Collect statistics about a newsgroup (specified by first argument) in
7 PointedEar 13
# the local news spool. Check all articles in the last 30-day period.
6 PointedEar 14
# Rank posters by number of posts and by volume of posts, report on top and
15
# bottom 20 posters. Show their name, number of posts, size of posts,
16
# percentage of quoted lines. Rank user-agents used, by poster rather than
17
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
7 PointedEar 18
#
6 PointedEar 19
# (Numbers and paths can be configured below.  -- PE)
20
############################################################################
5 PointedEar 21
 
6 PointedEar 22
############################################################################
23
#                       RECENT CHANGES                                     #
9 PointedEar 24
# 2011-10-03  PE  - Use more compatible shebang
25
#                 - Fixed some Perl::Critic-ized code
26
#                 - Fixed wrong indent for non-ASCII names (TODO: proper dot count)
27
#                 - Formatted source code
6 PointedEar 28
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
7 PointedEar 29
#                 - Use warnings, utf8 (just in case)
6 PointedEar 30
#                 - Documentation update
31
# N/A         NN  - Take newsgroup name as argument
32
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
33
#                 - Allow command line flags for subtracting
34
#                   output if not pertinent for a group
35
# 2002-11-09  NN  - Put Garry's writedata() function back in.
36
#                 - added "rn" to my list of UA's
37
#                 - Started using %distinct_agent for both User agent
38
#                   sections
39
#                 - named it newsstat.pl version 0.3
40
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
41
#                   mtime rather than ctime, and simplifying the logic
42
# 2002-11-05  NN  - moved user configurations to the top
43
#                 - fixed the cross-posting section
44
#                 - introduced the $newsgroup_name variable which
45
#                   later becomes $news$group
46
#                 - changed $name to $agent_name in countagents()
7 PointedEar 47
#
6 PointedEar 48
# Contributors
49
# -------------
50
# NN  Nomen nominandum (name to be determined later)
51
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
5 PointedEar 52
 
6 PointedEar 53
########### TODO #############
5 PointedEar 54
# Commas in bottom section of report
55
# Show date the figures were compiled
56
# No. of HTML articles (Content-Type: text/html)
57
# No. of quoted sigs (/>\s*-- /)
58
# Per cent of top-posted articles
59
# Top 10 cross-posters
60
# Top 20 news posting hosts (from Path)
61
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
62
# Count *all* User Agents that each poster uses
63
# What do we do about Bill Unruh's ] quote style?
64
# Change the way dates/times are checked
65
# include % share in posters by no. of arts
66
# include % share in posters by size
67
# Total, orig & quoted lines by user agent with per cent
6 PointedEar 68
# Take more arguments
5 PointedEar 69
#######################################################
70
 
71
###################### USER CONFIGURATIONS ############################
72
 
73
# The name of the group to do stats for
74
my $newsgroup_name = $ARGV[0];
6 PointedEar 75
$newsgroup_name or &usage;
5 PointedEar 76
 
77
# Check for removal flags
78
my $ix;
79
my $j;
80
my %skipSec;
81
my @skiplist;
82
my $args = @ARGV;
7 PointedEar 83
for ( $ix = 1 ; $ix < $args ; $ix++ )
84
{
85
  $j = $ix + 1;
86
  if ( $ARGV[$ix] eq "-x" )
87
  {
88
    @skiplist = split( ",", $ARGV[$j] );
89
  }
90
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
91
  {
92
    @skiplist = split( ",", $1 );
93
  }
5 PointedEar 94
}
7 PointedEar 95
foreach (@skiplist)
96
{
5 PointedEar 97
  $skipSec{$_} = 1;
98
}
99
 
100
# Leafnode users will want /var/spool/news for this variable.
101
my $news = "/var/spool/news/";
102
 
103
# How many days are we doing statistics for?
104
my $numdays = 30;
105
 
106
# no. of agents we list
107
my $topagents = 10;
108
 
109
# no. of threads we want to know about
110
my $topthreads = 20;
111
 
112
# no. of top or bottom posters to show
113
my $topposters = 20;
114
 
115
# no. of cross-posted threads to show
116
my $topcrossposts = 10;
117
 
118
# no. of time zones to show
119
my $toptz = 10;
120
 
121
###################### DATA STRUCTURES ######################
122
my $group = $newsgroup_name;
123
$group =~ s!\.!/!g;
7 PointedEar 124
my %data;          # name, count, agent, total, orig, quoted
125
my %threads;       # subject, count
126
my %crossposts;    # group, count
127
my %tz;            # timezones by count
128
my %headers;       # holds header of current article
129
my %lcheader;      # holds lowercase headers
130
my @body;          # holds body of current article
131
my @sig;           # holds sig text;
132
my $totalposts;    # total no. of posts considered
133
my $filename;      # name of current article file
134
my $filesize;      # size of current article file
135
my $earliest;      # earliest article we have found
136
my $latest;        # latest article we have found
137
my $poster;        # poster we are dealing with
138
my $totsize   = 0; # holds total sizes of all files
139
my $totheader = 0; # total size of header material
140
my $totbody   = 0; # total size of body material
141
my $totsig    = 0; # total size of sig material
142
my $totorig   = 0; # total size of original material
143
my $totquoted = 0; # total size of quoted material
144
my $origposts = 0; # total no. of original posts
145
my $replies   = 0; # total no. of replies
146
my $i;             # general purpose
5 PointedEar 147
my %distinct_agent;
7 PointedEar 148
my %agents =       # used to hold counts of User Agents used
149
  (
150
  "KNode"                     => 0,
151
  "Pan"                       => 0,
152
  "Mozilla"                   => 0,
153
  "Sylpheed"                  => 0,
154
  "Gnus"                      => 0,
155
  "Forte Agent"               => 0,
156
  "Forte Free Agent"          => 0,
157
  "MicroPlanet Gravity"       => 0,
158
  "Microsoft Outlook Express" => 0,
159
  "Xnews"                     => 0,
160
  "slrn"                      => 0,
161
  "tin"                       => 0,
162
  "rn"                        => 0,
163
  "NN"                        => 0,
164
  "MacSOUP"                   => 0,
165
  "Foorum"                    => 0,
166
  "MT-NewsWatcher"            => 0,
167
  "News Rover"                => 0,
168
  "WebTV"                     => 0,
169
  "Compuserver"               => 0,
170
  "VSoup"                     => 0
171
  );
5 PointedEar 172
 
173
######################## MAIN CODE ########################
174
$! = 1;
175
 
176
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
7 PointedEar 177
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
178
while ( defined( $filename = readdir(DIR) ) )
179
{
180
  %lcheader = ();
181
  next unless -f $filename;    # only want real files
182
  next if ( $filename eq ".overview" );    # real articles only
183
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
184
  $earliest = ( stat $filename )[9] unless defined($earliest);
185
  $latest   = ( stat $filename )[9] unless defined($latest);
186
  &getarticle($filename);                  # read in the article
187
  &getdata;                                # grab the data from the article
188
  $totalposts++;                           # bump count of articles considered
5 PointedEar 189
}
7 PointedEar 190
closedir(DIR);                             # finished with the directory
191
 
5 PointedEar 192
# post-processing
7 PointedEar 193
&countagents;    # count agents, collapsing versions
194
&fixpercent;     # check percentages orig/total for posters
5 PointedEar 195
 
196
&writedata;
197
 
198
#################### DISPLAY RESULTS #####################
199
print "=" x 76, "\n";
7 PointedEar 200
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
5 PointedEar 201
print "=" x 76, "\n";
7 PointedEar 202
printf "%s\n",
203
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
5 PointedEar 204
print "\n\n";
7 PointedEar 205
printf "Total posts considered: %s over %d days\n", commify($totalposts),
206
  $numdays;
207
printf "Earliest article: %s\n",               scalar localtime($earliest);
208
printf "Latest article:   %s\n",               scalar localtime($latest);
209
printf "Original articles: %s, replies: %s\n", commify($origposts),
210
  commify($replies);
5 PointedEar 211
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
7 PointedEar 212
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
5 PointedEar 213
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
7 PointedEar 214
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
215
  commify( int( $totsize / $totalposts ) );
5 PointedEar 216
my $count = keys %data;
217
printf "Total headers: %s KB  bodies: %s KB\n",
7 PointedEar 218
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
5 PointedEar 219
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
7 PointedEar 220
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
221
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
222
  commify( int( $totsig / 1024 ) );
223
printf "Total number of posters: %s, average %s bytes per poster\n",
224
  commify($count), commify( int( $totsize / $count ) );     #/
5 PointedEar 225
$count = keys %threads;
7 PointedEar 226
printf "Total number of threads: %s, average %s bytes per thread\n",
227
  commify($count), commify( int( $totsize / $count ) );     #/
5 PointedEar 228
printf "Total number of User-Agents: %d\n", scalar keys %agents;
229
print "\n", "=" x 76, "\n";
230
 
231
###############################
232
# show posters by article count  Sec 1;
233
###############################
7 PointedEar 234
unless ( $skipSec{1} )
235
{
236
  if ( keys %data < $topposters )
237
  {
238
    $count = keys %data;
239
  }
240
  else
241
  {
242
    $count = $topposters;
243
  }
244
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
245
  print "=" x 76, "\n";
246
  $i = 0;
9 PointedEar 247
  foreach
248
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
7 PointedEar 249
  {
250
    my $name = substr( $poster, 0, 65 );
251
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
252
      $data{$poster}{count};
253
    last if ( ++$i == $count );
254
  }
255
  print "\n", "=" x 76, "\n";
5 PointedEar 256
}
257
 
258
################################
259
# show posters by size in Kbytes Sec 2;
260
################################
7 PointedEar 261
unless ( $skipSec{2} )
262
{
263
  if ( keys %data < $topposters )
264
  {
5 PointedEar 265
    $count = keys %data;
7 PointedEar 266
  }
267
  else
268
  {
5 PointedEar 269
    $count = $topposters;
270
  }
7 PointedEar 271
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
5 PointedEar 272
  print "=" x 76, "\n";
273
  $i = 0;
8 PointedEar 274
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
7 PointedEar 275
  {
276
    my $name = substr( $poster, 0, 62 );
277
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
278
      $data{$poster}{size} / 1024;    #/
279
    last if ( ++$i == $count );
5 PointedEar 280
  }
281
  print "\n", "=" x 76, "\n";
282
}
283
 
284
####################################
285
# show top posters for original text
286
####################################
7 PointedEar 287
unless ( $skipSec{3} )
288
{
289
  if ( keys %data < $topposters )
290
  {
291
    $count = keys %data;
292
  }
293
  else
294
  {
295
    $count = $topposters;
296
  }
297
  printf "%s\n",
298
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
299
  print "=" x 76, "\n";
300
  $i = 0;
9 PointedEar 301
  foreach my $poster (
302
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
303
    keys %data
304
    )
7 PointedEar 305
  {
306
    next if $data{$poster}{quoted} == 0;
307
    next if $data{$poster}{count} < 5;
308
    my $name = substr( $poster, 0, 63 );
309
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
310
      $data{$poster}{percent};
311
    last if ( ++$i == $count );
312
  }
313
  print "\n", "=" x 76, "\n";
5 PointedEar 314
}
315
 
316
#######################################
317
# show bottom posters for original text
318
#######################################
7 PointedEar 319
unless ( $skipSec{4} )
320
{
321
  if ( keys %data < $topposters )
322
  {
5 PointedEar 323
    $count = keys %data;
7 PointedEar 324
  }
325
  else
326
  {
5 PointedEar 327
    $count = $topposters;
328
  }
7 PointedEar 329
  printf "%s\n",
330
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
5 PointedEar 331
  print "=" x 76, "\n";
332
  $i = 0;
9 PointedEar 333
  foreach my $poster (
334
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
335
    keys %data
336
    )
7 PointedEar 337
  {
5 PointedEar 338
    next if $data{$poster}{quoted} == 0;
339
    next if $data{$poster}{count} < 5;
7 PointedEar 340
    my $name = substr( $poster, 0, 63 );
341
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
342
      $data{$poster}{percent};
343
    last if ( ++$i == $count );
5 PointedEar 344
  }
345
  print "\n", "=" x 76, "\n";
346
}
347
 
348
####################################
349
# show threads by number of articles
350
####################################
7 PointedEar 351
unless ( $skipSec{5} )
352
{
353
  if ( keys %threads < $topthreads )
354
  {
5 PointedEar 355
    $count = keys %threads;
7 PointedEar 356
  }
357
  else
358
  {
5 PointedEar 359
    $count = $topthreads;
360
  }
7 PointedEar 361
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
5 PointedEar 362
  print "=" x 76, "\n";
363
  $i = 0;
9 PointedEar 364
  foreach my $thread (
365
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
366
    keys %threads
367
    )
7 PointedEar 368
  {
369
    my $name = substr( $thread, 0, 65 );
370
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
371
      $threads{$thread}{count};
372
    last if ( ++$i == $count );
5 PointedEar 373
  }
374
  print "\n", "=" x 76, "\n";
375
}
376
################################
377
# show threads by size in Kbytes
378
################################
7 PointedEar 379
unless ( $skipSec{6} )
380
{
381
  if ( keys %threads < $topthreads )
382
  {
5 PointedEar 383
    $count = keys %threads;
7 PointedEar 384
  }
385
  else
386
  {
5 PointedEar 387
    $count = $topthreads;
388
  }
7 PointedEar 389
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
5 PointedEar 390
  print "=" x 76, "\n";
391
  $i = 0;
9 PointedEar 392
  foreach my $thread (
393
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
394
    keys %threads
395
    )
7 PointedEar 396
  {
397
    my $name = substr( $thread, 0, 65 );
398
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
399
      $threads{$thread}{size} / 1024;    #/
400
    last if ( ++$i == $count );
5 PointedEar 401
  }
402
  print "\n", "=" x 76, "\n";
403
}
404
 
405
#################################
406
# show top 10 cross-posted groups
407
#################################
7 PointedEar 408
unless ( $skipSec{7} )
409
{
410
  delete $crossposts{"$newsgroup_name"};    # don't include ours
411
  if ( keys %crossposts < $topcrossposts )
412
  {
413
    $count = keys %crossposts;
414
  }
415
  else
416
  {
417
    $count = $topcrossposts;
418
  }
419
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
420
  print "=" x 76, "\n";
421
  $i = 0;
422
  foreach
423
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
424
  {
425
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
426
      $crossposts{$name};
427
    last if ( ++$i == $count );
428
  }
429
  print "\n", "=" x 76, "\n";
5 PointedEar 430
}
431
#######################
432
#show agents and counts
433
#######################
7 PointedEar 434
unless ( $skipSec{8} )
435
{
436
  if ( keys %agents < $topagents )
437
  {
438
    $count = keys %agents;
439
  }
440
  else
441
  {
442
    $count = $topagents;
443
  }
444
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
445
  print "=" x 76, "\n";
446
  $i = 0;
447
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
448
  {
449
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
450
      $agents{$agent};
451
    last if ( ++$i == $count );
452
  }
453
  print "\n", "=" x 76, "\n";
5 PointedEar 454
}
455
 
456
#######################
457
#show distinct agents
458
#######################
7 PointedEar 459
unless ( $skipSec{9} )
460
{
461
  if ( keys %distinct_agent < $topagents )
462
  {
463
    $count = keys %distinct_agent;
464
  }
465
  else
466
  {
467
    $count = $topagents;
468
  }
469
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
470
  print "=" x 76, "\n";
471
  $i = 0;
9 PointedEar 472
  foreach my $agent (
473
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
474
    keys %distinct_agent
475
    )
7 PointedEar 476
  {
477
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
478
      $distinct_agent{$agent},
479
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
480
    last if ( ++$i == $count );
481
  }
482
  print "\n", "=" x 76, "\n";
5 PointedEar 483
}
484
 
485
##########################
486
#show timezones and counts
487
##########################
7 PointedEar 488
unless ( $skipSec{10} )
489
{
490
  if ( keys %tz < $toptz )
491
  {
492
    $count = keys %tz;
493
  }
494
  else
495
  {
496
    $count = $toptz;
497
  }
498
  printf "%s\n", &centred( "Top 10 time zones", 76 );
499
  print "=" x 76, "\n";
500
  $i = 0;
501
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
502
  {
503
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
504
    last if ( ++$i == $count );
505
  }
506
  print "\n", "=" x 76, "\n";
5 PointedEar 507
}
508
 
509
################################ SUBROUTINES ################################
510
 
511
#######################################
512
# get current article's header and body
513
#######################################
7 PointedEar 514
sub getarticle
515
{
516
  %headers = ();    # dump old headers
517
  my $filename = shift;    # get the name of the file
5 PointedEar 518
 
7 PointedEar 519
  # get stats about the file itself
520
  $filesize = -s $filename;    # get total size of file
521
  $totsize += $filesize;       # bump total sizes of all files
522
 
523
  my $mtime = ( stat $filename )[9];
524
  if ( $mtime < $earliest )
525
  {
5 PointedEar 526
    $earliest = $mtime;
7 PointedEar 527
  }
528
  elsif ( $mtime > $latest )
529
  {
5 PointedEar 530
    $latest = $mtime;
7 PointedEar 531
  }
5 PointedEar 532
 
7 PointedEar 533
  # now read the file
9 PointedEar 534
  open( my $FILE, $filename ) or die "Can't open $filename: $!\n";
8 PointedEar 535
  while (<$FILE>)
7 PointedEar 536
  {
537
    $totheader += length($_);    # bump total header size
538
    last if (/^\s*$/);           # end of header?
539
    if (/^([^:\s]*):\s+(.*)/)
540
    {
541
      my ( $key, $val ) = ( $1, $2 );
542
      $headers{$key} = decode( 'MIME-Header', $val );
543
      $lcheader{ clean( lc($key) ) } = clean($val);
544
    }
5 PointedEar 545
  }
9 PointedEar 546
  @body = <$FILE>;               # slurp up body
8 PointedEar 547
  close($FILE);
7 PointedEar 548
}    # getarticle
5 PointedEar 549
 
550
###################################
551
# get data from the current article
552
###################################
7 PointedEar 553
sub getdata
554
{
5 PointedEar 555
#### First, analyse header fields ####
556
 
7 PointedEar 557
  # Set up this poster if not defined, get counts, sizes
9 PointedEar 558
  my $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
7 PointedEar 559
  if ( !defined( $data{$poster} ) )
9 PointedEar 560
  {                                                  # seen this one before?
561
    $data{$poster}{agent}  = 'Unknown';              # comes after For: field
7 PointedEar 562
    $data{$poster}{orig}   = 0;
563
    $data{$poster}{quoted} = 0;
564
  }
9 PointedEar 565
  $data{$poster}{count}++;    # bump count for this poster
566
  $data{$poster}{size} += $filesize;    # total size of file
5 PointedEar 567
 
7 PointedEar 568
  # The User-Agent and/or X-Newsreader fields
569
  # for User-Agent by poster
570
  if ( defined $lcheader{"user-agent"} )
571
  {
572
    $data{$poster}{agent} = $lcheader{"user-agent"};
573
  }
574
  if ( defined $lcheader{"x-newsreader"} )
575
  {
576
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
577
  }
5 PointedEar 578
 
7 PointedEar 579
  # The User Agent for User-Agent by number of posts
580
  my $UA = "unknown";
581
  foreach my $keys ( keys %lcheader )
582
  {
583
    if ( defined $lcheader{'user-agent'} )
584
    {
585
      $UA = $lcheader{'user-agent'};
586
    }
587
    elsif ( defined $lcheader{"x-newsreader"} )
588
    {
589
      $UA = $lcheader{"x-newsreader"};
590
    }
591
    elsif ( defined $lcheader{'x-mailer'} )
592
    {
593
      $UA = $lcheader{'x-mailer'};
594
    }
595
    elsif (
596
      ( defined $lcheader{'organization'} )
597
      && ( $lcheader{'organization'} =~
598
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
599
      )
600
    {
601
      $UA = $lcheader{'organization'};
602
    }
603
    elsif ( $lcheader{'message-id'} =~ /pine/i )
604
    {
605
      $UA = "Pine";
606
    }    ## Hopefully found UA, else set to unknown
607
  }
5 PointedEar 608
 
7 PointedEar 609
  $UA = clean($UA);
610
  $UA = get_agent($UA);
5 PointedEar 611
 
7 PointedEar 612
  sub get_agent
613
  {
614
    my $raw   = shift;
615
    my $agent = $raw;
5 PointedEar 616
 
7 PointedEar 617
    ## strip http
618
    if ( $raw =~ /.*http.*/ )
619
    {
620
      $raw =~ s!posted via!!i;
621
      $raw =~ s!http://!!g;
622
      $raw =~ s!/!!g;
623
      $raw =~ s! !!g;
624
    }
5 PointedEar 625
 
7 PointedEar 626
    ## Fix Outlook from Mac
627
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
5 PointedEar 628
 
7 PointedEar 629
    ## Pick out the popular agents
630
    if ( $raw =~ /(outlook express)/i
631
      || $raw =~ /(microplanet gravity)/i
632
      || $raw =~ /(news rover)/i
633
      || $raw =~ /(forte agent)/i
634
      || $raw =~ /(forte free agent)/i )
635
    {
636
      $agent = $1;
637
    }
638
    elsif (
639
      $raw =~ /^(
5 PointedEar 640
        pan
641
       |sylpheed
642
       |slrn
643
       |mozilla
644
       |knode
645
       |tin
646
       |hamster
647
       |xrn
648
       |xnews
649
       |aol
650
       |gnus
651
       |krn
652
       |macsoup
653
       |messenger
654
       |openxp
655
       |pine
656
       |thoth
657
       |turnpike
658
       |winvn
659
       |vsoup
660
       |google
661
       |supernews
662
       |nn
663
       |rn
664
       |007
665
       |webtv
666
       |compuserve
7 PointedEar 667
       )/ix
668
      )
669
    {
670
      $agent = $1;
671
    }
672
    else
673
    {
674
      ## Clean up unknown agents
675
      if ( $raw =~ m!^(.*?)/! )
676
      {
677
        $agent = $1;
678
      }
679
      elsif ( $raw =~ /^(\w*)\d.*/ )
680
      {
681
        $agent = $1;
682
      }
683
    }
684
 
685
    $distinct_agent{$agent}++;
686
    return $agent;
5 PointedEar 687
  }
688
 
7 PointedEar 689
  # Get all cross-posted newsgroups
690
  for ( split /,/, $headers{"Newsgroups"} )
691
  {
692
    $crossposts{$_}++;    # bump count for each
693
  }
5 PointedEar 694
 
7 PointedEar 695
  # Get threads
696
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
697
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
698
  $thread =~ s/\s+/ /g;     # collapse whitespace
699
  $threads{$thread}{count} += 1;            # bump count of this subject
700
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
5 PointedEar 701
 
7 PointedEar 702
  # Is this an original post or a reply?
703
  if ( defined $headers{"References"} )
704
  {
705
    $replies++;
706
  }
707
  else
708
  {
709
    $origposts++;
710
  }
5 PointedEar 711
 
7 PointedEar 712
  # Get the time zone
713
  $_ = $headers{"Date"};
714
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
9 PointedEar 715
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
7 PointedEar 716
  {
717
    $tz = "UTC";
718
  }
719
  $tz{$tz}++;
5 PointedEar 720
 
7 PointedEar 721
#### Now analyse the body text ####
722
  my $insig = 0;
723
  for (@body)
724
  {
725
    $totbody += length($_);    # bump total body size
726
    next if (/^$>/);           # don't count blank lines in body
727
    if ( $insig == 1 )
728
    {
729
      $totsig += length($_);    # bump total sig size
5 PointedEar 730
 
7 PointedEar 731
      # Bill Unruh uses ] quotes, and another poster uses ::
732
    }
733
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
734
    {                           # are we in a quote line?
735
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
9 PointedEar 736
      $totquoted += length($_);
7 PointedEar 737
    }
738
    elsif (/-- /)
739
    {
740
      $insig = 1;
741
    }
742
    else
743
    {
5 PointedEar 744
 
7 PointedEar 745
      # we must be processing an original line
746
      $data{$poster}{orig} += length($_);      # bump count of original chrs
9 PointedEar 747
      $totorig += length($_);
7 PointedEar 748
    }
749
  }    # end for (@body)
5 PointedEar 750
 
7 PointedEar 751
}    # getdata
5 PointedEar 752
 
753
########################################
754
# Count the User-Agents used, collapsing
755
# different versions into one per agent.
756
########################################
7 PointedEar 757
sub countagents
758
{
5 PointedEar 759
POSTER:
8 PointedEar 760
  foreach my $poster ( keys %data )
7 PointedEar 761
  {
762
    foreach my $agent_name ( keys %distinct_agent )
763
    {    # check against known ones
764
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
765
      {
766
        $agents{$agent_name}++;
767
        next POSTER;
768
      }
769
    }
770
    $agents{ $data{$poster}{agent} }++;
771
  }
772
}    # countagents
5 PointedEar 773
 
774
############################################
775
# set orig/total percentages for all posters
776
############################################
7 PointedEar 777
sub fixpercent
778
{
8 PointedEar 779
  foreach my $poster ( keys %data )
7 PointedEar 780
  {
781
    my $percent = 100;
782
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
783
    {
9 PointedEar 784
      $percent =
785
        $data{$poster}{orig} * 100 /
7 PointedEar 786
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
787
    }
788
    elsif ( $data{$poster}{orig} == 0 )
789
    {
790
      $percent = 0;
791
    }
792
    $data{$poster}{percent} = $percent;
793
  }
5 PointedEar 794
}
795
 
796
##############################
797
# right pad a string with '.'s
798
##############################
7 PointedEar 799
sub rpad
800
{
801
 
802
  # get text to pad, length to pad, pad chr
803
  my ( $text, $pad_len, $pad_chr ) = @_;
804
  if ( length($text) > $pad_len )
805
  {
806
    $text = substr( $text, 0, $pad_len );
807
  }
808
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
809
  return $padded;
5 PointedEar 810
}
811
 
812
#################
813
# centre a string
814
#################
7 PointedEar 815
sub centred
816
{
817
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
818
  my $pad_len = ( $width - length($text) ) / 2;    #/
819
  my $centred = " " x $pad_len . $text;
820
  return $centred;
5 PointedEar 821
}
822
 
823
##########################
824
# put commas into a number
825
##########################
7 PointedEar 826
sub commify
827
{
828
  $_ = shift;
829
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
830
  return $_;
5 PointedEar 831
}
832
 
833
#########################
834
# clean
835
#########################
7 PointedEar 836
sub clean
837
{
838
  my $dirty = shift;
839
  my $clean = $dirty;
840
  $clean =~ s/^\s*//;
841
  $clean =~ s/\s*$//;
5 PointedEar 842
 
7 PointedEar 843
  return $clean;
5 PointedEar 844
}
845
 
7 PointedEar 846
sub usage
847
{
5 PointedEar 848
 
7 PointedEar 849
  print "usage: newstat.pl newsgroupname\n";
850
  exit 1;
5 PointedEar 851
}
852
 
853
###################################
854
# Write data structures to a file #
855
###################################
7 PointedEar 856
sub writedata
857
{
8 PointedEar 858
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
859
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
860
  print $OUTF
7 PointedEar 861
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
862
  foreach my $name ( keys %data )
863
  {
8 PointedEar 864
    print $OUTF
7 PointedEar 865
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
866
  }
8 PointedEar 867
  print $OUTF
7 PointedEar 868
"============================================================================\n";
8 PointedEar 869
  print $OUTF "Thread subjects\n";
870
  print $OUTF
7 PointedEar 871
"----------------------------------------------------------------------------\n";
872
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
873
  {
9 PointedEar 874
    print $OUTF
875
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
7 PointedEar 876
  }
8 PointedEar 877
  print $OUTF
7 PointedEar 878
"============================================================================\n";
8 PointedEar 879
  print $OUTF "Cross-posts\n";
880
  print $OUTF
7 PointedEar 881
"----------------------------------------------------------------------------\n";
882
  foreach my $name ( sort keys %crossposts )
883
  {
8 PointedEar 884
    print $OUTF "$name : $crossposts{$name}\n";
7 PointedEar 885
  }
8 PointedEar 886
  print $OUTF print $OUTF
7 PointedEar 887
"============================================================================\n";
8 PointedEar 888
  print $OUTF "User agents\n";
889
  print $OUTF
7 PointedEar 890
"----------------------------------------------------------------------------\n";
891
  foreach my $name ( sort keys %agents )
892
  {
8 PointedEar 893
    print $OUTF "$name : $agents{$name}\n";
7 PointedEar 894
  }
8 PointedEar 895
  print $OUTF
7 PointedEar 896
"============================================================================\n";
8 PointedEar 897
  print $OUTF "Time zones\n";
898
  print $OUTF
7 PointedEar 899
"----------------------------------------------------------------------------\n";
900
  foreach my $name ( sort keys %tz )
901
  {
8 PointedEar 902
    print $OUTF "$name : $tz{$name}\n";
7 PointedEar 903
  }
8 PointedEar 904
  close $OUTF;
7 PointedEar 905
}    # writedata