Subversion Repositories LCARS

Rev

Rev 11 | Rev 14 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 11 Rev 13
Line 1... Line 1...
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
-
 
4
use diagnostics;
4
use utf8;
5
use utf8;
5
use encoding 'utf-8';
-
 
6
use Encode;
6
use Encode;
7
7
-
 
8
## Print out all text to STDOUT UTF-8 encoded
-
 
9
binmode STDOUT, ':encoding(UTF-8)';
-
 
10
8
###########################
11
############################
9
# newsstat.pl version 0.4.2
12
## newsstat.pl version 0.4.3
10
13
11
############################################################################
14
###########################################################################
12
# Collect statistics about a newsgroup (specified by first argument) in
15
## Collect statistics about a newsgroup (specified by first argument)
13
# the local news spool. Check all articles in the last 30-day period.
16
## in the local news spool. Check all articles in the last 30-day period.
14
# Rank posters by number of posts and by volume of posts, report on top and
17
## Rank posters by number of posts and by volume of posts, report on top
15
# bottom 20 posters. Show their name, number of posts, size of posts,
18
## and bottom 20 posters. Show their name, number of posts, size of posts,
16
# percentage of quoted lines. Rank user-agents used, by poster rather than
19
## percentage of quoted lines. Rank user-agents used, by poster rather
17
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
20
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
18
#
21
##
19
# (Numbers and paths can be configured below.  -- PE)
22
## Numbers and paths can be configured below.  See ChangeLog and TODO
20
############################################################################
-
 
21
-
 
22
############################################################################
-
 
23
#                       RECENT CHANGES                                     #
-
 
24
# 2011-10-03  PE  - Use more compatible shebang
-
 
25
#                 - Fixed some Perl::Critic-ized code
-
 
26
#                 - Fixed wrong indent for non-ASCII names
-
 
27
#                 - Formatted source code
-
 
28
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
-
 
29
#                 - Use warnings, utf8 (just in case)
-
 
30
#                 - Documentation update
-
 
31
# N/A         NN  - Take newsgroup name as argument
-
 
32
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
-
 
33
#                 - Allow command line flags for subtracting
-
 
34
#                   output if not pertinent for a group
-
 
35
# 2002-11-09  NN  - Put Garry's writedata() function back in.
-
 
36
#                 - added "rn" to my list of UA's
-
 
37
#                 - Started using %distinct_agent for both User agent
-
 
38
#                   sections
-
 
39
#                 - named it newsstat.pl version 0.3
-
 
40
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
-
 
41
#                   mtime rather than ctime, and simplifying the logic
-
 
42
# 2002-11-05  NN  - moved user configurations to the top
-
 
43
#                 - fixed the cross-posting section
-
 
44
#                 - introduced the $newsgroup_name variable which
-
 
45
#                   later becomes $news$group
-
 
46
#                 - changed $name to $agent_name in countagents()
-
 
47
#
-
 
48
# Contributors
-
 
49
# -------------
-
 
50
# NN  Nomen nominandum (name to be determined later)
-
 
51
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
-
 
52
-
 
53
########### TODO #############
-
 
54
# Commas in bottom section of report
-
 
55
# Show date the figures were compiled
-
 
56
# No. of HTML articles (Content-Type: text/html)
-
 
57
# No. of quoted sigs (/>\s*-- /)
-
 
58
# Per cent of top-posted articles
-
 
59
# Top 10 cross-posters
-
 
60
# Top 20 news posting hosts (from Path)
-
 
61
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
-
 
62
# Count *all* User Agents that each poster uses
-
 
63
# What do we do about Bill Unruh's ] quote style?
-
 
64
# Change the way dates/times are checked
-
 
65
# include % share in posters by no. of arts
-
 
66
# include % share in posters by size
-
 
67
# Total, orig & quoted lines by user agent with per cent
-
 
68
# Take more arguments
23
## for more.  -- PE
69
#######################################################
24
###########################################################################
70
25
71
###################### USER CONFIGURATIONS ############################
26
###################### USER CONFIGURATIONS ############################
72
27
73
# The name of the group to do stats for
28
## The name of the group to do stats for
74
my $newsgroup_name = $ARGV[0];
29
my $newsgroup_name = $ARGV[0];
75
$newsgroup_name or &usage;
30
$newsgroup_name or &usage;
76
31
77
# Check for removal flags
32
## Check for removal flags
78
my $ix;
33
my $ix;
79
my $j;
34
my $j;
80
my %skipSec;
35
my %skipSec;
81
my @skiplist;
36
my @skiplist;
82
my $args = @ARGV;
37
my $args = @ARGV;
Line 95... Line 50...
95
foreach (@skiplist)
50
foreach (@skiplist)
96
{
51
{
97
  $skipSec{$_} = 1;
52
  $skipSec{$_} = 1;
98
}
53
}
99
54
100
# Leafnode users will want /var/spool/news for this variable.
55
## Leafnode users will want /var/spool/news for this variable.
101
my $news = "/var/spool/news/";
56
my $news = "/var/spool/news/";
102
57
103
# How many days are we doing statistics for?
58
## How many days are we doing statistics for?
104
my $numdays = 30;
59
my $numdays = 30;
105
60
106
# no. of agents we list
61
## Number of agents we list
107
my $topagents = 10;
62
my $topagents = 10;
108
63
109
# no. of threads we want to know about
64
## Number of threads we want to know about
110
my $topthreads = 20;
65
my $topthreads = 20;
111
66
112
# no. of top or bottom posters to show
67
## Number of top or bottom posters to show
113
my $topposters = 20;
68
my $topposters = 20;
114
69
115
# no. of cross-posted threads to show
70
## Number of cross-posted threads to show
116
my $topcrossposts = 10;
71
my $topcrossposts = 10;
117
72
118
# no. of time zones to show
73
## Number of time zones to show
119
my $toptz = 10;
74
my $toptz = 10;
120
75
121
###################### DATA STRUCTURES ######################
76
###################### DATA STRUCTURES ######################
122
my $group = $newsgroup_name;
77
my $group = $newsgroup_name;
123
$group =~ s!\.!/!g;
78
$group =~ s!\.!/!g;
Line 143... Line 98...
143
my $totquoted = 0; # total size of quoted material
98
my $totquoted = 0; # total size of quoted material
144
my $origposts = 0; # total no. of original posts
99
my $origposts = 0; # total no. of original posts
145
my $replies   = 0; # total no. of replies
100
my $replies   = 0; # total no. of replies
146
my $i;             # general purpose
101
my $i;             # general purpose
147
my %distinct_agent;
102
my %distinct_agent;
148
my %agents =       # used to hold counts of User Agents used
-
 
149
  (
103
150
  "KNode"                     => 0,
104
## Used to hold counts of User Agents used
151
  "Pan"                       => 0,
-
 
152
  "Mozilla"                   => 0,
105
my %agents = (
153
  "Sylpheed"                  => 0,
106
  "Compuserver"               => 0,
154
  "Gnus"                      => 0,
107
  "Foorum"                    => 0,
155
  "Forte Agent"               => 0,
108
  "Forte Agent"               => 0,
156
  "Forte Free Agent"          => 0,
109
  "Forte Free Agent"          => 0,
-
 
110
  "Gnus"                      => 0,
-
 
111
  "KNode"                     => 0,
-
 
112
  "MacSOUP"                   => 0,
-
 
113
  "MT-NewsWatcher"            => 0,
157
  "MicroPlanet Gravity"       => 0,
114
  "MicroPlanet Gravity"       => 0,
158
  "Microsoft Outlook Express" => 0,
115
  "Microsoft Outlook Express" => 0,
-
 
116
  "Microsoft Windows Mail"    => 0,
-
 
117
  "Mozilla"                   => 0,
-
 
118
  "News Rover"                => 0,
-
 
119
  "NN"                        => 0,
-
 
120
  "Pan"                       => 0,
159
  "Xnews"                     => 0,
121
  "rn"                        => 0,
160
  "slrn"                      => 0,
122
  "slrn"                      => 0,
-
 
123
  "Sylpheed"                  => 0,
161
  "tin"                       => 0,
124
  "tin"                       => 0,
162
  "rn"                        => 0,
-
 
163
  "NN"                        => 0,
-
 
164
  "MacSOUP"                   => 0,
-
 
165
  "Foorum"                    => 0,
125
  "VSoup"                     => 0,
166
  "MT-NewsWatcher"            => 0,
-
 
167
  "News Rover"                => 0,
-
 
168
  "WebTV"                     => 0,
126
  "WebTV"                     => 0,
169
  "Compuserver"               => 0,
-
 
170
  "VSoup"                     => 0
127
  "Xnews"                     => 0
171
  );
128
);
172
129
173
######################## MAIN CODE ########################
130
######################## MAIN CODE ########################
174
$! = 1;
131
$! = 1;
175
132
176
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
133
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
177
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
134
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
178
while ( defined( $filename = readdir(DIR) ) )
135
while ( defined( $filename = readdir(DIR) ) )
179
{
136
{
180
  %lcheader = ();
137
  %lcheader = ();
181
  next unless -f $filename;    # only want real files
138
  next unless -f $filename;               # only want real files
182
  next if ( $filename eq ".overview" );    # real articles only
139
  next if ( $filename eq ".overview" );   # real articles only
183
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
140
  next if ( -M $filename > $numdays );    # only want articles <= a certain age
184
  $earliest = ( stat $filename )[9] unless defined($earliest);
141
  $earliest = ( stat $filename )[9] unless defined($earliest);
185
  $latest   = ( stat $filename )[9] unless defined($latest);
142
  $latest   = ( stat $filename )[9] unless defined($latest);
186
  &getarticle($filename);                  # read in the article
143
  &get_article($filename);                 # read in the article
187
  &getdata;                                # grab the data from the article
144
  &get_data;                               # grab the data from the article
188
  $totalposts++;                           # bump count of articles considered
145
  $totalposts++;                          # bump count of articles considered
189
}
146
}
190
closedir(DIR);                             # finished with the directory
147
closedir(DIR);                            # finished with the directory
191
148
192
# post-processing
149
## Post-processing
193
&countagents;    # count agents, collapsing versions
150
&count_agents;    # count agents, collapsing versions
194
&fixpercent;     # check percentages orig/total for posters
151
&fix_percent;     # check percentages orig/total for posters
195
152
196
&writedata;
153
&write_data;
197
154
198
#################### DISPLAY RESULTS #####################
155
#################### DISPLAY RESULTS #####################
199
print "=" x 76, "\n";
156
print "=" x 76, "\n";
200
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
157
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
201
print "=" x 76, "\n";
158
print "=" x 76, "\n";
Line 206... Line 163...
206
  $numdays;
163
  $numdays;
207
printf "Earliest article: %s\n",               scalar localtime($earliest);
164
printf "Earliest article: %s\n",               scalar localtime($earliest);
208
printf "Latest article:   %s\n",               scalar localtime($latest);
165
printf "Latest article:   %s\n",               scalar localtime($latest);
209
printf "Original articles: %s, replies: %s\n", commify($origposts),
166
printf "Original articles: %s, replies: %s\n", commify($origposts),
210
  commify($replies);
167
  commify($replies);
211
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
168
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
212
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
169
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
213
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
170
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
214
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
171
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
215
  commify( int( $totsize / $totalposts ) );
172
  commify( int( $totsize / $totalposts ) );
216
my $count = keys %data;
173
my $count = keys %data;
217
printf "Total headers: %s KB  bodies: %s KB\n",
174
printf "Total headers: %s KiB  bodies: %s KiB\n",
218
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
175
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
219
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
176
printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
220
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
177
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
221
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
178
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
222
  commify( int( $totsig / 1024 ) );
179
  commify( int( $totsig / 1024 ) );
223
printf "Total number of posters: %s, average %s bytes per poster\n",
180
printf "Total number of posters: %s, average %s bytes per poster\n",
224
  commify($count), commify( int( $totsize / $count ) );     #/
181
  commify($count), commify( int( $totsize / $count ) );     #/
225
$count = keys %threads;
182
$count = keys %threads;
226
printf "Total number of threads: %s, average %s bytes per thread\n",
183
printf "Total number of threads: %s, average %s bytes per thread\n",
227
  commify($count), commify( int( $totsize / $count ) );     #/
184
  commify($count), commify( int( $totsize / $count ) );     #/
228
printf "Total number of User-Agents: %d\n", scalar keys %agents;
185
printf "Total number of user agents: %d\n", scalar keys %agents;
229
print "\n", "=" x 76, "\n";
186
print "\n", "=" x 76, "\n";
230
187
231
###############################
188
########################################
232
# show posters by article count  Sec 1;
189
## Show posters by article count  Sec 1;
233
###############################
190
########################################
234
unless ( $skipSec{1} )
191
unless ( $skipSec{1} )
235
{
192
{
236
  if ( keys %data < $topposters )
193
  if ( keys %data < $topposters )
237
  {
194
  {
238
    $count = keys %data;
195
    $count = keys %data;
Line 253... Line 210...
253
    last if ( ++$i == $count );
210
    last if ( ++$i == $count );
254
  }
211
  }
255
  print "\n", "=" x 76, "\n";
212
  print "\n", "=" x 76, "\n";
256
}
213
}
257
214
258
################################
215
######################################
259
# show posters by size in Kbytes Sec 2;
216
## Show posters by size in KiB  Sec 2;
260
################################
217
######################################
261
unless ( $skipSec{2} )
218
unless ( $skipSec{2} )
262
{
219
{
263
  if ( keys %data < $topposters )
220
  if ( keys %data < $topposters )
264
  {
221
  {
265
    $count = keys %data;
222
    $count = keys %data;
266
  }
223
  }
267
  else
224
  else
268
  {
225
  {
269
    $count = $topposters;
226
    $count = $topposters;
270
  }
227
  }
271
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
228
  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
272
  print "=" x 76, "\n";
229
  print "=" x 76, "\n";
273
  $i = 0;
230
  $i = 0;
274
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
231
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
275
  {
232
  {
276
    my $name = substr( $poster, 0, 62 );
233
    my $name = substr( $poster, 0, 62 );
Line 279... Line 236...
279
    last if ( ++$i == $count );
236
    last if ( ++$i == $count );
280
  }
237
  }
281
  print "\n", "=" x 76, "\n";
238
  print "\n", "=" x 76, "\n";
282
}
239
}
283
240
284
####################################
241
#####################################
285
# show top posters for original text
242
## Show top posters for original text
286
####################################
243
#####################################
287
unless ( $skipSec{3} )
244
unless ( $skipSec{3} )
288
{
245
{
289
  if ( keys %data < $topposters )
246
  if ( keys %data < $topposters )
290
  {
247
  {
291
    $count = keys %data;
248
    $count = keys %data;
Line 311... Line 268...
311
    last if ( ++$i == $count );
268
    last if ( ++$i == $count );
312
  }
269
  }
313
  print "\n", "=" x 76, "\n";
270
  print "\n", "=" x 76, "\n";
314
}
271
}
315
272
316
#######################################
273
########################################
317
# show bottom posters for original text
274
## Show bottom posters for original text
318
#######################################
275
########################################
319
unless ( $skipSec{4} )
276
unless ( $skipSec{4} )
320
{
277
{
321
  if ( keys %data < $topposters )
278
  if ( keys %data < $topposters )
322
  {
279
  {
323
    $count = keys %data;
280
    $count = keys %data;
Line 343... Line 300...
343
    last if ( ++$i == $count );
300
    last if ( ++$i == $count );
344
  }
301
  }
345
  print "\n", "=" x 76, "\n";
302
  print "\n", "=" x 76, "\n";
346
}
303
}
347
304
348
####################################
305
#####################################
349
# show threads by number of articles
306
## Show threads by number of articles
350
####################################
307
#####################################
351
unless ( $skipSec{5} )
308
unless ( $skipSec{5} )
352
{
309
{
353
  if ( keys %threads < $topthreads )
310
  if ( keys %threads < $topthreads )
354
  {
311
  {
355
    $count = keys %threads;
312
    $count = keys %threads;
Line 371... Line 328...
371
      $threads{$thread}{count};
328
      $threads{$thread}{count};
372
    last if ( ++$i == $count );
329
    last if ( ++$i == $count );
373
  }
330
  }
374
  print "\n", "=" x 76, "\n";
331
  print "\n", "=" x 76, "\n";
375
}
332
}
-
 
333
376
################################
334
##############################
377
# show threads by size in Kbytes
335
## Show threads by size in KiB
378
################################
336
##############################
379
unless ( $skipSec{6} )
337
unless ( $skipSec{6} )
380
{
338
{
381
  if ( keys %threads < $topthreads )
339
  if ( keys %threads < $topthreads )
382
  {
340
  {
383
    $count = keys %threads;
341
    $count = keys %threads;
384
  }
342
  }
385
  else
343
  else
386
  {
344
  {
387
    $count = $topthreads;
345
    $count = $topthreads;
388
  }
346
  }
389
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
347
  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
390
  print "=" x 76, "\n";
348
  print "=" x 76, "\n";
391
  $i = 0;
349
  $i = 0;
392
  foreach my $thread (
350
  foreach my $thread (
393
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
351
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
394
    keys %threads
352
    keys %threads
Line 400... Line 358...
400
    last if ( ++$i == $count );
358
    last if ( ++$i == $count );
401
  }
359
  }
402
  print "\n", "=" x 76, "\n";
360
  print "\n", "=" x 76, "\n";
403
}
361
}
404
362
405
#################################
363
##################################
406
# show top 10 cross-posted groups
364
## Show top 10 cross-posted groups
407
#################################
365
##################################
408
unless ( $skipSec{7} )
366
unless ( $skipSec{7} )
409
{
367
{
410
  delete $crossposts{"$newsgroup_name"};    # don't include ours
368
  delete $crossposts{"$newsgroup_name"};    # don't include ours
411
  if ( keys %crossposts < $topcrossposts )
369
  if ( keys %crossposts < $topcrossposts )
412
  {
370
  {
Line 426... Line 384...
426
      $crossposts{$name};
384
      $crossposts{$name};
427
    last if ( ++$i == $count );
385
    last if ( ++$i == $count );
428
  }
386
  }
429
  print "\n", "=" x 76, "\n";
387
  print "\n", "=" x 76, "\n";
430
}
388
}
-
 
389
431
#######################
390
#########################
432
#show agents and counts
391
## Show agents and counts
433
#######################
392
#########################
434
unless ( $skipSec{8} )
393
unless ( $skipSec{8} )
435
{
394
{
436
  if ( keys %agents < $topagents )
395
  if ( keys %agents < $topagents )
437
  {
396
  {
438
    $count = keys %agents;
397
    $count = keys %agents;
Line 452... Line 411...
452
  }
411
  }
453
  print "\n", "=" x 76, "\n";
412
  print "\n", "=" x 76, "\n";
454
}
413
}
455
414
456
#######################
415
#######################
457
#show distinct agents
416
## Show distinct agents
458
#######################
417
#######################
459
unless ( $skipSec{9} )
418
unless ( $skipSec{9} )
460
{
419
{
461
  if ( keys %distinct_agent < $topagents )
420
  if ( keys %distinct_agent < $topagents )
462
  {
421
  {
Line 480... Line 439...
480
    last if ( ++$i == $count );
439
    last if ( ++$i == $count );
481
  }
440
  }
482
  print "\n", "=" x 76, "\n";
441
  print "\n", "=" x 76, "\n";
483
}
442
}
484
443
485
##########################
444
############################
486
#show timezones and counts
445
## Show timezones and counts
487
##########################
446
############################
488
unless ( $skipSec{10} )
447
unless ( $skipSec{10} )
489
{
448
{
490
  if ( keys %tz < $toptz )
449
  if ( keys %tz < $toptz )
491
  {
450
  {
492
    $count = keys %tz;
451
    $count = keys %tz;
Line 506... Line 465...
506
  print "\n", "=" x 76, "\n";
465
  print "\n", "=" x 76, "\n";
507
}
466
}
508
467
509
################################ SUBROUTINES ################################
468
################################ SUBROUTINES ################################
510
469
511
#######################################
470
########################################
512
# get current article's header and body
471
## Get current article's header and body
513
#######################################
472
########################################
514
sub getarticle
473
sub get_article
515
{
474
{
516
  %headers = ();    # dump old headers
475
  %headers = ();    # dump old headers
517
  my $filename = shift;    # get the name of the file
476
  my $filename = shift;    # get the name of the file
518
477
519
  # get stats about the file itself
478
  ## get stats about the file itself
520
  $filesize = -s $filename;    # get total size of file
479
  $filesize = -s $filename;    # get total size of file
521
  $totsize += $filesize;       # bump total sizes of all files
480
  $totsize += $filesize;       # bump total sizes of all files
522
481
523
  my $mtime = ( stat $filename )[9];
482
  my $mtime = ( stat $filename )[9];
524
  if ( $mtime < $earliest )
483
  if ( $mtime < $earliest )
Line 528... Line 487...
528
  elsif ( $mtime > $latest )
487
  elsif ( $mtime > $latest )
529
  {
488
  {
530
    $latest = $mtime;
489
    $latest = $mtime;
531
  }
490
  }
532
491
533
  # now read the file
492
  ## now read the file
534
  open( my $FILE, $filename ) or die "Can't open $filename: $!\n";
493
  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
535
  while (<$FILE>)
494
  while (<$FILE>)
536
  {
495
  {
537
    $totheader += length($_);    # bump total header size
496
    $totheader += length($_);    # bump total header size
538
    last if (/^\s*$/);           # end of header?
497
    last if (/^\s*$/);           # end of header?
539
    if (/^([^:\s]*):\s+(.*)/)
498
    if (/^([^:\s]*):\s*(.*)/)
540
    {
499
    {
541
      my ( $key, $val ) = ( $1, $2 );
500
      my ( $key, $val ) = ( $1, $2 );
542
      $headers{$key} = decode( 'MIME-Header', $val );
501
      $headers{$key} = decode( 'MIME-Header', $val );
543
      $lcheader{ clean( lc($key) ) } = clean($val);
502
      $lcheader{ clean( lc($key) ) } = clean($val);
544
    }
503
    }
545
  }
504
  }
546
  @body = <$FILE>;               # slurp up body
505
  @body = <$FILE>;               # slurp up body
547
  close($FILE);
506
  close($FILE);
548
}    # getarticle
507
}    # get_article
549
508
550
###################################
509
####################################
551
# get data from the current article
510
## Get data from the current article
552
###################################
511
####################################
553
sub getdata
512
sub get_data
554
{
513
{
555
#### First, analyse header fields ####
514
#### First, analyse header fields ####
556
515
557
  # Set up this poster if not defined, get counts, sizes
516
  ## Set up this poster if not defined, get counts, sizes
558
  my $poster = $headers{From};    # get the poster's name
517
  my $poster = $headers{From};    # get the poster's name
-
 
518
 
-
 
519
  # Convert old to new format
-
 
520
  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
-
 
521
 
-
 
522
  # Collapse whitespace
-
 
523
  $poster =~ s/\s+/ /;
-
 
524
 
-
 
525
  # Remove outer quotes
-
 
526
  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
-
 
527
 
559
  if ( !defined( $data{$poster} ) )
528
  if ( !defined( $data{$poster} ) )
560
  {                                   # seen this one before?
529
  {                               # seen this one before?
561
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
530
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
562
    $data{$poster}{orig}   = 0;
531
    $data{$poster}{orig}   = 0;
563
    $data{$poster}{quoted} = 0;
532
    $data{$poster}{quoted} = 0;
564
  }
533
  }
565
  $data{$poster}{count}++;                 # bump count for this poster
534
  $data{$poster}{count}++;                 # bump count for this poster
566
  $data{$poster}{size} += $filesize;       # total size of file
535
  $data{$poster}{size} += $filesize;       # total size of file
567
536
568
  # The User-Agent and/or X-Newsreader fields
537
  ## The User-Agent and/or X-Newsreader fields
569
  # for User-Agent by poster
538
  ## for User-Agent by poster
570
  if ( defined $lcheader{"user-agent"} )
539
  if ( defined $lcheader{"user-agent"} )
571
  {
540
  {
572
    $data{$poster}{agent} = $lcheader{"user-agent"};
541
    $data{$poster}{agent} = $lcheader{"user-agent"};
573
  }
542
  }
574
  if ( defined $lcheader{"x-newsreader"} )
543
  if ( defined $lcheader{"x-newsreader"} )
575
  {
544
  {
576
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
545
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
577
  }
546
  }
578
547
579
  # The User Agent for User-Agent by number of posts
548
  ## The User Agent for User-Agent by number of posts
580
  my $UA = "unknown";
549
  my $UA = "unknown";
581
  foreach my $keys ( keys %lcheader )
550
  foreach my $keys ( keys %lcheader )
582
  {
551
  {
583
    if ( defined $lcheader{'user-agent'} )
552
    if ( defined $lcheader{'user-agent'} )
584
    {
553
    {
Line 625... Line 594...
625
594
626
    ## Fix Outlook from Mac
595
    ## Fix Outlook from Mac
627
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
596
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
628
597
629
    ## Pick out the popular agents
598
    ## Pick out the popular agents
-
 
599
    if (
630
    if ( $raw =~ /(outlook express)/i
600
           $raw =~ /(outlook express)/i
-
 
601
        || $raw =~ /(windows mail)/i
631
      || $raw =~ /(microplanet gravity)/i
602
        || $raw =~ /(microplanet gravity)/i
632
      || $raw =~ /(news rover)/i
603
        || $raw =~ /(news rover)/i
633
      || $raw =~ /(forte agent)/i
604
        || $raw =~ /(forte agent)/i
634
      || $raw =~ /(forte free agent)/i )
605
        || $raw =~ /(forte free agent)/i
-
 
606
      )
635
    {
607
    {
636
      $agent = $1;
608
      $agent = $1;
637
    }
609
    }
638
    elsif (
610
    elsif (
639
      $raw =~ /^(
611
      $raw =~ /^(
Line 684... Line 656...
684
656
685
    $distinct_agent{$agent}++;
657
    $distinct_agent{$agent}++;
686
    return $agent;
658
    return $agent;
687
  }
659
  }
688
660
689
  # Get all cross-posted newsgroups
661
  ## Get all cross-posted newsgroups
690
  for ( split /,/, $headers{"Newsgroups"} )
662
  for ( split /,/, $headers{"Newsgroups"} )
691
  {
663
  {
692
    $crossposts{$_}++;    # bump count for each
664
    $crossposts{$_}++;    # bump count for each
693
  }
665
  }
694
666
695
  # Get threads
667
  ## Get threads
696
  my $thread = $headers{"Subject"};
668
  my $thread = $headers{"Subject"};
697
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
669
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
698
  $thread =~ s/\s+/ /g;     # collapse whitespace
670
  $thread =~ s/\s+/ /g;     # collapse whitespace
699
  $threads{$thread}{count} += 1;            # bump count of this subject
671
  $threads{$thread}{count} += 1;            # bump count of this subject
700
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
672
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
701
673
702
  # Is this an original post or a reply?
674
  ## Is this an original post or a reply?
703
  if ( defined $headers{"References"} )
675
  if ( defined $headers{"References"} )
704
  {
676
  {
705
    $replies++;
677
    $replies++;
706
  }
678
  }
707
  else
679
  else
708
  {
680
  {
709
    $origposts++;
681
    $origposts++;
710
  }
682
  }
711
683
712
  # Get the time zone
684
  ## Get the time zone
713
  $_ = $headers{"Date"};
685
  $_ = $headers{"Date"};
714
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
686
  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
715
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
687
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
716
  {
688
  {
717
    $tz = "UTC";
689
    $tz = "UTC";
718
  }
690
  }
719
  $tz{$tz}++;
691
  $tz{$tz}++;
Line 726... Line 698...
726
    next if (/^$>/);           # don't count blank lines in body
698
    next if (/^$>/);           # don't count blank lines in body
727
    if ( $insig == 1 )
699
    if ( $insig == 1 )
728
    {
700
    {
729
      $totsig += length($_);    # bump total sig size
701
      $totsig += length($_);    # bump total sig size
730
702
731
      # Bill Unruh uses ] quotes, and another poster uses ::
703
      ## Bill Unruh uses ] quotes, and another poster uses ::
732
    }
704
    }
733
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
705
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
734
    {                           # are we in a quote line?
706
    {                           # are we in a quote line?
735
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
707
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
736
      $totquoted += length($_);
708
      $totquoted += length($_);
Line 740... Line 712...
740
      $insig = 1;
712
      $insig = 1;
741
    }
713
    }
742
    else
714
    else
743
    {
715
    {
744
716
745
      # we must be processing an original line
717
      ## We must be processing an original line
746
      $data{$poster}{orig} += length($_);      # bump count of original chrs
718
      $data{$poster}{orig} += length($_);      # bump count of original chrs
747
      $totorig += length($_);
719
      $totorig += length($_);
748
    }
720
    }
749
  }    # end for (@body)
721
  }    # end for (@body)
750
722
751
}    # getdata
723
}    # get_data
752
724
753
########################################
725
#########################################
754
# Count the User-Agents used, collapsing
726
## Count the User-Agents used, collapsing
755
# different versions into one per agent.
727
## different versions into one per agent.
756
########################################
728
#########################################
757
sub countagents
729
sub count_agents
758
{
730
{
759
POSTER:
731
POSTER:
760
  foreach my $poster ( keys %data )
732
  foreach my $poster ( keys %data )
761
  {
733
  {
762
    foreach my $agent_name ( keys %distinct_agent )
734
    foreach my $agent_name ( keys %distinct_agent )
Line 767... Line 739...
767
        next POSTER;
739
        next POSTER;
768
      }
740
      }
769
    }
741
    }
770
    $agents{ $data{$poster}{agent} }++;
742
    $agents{ $data{$poster}{agent} }++;
771
  }
743
  }
772
}    # countagents
744
}    # count_agents
773
745
774
############################################
746
#############################################
775
# set orig/total percentages for all posters
747
## Set orig/total percentages for all posters
776
############################################
748
#############################################
777
sub fixpercent
749
sub fix_percent
778
{
750
{
779
  foreach my $poster ( keys %data )
751
  foreach my $poster ( keys %data )
780
  {
752
  {
781
    my $percent = 100;
753
    my $percent = 100;
782
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
754
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
Line 791... Line 763...
791
    }
763
    }
792
    $data{$poster}{percent} = $percent;
764
    $data{$poster}{percent} = $percent;
793
  }
765
  }
794
}
766
}
795
767
796
##############################
768
###############################
797
# right pad a string with '.'s
769
## Right pad a string with '.'s
798
##############################
770
###############################
799
sub rpad
771
sub rpad
800
{
772
{
801
  # get text to pad, length to pad, pad chr
773
  ## Get text to pad, length to pad, pad chr
802
  my ( $text, $pad_len, $pad_chr ) = @_;
774
  my ( $text, $pad_len, $pad_chr ) = @_;
803
775
804
  ### DEBUG
776
  ## DEBUG
805
#  printf "|%s| = %d\n", $text, length($text);
777
#printf "|%s| = %d\n", $text, length($text);
806
778
807
  if ( length($text) > $pad_len )
779
  if ( length($text) > $pad_len )
808
  {
780
  {
809
    $text = substr( $text, 0, $pad_len );
781
    $text = substr( $text, 0, $pad_len );
810
  }
782
  }
811
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
783
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
812
  return $padded;
784
  return $padded;
813
}
785
}
814
786
815
#################
787
##################
816
# centre a string
788
## Centre a string
817
#################
789
##################
818
sub centred
790
sub centred
819
{
791
{
820
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
792
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
821
  my $pad_len = ( $width - length($text) ) / 2;    #/
793
  my $pad_len = ( $width - length($text) ) / 2;    #/
822
  my $centred = " " x $pad_len . $text;
794
  my $centred = " " x $pad_len . $text;
823
  return $centred;
795
  return $centred;
824
}
796
}
825
797
826
##########################
798
###########################
827
# put commas into a number
799
## Put commas into a number
828
##########################
800
###########################
829
sub commify
801
sub commify
830
{
802
{
831
  $_ = shift;
803
  local $_ = shift;
832
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
804
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
833
  return $_;
805
  return $_;
834
}
806
}
835
807
836
#########################
808
################################################################
837
# clean
809
## Returns a string with leading and trailing whitespace removed
838
#########################
810
################################################################
839
sub clean
811
sub clean
840
{
812
{
841
  my $dirty = shift;
813
  my $dirty = shift;
842
  my $clean = $dirty;
814
  my $clean = $dirty;
843
  $clean =~ s/^\s*//;
-
 
844
  $clean =~ s/\s*$//;
815
  $clean =~ s/^\s*|\s*$//g;
845
816
846
  return $clean;
817
  return $clean;
847
}
818
}
848
819
849
sub usage
820
sub usage
850
{
821
{
851
-
 
852
  print "usage: newstat.pl newsgroupname\n";
822
  print "usage: newstat.pl newsgroupname\n";
853
  exit 1;
823
  exit 1;
854
}
824
}
855
825
856
###################################
826
##################################
857
# Write data structures to a file #
827
## Write data structures to a file
858
###################################
828
##################################
859
sub writedata
829
sub write_data
860
{
830
{
-
 
831
  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
861
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
832
    or die "Can't create XDATA: $!\n";
862
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
833
  print $OUTF "Data collected from $newsgroup_name\n\n";
863
  print $OUTF
834
  print $OUTF
864
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
835
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
865
  foreach my $name ( keys %data )
836
  foreach my $name ( keys %data )
866
  {
837
  {
867
    print $OUTF
838
    print $OUTF
Line 884... Line 855...
884
"----------------------------------------------------------------------------\n";
855
"----------------------------------------------------------------------------\n";
885
  foreach my $name ( sort keys %crossposts )
856
  foreach my $name ( sort keys %crossposts )
886
  {
857
  {
887
    print $OUTF "$name : $crossposts{$name}\n";
858
    print $OUTF "$name : $crossposts{$name}\n";
888
  }
859
  }
889
  print $OUTF print $OUTF
860
  print $OUTF
890
"============================================================================\n";
861
"============================================================================\n";
891
  print $OUTF "User agents\n";
862
  print $OUTF "User agents\n";
892
  print $OUTF
863
  print $OUTF
893
"----------------------------------------------------------------------------\n";
864
"----------------------------------------------------------------------------\n";
894
  foreach my $name ( sort keys %agents )
865
  foreach my $name ( sort keys %agents )
Line 903... Line 874...
903
  foreach my $name ( sort keys %tz )
874
  foreach my $name ( sort keys %tz )
904
  {
875
  {
905
    print $OUTF "$name : $tz{$name}\n";
876
    print $OUTF "$name : $tz{$name}\n";
906
  }
877
  }
907
  close $OUTF;
878
  close $OUTF;
908
}    # writedata
879
}    # write_data