Subversion Repositories LCARS

Rev

Rev 14 | Rev 23 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14 Rev 22
Line 1... Line 1...
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
-
 
4
4
use diagnostics;
5
#use diagnostics;
5
use utf8;
6
use utf8;
6
use Encode;
7
use Encode;
7
8
-
 
9
use constant DEBUG => 0;
-
 
10
8
## Print out all text to STDOUT UTF-8 encoded
11
## Print out all text to STDOUT UTF-8 encoded
9
binmode STDOUT, ':encoding(UTF-8)';
12
binmode STDOUT, ':encoding(UTF-8)';
-
 
13
binmode STDERR, ':encoding(UTF-8)';
10
14
11
##############################
-
 
12
## newsstat.pl version 0.4.3.1
-
 
13
-
 
14
###########################################################################
-
 
15
## Collect statistics about a newsgroup (specified by first argument)
-
 
16
## in the local news spool. Check all articles in the last 30-day period.
-
 
17
## Rank posters by number of posts and by volume of posts, report on top
-
 
18
## and bottom 20 posters. Show their name, number of posts, size of posts,
-
 
19
## percentage of quoted lines. Rank user-agents used, by poster rather
-
 
20
## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
-
 
21
##
-
 
22
## Numbers and paths can be configured below.  See ChangeLog and TODO
15
# FIXME: Automatically include resolved '.' in @INC
23
## for more.  -- PE
16
# print join "\n", @INC;
24
###########################################################################
-
 
25
17
-
 
18
use locale ':not_characters';
-
 
19
use Locale::TextDomain ('de.pointedears.newsstat');
-
 
20
use POSIX ('locale_h');
-
 
21
use Locale::Messages qw (bind_textdomain_filter
-
 
22
                         bind_textdomain_codeset
-
 
23
                         turn_utf_8_on);
-
 
24
#setlocale( LC_MESSAGES, '' );
-
 
25
bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on;
-
 
26
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
-
 
27
 
-
 
28
require Mail::Message;
-
 
29
require DateTime;
-
 
30
require DateTime::Format::Mail;
-
 
31
 
26
###################### USER CONFIGURATIONS ############################
32
###################### USER CONFIGURATIONS ############################
27
33
28
## The name of the group to do stats for
34
## The name of the group to do stats for
29
my $newsgroup_name = $ARGV[0];
35
my $newsgroup_name = $ARGV[0];
30
$newsgroup_name or &usage;
36
$newsgroup_name or usage();
31
37
32
## Check for removal flags
38
## Check for removal flags
33
my $ix;
39
my $ix;
34
my $j;
40
my $j;
35
my %skipSec;
41
my %skipSec;
Line 53... Line 59...
53
}
59
}
54
60
55
## Leafnode users will want /var/spool/news for this variable.
61
## Leafnode users will want /var/spool/news for this variable.
56
my $news = "/var/spool/news/";
62
my $news = "/var/spool/news/";
57
63
58
## How many days are we doing statistics for?
-
 
59
my $numdays = 30;
-
 
60
-
 
61
## Number of agents we list
64
## Number of top or bottom posters to show
62
my $topagents = 10;
65
my $topposters = 20;
63
66
64
## Number of threads we want to know about
67
## Number of threads we want to know about
65
my $topthreads = 20;
68
my $topthreads = 20;
66
69
67
## Number of top or bottom posters to show
-
 
68
my $topposters = 20;
-
 
69
-
 
70
## Number of cross-posted threads to show
70
## Number of cross-posted threads to show
71
my $topcrossposts = 10;
71
my $topcrossposts = 10;
72
72
-
 
73
## Number of agents we list
-
 
74
my $topagents = 10;
-
 
75
73
## Number of time zones to show
76
## Number of time zones to show
74
my $toptz = 10;
77
my $toptz = 10;
75
78
76
###################### DATA STRUCTURES ######################
79
###################### DATA STRUCTURES ######################
77
my $group = $newsgroup_name;
80
my $group = $newsgroup_name;
78
$group =~ s!\.!/!g;
81
$group =~ s!\.!/!g;
79
my %data;          # name, count, agent, total, orig, quoted
82
my %data;    # name, count, agent, total, orig, quoted
80
my %threads;       # subject, count
83
my $totsize = 0;    # holds total sizes of all files
81
my %crossposts;    # group, count
84
my %crossposts;     # group, count
82
my %tz;            # timezones by count
85
my %threads;        # subject, count
83
my %headers;       # holds header of current article
-
 
84
my %lcheader;      # holds lowercase headers
-
 
85
my @body;          # holds body of current article
86
my $replies   = 0;  # total no. of replies
86
my @sig;           # holds sig text;
-
 
87
my $totalposts;    # total no. of posts considered
87
my $origposts = 0;  # total no. of original posts
88
my $filename;      # name of current article file
-
 
89
my $filesize;      # size of current article file
88
my %tz;             # timezones by count
90
my $earliest;      # earliest article we have found
89
my $earliest;       # earliest article we have found
91
my $latest;        # latest article we have found
90
my $latest;         # latest article we have found
92
my $poster;        # poster we are dealing with
-
 
93
my $totsize   = 0; # holds total sizes of all files
-
 
94
my $totheader = 0; # total size of header material
91
my $totheader = 0;  # total size of header material
95
my $totbody   = 0; # total size of body material
92
my $totbody   = 0;  # total size of body material
96
my $totsig    = 0; # total size of sig material
93
my $totsig    = 0;  # total size of sig material
97
my $totorig   = 0; # total size of original material
-
 
98
my $totquoted = 0; # total size of quoted material
94
my $totquoted = 0;  # total size of quoted material
99
my $origposts = 0; # total no. of original posts
95
my $totorig   = 0;  # total size of original material
100
my $replies   = 0; # total no. of replies
96
my $totalposts;     # total no. of posts considered
101
my $i;             # general purpose
-
 
102
my %distinct_agent;
97
my %distinct_agent;
103
98
104
## Used to hold counts of User Agents used
99
## Used to hold counts of User Agents used
105
my %agents = (
100
my %agents = (
106
  "Compuserver"               => 0,
101
  "Compuserver"               => 0,
Line 122... Line 117...
122
  "slrn"                      => 0,
117
  "slrn"                      => 0,
123
  "Sylpheed"                  => 0,
118
  "Sylpheed"                  => 0,
124
  "tin"                       => 0,
119
  "tin"                       => 0,
125
  "VSoup"                     => 0,
120
  "VSoup"                     => 0,
126
  "WebTV"                     => 0,
121
  "WebTV"                     => 0,
127
  "Xnews"                     => 0
122
  "Xnews"                     => 0,
128
);
123
);
129
124
-
 
125
my $datetime_parser = DateTime::Format::Mail->new();
-
 
126
$datetime_parser->loose();
-
 
127
-
 
128
my $today = DateTime->today( time_zone => 'UTC' );
-
 
129
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
-
 
130
my $start      = int $prev_month->strftime('%s');
-
 
131
my $numdays    = int DateTime->last_day_of_month(
-
 
132
  year      => $prev_month->year(),
-
 
133
  month     => $prev_month->month(),
-
 
134
  time_zone => $prev_month->time_zone(),
-
 
135
)->day();
130
######################## MAIN CODE ########################
136
my $end = int $today->clone()->set_day(1)->strftime('%s');
-
 
137
-
 
138
dmsg( $start, " to ", $end ) if DEBUG;
-
 
139
-
 
140
chdir("$news$group")
-
 
141
  or die __x(
-
 
142
  "Can't cd to {newsgroup}: {error}\n",
-
 
143
  newsgroup => "$news$group",
-
 
144
  error     => $!
-
 
145
  );
-
 
146
opendir( DIR, "." )
-
 
147
  or die __x(
-
 
148
  "Can't open {newsgroup}: {error}\n",
-
 
149
  newsgroup => "$news$group",
-
 
150
  error     => $!
131
$! = 1;
151
  );
-
 
152
-
 
153
while ( defined( my $filename = readdir(DIR) ) )
-
 
154
{
-
 
155
  next unless -f $filename;    # only want real files
-
 
156
  next if ( $filename eq ".overview" );    # real articles only
132
157
133
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
-
 
134
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
-
 
135
while ( defined( $filename = readdir(DIR) ) )
-
 
136
{
-
 
137
  %lcheader = ();
-
 
138
  next unless -f $filename;               # only want real files
-
 
139
  next if ( $filename eq ".overview" );   # real articles only
-
 
140
  next if ( -M $filename > $numdays );    # only want articles <= a certain age
-
 
141
  $earliest = ( stat $filename )[9] unless defined($earliest);
-
 
142
  $latest   = ( stat $filename )[9] unless defined($latest);
-
 
143
  &get_article($filename);                 # read in the article
158
  get_article($filename);                  # read in the article
144
  &get_data;                               # grab the data from the article
-
 
145
  $totalposts++;                          # bump count of articles considered
-
 
146
}
159
}
147
closedir(DIR);                            # finished with the directory
160
closedir(DIR);                             # finished with the directory
148
161
149
## Post-processing
-
 
150
&count_agents;    # count agents, collapsing versions
162
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
151
&fix_percent;     # check percentages orig/total for posters
-
 
152
163
-
 
164
## Post-processing
-
 
165
count_agents();                            # count agents, collapsing versions
153
&write_data;
166
fix_percent();
154
167
155
#################### DISPLAY RESULTS #####################
-
 
156
print "=" x 76, "\n";
-
 
157
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
-
 
158
print "=" x 76, "\n";
-
 
159
printf "%s\n",
-
 
160
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
-
 
161
print "\n\n";
168
write_data();
162
printf "Total posts considered: %s over %d days\n", commify($totalposts),
-
 
163
  $numdays;
-
 
164
printf "Earliest article: %s\n",               scalar localtime($earliest);
-
 
165
printf "Latest article:   %s\n",               scalar localtime($latest);
-
 
166
printf "Original articles: %s, replies: %s\n", commify($origposts),
-
 
167
  commify($replies);
169
display_results();
168
printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize),
-
 
169
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
-
 
170
printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
-
 
171
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
-
 
172
  commify( int( $totsize / $totalposts ) );
-
 
173
my $count = keys %data;
-
 
174
printf "Total headers: %s KiB  bodies: %s KiB\n",
-
 
175
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
-
 
176
printf "Body text - quoted: %s KiB,  original: %s KiB = %02.2f%%, sigs: %s KiB\n",
-
 
177
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
-
 
178
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
-
 
179
  commify( int( $totsig / 1024 ) );
-
 
180
printf "Total number of posters: %s, average %s bytes per poster\n",
-
 
181
  commify($count), commify( int( $totsize / $count ) );     #/
-
 
182
$count = keys %threads;
-
 
183
printf "Total number of threads: %s, average %s bytes per thread\n",
-
 
184
  commify($count), commify( int( $totsize / $count ) );     #/
-
 
185
printf "Total number of user agents: %d\n", scalar keys %agents;
-
 
186
print "\n", "=" x 76, "\n";
-
 
187
170
188
########################################
171
########################################
189
## Show posters by article count  Sec 1;
172
## Get current article's header and body
190
########################################
173
########################################
191
unless ( $skipSec{1} )
174
sub get_article
192
{
175
{
193
  if ( keys %data < $topposters )
-
 
194
  {
-
 
195
    $count = keys %data;
176
  my $filename = shift;
196
  }
-
 
197
  else
-
 
198
  {
-
 
199
    $count = $topposters;
-
 
200
  }
-
 
201
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
-
 
202
  print "=" x 76, "\n";
-
 
203
  $i = 0;
-
 
204
  foreach
-
 
205
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
-
 
206
  {
-
 
207
    my $name = substr( $poster, 0, 65 );
-
 
208
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
-
 
209
      $data{$poster}{count};
-
 
210
    last if ( ++$i == $count );
-
 
211
  }
-
 
212
  print "\n", "=" x 76, "\n";
-
 
213
}
-
 
214
177
215
######################################
-
 
216
## Show posters by size in KiB  Sec 2;
-
 
217
######################################
-
 
218
unless ( $skipSec{2} )
-
 
219
{
-
 
220
  if ( keys %data < $topposters )
178
  open( my $FILE, '<', $filename )
221
  {
-
 
222
    $count = keys %data;
-
 
223
  }
-
 
224
  else
179
    or
225
  {
-
 
226
    $count = $topposters;
-
 
227
  }
-
 
228
  printf "%s\n", &centred( "Top $count posters by article size in KiB", 76 );
180
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
229
  print "=" x 76, "\n";
-
 
230
  $i = 0;
-
 
231
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
-
 
232
  {
-
 
233
    my $name = substr( $poster, 0, 62 );
181
  my $msg       = Mail::Message->read($FILE);
234
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
-
 
235
      $data{$poster}{size} / 1024;    #/
182
  my $timestamp = $msg->timestamp();
236
    last if ( ++$i == $count );
183
  my $date      = $msg->study('Date');
237
  }
-
 
238
  print "\n", "=" x 76, "\n";
-
 
239
}
-
 
240
184
241
#####################################
-
 
242
## Show top posters for original text
185
  ## Disregard article if timestamp is not in range
243
#####################################
-
 
244
unless ( $skipSec{3} )
186
  dmsg($timestamp) if DEBUG;
245
{
-
 
246
  if ( keys %data < $topposters )
187
  if ( $timestamp < $start or $timestamp >= $end )
247
  {
188
  {
-
 
189
    dmsg("Posting on $date ignored.") if DEBUG;
248
    $count = keys %data;
190
    return;
249
  }
191
  }
250
  else
-
 
251
  {
-
 
252
    $count = $topposters;
-
 
253
  }
-
 
254
  printf "%s\n",
-
 
255
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
-
 
256
  print "=" x 76, "\n";
-
 
257
  $i = 0;
-
 
258
  foreach my $poster (
-
 
259
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
-
 
260
    keys %data
-
 
261
    )
-
 
262
  {
-
 
263
    next if $data{$poster}{quoted} == 0;
-
 
264
    next if $data{$poster}{count} < 5;
-
 
265
    my $name = substr( $poster, 0, 63 );
-
 
266
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
-
 
267
      $data{$poster}{percent};
-
 
268
    last if ( ++$i == $count );
-
 
269
  }
-
 
270
  print "\n", "=" x 76, "\n";
-
 
271
}
-
 
272
192
273
########################################
-
 
274
## Show bottom posters for original text
-
 
275
########################################
-
 
276
unless ( $skipSec{4} )
-
 
277
{
-
 
278
  if ( keys %data < $topposters )
-
 
279
  {
-
 
280
    $count = keys %data;
-
 
281
  }
-
 
282
  else
-
 
283
  {
-
 
284
    $count = $topposters;
-
 
285
  }
-
 
286
  printf "%s\n",
-
 
287
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
-
 
288
  print "=" x 76, "\n";
-
 
289
  $i = 0;
-
 
290
  foreach my $poster (
-
 
291
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
193
  $totalposts++;    # bump count of articles considered
292
    keys %data
-
 
293
    )
-
 
294
  {
-
 
295
    next if $data{$poster}{quoted} == 0;
-
 
296
    next if $data{$poster}{count} < 5;
-
 
297
    my $name = substr( $poster, 0, 63 );
-
 
298
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
-
 
299
      $data{$poster}{percent};
-
 
300
    last if ( ++$i == $count );
-
 
301
  }
-
 
302
  print "\n", "=" x 76, "\n";
-
 
303
}
-
 
304
194
305
#####################################
-
 
306
## Show threads by number of articles
-
 
307
#####################################
-
 
308
unless ( $skipSec{5} )
-
 
309
{
-
 
310
  if ( keys %threads < $topthreads )
-
 
311
  {
-
 
312
    $count = keys %threads;
-
 
313
  }
-
 
314
  else
-
 
315
  {
-
 
316
    $count = $topthreads;
-
 
317
  }
-
 
318
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
-
 
319
  print "=" x 76, "\n";
-
 
320
  $i = 0;
195
  ## DEBUG
321
  foreach my $thread (
196
  dmsg($date) if DEBUG;
322
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
-
 
323
    keys %threads
-
 
324
    )
-
 
325
  {
-
 
326
    my $name = substr( $thread, 0, 65 );
-
 
327
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-
 
328
      $threads{$thread}{count};
-
 
329
    last if ( ++$i == $count );
-
 
330
  }
-
 
331
  print "\n", "=" x 76, "\n";
-
 
332
}
-
 
333
197
334
##############################
-
 
335
## Show threads by size in KiB
198
  ## get stats about the file itself
336
##############################
-
 
337
unless ( $skipSec{6} )
-
 
338
{
-
 
339
  if ( keys %threads < $topthreads )
-
 
340
  {
-
 
341
    $count = keys %threads;
-
 
342
  }
-
 
343
  else
-
 
344
  {
-
 
345
    $count = $topthreads;
-
 
346
  }
-
 
347
  printf "%s\n", &centred( "Top $count threads by size in KiB", 76 );
-
 
348
  print "=" x 76, "\n";
-
 
349
  $i = 0;
-
 
350
  foreach my $thread (
-
 
351
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
199
  my $filesize = -s $filename;    # get total size of file
352
    keys %threads
-
 
353
    )
-
 
354
  {
-
 
355
    my $name = substr( $thread, 0, 65 );
-
 
356
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
200
  $totsize += $filesize;          # bump total sizes of all files
357
      $threads{$thread}{size} / 1024;    #/
-
 
358
    last if ( ++$i == $count );
-
 
359
  }
-
 
360
  print "\n", "=" x 76, "\n";
-
 
361
}
-
 
362
201
363
##################################
-
 
364
## Show top 10 cross-posted groups
-
 
365
##################################
-
 
366
unless ( $skipSec{7} )
-
 
367
{
-
 
368
  delete $crossposts{"$newsgroup_name"};    # don't include ours
-
 
369
  if ( keys %crossposts < $topcrossposts )
202
  if ( ( not defined $earliest ) or $timestamp < $earliest )
370
  {
203
  {
371
    $count = keys %crossposts;
204
    $earliest = $timestamp;
372
  }
205
  }
373
  else
206
  elsif ( ( not defined $latest ) or $timestamp > $latest )
374
  {
207
  {
375
    $count = $topcrossposts;
208
    $latest = $timestamp;
376
  }
209
  }
377
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
-
 
378
  print "=" x 76, "\n";
-
 
379
  $i = 0;
-
 
380
  foreach
-
 
381
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
-
 
382
  {
-
 
383
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-
 
384
      $crossposts{$name};
-
 
385
    last if ( ++$i == $count );
-
 
386
  }
-
 
387
  print "\n", "=" x 76, "\n";
-
 
388
}
-
 
389
210
390
#########################
-
 
391
## Show agents and counts
-
 
392
#########################
-
 
393
unless ( $skipSec{8} )
-
 
394
{
-
 
395
  if ( keys %agents < $topagents )
-
 
396
  {
-
 
397
    $count = keys %agents;
-
 
398
  }
-
 
399
  else
-
 
400
  {
-
 
401
    $count = $topagents;
-
 
402
  }
-
 
403
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
-
 
404
  print "=" x 76, "\n";
-
 
405
  $i = 0;
-
 
406
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
-
 
407
  {
-
 
408
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
-
 
409
      $agents{$agent};
-
 
410
    last if ( ++$i == $count );
-
 
411
  }
-
 
412
  print "\n", "=" x 76, "\n";
211
  #print "timestamp: $timestamp\n";
413
}
-
 
414
212
415
#######################
-
 
416
## Show distinct agents
-
 
417
#######################
-
 
418
unless ( $skipSec{9} )
-
 
419
{
-
 
420
  if ( keys %distinct_agent < $topagents )
-
 
421
  {
-
 
422
    $count = keys %distinct_agent;
-
 
423
  }
-
 
424
  else
-
 
425
  {
-
 
426
    $count = $topagents;
213
  ## count header size
427
  }
-
 
428
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
-
 
429
  print "=" x 76, "\n";
-
 
430
  $i = 0;
-
 
431
  foreach my $agent (
-
 
432
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
-
 
433
    keys %distinct_agent
-
 
434
    )
-
 
435
  {
-
 
436
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
-
 
437
      $distinct_agent{$agent},
-
 
438
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
-
 
439
    last if ( ++$i == $count );
214
  $totheader += $msg->head()->size();
440
  }
-
 
441
  print "\n", "=" x 76, "\n";
-
 
442
}
-
 
443
215
444
############################
-
 
445
## Show timezones and counts
216
  ## get the poster's name (MIME-decoded, in UTF-8)
446
############################
-
 
447
unless ( $skipSec{10} )
217
  my $poster = $msg->study('From');
448
{
-
 
449
  if ( keys %tz < $toptz )
218
  if ( defined $poster )
450
  {
219
  {
451
    $count = keys %tz;
220
    ## Convert old to new format
452
  }
-
 
453
  else
-
 
454
  {
-
 
455
    $count = $toptz;
-
 
456
  }
-
 
457
  printf "%s\n", &centred( "Top 10 time zones", 76 );
221
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
458
  print "=" x 76, "\n";
-
 
459
  $i = 0;
-
 
460
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
-
 
461
  {
-
 
462
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
-
 
463
    last if ( ++$i == $count );
-
 
464
  }
-
 
465
  print "\n", "=" x 76, "\n";
-
 
466
}
-
 
467
222
468
################################ SUBROUTINES ################################
-
 
469
-
 
470
########################################
-
 
471
## Get current article's header and body
223
    ## Collapse whitespace
472
########################################
-
 
473
sub get_article
-
 
474
{
-
 
475
  %headers = ();    # dump old headers
224
    $poster =~ s/\s+/ /g;
476
  my $filename = shift;    # get the name of the file
-
 
477
225
478
  ## get stats about the file itself
-
 
479
  $filesize = -s $filename;    # get total size of file
226
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
480
  $totsize += $filesize;       # bump total sizes of all files
227
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
481
228
482
  my $mtime = ( stat $filename )[9];
-
 
483
  if ( $mtime < $earliest )
-
 
484
  {
-
 
485
    $earliest = $mtime;
229
    ## DEBUG
486
  }
-
 
487
  elsif ( $mtime > $latest )
-
 
488
  {
-
 
489
    $latest = $mtime;
230
    dmsg($poster) if DEBUG;
490
  }
-
 
491
231
492
  ## now read the file
232
    ## seen this one before?
493
  open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n";
-
 
494
  while (<$FILE>)
-
 
495
  {
-
 
496
    $totheader += length($_);    # bump total header size
-
 
497
    last if (/^\s*$/);           # end of header?
-
 
498
    if (/^([^:\s]*):\s*(.*)/)
233
    if ( !defined( $data{$poster} ) )
499
    {
234
    {
500
      my ( $key, $val ) = ( $1, $2 );
235
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
501
      $headers{$key} = decode( 'MIME-Header', $val );
236
      $data{$poster}{'orig'}   = 0;
502
      $lcheader{ clean( lc($key) ) } = clean($val);
237
      $data{$poster}{'quoted'} = 0;
503
    }
238
    }
504
  }
-
 
505
  @body = <$FILE>;               # slurp up body
239
    $data{$poster}{'count'}++;                    # bump count for this poster
506
  close($FILE);
-
 
507
}    # get_article
240
    $data{$poster}{'size'} += $filesize;          # total size of file
508
241
509
####################################
242
    ## The User-Agent and/or X-Newsreader fields
510
## Get data from the current article
243
    ## for User-Agent by poster
511
####################################
244
    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
512
sub get_data
245
    if ( defined $ua )
513
{
246
    {
514
#### First, analyse header fields ####
247
      $data{$poster}{'agent'} = $ua;
515
248
516
  ## Set up this poster if not defined, get counts, sizes
-
 
517
  my $poster = $headers{From};    # get the poster's name
-
 
518
 
-
 
519
  # Convert old to new format
-
 
520
  $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
-
 
521
 
-
 
522
  # Collapse whitespace
-
 
523
  $poster =~ s/\s+/ /;
-
 
524
 
-
 
525
  # Remove outer quotes
249
      ## DEBUG
526
  $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/;
-
 
527
 
-
 
528
  if ( !defined( $data{$poster} ) )
-
 
529
  {                               # seen this one before?
-
 
530
    $data{$poster}{agent}  = 'Unknown';    # comes after For: field
-
 
531
    $data{$poster}{orig}   = 0;
250
      dmsg($ua) if DEBUG;
532
    $data{$poster}{quoted} = 0;
-
 
533
  }
251
    }
534
  $data{$poster}{count}++;                 # bump count for this poster
-
 
535
  $data{$poster}{size} += $filesize;       # total size of file
-
 
536
252
537
  ## The User-Agent and/or X-Newsreader fields
253
    ## The User Agent for User-Agent by number of posts
538
  ## for User-Agent by poster
254
    get_agent($msg);
539
  if ( defined $lcheader{"user-agent"} )
-
 
540
  {
-
 
541
    $data{$poster}{agent} = $lcheader{"user-agent"};
-
 
542
  }
-
 
543
  if ( defined $lcheader{"x-newsreader"} )
-
 
544
  {
-
 
545
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
-
 
546
  }
-
 
547
255
548
  ## The User Agent for User-Agent by number of posts
256
    ## Get all cross-posted newsgroups
549
  my $UA = "unknown";
-
 
550
  foreach my $keys ( keys %lcheader )
-
 
551
  {
-
 
552
    if ( defined $lcheader{'user-agent'} )
257
    for ( split( /,/, $msg->study('Newsgroups') ) )
553
    {
258
    {
554
      $UA = $lcheader{'user-agent'};
259
      $crossposts{$_}++;    # bump count for each
555
    }
260
    }
-
 
261
-
 
262
    ## Get threads
-
 
263
    my $thread = $msg->study('Subject');
-
 
264
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
-
 
265
    $thread =~ s/\s+/ /g;       # collapse whitespace
-
 
266
    $threads{$thread}{'count'}++;    # bump count of this subject
-
 
267
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
-
 
268
-
 
269
    ## Is this an original post or a reply?
556
    elsif ( defined $lcheader{"x-newsreader"} )
270
    if ( defined $msg->study('References') )
557
    {
271
    {
558
      $UA = $lcheader{"x-newsreader"};
272
      $replies++;
559
    }
273
    }
560
    elsif ( defined $lcheader{'x-mailer'} )
274
    else
561
    {
275
    {
562
      $UA = $lcheader{'x-mailer'};
276
      $origposts++;
563
    }
277
    }
-
 
278
-
 
279
    ## Get the time zone
-
 
280
    my $datetime = $datetime_parser->parse_datetime($date);
-
 
281
    my $tz       = $datetime->strftime('%z');
-
 
282
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
-
 
283
    $tz{$tz}++;
-
 
284
-
 
285
    ## DEBUG
-
 
286
    dmsg($tz) if DEBUG;
-
 
287
-
 
288
#### Now analyse the body text ####
-
 
289
    my $body = $msg->body();
-
 
290
-
 
291
    my $insig = 0;
-
 
292
    my @body  = $body->lines;
-
 
293
    for (@body)
-
 
294
    {
-
 
295
      $totbody += length($_);    # bump total body size
-
 
296
      next if (m{^$>}o);         # don't count blank lines in body
-
 
297
      if ( $insig == 1 )
-
 
298
      {
-
 
299
-
 
300
        # bump total sig size
-
 
301
        $totsig += length($_);
564
    elsif (
302
      }
-
 
303
      ## are we in a quote line?
-
 
304
      ## Bill Unruh uses ] quotes, and another poster uses ::
565
      ( defined $lcheader{'organization'} )
305
      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
-
 
306
      {
566
      && ( $lcheader{'organization'} =~
307
        ## bump count of quoted chrs
567
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
308
        $data{$poster}{'quoted'} += length($_);
-
 
309
        $totquoted += length($_);
-
 
310
      }
-
 
311
      elsif (/^-- $/)
-
 
312
      {
-
 
313
        $insig = 1;
568
      )
314
      }
-
 
315
      else
569
    {
316
      {
-
 
317
        ## We must be processing an original line
-
 
318
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
570
      $UA = $lcheader{'organization'};
319
        $totorig += length($_);
-
 
320
      }
571
    }
321
    }
572
    elsif ( $lcheader{'message-id'} =~ /pine/i )
-
 
573
    {
322
574
      $UA = "Pine";
323
    # end for (@body)
575
    }    ## Hopefully found UA, else set to unknown
-
 
576
  }
324
  }
577
325
578
  $UA = clean($UA);
326
  close($FILE);
579
  $UA = get_agent($UA);
-
 
-
 
327
}
580
328
581
  sub get_agent
329
sub get_agent
582
  {
330
{
583
    my $raw   = shift;
331
  my $msg = shift;
584
    my $agent = $raw;
-
 
585
332
-
 
333
  my $ua =
-
 
334
       $msg->study('User-Agent')
-
 
335
    or $msg->study('X-Newsreader')
-
 
336
    or $msg->study('X-Mailer');
586
    ## strip http
337
  if ( not defined $ua )
-
 
338
  {
-
 
339
    my $org = $msg->study('Organization');
587
    if ( $raw =~ /.*http.*/ )
340
    if ( defined $org
-
 
341
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
588
    {
342
    {
589
      $raw =~ s!posted via!!i;
-
 
590
      $raw =~ s!http://!!g;
-
 
591
      $raw =~ s!/!!g;
-
 
592
      $raw =~ s! !!g;
343
      $ua = $org;
593
    }
344
    }
594
-
 
595
    ## Fix Outlook from Mac
-
 
596
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
345
    elsif ( $msg->study('Message-ID') =~ /pine/i )
597
-
 
598
    ## Pick out the popular agents
-
 
599
    if (
-
 
600
           $raw =~ /(outlook express)/i
-
 
601
        || $raw =~ /(windows mail)/i
-
 
602
        || $raw =~ /(microplanet gravity)/i
-
 
603
        || $raw =~ /(news rover)/i
-
 
604
        || $raw =~ /(forte agent)/i
-
 
605
        || $raw =~ /(forte free agent)/i
-
 
606
      )
-
 
607
    {
346
    {
608
      $agent = $1;
347
      $ua = "Pine";
609
    }
348
    }
-
 
349
  }
-
 
350
-
 
351
  ## Hopefully found UA, else set to unknown
-
 
352
  if ( not defined $ua )
-
 
353
  {
-
 
354
    $ua = __ "unknown";
-
 
355
  }
-
 
356
-
 
357
  $ua = clean($ua);
-
 
358
-
 
359
  my $raw   = $ua;
-
 
360
  my $agent = $raw;
-
 
361
-
 
362
  ## strip http
-
 
363
  if ( $raw =~ /.*http.*/ )
-
 
364
  {
-
 
365
    $raw =~ s!posted via!!i;
-
 
366
    $raw =~ s!http://!!g;
-
 
367
    $raw =~ s!/!!g;
-
 
368
    $raw =~ s! !!g;
-
 
369
  }
-
 
370
-
 
371
  ## Fix Outlook from Mac
-
 
372
  if ( $raw =~ /^microsoft/i )
-
 
373
  {
-
 
374
    $raw =~ s/-/ /g;
-
 
375
  }
-
 
376
-
 
377
  ## Pick out the popular agents
-
 
378
  if ( $raw =~ /(outlook express)/i
-
 
379
    || $raw =~ /(windows mail)/i
-
 
380
    || $raw =~ /(microplanet gravity)/i
-
 
381
    || $raw =~ /(news rover)/i
-
 
382
    || $raw =~ /(forte agent)/i
-
 
383
    || $raw =~ /(forte free agent)/i )
-
 
384
  {
-
 
385
    $agent = $1;
-
 
386
  }
610
    elsif (
387
  elsif (
611
      $raw =~ /^(
388
    $raw =~ /^(
612
        pan
389
        pan
613
       |sylpheed
390
       |sylpheed
614
       |slrn
391
       |slrn
615
       |mozilla
392
       |mozilla
616
       |knode
393
       |knode
Line 635... Line 412...
635
       |rn
412
       |rn
636
       |007
413
       |007
637
       |webtv
414
       |webtv
638
       |compuserve
415
       |compuserve
639
       )/ix
416
       )/ix
640
      )
417
    )
-
 
418
  {
-
 
419
    $agent = $1;
-
 
420
  }
-
 
421
  else
-
 
422
  {
-
 
423
    ## Clean up unknown agents
-
 
424
    if ( $raw =~ m!^(.*?)/! )
641
    {
425
    {
642
      $agent = $1;
426
      $agent = $1;
643
    }
427
    }
644
    else
428
    elsif ( $raw =~ /^(\w*)\d.*/ )
645
    {
429
    {
646
      ## Clean up unknown agents
430
      $agent = $1;
647
      if ( $raw =~ m!^(.*?)/! )
-
 
648
      {
431
    }
-
 
432
  }
-
 
433
-
 
434
  $distinct_agent{$agent}++;
649
        $agent = $1;
435
  return $agent;
-
 
436
}
-
 
437
## get_agent
-
 
438
-
 
439
#########################################
-
 
440
## Count the User-Agents used, collapsing
-
 
441
## different versions into one per agent.
-
 
442
#########################################
-
 
443
sub count_agents
-
 
444
{
-
 
445
POSTER:
-
 
446
  foreach my $poster ( keys %data )
650
      }
447
  {
-
 
448
    foreach my $agent_name ( keys %distinct_agent )
651
      elsif ( $raw =~ /^(\w*)\d.*/ )
449
    {    # check against known ones
-
 
450
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
652
      {
451
      {
-
 
452
        $agents{$agent_name}++;
653
        $agent = $1;
453
        next POSTER;
654
      }
454
      }
655
    }
455
    }
-
 
456
    $agents{ $data{$poster}{'agent'} }++;
-
 
457
  }
-
 
458
}    # count_agents
656
459
-
 
460
#############################################
-
 
461
## Set orig/total percentages for all posters
-
 
462
#############################################
-
 
463
sub fix_percent
-
 
464
{
-
 
465
  foreach my $poster ( keys %data )
-
 
466
  {
657
    $distinct_agent{$agent}++;
467
    my $percent = 100;
-
 
468
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
-
 
469
    {
658
    return $agent;
470
      $percent =
-
 
471
        $data{$poster}{'orig'} * 100 /
-
 
472
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
-
 
473
    }
-
 
474
    elsif ( $data{$poster}{'orig'} == 0 )
-
 
475
    {
-
 
476
      $percent = 0;
-
 
477
    }
-
 
478
    $data{$poster}{'percent'} = $percent;
659
  }
479
  }
-
 
480
}
-
 
481
## fix_percent
660
482
-
 
483
##################################
-
 
484
## Write data structures to a file
-
 
485
##################################
-
 
486
sub write_data
-
 
487
{
-
 
488
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
-
 
489
    or die __x( "Can't create XDATA: {error}\n", error => $! );
-
 
490
  print $OUTF "Data collected from $newsgroup_name\n\n";
-
 
491
  print $OUTF
-
 
492
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
-
 
493
  foreach my $name ( keys %data )
-
 
494
  {
-
 
495
    print $OUTF
-
 
496
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
-
 
497
  }
-
 
498
  print $OUTF
-
 
499
"============================================================================\n";
-
 
500
  print $OUTF "Thread subjects\n";
-
 
501
  print $OUTF
-
 
502
"----------------------------------------------------------------------------\n";
-
 
503
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
-
 
504
  {
-
 
505
    print $OUTF
-
 
506
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
-
 
507
  }
-
 
508
  print $OUTF
-
 
509
"============================================================================\n";
661
  ## Get all cross-posted newsgroups
510
  print $OUTF "Cross-posts\n";
-
 
511
  print $OUTF
-
 
512
"----------------------------------------------------------------------------\n";
662
  for ( split /,/, $headers{"Newsgroups"} )
513
  foreach my $name ( sort keys %crossposts )
-
 
514
  {
-
 
515
    print $OUTF "$name : $crossposts{$name}\n";
-
 
516
  }
-
 
517
  print $OUTF
-
 
518
"============================================================================\n";
-
 
519
  print $OUTF "User agents\n";
-
 
520
  print $OUTF
-
 
521
"----------------------------------------------------------------------------\n";
-
 
522
  foreach my $name ( sort keys %agents )
-
 
523
  {
-
 
524
    print $OUTF "$name : $agents{$name}\n";
-
 
525
  }
-
 
526
  print $OUTF
-
 
527
"============================================================================\n";
-
 
528
  print $OUTF "Time zones\n";
-
 
529
  print $OUTF
-
 
530
"----------------------------------------------------------------------------\n";
-
 
531
  foreach my $name ( sort keys %tz )
663
  {
532
  {
664
    $crossposts{$_}++;    # bump count for each
533
    print $OUTF "$name : $tz{$name}\n";
665
  }
534
  }
-
 
535
  close $OUTF;
-
 
536
}    # write_data
666
537
-
 
538
sub display_results
-
 
539
{
-
 
540
  #################### DISPLAY RESULTS #####################
-
 
541
  print "=" x 76, "\n";
-
 
542
  printf "%s\n",
667
  ## Get threads
543
    centred(
-
 
544
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
-
 
545
    76 );
-
 
546
  print "=" x 76, "\n";
-
 
547
  printf "%s\n",
-
 
548
    centred(
-
 
549
    __
-
 
550
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)",
-
 
551
    76
-
 
552
    );
-
 
553
  print "\n\n";
-
 
554
  printf __"Total posts considered: %s over %d days" . "\n",
-
 
555
    commify($totalposts),
-
 
556
    $numdays;
668
  my $thread = $headers{"Subject"};
557
  my $time_locale = setlocale(LC_TIME);
669
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
558
  my $earliest_datetime = DateTime->from_epoch(
-
 
559
    epoch => $earliest,
-
 
560
    locale => $time_locale,
-
 
561
    time_zone => 'UTC',
-
 
562
  );
-
 
563
  my $latest_datetime = DateTime->from_epoch(
-
 
564
    epoch => $latest,
-
 
565
    locale => $time_locale,
-
 
566
    time_zone => 'UTC',
-
 
567
  );
-
 
568
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
-
 
569
  printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format);
-
 
570
  printf __"Latest article" . ":   %s\n", $latest_datetime->strftime($datetime_format);
-
 
571
  printf __"Original articles: %s; replies" . ": %s\n",
-
 
572
    commify($origposts),
-
 
573
    commify($replies);
-
 
574
  printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n",
-
 
575
    commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576;  #
-
 
576
  printf __
-
 
577
    "Average %s articles per day, %.2f MiB per day, %s bytes per article\n",
-
 
578
    commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
-
 
579
    commify( int( $totsize / $totalposts ) );
-
 
580
  my $count = keys %data;
-
 
581
  printf __"Total headers: %s KiB; bodies: %s KiB\n",
-
 
582
    commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
-
 
583
  printf __
-
 
584
    "Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n",
-
 
585
    commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
670
  $thread =~ s/\s+/ /g;     # collapse whitespace
586
    ( $totorig * 100 ) / ( $totorig + $totquoted ),
-
 
587
    commify( int( $totsig / 1024 ) );
-
 
588
  printf __"Total number of posters: %s, average %s bytes per poster\n",
671
  $threads{$thread}{count} += 1;            # bump count of this subject
589
    commify($count), commify( int( $totsize / $count ) );    #/
-
 
590
  $count = keys %threads;
672
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
591
  printf __"Total number of threads: %s, average %s bytes per thread\n",
-
 
592
    commify($count), commify( int( $totsize / $count ) );    #/
-
 
593
  printf __"Total number of user agents: %d\n", scalar keys %agents;
-
 
594
  print "\n", "=" x 76, "\n";
673
595
-
 
596
  ########################################
674
  ## Is this an original post or a reply?
597
  ## Show posters by article count  Sec 1;
-
 
598
  ########################################
675
  if ( defined $headers{"References"} )
599
  unless ( $skipSec{1} )
676
  {
600
  {
-
 
601
    if ( keys %data < $topposters )
-
 
602
    {
-
 
603
      $count = keys %data;
-
 
604
    }
-
 
605
    else
-
 
606
    {
-
 
607
      $count = $topposters;
-
 
608
    }
-
 
609
    printf "%s\n",
-
 
610
      centred(
-
 
611
      __x( "Top {count} posters by number of articles", count => $topposters ), 76 );
-
 
612
    print "=" x 76, "\n";
677
    $replies++;
613
    my $i = 0;
-
 
614
    foreach
-
 
615
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
-
 
616
    {
-
 
617
      my $name = substr( $poster, 0, 65 );
-
 
618
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
-
 
619
        $data{$poster}{count};
-
 
620
      last if ( ++$i == $count );
-
 
621
    }
-
 
622
    print "\n", "=" x 76, "\n";
678
  }
623
  }
-
 
624
-
 
625
  ######################################
-
 
626
  ## Show posters by size in KiB  Sec 2;
-
 
627
  ######################################
679
  else
628
  unless ( $skipSec{2} )
680
  {
629
  {
-
 
630
    if ( keys %data < $topposters )
-
 
631
    {
-
 
632
      $count = keys %data;
-
 
633
    }
-
 
634
    else
-
 
635
    {
-
 
636
      $count = $topposters;
-
 
637
    }
681
    $origposts++;
638
    printf "%s\n",
-
 
639
      centred(
-
 
640
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
-
 
641
      76 );
-
 
642
    print "=" x 76, "\n";
-
 
643
    my $i = 0;
-
 
644
    foreach
-
 
645
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
-
 
646
    {
-
 
647
      my $name = substr( $poster, 0, 62 );
-
 
648
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
-
 
649
        $data{$poster}{size} / 1024;    #/
-
 
650
      last if ( ++$i == $count );
-
 
651
    }
-
 
652
    print "\n", "=" x 76, "\n";
682
  }
653
  }
683
654
684
  ## Get the time zone
655
  #####################################
685
  $_ = $headers{"Date"};
656
  ## Show top posters for original text
686
  my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/;
657
  #####################################
687
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
658
  unless ( $skipSec{3} )
688
  {
659
  {
-
 
660
    if ( keys %data < $topposters )
-
 
661
    {
-
 
662
      $count = keys %data;
-
 
663
    }
-
 
664
    else
-
 
665
    {
-
 
666
      $count = $topposters;
-
 
667
    }
689
    $tz = "UTC";
668
    printf "%s\n",
-
 
669
      centred(
-
 
670
      __x(
-
 
671
        "Top {count} responders by original text (> 5 posts)",
-
 
672
        count => $topposters
-
 
673
      ),
-
 
674
      76
-
 
675
      );
-
 
676
    print "=" x 76, "\n";
-
 
677
    my $i = 0;
-
 
678
    foreach my $poster (
-
 
679
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
-
 
680
      keys %data
-
 
681
      )
-
 
682
    {
-
 
683
      next if $data{$poster}{quoted} == 0;
-
 
684
      next if $data{$poster}{count} < 5;
-
 
685
      my $name = substr( $poster, 0, 63 );
-
 
686
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
-
 
687
        $data{$poster}{percent};
-
 
688
      last if ( ++$i == $count );
-
 
689
    }
-
 
690
    print "\n", "=" x 76, "\n";
690
  }
691
  }
691
  $tz{$tz}++;
-
 
692
692
693
#### Now analyse the body text ####
693
  ########################################
694
  my $insig = 0;
694
  ## Show bottom posters for original text
-
 
695
  ########################################
695
  for (@body)
696
  unless ( $skipSec{4} )
696
  {
697
  {
697
    $totbody += length($_);    # bump total body size
-
 
698
    next if (/^$>/);           # don't count blank lines in body
-
 
699
    if ( $insig == 1 )
698
    if ( keys %data < $topposters )
700
    {
699
    {
-
 
700
      $count = keys %data;
-
 
701
    }
-
 
702
    else
-
 
703
    {
-
 
704
      $count = $topposters;
-
 
705
    }
-
 
706
    printf "%s\n",
-
 
707
      centred(
-
 
708
      __x(
-
 
709
        "Bottom {count} responders by original text  (> 5 posts)",
-
 
710
        count => $topposters
-
 
711
      ),
-
 
712
      76
-
 
713
      );
-
 
714
    print "=" x 76, "\n";
-
 
715
    my $i = 0;
-
 
716
    foreach my $poster (
-
 
717
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
-
 
718
      keys %data
-
 
719
      )
-
 
720
    {
701
      $totsig += length($_);    # bump total sig size
721
      next if $data{$poster}{quoted} == 0;
-
 
722
      next if $data{$poster}{count} < 5;
-
 
723
      my $name = substr( $poster, 0, 63 );
-
 
724
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
-
 
725
        $data{$poster}{percent};
-
 
726
      last if ( ++$i == $count );
-
 
727
    }
-
 
728
    print "\n", "=" x 76, "\n";
-
 
729
  }
702
730
-
 
731
  #####################################
703
      ## Bill Unruh uses ] quotes, and another poster uses ::
732
  ## Show threads by number of articles
-
 
733
  #####################################
-
 
734
  unless ( $skipSec{5} )
-
 
735
  {
-
 
736
    if ( keys %threads < $topthreads )
-
 
737
    {
-
 
738
      $count = keys %threads;
704
    }
739
    }
705
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
740
    else
706
    {                           # are we in a quote line?
741
    {
707
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
-
 
708
      $totquoted += length($_);
742
      $count = $topthreads;
709
    }
743
    }
710
    elsif (/-- /)
744
    printf "%s\n",
-
 
745
      centred( __x( "Top {count} threads by no. of articles", count => $topthreads ),
-
 
746
      76 );
-
 
747
    print "=" x 76, "\n";
-
 
748
    my $i = 0;
-
 
749
    foreach my $thread (
-
 
750
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
-
 
751
      keys %threads
-
 
752
      )
711
    {
753
    {
-
 
754
      my $name = substr( $thread, 0, 65 );
-
 
755
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-
 
756
        $threads{$thread}{'count'};
-
 
757
      last if ( ++$i == $count );
-
 
758
    }
-
 
759
    print "\n", "=" x 76, "\n";
-
 
760
  }
-
 
761
-
 
762
  ##############################
-
 
763
  ## Show threads by size in KiB
-
 
764
  ##############################
-
 
765
  unless ( $skipSec{6} )
-
 
766
  {
-
 
767
    if ( keys %threads < $topthreads )
-
 
768
    {
712
      $insig = 1;
769
      $count = keys %threads;
713
    }
770
    }
714
    else
771
    else
715
    {
772
    {
-
 
773
      $count = $topthreads;
-
 
774
    }
-
 
775
    printf "%s\n",
-
 
776
      centred( __x( "Top {count} threads by size in KiB", count => $topthreads ),
-
 
777
      76 );
-
 
778
    print "=" x 76, "\n";
-
 
779
    my $i = 0;
-
 
780
    foreach my $thread (
-
 
781
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
-
 
782
      keys %threads
-
 
783
      )
-
 
784
    {
-
 
785
      my $name = substr( $thread, 0, 65 );
-
 
786
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-
 
787
        $threads{$thread}{'size'} / 1024;    #/
-
 
788
      last if ( ++$i == $count );
-
 
789
    }
-
 
790
    print "\n", "=" x 76, "\n";
-
 
791
  }
716
792
-
 
793
  ##################################
-
 
794
  ## Show top 10 cross-posted groups
-
 
795
  ##################################
-
 
796
  unless ( $skipSec{7} )
-
 
797
  {
-
 
798
    delete $crossposts{"$newsgroup_name"};    # don't include ours
717
      ## We must be processing an original line
799
    if ( keys %crossposts < $topcrossposts )
-
 
800
    {
-
 
801
      $count = keys %crossposts;
-
 
802
    }
-
 
803
    else
-
 
804
    {
-
 
805
      $count = $topcrossposts;
-
 
806
    }
-
 
807
    printf "%s\n",
718
      $data{$poster}{orig} += length($_);      # bump count of original chrs
808
      centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
-
 
809
    print "=" x 76, "\n";
-
 
810
    my $i = 0;
-
 
811
    foreach
-
 
812
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
-
 
813
    {
-
 
814
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
-
 
815
        $crossposts{$name};
719
      $totorig += length($_);
816
      last if ( ++$i == $count );
720
    }
817
    }
721
  }    # end for (@body)
818
    print "\n", "=" x 76, "\n";
-
 
819
  }
722
820
-
 
821
  #########################
-
 
822
  ## Show agents and counts
-
 
823
  #########################
-
 
824
  unless ( $skipSec{8} )
-
 
825
  {
-
 
826
    if ( keys %agents < $topagents )
-
 
827
    {
-
 
828
      $count = keys %agents;
-
 
829
    }
-
 
830
    else
-
 
831
    {
-
 
832
      $count = $topagents;
-
 
833
    }
-
 
834
    printf "%s\n",
-
 
835
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
-
 
836
      76 );
-
 
837
    print "=" x 76, "\n";
-
 
838
    my $i = 0;
-
 
839
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
-
 
840
    {
-
 
841
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
723
}    # get_data
842
        $agents{$agent};
-
 
843
      last if ( ++$i == $count );
-
 
844
    }
-
 
845
    print "\n", "=" x 76, "\n";
-
 
846
  }
724
847
725
#########################################
848
  #######################
726
## Count the User-Agents used, collapsing
-
 
727
## different versions into one per agent.
849
  ## Show distinct agents
728
#########################################
850
  #######################
729
sub count_agents
-
 
730
{
-
 
731
POSTER:
-
 
732
  foreach my $poster ( keys %data )
851
  unless ( $skipSec{9} )
733
  {
852
  {
734
    foreach my $agent_name ( keys %distinct_agent )
-
 
735
    {    # check against known ones
-
 
736
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
853
    if ( keys %distinct_agent < $topagents )
737
      {
854
    {
738
        $agents{$agent_name}++;
855
      $count = keys %distinct_agent;
739
        next POSTER;
-
 
740
      }
-
 
741
    }
856
    }
-
 
857
    else
-
 
858
    {
-
 
859
      $count = $topagents;
-
 
860
    }
-
 
861
    printf "%s\n",
-
 
862
      centred(
-
 
863
      __x( "Top {count} user agents by number of posts", count => $topagents ),
-
 
864
      76 );
-
 
865
    print "=" x 76, "\n";
-
 
866
    my $i = 0;
-
 
867
    foreach my $agent (
-
 
868
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
-
 
869
      keys %distinct_agent
-
 
870
      )
-
 
871
    {
-
 
872
      printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
742
    $agents{ $data{$poster}{agent} }++;
873
        $distinct_agent{$agent},
-
 
874
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
-
 
875
      last if ( ++$i == $count );
-
 
876
    }
-
 
877
    print "\n", "=" x 76, "\n";
743
  }
878
  }
744
}    # count_agents
-
 
745
879
746
#############################################
880
  ############################
747
## Set orig/total percentages for all posters
881
  ## Show timezones and counts
748
#############################################
882
  ############################
749
sub fix_percent
-
 
750
{
-
 
751
  foreach my $poster ( keys %data )
883
  unless ( $skipSec{10} )
752
  {
884
  {
753
    my $percent = 100;
885
    if ( keys %tz < $toptz )
754
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
-
 
755
    {
886
    {
756
      $percent =
887
      $count = keys %tz;
757
        $data{$poster}{orig} * 100 /
-
 
758
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
-
 
759
    }
888
    }
760
    elsif ( $data{$poster}{orig} == 0 )
889
    else
761
    {
890
    {
762
      $percent = 0;
891
      $count = $toptz;
763
    }
892
    }
-
 
893
    printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 );
-
 
894
    print "=" x 76, "\n";
-
 
895
    my $i = 0;
-
 
896
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
-
 
897
    {
-
 
898
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
764
    $data{$poster}{percent} = $percent;
899
      last if ( ++$i == $count );
-
 
900
    }
-
 
901
    print "\n", "=" x 76, "\n";
765
  }
902
  }
766
}
903
}
767
904
-
 
905
## helper subs
-
 
906
768
###############################
907
###############################
769
## Right pad a string with '.'s
908
## Right pad a string with '.'s
770
###############################
909
###############################
771
sub rpad
910
sub rpad
772
{
911
{
773
  ## Get text to pad, length to pad, pad chr
912
  ## Get text to pad, length to pad, pad chr
774
  my ( $text, $pad_len, $pad_chr ) = @_;
913
  my ( $text, $pad_len, $pad_chr ) = @_;
775
914
776
  ## DEBUG
915
  ## DEBUG
777
#printf "|%s| = %d\n", $text, length($text);
916
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
778
917
779
  if ( length($text) > $pad_len )
918
  if ( length($text) > $pad_len )
780
  {
919
  {
781
    $text = substr( $text, 0, $pad_len );
920
    $text = substr( $text, 0, $pad_len );
782
  }
921
  }
Line 796... Line 935...
796
}
935
}
797
936
798
###########################
937
###########################
799
## Put commas into a number
938
## Put commas into a number
800
###########################
939
###########################
-
 
940
# Get some of locale's numeric formatting parameters
-
 
941
my ($thousands_sep, $grouping) =
-
 
942
        @{localeconv()}{'thousands_sep', 'grouping'};
-
 
943
# Apply defaults if values are missing
-
 
944
$thousands_sep = ',' unless $thousands_sep;
-
 
945
# grouping and mon_grouping are packed lists
-
 
946
# of small integers (characters) telling the
-
 
947
# grouping (thousand_seps and mon_thousand_seps
-
 
948
# being the group dividers) of numbers and
-
 
949
# monetary quantities.  The integers' meanings:
-
 
950
# 255 means no more grouping, 0 means repeat
-
 
951
# the previous grouping, 1-254 means use that
-
 
952
# as the current grouping.  Grouping goes from
-
 
953
# right to left (low to high digits).  In the
-
 
954
# below we cheat slightly by never using anything
-
 
955
# else than the first grouping (whatever that is).
-
 
956
my @grouping;
-
 
957
if ($grouping) {
-
 
958
    @grouping = unpack("C*", $grouping);
-
 
959
} else {
-
 
960
    @grouping = (3);
-
 
961
}
-
 
962
801
sub commify
963
sub commify
802
{
964
{
803
  local $_ = shift;
965
  local $_ = shift;
804
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
966
  #1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
-
 
967
  $_ = int;    # Chop non-integer part
-
 
968
  1 while
-
 
969
  s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/;
805
  return $_;
970
  return $_;
806
}
971
}
807
972
808
################################################################
973
################################################################
809
## Returns a string with leading and trailing whitespace removed
974
## Returns a string with leading and trailing whitespace removed
Line 817... Line 982...
817
  return $clean;
982
  return $clean;
818
}
983
}
819
984
820
sub usage
985
sub usage
821
{
986
{
822
  print "usage: newstat.pl newsgroupname\n";
987
  print __"usage: newsstat.pl NEWS.GROUP\n";
823
  exit 1;
988
  exit 1;
824
}
989
}
825
990
826
##################################
-
 
827
## Write data structures to a file
-
 
828
##################################
-
 
829
sub write_data
991
sub dmsg
830
{
992
{
831
  open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA"
-
 
832
    or die "Can't create XDATA: $!\n";
-
 
833
  print $OUTF "Data collected from $newsgroup_name\n\n";
-
 
834
  print $OUTF
-
 
835
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
-
 
836
  foreach my $name ( keys %data )
-
 
837
  {
-
 
838
    print $OUTF
-
 
839
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
-
 
840
  }
-
 
841
  print $OUTF
-
 
842
"============================================================================\n";
-
 
843
  print $OUTF "Thread subjects\n";
993
  print STDERR @_, "\n";
844
  print $OUTF
-
 
845
"----------------------------------------------------------------------------\n";
-
 
846
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
-
 
847
  {
-
 
848
    print $OUTF
-
 
849
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
-
 
850
  }
994
}
851
  print $OUTF
-
 
852
"============================================================================\n";
-
 
853
  print $OUTF "Cross-posts\n";
-
 
854
  print $OUTF
-
 
855
"----------------------------------------------------------------------------\n";
-
 
856
  foreach my $name ( sort keys %crossposts )
-
 
857
  {
-
 
858
    print $OUTF "$name : $crossposts{$name}\n";
-
 
859
  }
995
860
  print $OUTF
-
 
861
"============================================================================\n";
-
 
862
  print $OUTF "User agents\n";
-
 
863
  print $OUTF
996
sub dmsg2
864
"----------------------------------------------------------------------------\n";
-
 
865
  foreach my $name ( sort keys %agents )
-
 
866
  {
997
{
867
    print $OUTF "$name : $agents{$name}\n";
-
 
868
  }
-
 
869
  print $OUTF
-
 
870
"============================================================================\n";
-
 
871
  print $OUTF "Time zones\n";
998
  my ( $level, @msg ) = @_;
872
  print $OUTF
-
 
873
"----------------------------------------------------------------------------\n";
-
 
874
  foreach my $name ( sort keys %tz )
-
 
875
  {
-
 
876
    print $OUTF "$name : $tz{$name}\n";
999
  print STDERR @msg, "\n" if $level >= DEBUG;
877
  }
1000
}
878
  close $OUTF;
-
 
879
}    # write_data
-