Subversion Repositories LCARS

Rev

Rev 23 | Rev 25 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 23 Rev 24
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
4
require 5.004;
4
require 5.004;
5
5
6
#use diagnostics;
6
#use diagnostics;
7
use utf8;
7
use utf8;
8
use Encode;
8
use Encode;
9
9
10
use constant DEBUG => 0;
10
use constant DEBUG => 0;
11
11
-
 
12
## newsstat.pl
-
 
13
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
-
 
14
## Based on work by Garry Knight et al.
-
 
15
## 
-
 
16
## This program is free software: you can redistribute it and/or modify
-
 
17
## it under the terms of the GNU General Public License as published by
-
 
18
## the Free Software Foundation, either version 3 of the License, or
-
 
19
## (at your option) any later version.
-
 
20
## 
-
 
21
## This program is distributed in the hope that it will be useful,
-
 
22
## but WITHOUT ANY WARRANTY; without even the implied warranty of
-
 
23
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-
 
24
## GNU General Public License for more details.
-
 
25
## 
-
 
26
## You should have received a copy of the GNU General Public License
-
 
27
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
 
28
12
## Print out all text to STDOUT UTF-8 encoded
29
## Print out all text to STDOUT UTF-8 encoded
13
binmode STDOUT, ':encoding(UTF-8)';
30
binmode STDOUT, ':encoding(UTF-8)';
14
binmode STDERR, ':encoding(UTF-8)';
31
binmode STDERR, ':encoding(UTF-8)';
15
32
16
## L10n
33
## L10n
17
use locale ':not_characters';
34
use locale ':not_characters';
18
35
19
# setlocale( LC_MESSAGES, '' );
36
# setlocale( LC_MESSAGES, '' );
20
require Number::Format;
37
require Number::Format;
21
38
22
## i18n
39
## i18n
23
## FIXME: Automatically include resolved '.' in @INC
40
## FIXME: Automatically include resolved '.' in @INC
24
# print join "\n", @INC;
41
# print join "\n", @INC;
25
42
26
use Locale::TextDomain ('de.pointedears.newsstat');
43
use Locale::TextDomain ('de.pointedears.newsstat');
27
use POSIX              ('locale_h');
44
use POSIX              ('locale_h');
28
use Locale::Messages qw (bind_textdomain_filter
45
use Locale::Messages qw (bind_textdomain_filter
29
  bind_textdomain_codeset
46
  bind_textdomain_codeset
30
  turn_utf_8_on);
47
  turn_utf_8_on);
31
48
32
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
49
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
33
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
50
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
34
51
35
require Mail::Message;
52
require Mail::Message;
36
require DateTime;
53
require DateTime;
37
require DateTime::Format::Mail;
54
require DateTime::Format::Mail;
38
55
39
# See comments in previous example
56
# See comments in previous example
40
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
57
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
41
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
58
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
42
  'decimal_point' };
59
  'decimal_point' };
43
60
44
# Apply defaults if values are missing
61
# Apply defaults if values are missing
45
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
62
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
46
$thousands_sep = ' ' unless $thousands_sep;
63
$thousands_sep = ' ' unless $thousands_sep;
47
64
48
# grouping and mon_grouping are packed lists
65
# grouping and mon_grouping are packed lists
49
# of small integers (characters) telling the
66
# of small integers (characters) telling the
50
# grouping (thousand_seps and mon_thousand_seps
67
# grouping (thousand_seps and mon_thousand_seps
51
# being the group dividers) of numbers and
68
# being the group dividers) of numbers and
52
# monetary quantities.  The integers' meanings:
69
# monetary quantities.  The integers' meanings:
53
# 255 means no more grouping, 0 means repeat
70
# 255 means no more grouping, 0 means repeat
54
# the previous grouping, 1-254 means use that
71
# the previous grouping, 1-254 means use that
55
# as the current grouping.  Grouping goes from
72
# as the current grouping.  Grouping goes from
56
# right to left (low to high digits).  In the
73
# right to left (low to high digits).  In the
57
# below we cheat slightly by never using anything
74
# below we cheat slightly by never using anything
58
# else than the first grouping (whatever that is).
75
# else than the first grouping (whatever that is).
59
my @grouping;
76
my @grouping;
60
if ($grouping)
77
if ($grouping)
61
{
78
{
62
  @grouping = unpack( "C*", $grouping );
79
  @grouping = unpack( "C*", $grouping );
63
}
80
}
64
else
81
else
65
{
82
{
66
  @grouping = (3);
83
  @grouping = (3);
67
}
84
}
68
85
69
## FIXME: Why don't the defaults work already?
86
## FIXME: Why don't the defaults work already?
70
my $formatter = new Number::Format(
87
my $formatter = new Number::Format(
71
  -decimal_point => $decimal_point,
88
  -decimal_point => $decimal_point,
72
  -thousands_sep => $thousands_sep,
89
  -thousands_sep => $thousands_sep,
73
90
74
  # -grouping      => $grouping[0]
91
  # -grouping      => $grouping[0]
75
);
92
);
76
93
77
###################### USER CONFIGURATIONS ############################
94
###################### USER CONFIGURATIONS ############################
78
95
79
## The name of the group to do stats for
96
## The name of the group to do stats for
80
my $newsgroup_name = $ARGV[0];
97
my $newsgroup_name = $ARGV[0];
81
$newsgroup_name or usage();
98
$newsgroup_name or usage();
82
99
83
## Check for removal flags
100
## Check for removal flags
84
my $ix;
101
my $ix;
85
my $j;
102
my $j;
86
my %skipSec;
103
my %skipSec;
87
my @skiplist;
104
my @skiplist;
88
my $args = @ARGV;
105
my $args = @ARGV;
89
for ( $ix = 1 ; $ix < $args ; $ix++ )
106
for ( $ix = 1 ; $ix < $args ; $ix++ )
90
{
107
{
91
  $j = $ix + 1;
108
  $j = $ix + 1;
92
  if ( $ARGV[$ix] eq "-x" )
109
  if ( $ARGV[$ix] eq "-x" )
93
  {
110
  {
94
    @skiplist = split( ",", $ARGV[$j] );
111
    @skiplist = split( ",", $ARGV[$j] );
95
  }
112
  }
96
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
113
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
97
  {
114
  {
98
    @skiplist = split( ",", $1 );
115
    @skiplist = split( ",", $1 );
99
  }
116
  }
100
}
117
}
101
foreach (@skiplist)
118
foreach (@skiplist)
102
{
119
{
103
  $skipSec{$_} = 1;
120
  $skipSec{$_} = 1;
104
}
121
}
105
122
106
## Leafnode users will want /var/spool/news for this variable.
123
## Leafnode users will want /var/spool/news for this variable.
107
my $news = "/var/spool/news/";
124
my $news = "/var/spool/news/";
108
125
109
## Number of top or bottom posters to show
126
## Number of top or bottom posters to show
110
my $topposters = 20;
127
my $topposters = 20;
111
128
112
## Number of threads we want to know about
129
## Number of threads we want to know about
113
my $topthreads = 20;
130
my $topthreads = 20;
114
131
115
## Number of cross-posted threads to show
132
## Number of cross-posted threads to show
116
my $topcrossposts = 10;
133
my $topcrossposts = 10;
117
134
118
## Number of agents we list
135
## Number of agents we list
119
my $topagents = 10;
136
my $topagents = 10;
120
137
121
## Number of time zones to show
138
## Number of time zones to show
122
my $toptz = 10;
139
my $toptz = 10;
123
140
124
###################### DATA STRUCTURES ######################
141
###################### DATA STRUCTURES ######################
125
my $group = $newsgroup_name;
142
my $group = $newsgroup_name;
126
$group =~ s!\.!/!g;
143
$group =~ s!\.!/!g;
127
my %data;    # name, count, agent, total, orig, quoted
144
my %data;    # name, count, agent, total, orig, quoted
128
my $totsize = 0;    # holds total sizes of all files
145
my $totsize = 0;    # holds total sizes of all files
129
my %crossposts;     # group, count
146
my %crossposts;     # group, count
130
my %threads;        # subject, count
147
my %threads;        # subject, count
131
my $replies   = 0;  # total no. of replies
148
my $replies   = 0;  # total no. of replies
132
my $origposts = 0;  # total no. of original posts
149
my $origposts = 0;  # total no. of original posts
133
my %tz;             # timezones by count
150
my %tz;             # timezones by count
134
my $earliest;       # earliest article we have found
151
my $earliest;       # earliest article we have found
135
my $latest;         # latest article we have found
152
my $latest;         # latest article we have found
136
my $totheader = 0;  # total size of header material
153
my $totheader = 0;  # total size of header material
137
my $totbody   = 0;  # total size of body material
154
my $totbody   = 0;  # total size of body material
138
my $totsig    = 0;  # total size of sig material
155
my $totsig    = 0;  # total size of sig material
139
my $totquoted = 0;  # total size of quoted material
156
my $totquoted = 0;  # total size of quoted material
140
my $totorig   = 0;  # total size of original material
157
my $totorig   = 0;  # total size of original material
141
my $totalposts;     # total no. of posts considered
158
my $totalposts;     # total no. of posts considered
142
my %distinct_agent;
159
my %distinct_agent;
143
160
144
## Used to hold counts of User Agents used
161
## Used to hold counts of User Agents used
145
my %agents = (
162
my %agents = (
146
  "Compuserver"               => 0,
163
  "Compuserver"               => 0,
147
  "Foorum"                    => 0,
164
  "Foorum"                    => 0,
148
  "Forte Agent"               => 0,
165
  "Forte Agent"               => 0,
149
  "Forte Free Agent"          => 0,
166
  "Forte Free Agent"          => 0,
150
  "Gnus"                      => 0,
167
  "Gnus"                      => 0,
151
  "KNode"                     => 0,
168
  "KNode"                     => 0,
152
  "MacSOUP"                   => 0,
169
  "MacSOUP"                   => 0,
153
  "MT-NewsWatcher"            => 0,
170
  "MT-NewsWatcher"            => 0,
154
  "MicroPlanet Gravity"       => 0,
171
  "MicroPlanet Gravity"       => 0,
155
  "Microsoft Outlook Express" => 0,
172
  "Microsoft Outlook Express" => 0,
156
  "Microsoft Windows Mail"    => 0,
173
  "Microsoft Windows Mail"    => 0,
157
  "Mozilla"                   => 0,
174
  "Mozilla"                   => 0,
158
  "News Rover"                => 0,
175
  "News Rover"                => 0,
159
  "NN"                        => 0,
176
  "NN"                        => 0,
160
  "Pan"                       => 0,
177
  "Pan"                       => 0,
161
  "rn"                        => 0,
178
  "rn"                        => 0,
162
  "slrn"                      => 0,
179
  "slrn"                      => 0,
163
  "Sylpheed"                  => 0,
180
  "Sylpheed"                  => 0,
164
  "tin"                       => 0,
181
  "tin"                       => 0,
165
  "VSoup"                     => 0,
182
  "VSoup"                     => 0,
166
  "WebTV"                     => 0,
183
  "WebTV"                     => 0,
167
  "Xnews"                     => 0,
184
  "Xnews"                     => 0,
168
);
185
);
169
186
170
my $datetime_parser = DateTime::Format::Mail->new();
187
my $datetime_parser = DateTime::Format::Mail->new();
171
$datetime_parser->loose();
188
$datetime_parser->loose();
172
189
173
my $today = DateTime->today( time_zone => 'UTC' );
190
my $today = DateTime->today( time_zone => 'UTC' );
174
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
191
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
175
my $start      = int $prev_month->strftime('%s');
192
my $start      = int $prev_month->strftime('%s');
176
my $numdays    = int DateTime->last_day_of_month(
193
my $numdays    = int DateTime->last_day_of_month(
177
  year      => $prev_month->year(),
194
  year      => $prev_month->year(),
178
  month     => $prev_month->month(),
195
  month     => $prev_month->month(),
179
  time_zone => $prev_month->time_zone(),
196
  time_zone => $prev_month->time_zone(),
180
)->day();
197
)->day();
181
my $end = int $today->clone()->set_day(1)->strftime('%s');
198
my $end = int $today->clone()->set_day(1)->strftime('%s');
182
199
183
dmsg( $start, " to ", $end ) if DEBUG;
200
dmsg( $start, " to ", $end ) if DEBUG;
184
201
185
chdir("$news$group")
202
chdir("$news$group")
186
  or die __x(
203
  or die __x(
187
  "Can't cd to {newsgroup}: {error}\n",
204
  "Can't cd to {newsgroup}: {error}\n",
188
  newsgroup => "$news$group",
205
  newsgroup => "$news$group",
189
  error     => $!
206
  error     => $!
190
  );
207
  );
191
opendir( DIR, "." )
208
opendir( DIR, "." )
192
  or die __x(
209
  or die __x(
193
  "Can't open {newsgroup}: {error}\n",
210
  "Can't open {newsgroup}: {error}\n",
194
  newsgroup => "$news$group",
211
  newsgroup => "$news$group",
195
  error     => $!
212
  error     => $!
196
  );
213
  );
197
214
198
while ( defined( my $filename = readdir(DIR) ) )
215
while ( defined( my $filename = readdir(DIR) ) )
199
{
216
{
200
  next unless -f $filename;    # only want real files
217
  next unless -f $filename;    # only want real files
201
  next if ( $filename eq ".overview" );    # real articles only
218
  next if ( $filename eq ".overview" );    # real articles only
202
219
203
  get_article($filename);                  # read in the article
220
  get_article($filename);                  # read in the article
204
}
221
}
205
closedir(DIR);                             # finished with the directory
222
closedir(DIR);                             # finished with the directory
206
223
207
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
224
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
208
225
209
## Post-processing
226
## Post-processing
210
count_agents();                            # count agents, collapsing versions
227
count_agents();                            # count agents, collapsing versions
211
fix_percent();
228
fix_percent();
212
229
213
write_data();
230
write_data();
214
display_results();
231
display_results();
215
232
216
########################################
233
########################################
217
## Get current article's header and body
234
## Get current article's header and body
218
########################################
235
########################################
219
sub get_article
236
sub get_article
220
{
237
{
221
  my $filename = shift;
238
  my $filename = shift;
222
239
223
  open( my $FILE, '<', $filename )
240
  open( my $FILE, '<', $filename )
224
    or
241
    or
225
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
242
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
226
  my $msg       = Mail::Message->read($FILE);
243
  my $msg       = Mail::Message->read($FILE);
227
  my $timestamp = $msg->timestamp();
244
  my $timestamp = $msg->timestamp();
228
  my $date      = $msg->study('Date');
245
  my $date      = $msg->study('Date');
229
246
230
  ## Disregard article if timestamp is not in range
247
  ## Disregard article if timestamp is not in range
231
  dmsg($timestamp) if DEBUG;
248
  dmsg($timestamp) if DEBUG;
232
  if ( $timestamp < $start or $timestamp >= $end )
249
  if ( $timestamp < $start or $timestamp >= $end )
233
  {
250
  {
234
    dmsg("Posting on $date ignored.") if DEBUG;
251
    dmsg("Posting on $date ignored.") if DEBUG;
235
    return;
252
    return;
236
  }
253
  }
237
254
238
  $totalposts++;    # bump count of articles considered
255
  $totalposts++;    # bump count of articles considered
239
256
240
  ## DEBUG
257
  ## DEBUG
241
  dmsg($date) if DEBUG;
258
  dmsg($date) if DEBUG;
242
259
243
  ## get stats about the file itself
260
  ## get stats about the file itself
244
  my $filesize = -s $filename;    # get total size of file
261
  my $filesize = -s $filename;    # get total size of file
245
  $totsize += $filesize;          # bump total sizes of all files
262
  $totsize += $filesize;          # bump total sizes of all files
246
263
247
  if ( ( not defined $earliest ) or $timestamp < $earliest )
264
  if ( ( not defined $earliest ) or $timestamp < $earliest )
248
  {
265
  {
249
    $earliest = $timestamp;
266
    $earliest = $timestamp;
250
  }
267
  }
251
  elsif ( ( not defined $latest ) or $timestamp > $latest )
268
  elsif ( ( not defined $latest ) or $timestamp > $latest )
252
  {
269
  {
253
    $latest = $timestamp;
270
    $latest = $timestamp;
254
  }
271
  }
255
272
256
  #print "timestamp: $timestamp\n";
273
  #print "timestamp: $timestamp\n";
257
274
258
  ## count header size
275
  ## count header size
259
  $totheader += $msg->head()->size();
276
  $totheader += $msg->head()->size();
260
277
261
  ## get the poster's name (MIME-decoded, in UTF-8)
278
  ## get the poster's name (MIME-decoded, in UTF-8)
262
  my $poster = $msg->study('From');
279
  my $poster = $msg->study('From');
263
  if ( defined $poster )
280
  if ( defined $poster )
264
  {
281
  {
265
    ## Convert old to new format
282
    ## Convert old to new format
266
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
283
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
267
284
268
    ## Collapse whitespace
285
    ## Collapse whitespace
269
    $poster =~ s/\s+/ /g;
286
    $poster =~ s/\s+/ /g;
270
287
271
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
288
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
272
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
289
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
273
290
274
    ## DEBUG
291
    ## DEBUG
275
    dmsg($poster) if DEBUG;
292
    dmsg($poster) if DEBUG;
276
293
277
    ## seen this one before?
294
    ## seen this one before?
278
    if ( !defined( $data{$poster} ) )
295
    if ( !defined( $data{$poster} ) )
279
    {
296
    {
280
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
297
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
281
      $data{$poster}{'orig'}   = 0;
298
      $data{$poster}{'orig'}   = 0;
282
      $data{$poster}{'quoted'} = 0;
299
      $data{$poster}{'quoted'} = 0;
283
    }
300
    }
284
    $data{$poster}{'count'}++;                    # bump count for this poster
301
    $data{$poster}{'count'}++;                    # bump count for this poster
285
    $data{$poster}{'size'} += $filesize;          # total size of file
302
    $data{$poster}{'size'} += $filesize;          # total size of file
286
303
287
    ## The User-Agent and/or X-Newsreader fields
304
    ## The User-Agent and/or X-Newsreader fields
288
    ## for User-Agent by poster
305
    ## for User-Agent by poster
289
    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
306
    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
290
    if ( defined $ua )
307
    if ( defined $ua )
291
    {
308
    {
292
      $data{$poster}{'agent'} = $ua;
309
      $data{$poster}{'agent'} = $ua;
293
310
294
      ## DEBUG
311
      ## DEBUG
295
      dmsg($ua) if DEBUG;
312
      dmsg($ua) if DEBUG;
296
    }
313
    }
297
314
298
    ## The User Agent for User-Agent by number of posts
315
    ## The User Agent for User-Agent by number of posts
299
    get_agent($msg);
316
    get_agent($msg);
300
317
301
    ## Get all cross-posted newsgroups
318
    ## Get all cross-posted newsgroups
302
    for ( split( /,/, $msg->study('Newsgroups') ) )
319
    for ( split( /,/, $msg->study('Newsgroups') ) )
303
    {
320
    {
304
      $crossposts{$_}++;    # bump count for each
321
      $crossposts{$_}++;    # bump count for each
305
    }
322
    }
306
323
307
    ## Get threads
324
    ## Get threads
308
    my $thread = $msg->study('Subject');
325
    my $thread = $msg->study('Subject');
309
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
326
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
310
    $thread =~ s/\s+/ /g;       # collapse whitespace
327
    $thread =~ s/\s+/ /g;       # collapse whitespace
311
    $threads{$thread}{'count'}++;    # bump count of this subject
328
    $threads{$thread}{'count'}++;    # bump count of this subject
312
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
329
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
313
330
314
    ## Is this an original post or a reply?
331
    ## Is this an original post or a reply?
315
    if ( defined $msg->study('References') )
332
    if ( defined $msg->study('References') )
316
    {
333
    {
317
      $replies++;
334
      $replies++;
318
    }
335
    }
319
    else
336
    else
320
    {
337
    {
321
      $origposts++;
338
      $origposts++;
322
    }
339
    }
323
340
324
    ## Get the time zone
341
    ## Get the time zone
325
    my $datetime = $datetime_parser->parse_datetime($date);
342
    my $datetime = $datetime_parser->parse_datetime($date);
326
    my $tz       = $datetime->strftime('%z');
343
    my $tz       = $datetime->strftime('%z');
327
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
344
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
328
    $tz{$tz}++;
345
    $tz{$tz}++;
329
346
330
    ## DEBUG
347
    ## DEBUG
331
    dmsg($tz) if DEBUG;
348
    dmsg($tz) if DEBUG;
332
349
333
#### Now analyse the body text ####
350
#### Now analyse the body text ####
334
    my $body = $msg->body();
351
    my $body = $msg->body();
335
352
336
    my $insig = 0;
353
    my $insig = 0;
337
    my @body  = $body->lines;
354
    my @body  = $body->lines;
338
    for (@body)
355
    for (@body)
339
    {
356
    {
340
      $totbody += length($_);    # bump total body size
357
      $totbody += length($_);    # bump total body size
341
      next if (m{^$>}o);         # don't count blank lines in body
358
      next if (m{^$>}o);         # don't count blank lines in body
342
      if ( $insig == 1 )
359
      if ( $insig == 1 )
343
      {
360
      {
344
361
345
        # bump total sig size
362
        # bump total sig size
346
        $totsig += length($_);
363
        $totsig += length($_);
347
      }
364
      }
348
      ## are we in a quote line?
365
      ## are we in a quote line?
349
      ## Bill Unruh uses ] quotes, and another poster uses ::
366
      ## Bill Unruh uses ] quotes, and another poster uses ::
350
      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
367
      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
351
      {
368
      {
352
        ## bump count of quoted chrs
369
        ## bump count of quoted chrs
353
        $data{$poster}{'quoted'} += length($_);
370
        $data{$poster}{'quoted'} += length($_);
354
        $totquoted += length($_);
371
        $totquoted += length($_);
355
      }
372
      }
356
      elsif (/^-- $/)
373
      elsif (/^-- $/)
357
      {
374
      {
358
        $insig = 1;
375
        $insig = 1;
359
      }
376
      }
360
      else
377
      else
361
      {
378
      {
362
        ## We must be processing an original line
379
        ## We must be processing an original line
363
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
380
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
364
        $totorig += length($_);
381
        $totorig += length($_);
365
      }
382
      }
366
    }
383
    }
367
384
368
    # end for (@body)
385
    # end for (@body)
369
  }
386
  }
370
387
371
  close($FILE);
388
  close($FILE);
372
}
389
}
373
390
374
sub get_agent
391
sub get_agent
375
{
392
{
376
  my $msg = shift;
393
  my $msg = shift;
377
394
378
  my $ua =
395
  my $ua =
379
       $msg->study('User-Agent')
396
       $msg->study('User-Agent')
380
    or $msg->study('X-Newsreader')
397
    or $msg->study('X-Newsreader')
381
    or $msg->study('X-Mailer');
398
    or $msg->study('X-Mailer');
382
  if ( not defined $ua )
399
  if ( not defined $ua )
383
  {
400
  {
384
    my $org = $msg->study('Organization');
401
    my $org = $msg->study('Organization');
385
    if ( defined $org
402
    if ( defined $org
386
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
403
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
387
    {
404
    {
388
      $ua = $org;
405
      $ua = $org;
389
    }
406
    }
390
    elsif ( $msg->study('Message-ID') =~ /pine/i )
407
    elsif ( $msg->study('Message-ID') =~ /pine/i )
391
    {
408
    {
392
      $ua = "Pine";
409
      $ua = "Pine";
393
    }
410
    }
394
  }
411
  }
395
412
396
  ## Hopefully found UA, else set to unknown
413
  ## Hopefully found UA, else set to unknown
397
  if ( not defined $ua )
414
  if ( not defined $ua )
398
  {
415
  {
399
    $ua = __ "unknown";
416
    $ua = __ "unknown";
400
  }
417
  }
401
418
402
  $ua = clean($ua);
419
  $ua = clean($ua);
403
420
404
  my $raw   = $ua;
421
  my $raw   = $ua;
405
  my $agent = $raw;
422
  my $agent = $raw;
406
423
407
  ## strip http
424
  ## strip http
408
  if ( $raw =~ /.*http.*/ )
425
  if ( $raw =~ /.*http.*/ )
409
  {
426
  {
410
    $raw =~ s!posted via!!i;
427
    $raw =~ s!posted via!!i;
411
    $raw =~ s!http://!!g;
428
    $raw =~ s!http://!!g;
412
    $raw =~ s!/!!g;
429
    $raw =~ s!/!!g;
413
    $raw =~ s! !!g;
430
    $raw =~ s! !!g;
414
  }
431
  }
415
432
416
  ## Fix Outlook from Mac
433
  ## Fix Outlook from Mac
417
  if ( $raw =~ /^microsoft/i )
434
  if ( $raw =~ /^microsoft/i )
418
  {
435
  {
419
    $raw =~ s/-/ /g;
436
    $raw =~ s/-/ /g;
420
  }
437
  }
421
438
422
  ## Pick out the popular agents
439
  ## Pick out the popular agents
423
  if ( $raw =~ /(outlook express)/i
440
  if ( $raw =~ /(outlook express)/i
424
    || $raw =~ /(windows mail)/i
441
    || $raw =~ /(windows mail)/i
425
    || $raw =~ /(microplanet gravity)/i
442
    || $raw =~ /(microplanet gravity)/i
426
    || $raw =~ /(news rover)/i
443
    || $raw =~ /(news rover)/i
427
    || $raw =~ /(forte agent)/i
444
    || $raw =~ /(forte agent)/i
428
    || $raw =~ /(forte free agent)/i )
445
    || $raw =~ /(forte free agent)/i )
429
  {
446
  {
430
    $agent = $1;
447
    $agent = $1;
431
  }
448
  }
432
  elsif (
449
  elsif (
433
    $raw =~ /^(
450
    $raw =~ /^(
434
        pan
451
        pan
435
       |sylpheed
452
       |sylpheed
436
       |slrn
453
       |slrn
437
       |mozilla
454
       |mozilla
438
       |knode
455
       |knode
439
       |tin
456
       |tin
440
       |hamster
457
       |hamster
441
       |xrn
458
       |xrn
442
       |xnews
459
       |xnews
443
       |aol
460
       |aol
444
       |gnus
461
       |gnus
445
       |krn
462
       |krn
446
       |macsoup
463
       |macsoup
447
       |messenger
464
       |messenger
448
       |openxp
465
       |openxp
449
       |pine
466
       |pine
450
       |thoth
467
       |thoth
451
       |turnpike
468
       |turnpike
452
       |winvn
469
       |winvn
453
       |vsoup
470
       |vsoup
454
       |google
471
       |google
455
       |supernews
472
       |supernews
456
       |nn
473
       |nn
457
       |rn
474
       |rn
458
       |007
475
       |007
459
       |webtv
476
       |webtv
460
       |compuserve
477
       |compuserve
461
       )/ix
478
       )/ix
462
    )
479
    )
463
  {
480
  {
464
    $agent = $1;
481
    $agent = $1;
465
  }
482
  }
466
  else
483
  else
467
  {
484
  {
468
    ## Clean up unknown agents
485
    ## Clean up unknown agents
469
    if ( $raw =~ m!^(.*?)/! )
486
    if ( $raw =~ m!^(.*?)/! )
470
    {
487
    {
471
      $agent = $1;
488
      $agent = $1;
472
    }
489
    }
473
    elsif ( $raw =~ /^(\w*)\d.*/ )
490
    elsif ( $raw =~ /^(\w*)\d.*/ )
474
    {
491
    {
475
      $agent = $1;
492
      $agent = $1;
476
    }
493
    }
477
  }
494
  }
478
495
479
  $distinct_agent{$agent}++;
496
  $distinct_agent{$agent}++;
480
  return $agent;
497
  return $agent;
481
}
498
}
482
## get_agent
499
## get_agent
483
500
484
#########################################
501
#########################################
485
## Count the User-Agents used, collapsing
502
## Count the User-Agents used, collapsing
486
## different versions into one per agent.
503
## different versions into one per agent.
487
#########################################
504
#########################################
488
sub count_agents
505
sub count_agents
489
{
506
{
490
POSTER:
507
POSTER:
491
  foreach my $poster ( keys %data )
508
  foreach my $poster ( keys %data )
492
  {
509
  {
493
    foreach my $agent_name ( keys %distinct_agent )
510
    foreach my $agent_name ( keys %distinct_agent )
494
    {    # check against known ones
511
    {    # check against known ones
495
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
512
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
496
      {
513
      {
497
        $agents{$agent_name}++;
514
        $agents{$agent_name}++;
498
        next POSTER;
515
        next POSTER;
499
      }
516
      }
500
    }
517
    }
501
    $agents{ $data{$poster}{'agent'} }++;
518
    $agents{ $data{$poster}{'agent'} }++;
502
  }
519
  }
503
}    # count_agents
520
}    # count_agents
504
521
505
#############################################
522
#############################################
506
## Set orig/total percentages for all posters
523
## Set orig/total percentages for all posters
507
#############################################
524
#############################################
508
sub fix_percent
525
sub fix_percent
509
{
526
{
510
  foreach my $poster ( keys %data )
527
  foreach my $poster ( keys %data )
511
  {
528
  {
512
    my $percent = 100;
529
    my $percent = 100;
513
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
530
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
514
    {
531
    {
515
      $percent =
532
      $percent =
516
        $data{$poster}{'orig'} * 100 /
533
        $data{$poster}{'orig'} * 100 /
517
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
534
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
518
    }
535
    }
519
    elsif ( $data{$poster}{'orig'} == 0 )
536
    elsif ( $data{$poster}{'orig'} == 0 )
520
    {
537
    {
521
      $percent = 0;
538
      $percent = 0;
522
    }
539
    }
523
    $data{$poster}{'percent'} = $percent;
540
    $data{$poster}{'percent'} = $percent;
524
  }
541
  }
525
}
542
}
526
## fix_percent
543
## fix_percent
527
544
528
##################################
545
##################################
529
## Write data structures to a file
546
## Write data structures to a file
530
##################################
547
##################################
531
sub write_data
548
sub write_data
532
{
549
{
533
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
550
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
534
    or die __x( "Can't create XDATA: {error}\n", error => $! );
551
    or die __x( "Can't create XDATA: {error}\n", error => $! );
535
  print $OUTF "Data collected from $newsgroup_name\n\n";
552
  print $OUTF "Data collected from $newsgroup_name\n\n";
536
  print $OUTF
553
  print $OUTF
537
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
554
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
538
  foreach my $name ( keys %data )
555
  foreach my $name ( keys %data )
539
  {
556
  {
540
    print $OUTF
557
    print $OUTF
541
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
558
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
542
  }
559
  }
543
  print $OUTF
560
  print $OUTF
544
"============================================================================\n";
561
"============================================================================\n";
545
  print $OUTF "Thread subjects\n";
562
  print $OUTF "Thread subjects\n";
546
  print $OUTF
563
  print $OUTF
547
"----------------------------------------------------------------------------\n";
564
"----------------------------------------------------------------------------\n";
548
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
565
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
549
  {
566
  {
550
    print $OUTF
567
    print $OUTF
551
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
568
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
552
  }
569
  }
553
  print $OUTF
570
  print $OUTF
554
"============================================================================\n";
571
"============================================================================\n";
555
  print $OUTF "Cross-posts\n";
572
  print $OUTF "Cross-posts\n";
556
  print $OUTF
573
  print $OUTF
557
"----------------------------------------------------------------------------\n";
574
"----------------------------------------------------------------------------\n";
558
  foreach my $name ( sort keys %crossposts )
575
  foreach my $name ( sort keys %crossposts )
559
  {
576
  {
560
    print $OUTF "$name : $crossposts{$name}\n";
577
    print $OUTF "$name : $crossposts{$name}\n";
561
  }
578
  }
562
  print $OUTF
579
  print $OUTF
563
"============================================================================\n";
580
"============================================================================\n";
564
  print $OUTF "User agents\n";
581
  print $OUTF "User agents\n";
565
  print $OUTF
582
  print $OUTF
566
"----------------------------------------------------------------------------\n";
583
"----------------------------------------------------------------------------\n";
567
  foreach my $name ( sort keys %agents )
584
  foreach my $name ( sort keys %agents )
568
  {
585
  {
569
    print $OUTF "$name : $agents{$name}\n";
586
    print $OUTF "$name : $agents{$name}\n";
570
  }
587
  }
571
  print $OUTF
588
  print $OUTF
572
"============================================================================\n";
589
"============================================================================\n";
573
  print $OUTF "Time zones\n";
590
  print $OUTF "Time zones\n";
574
  print $OUTF
591
  print $OUTF
575
"----------------------------------------------------------------------------\n";
592
"----------------------------------------------------------------------------\n";
576
  foreach my $name ( sort keys %tz )
593
  foreach my $name ( sort keys %tz )
577
  {
594
  {
578
    print $OUTF "$name : $tz{$name}\n";
595
    print $OUTF "$name : $tz{$name}\n";
579
  }
596
  }
580
  close $OUTF;
597
  close $OUTF;
581
}    # write_data
598
}    # write_data
582
599
583
sub display_results
600
sub display_results
584
{
601
{
585
  #################### DISPLAY RESULTS #####################
602
  #################### DISPLAY RESULTS #####################
586
  println( "=" x 76 );
603
  println( "=" x 76 );
587
  printf "%s\n",
604
  printf "%s\n",
588
    centred(
605
    centred(
589
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
606
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
590
    76 );
607
    76 );
591
  println( "=" x 76 );
608
  println( "=" x 76 );
592
  printf "%s\n",
609
  printf "%s\n",
593
    centred(
610
    centred(
594
    __(
611
    __(
595
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
612
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
596
    ),
613
    ),
597
    76
614
    76
598
    );
615
    );
599
  print "\n\n";
616
  print "\n\n";
600
  printf __"Total posts considered: %s over %d days\n",
617
  printf __"Total posts considered: %s over %d days\n",
601
    $formatter->format_number($totalposts),
618
    $formatter->format_number($totalposts),
602
    $formatter->format_number($numdays);
619
    $formatter->format_number($numdays);
603
  my $time_locale       = setlocale(LC_TIME);
620
  my $time_locale       = setlocale(LC_TIME);
604
  my $earliest_datetime = DateTime->from_epoch(
621
  my $earliest_datetime = DateTime->from_epoch(
605
    epoch     => $earliest,
622
    epoch     => $earliest,
606
    locale    => $time_locale,
623
    locale    => $time_locale,
607
    time_zone => 'UTC',
624
    time_zone => 'UTC',
608
  );
625
  );
609
  my $latest_datetime = DateTime->from_epoch(
626
  my $latest_datetime = DateTime->from_epoch(
610
    epoch     => $latest,
627
    epoch     => $latest,
611
    locale    => $time_locale,
628
    locale    => $time_locale,
612
    time_zone => 'UTC',
629
    time_zone => 'UTC',
613
  );
630
  );
614
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
631
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
615
  printf __"Earliest article: %s\n",
632
  printf __"Earliest article: %s\n",
616
    $earliest_datetime->strftime($datetime_format);
633
    $earliest_datetime->strftime($datetime_format);
617
  printf __"Latest article:   %s\n",
634
  printf __"Latest article:   %s\n",
618
    $latest_datetime->strftime($datetime_format);
635
    $latest_datetime->strftime($datetime_format);
619
  printf __"Original articles: %s; replies: %s\n",
636
  printf __"Original articles: %s; replies: %s\n",
620
    $formatter->format_number($origposts),
637
    $formatter->format_number($origposts),
621
    $formatter->format_number($replies);
638
    $formatter->format_number($replies);
622
  printf __"Total size of posts: %s bytes (%s)" . "\n",
639
  printf __"Total size of posts: %s bytes (%s)" . "\n",
623
    $formatter->format_number($totsize),
640
    $formatter->format_number($totsize),
624
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
641
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
625
  printf __"Average %s articles per day, %s per day, %s bytes per article\n",
642
  printf __"Average %s articles per day, %s per day, %s bytes per article\n",
626
    $formatter->format_number( int( $totalposts / $numdays ) ),
643
    $formatter->format_number( int( $totalposts / $numdays ) ),
627
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
644
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
628
    $formatter->format_number( int( $totsize / $totalposts ) );
645
    $formatter->format_number( int( $totsize / $totalposts ) );
629
646
630
  my $count = keys %data;
647
  my $count = keys %data;
631
  printf __"Total headers: %s; bodies: %s\n",
648
  printf __"Total headers: %s; bodies: %s\n",
632
    $formatter->format_bytes(
649
    $formatter->format_bytes(
633
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
650
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
634
    ),
651
    ),
635
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
652
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
636
  printf __
653
  printf __
637
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
654
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
638
    $formatter->format_bytes(
655
    $formatter->format_bytes(
639
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
656
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
640
    ),
657
    ),
641
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
658
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
642
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) ),
659
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) ),
643
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
660
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
644
  printf __"Total number of posters: %s, average %s per poster\n",
661
  printf __"Total number of posters: %s, average %s per poster\n",
645
    $formatter->format_number($count),
662
    $formatter->format_number($count),
646
    $formatter->format_bytes( $totsize / $count,
663
    $formatter->format_bytes( $totsize / $count,
647
    ( 'precision' => 1, 'mode' => 'iec' ) );
664
    ( 'precision' => 1, 'mode' => 'iec' ) );
648
  $count = keys %threads;
665
  $count = keys %threads;
649
  printf __"Total number of threads: %s, average %s per thread\n",
666
  printf __"Total number of threads: %s, average %s per thread\n",
650
    $formatter->format_number($count),
667
    $formatter->format_number($count),
651
    $formatter->format_bytes( $totsize / $count,
668
    $formatter->format_bytes( $totsize / $count,
652
    ( 'precision' => 1, 'mode' => 'iec' ) );
669
    ( 'precision' => 1, 'mode' => 'iec' ) );
653
  printf __"Total number of user agents: %d\n",
670
  printf __"Total number of user agents: %d\n",
654
    $formatter->format_number( scalar keys %agents );
671
    $formatter->format_number( scalar keys %agents );
655
  print "\n", "=" x 76, "\n"  ;
672
  print "\n", "=" x 76, "\n"  ;
656
  ########################################
673
  ########################################
657
  ## Show posters by article count  Sec 1;
674
  ## Show posters by article count  Sec 1;
658
  ########################################
675
  ########################################
659
  unless ( $skipSec{1} )
676
  unless ( $skipSec{1} )
660
  {
677
  {
661
    if ( keys %data < $topposters )
678
    if ( keys %data < $topposters )
662
    {
679
    {
663
      $count = keys %data;
680
      $count = keys %data;
664
    }
681
    }
665
    else
682
    else
666
    {
683
    {
667
      $count = $topposters;
684
      $count = $topposters;
668
    }
685
    }
669
    printf "%s\n",
686
    printf "%s\n",
670
      centred(
687
      centred(
671
      __x( "Top {count} posters by number of articles", count => $topposters ),
688
      __x( "Top {count} posters by number of articles", count => $topposters ),
672
      76
689
      76
673
      );
690
      );
674
    print "=" x 76, "\n";
691
    print "=" x 76, "\n";
675
    my $i = 0;
692
    my $i = 0;
676
    foreach
693
    foreach
677
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
694
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
678
    {
695
    {
679
      my $name = substr( $poster, 0, 65 );
696
      my $name = substr( $poster, 0, 65 );
680
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
697
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
681
        $data{$poster}{count};
698
        $data{$poster}{count};
682
      last if ( ++$i == $count );
699
      last if ( ++$i == $count );
683
    }
700
    }
684
    print "\n", "=" x 76, "\n";
701
    print "\n", "=" x 76, "\n";
685
  }
702
  }
686
703
687
  ######################################
704
  ######################################
688
  ## Show posters by size in KiB  Sec 2;
705
  ## Show posters by size in KiB  Sec 2;
689
  ######################################
706
  ######################################
690
  unless ( $skipSec{2} )
707
  unless ( $skipSec{2} )
691
  {
708
  {
692
    if ( keys %data < $topposters )
709
    if ( keys %data < $topposters )
693
    {
710
    {
694
      $count = keys %data;
711
      $count = keys %data;
695
    }
712
    }
696
    else
713
    else
697
    {
714
    {
698
      $count = $topposters;
715
      $count = $topposters;
699
    }
716
    }
700
    printf "%s\n",
717
    printf "%s\n",
701
      centred(
718
      centred(
702
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
719
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
703
      76
720
      76
704
      );
721
      );
705
    print "=" x 76, "\n";
722
    print "=" x 76, "\n";
706
    my $i = 0;
723
    my $i = 0;
707
    foreach
724
    foreach
708
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
725
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
709
    {
726
    {
710
      my $name = substr( $poster, 0, 62 );
727
      my $name = substr( $poster, 0, 62 );
711
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
728
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
712
        $data{$poster}{size} / 1024;    #/
729
        $data{$poster}{size} / 1024;    #/
713
      last if ( ++$i == $count );
730
      last if ( ++$i == $count );
714
    }
731
    }
715
    print "\n", "=" x 76, "\n";
732
    print "\n", "=" x 76, "\n";
716
  }
733
  }
717
734
718
  #####################################
735
  #####################################
719
  ## Show top posters for original text
736
  ## Show top posters for original text
720
  #####################################
737
  #####################################
721
  unless ( $skipSec{3} )
738
  unless ( $skipSec{3} )
722
  {
739
  {
723
    if ( keys %data < $topposters )
740
    if ( keys %data < $topposters )
724
    {
741
    {
725
      $count = keys %data;
742
      $count = keys %data;
726
    }
743
    }
727
    else
744
    else
728
    {
745
    {
729
      $count = $topposters;
746
      $count = $topposters;
730
    }
747
    }
731
    printf "%s\n",
748
    printf "%s\n",
732
      centred(
749
      centred(
733
      __x(
750
      __x(
734
        "Top {count} responders by original text (> 5 posts)",
751
        "Top {count} responders by original text (> 5 posts)",
735
        count => $topposters
752
        count => $topposters
736
      ),
753
      ),
737
      76
754
      76
738
      );
755
      );
739
    print "=" x 76, "\n";
756
    print "=" x 76, "\n";
740
    my $i = 0;
757
    my $i = 0;
741
    foreach my $poster (
758
    foreach my $poster (
742
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
759
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
743
      keys %data
760
      keys %data
744
      )
761
      )
745
    {
762
    {
746
      next if $data{$poster}{quoted} == 0;
763
      next if $data{$poster}{quoted} == 0;
747
      next if $data{$poster}{count} < 5;
764
      next if $data{$poster}{count} < 5;
748
      my $name = substr( $poster, 0, 63 );
765
      my $name = substr( $poster, 0, 63 );
749
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
766
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
750
        $data{$poster}{percent};
767
        $data{$poster}{percent};
751
      last if ( ++$i == $count );
768
      last if ( ++$i == $count );
752
    }
769
    }
753
    print "\n", "=" x 76, "\n";
770
    print "\n", "=" x 76, "\n";
754
  }
771
  }
755
772
756
  ########################################
773
  ########################################
757
  ## Show bottom posters for original text
774
  ## Show bottom posters for original text
758
  ########################################
775
  ########################################
759
  unless ( $skipSec{4} )
776
  unless ( $skipSec{4} )
760
  {
777
  {
761
    if ( keys %data < $topposters )
778
    if ( keys %data < $topposters )
762
    {
779
    {
763
      $count = keys %data;
780
      $count = keys %data;
764
    }
781
    }
765
    else
782
    else
766
    {
783
    {
767
      $count = $topposters;
784
      $count = $topposters;
768
    }
785
    }
769
    printf "%s\n",
786
    printf "%s\n",
770
      centred(
787
      centred(
771
      __x(
788
      __x(
772
        "Bottom {count} responders by original text  (> 5 posts)",
789
        "Bottom {count} responders by original text  (> 5 posts)",
773
        count => $topposters
790
        count => $topposters
774
      ),
791
      ),
775
      76
792
      76
776
      );
793
      );
777
    print "=" x 76, "\n";
794
    print "=" x 76, "\n";
778
    my $i = 0;
795
    my $i = 0;
779
    foreach my $poster (
796
    foreach my $poster (
780
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
797
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
781
      keys %data
798
      keys %data
782
      )
799
      )
783
    {
800
    {
784
      next if $data{$poster}{quoted} == 0;
801
      next if $data{$poster}{quoted} == 0;
785
      next if $data{$poster}{count} < 5;
802
      next if $data{$poster}{count} < 5;
786
      my $name = substr( $poster, 0, 63 );
803
      my $name = substr( $poster, 0, 63 );
787
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
804
      printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
788
        $data{$poster}{percent};
805
        $data{$poster}{percent};
789
      last if ( ++$i == $count );
806
      last if ( ++$i == $count );
790
    }
807
    }
791
    print "\n", "=" x 76, "\n";
808
    print "\n", "=" x 76, "\n";
792
  }
809
  }
793
810
794
  #####################################
811
  #####################################
795
  ## Show threads by number of articles
812
  ## Show threads by number of articles
796
  #####################################
813
  #####################################
797
  unless ( $skipSec{5} )
814
  unless ( $skipSec{5} )
798
  {
815
  {
799
    if ( keys %threads < $topthreads )
816
    if ( keys %threads < $topthreads )
800
    {
817
    {
801
      $count = keys %threads;
818
      $count = keys %threads;
802
    }
819
    }
803
    else
820
    else
804
    {
821
    {
805
      $count = $topthreads;
822
      $count = $topthreads;
806
    }
823
    }
807
    printf "%s\n",
824
    printf "%s\n",
808
      centred(
825
      centred(
809
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
826
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
810
      76 );
827
      76 );
811
    print "=" x 76, "\n";
828
    print "=" x 76, "\n";
812
    my $i = 0;
829
    my $i = 0;
813
    foreach my $thread (
830
    foreach my $thread (
814
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
831
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
815
      keys %threads
832
      keys %threads
816
      )
833
      )
817
    {
834
    {
818
      my $name = substr( $thread, 0, 65 );
835
      my $name = substr( $thread, 0, 65 );
819
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
836
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
820
        $threads{$thread}{'count'};
837
        $threads{$thread}{'count'};
821
      last if ( ++$i == $count );
838
      last if ( ++$i == $count );
822
    }
839
    }
823
    print "\n", "=" x 76, "\n";
840
    print "\n", "=" x 76, "\n";
824
  }
841
  }
825
842
826
  ##############################
843
  ##############################
827
  ## Show threads by size in KiB
844
  ## Show threads by size in KiB
828
  ##############################
845
  ##############################
829
  unless ( $skipSec{6} )
846
  unless ( $skipSec{6} )
830
  {
847
  {
831
    if ( keys %threads < $topthreads )
848
    if ( keys %threads < $topthreads )
832
    {
849
    {
833
      $count = keys %threads;
850
      $count = keys %threads;
834
    }
851
    }
835
    else
852
    else
836
    {
853
    {
837
      $count = $topthreads;
854
      $count = $topthreads;
838
    }
855
    }
839
    printf "%s\n",
856
    printf "%s\n",
840
      centred(
857
      centred(
841
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
858
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
842
    print "=" x 76, "\n";
859
    print "=" x 76, "\n";
843
    my $i = 0;
860
    my $i = 0;
844
    foreach my $thread (
861
    foreach my $thread (
845
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
862
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
846
      keys %threads
863
      keys %threads
847
      )
864
      )
848
    {
865
    {
849
      my $name = substr( $thread, 0, 65 );
866
      my $name = substr( $thread, 0, 65 );
850
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
867
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
851
        $threads{$thread}{'size'} / 1024;    #/
868
        $threads{$thread}{'size'} / 1024;    #/
852
      last if ( ++$i == $count );
869
      last if ( ++$i == $count );
853
    }
870
    }
854
    print "\n", "=" x 76, "\n";
871
    print "\n", "=" x 76, "\n";
855
  }
872
  }
856
873
857
  ##################################
874
  ##################################
858
  ## Show top 10 cross-posted groups
875
  ## Show top 10 cross-posted groups
859
  ##################################
876
  ##################################
860
  unless ( $skipSec{7} )
877
  unless ( $skipSec{7} )
861
  {
878
  {
862
    delete $crossposts{"$newsgroup_name"};    # don't include ours
879
    delete $crossposts{"$newsgroup_name"};    # don't include ours
863
    if ( keys %crossposts < $topcrossposts )
880
    if ( keys %crossposts < $topcrossposts )
864
    {
881
    {
865
      $count = keys %crossposts;
882
      $count = keys %crossposts;
866
    }
883
    }
867
    else
884
    else
868
    {
885
    {
869
      $count = $topcrossposts;
886
      $count = $topcrossposts;
870
    }
887
    }
871
    printf "%s\n",
888
    printf "%s\n",
872
      centred(
889
      centred(
873
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
890
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
874
    print "=" x 76, "\n";
891
    print "=" x 76, "\n";
875
    my $i = 0;
892
    my $i = 0;
876
    foreach
893
    foreach
877
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
894
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
878
    {
895
    {
879
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
896
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
880
        $crossposts{$name};
897
        $crossposts{$name};
881
      last if ( ++$i == $count );
898
      last if ( ++$i == $count );
882
    }
899
    }
883
    print "\n", "=" x 76, "\n";
900
    print "\n", "=" x 76, "\n";
884
  }
901
  }
885
902
886
  #########################
903
  #########################
887
  ## Show agents and counts
904
  ## Show agents and counts
888
  #########################
905
  #########################
889
  unless ( $skipSec{8} )
906
  unless ( $skipSec{8} )
890
  {
907
  {
891
    if ( keys %agents < $topagents )
908
    if ( keys %agents < $topagents )
892
    {
909
    {
893
      $count = keys %agents;
910
      $count = keys %agents;
894
    }
911
    }
895
    else
912
    else
896
    {
913
    {
897
      $count = $topagents;
914
      $count = $topagents;
898
    }
915
    }
899
    printf "%s\n",
916
    printf "%s\n",
900
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
917
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
901
      76 );
918
      76 );
902
    print "=" x 76, "\n";
919
    print "=" x 76, "\n";
903
    my $i = 0;
920
    my $i = 0;
904
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
921
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
905
    {
922
    {
906
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
923
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
907
        $agents{$agent};
924
        $agents{$agent};
908
      last if ( ++$i == $count );
925
      last if ( ++$i == $count );
909
    }
926
    }
910
    print "\n", "=" x 76, "\n";
927
    print "\n", "=" x 76, "\n";
911
  }
928
  }
912
929
913
  #######################
930
  #######################
914
  ## Show distinct agents
931
  ## Show distinct agents
915
  #######################
932
  #######################
916
  unless ( $skipSec{9} )
933
  unless ( $skipSec{9} )
917
  {
934
  {
918
    if ( keys %distinct_agent < $topagents )
935
    if ( keys %distinct_agent < $topagents )
919
    {
936
    {
920
      $count = keys %distinct_agent;
937
      $count = keys %distinct_agent;
921
    }
938
    }
922
    else
939
    else
923
    {
940
    {
924
      $count = $topagents;
941
      $count = $topagents;
925
    }
942
    }
926
    printf "%s\n",
943
    printf "%s\n",
927
      centred(
944
      centred(
928
      __x( "Top {count} user agents by number of posts", count => $topagents ),
945
      __x( "Top {count} user agents by number of posts", count => $topagents ),
929
      76
946
      76
930
      );
947
      );
931
    print "=" x 76, "\n";
948
    print "=" x 76, "\n";
932
    my $i = 0;
949
    my $i = 0;
933
    foreach my $agent (
950
    foreach my $agent (
934
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
951
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
935
      keys %distinct_agent
952
      keys %distinct_agent
936
      )
953
      )
937
    {
954
    {
938
      printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
955
      printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
939
        $distinct_agent{$agent},
956
        $distinct_agent{$agent},
940
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
957
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
941
      last if ( ++$i == $count );
958
      last if ( ++$i == $count );
942
    }
959
    }
943
    print "\n", "=" x 76, "\n";
960
    print "\n", "=" x 76, "\n";
944
  }
961
  }
945
962
946
  ############################
963
  ############################
947
  ## Show timezones and counts
964
  ## Show timezones and counts
948
  ############################
965
  ############################
949
  unless ( $skipSec{10} )
966
  unless ( $skipSec{10} )
950
  {
967
  {
951
    if ( keys %tz < $toptz )
968
    if ( keys %tz < $toptz )
952
    {
969
    {
953
      $count = keys %tz;
970
      $count = keys %tz;
954
    }
971
    }
955
    else
972
    else
956
    {
973
    {
957
      $count = $toptz;
974
      $count = $toptz;
958
    }
975
    }
959
    printf "%s\n",
976
    printf "%s\n",
960
      centred( __x( "Top {count} time zones", count => $toptz ), 76 );
977
      centred( __x( "Top {count} time zones", count => $toptz ), 76 );
961
    print "=" x 76, "\n";
978
    print "=" x 76, "\n";
962
    my $i = 0;
979
    my $i = 0;
963
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
980
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
964
    {
981
    {
965
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
982
      printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
966
      last if ( ++$i == $count );
983
      last if ( ++$i == $count );
967
    }
984
    }
968
    print "\n", "=" x 76, "\n";
985
    print "\n", "=" x 76, "\n";
969
  }
986
  }
970
}
987
}
971
988
972
## helper subs
989
## helper subs
973
990
974
###############################
991
###############################
975
## Right pad a string with '.'s
992
## Right pad a string with '.'s
976
###############################
993
###############################
977
sub rpad
994
sub rpad
978
{
995
{
979
  ## Get text to pad, length to pad, pad chr
996
  ## Get text to pad, length to pad, pad chr
980
  my ( $text, $pad_len, $pad_chr ) = @_;
997
  my ( $text, $pad_len, $pad_chr ) = @_;
981
998
982
  ## DEBUG
999
  ## DEBUG
983
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
1000
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
984
1001
985
  if ( length($text) > $pad_len )
1002
  if ( length($text) > $pad_len )
986
  {
1003
  {
987
    $text = substr( $text, 0, $pad_len );
1004
    $text = substr( $text, 0, $pad_len );
988
  }
1005
  }
989
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
1006
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
990
  return $padded;
1007
  return $padded;
991
}
1008
}
992
1009
993
##################
1010
##################
994
## Centre a string
1011
## Centre a string
995
##################
1012
##################
996
sub centred
1013
sub centred
997
{
1014
{
998
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
1015
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
999
  my $pad_len = ( $width - length($text) ) / 2;    #/
1016
  my $pad_len = ( $width - length($text) ) / 2;    #/
1000
  my $centred = " " x $pad_len . $text;
1017
  my $centred = " " x $pad_len . $text;
1001
  return $centred;
1018
  return $centred;
1002
}
1019
}
1003
1020
1004
###########################
1021
###########################
1005
## Put commas into a number
1022
## Put commas into a number
1006
###########################
1023
###########################
1007
sub commify
1024
sub commify
1008
{
1025
{
1009
  local $_ = shift;
1026
  local $_ = shift;
1010
  my $number = $_;
1027
  my $number = $_;
1011
  $_ = int;                                        # Chop non-integer part
1028
  $_ = int;                                        # Chop non-integer part
1012
  1 while
1029
  1 while
1013
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1030
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1014
  my $int_part  = $_;
1031
  my $int_part  = $_;
1015
  my $real_part = '';
1032
  my $real_part = '';
1016
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1033
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1017
  {
1034
  {
1018
    $real_part = $1;
1035
    $real_part = $1;
1019
  }
1036
  }
1020
  return $int_part . $real_part;
1037
  return $int_part . $real_part;
1021
}
1038
}
1022
1039
1023
################################################################
1040
################################################################
1024
## Returns a string with leading and trailing whitespace removed
1041
## Returns a string with leading and trailing whitespace removed
1025
################################################################
1042
################################################################
1026
sub clean
1043
sub clean
1027
{
1044
{
1028
  my $dirty = shift;
1045
  my $dirty = shift;
1029
  my $clean = $dirty;
1046
  my $clean = $dirty;
1030
  $clean =~ s/^\s+|\s+$//g;
1047
  $clean =~ s/^\s+|\s+$//g;
1031
1048
1032
  return $clean;
1049
  return $clean;
1033
}
1050
}
1034
1051
1035
sub usage
1052
sub usage
1036
{
1053
{
1037
  println( __ "usage: newsstat.pl NEWS.GROUP" );
1054
  println( __ "usage: newsstat.pl NEWS.GROUP" );
1038
  exit 1;
1055
  exit 1;
1039
}
1056
}
1040
1057
1041
sub dmsg
1058
sub dmsg
1042
{
1059
{
1043
  print STDERR @_, "\n";
1060
  print STDERR @_, "\n";
1044
}
1061
}
1045
1062
1046
sub dmsg2
1063
sub dmsg2
1047
{
1064
{
1048
  my ( $level, @msg ) = @_;
1065
  my ( $level, @msg ) = @_;
1049
  print STDERR @msg, "\n" if $level >= DEBUG;
1066
  print STDERR @msg, "\n" if $level >= DEBUG;
1050
}
1067
}
1051
1068
1052
sub println
1069
sub println
1053
{
1070
{
1054
  print @_, "\n";
1071
  print @_, "\n";
1055
}
1072
}
1056
 
1073