Subversion Repositories LCARS

Rev

Rev 30 | Rev 33 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 30 Rev 31
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
4
require 5.004;
4
require 5.004;
5
5
6
#use diagnostics;
6
#use diagnostics;
7
use utf8;
7
use utf8;
8
8
9
## NOTE:
9
## NOTE:
10
## Enable and remove binmode when utf8::all has actually become lexically scoped
10
## Enable and remove binmode when utf8::all has actually become lexically scoped
11
# use utf8:all;
11
# use utf8:all;
12
12
13
use constant DEBUG => 0;
13
use constant DEBUG => 0;
14
14
15
## newsstat.pl
15
## newsstat.pl
16
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
16
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
17
## Based on work by Garry Knight et al.
17
## Based on work by Garry Knight et al.
18
## 
18
##
19
## This program is free software: you can redistribute it and/or modify
19
## This program is free software: you can redistribute it and/or modify
20
## it under the terms of the GNU General Public License as published by
20
## it under the terms of the GNU General Public License as published by
21
## the Free Software Foundation, either version 3 of the License, or
21
## the Free Software Foundation, either version 3 of the License, or
22
## (at your option) any later version.
22
## (at your option) any later version.
23
## 
23
##
24
## This program is distributed in the hope that it will be useful,
24
## This program is distributed in the hope that it will be useful,
25
## but WITHOUT ANY WARRANTY; without even the implied warranty of
25
## but WITHOUT ANY WARRANTY; without even the implied warranty of
26
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27
## GNU General Public License for more details.
27
## GNU General Public License for more details.
28
## 
28
##
29
## You should have received a copy of the GNU General Public License
29
## You should have received a copy of the GNU General Public License
30
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
30
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
31
31
32
## Print out all text to STDOUT UTF-8 encoded
32
## Print out all text to STDOUT UTF-8 encoded
33
binmode STDOUT, ':encoding(UTF-8)';
33
binmode STDOUT, ':encoding(UTF-8)';
34
binmode STDERR, ':encoding(UTF-8)';
34
binmode STDERR, ':encoding(UTF-8)';
35
35
36
## L10n
36
## L10n
37
use locale ':not_characters';
37
use locale ':not_characters';
38
38
39
# setlocale( LC_MESSAGES, '' );
39
# setlocale( LC_MESSAGES, '' );
40
require Number::Format;
40
require Number::Format;
41
41
42
## i18n
42
## i18n
43
## FIXME: Automatically include resolved '.' in @INC
43
## FIXME: Automatically include resolved '.' in @INC
44
# print join "\n", @INC;
44
# print join "\n", @INC;
45
45
46
use Locale::TextDomain ('de.pointedears.newsstat');
46
use Locale::TextDomain ('de.pointedears.newsstat');
47
use POSIX              ('locale_h');
47
use POSIX              ('locale_h');
48
use Locale::Messages qw (bind_textdomain_filter
48
use Locale::Messages qw (bind_textdomain_filter
49
  bind_textdomain_codeset
49
  bind_textdomain_codeset
50
  turn_utf_8_on);
50
  turn_utf_8_on);
51
51
52
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
52
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
53
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
53
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
54
54
55
require Mail::Message;
55
require Mail::Message;
56
require DateTime;
56
require DateTime;
57
require DateTime::Format::Mail;
57
require DateTime::Format::Mail;
58
58
59
# See comments in previous example
59
# See comments in previous example
60
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
60
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
61
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
61
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
62
  'decimal_point' };
62
  'decimal_point' };
63
63
64
# Apply defaults if values are missing
64
# Apply defaults if values are missing
65
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
65
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
66
$thousands_sep = ' ' unless $thousands_sep;
66
$thousands_sep = ' ' unless $thousands_sep;
67
67
68
# grouping and mon_grouping are packed lists
68
# grouping and mon_grouping are packed lists
69
# of small integers (characters) telling the
69
# of small integers (characters) telling the
70
# grouping (thousand_seps and mon_thousand_seps
70
# grouping (thousand_seps and mon_thousand_seps
71
# being the group dividers) of numbers and
71
# being the group dividers) of numbers and
72
# monetary quantities.  The integers' meanings:
72
# monetary quantities.  The integers' meanings:
73
# 255 means no more grouping, 0 means repeat
73
# 255 means no more grouping, 0 means repeat
74
# the previous grouping, 1-254 means use that
74
# the previous grouping, 1-254 means use that
75
# as the current grouping.  Grouping goes from
75
# as the current grouping.  Grouping goes from
76
# right to left (low to high digits).  In the
76
# right to left (low to high digits).  In the
77
# below we cheat slightly by never using anything
77
# below we cheat slightly by never using anything
78
# else than the first grouping (whatever that is).
78
# else than the first grouping (whatever that is).
79
my @grouping;
79
my @grouping;
80
if ($grouping)
80
if ($grouping)
81
{
81
{
82
  @grouping = unpack( "C*", $grouping );
82
  @grouping = unpack( "C*", $grouping );
83
}
83
}
84
else
84
else
85
{
85
{
86
  @grouping = (3);
86
  @grouping = (3);
87
}
87
}
88
88
89
## FIXME: Why don't the defaults work already?
89
## FIXME: Why don't the defaults work already?
90
my $formatter = new Number::Format(
90
my $formatter = new Number::Format(
91
  -decimal_point => $decimal_point,
91
  -decimal_point => $decimal_point,
92
  -thousands_sep => $thousands_sep,
92
  -thousands_sep => $thousands_sep,
93
93
94
  # -grouping      => $grouping[0]
94
  # -grouping      => $grouping[0]
95
);
95
);
96
96
97
###################### USER CONFIGURATIONS ############################
97
###################### USER CONFIGURATIONS ############################
98
98
99
## The name of the group to do stats for
99
## The name of the group to do stats for
100
my $newsgroup_name = $ARGV[0];
100
my $newsgroup_name = $ARGV[0];
101
$newsgroup_name or usage();
101
$newsgroup_name // usage();
102
102
103
## Check for removal flags
103
## Check for removal flags
104
my $ix;
104
my $ix;
105
my $j;
105
my $j;
106
my %skipSec;
106
my %skipSec;
107
my @skiplist;
107
my @skiplist;
108
my $args = @ARGV;
108
my $args = @ARGV;
109
for ( $ix = 1 ; $ix < $args ; $ix++ )
109
for ( $ix = 1 ; $ix < $args ; $ix++ )
110
{
110
{
111
  $j = $ix + 1;
111
  $j = $ix + 1;
112
  if ( $ARGV[$ix] eq "-x" )
112
  if ( $ARGV[$ix] eq "-x" )
113
  {
113
  {
114
    @skiplist = split( ",", $ARGV[$j] );
114
    @skiplist = split( ",", $ARGV[$j] );
115
  }
115
  }
116
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
116
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
117
  {
117
  {
118
    @skiplist = split( ",", $1 );
118
    @skiplist = split( ",", $1 );
119
  }
119
  }
120
}
120
}
121
foreach (@skiplist)
121
foreach (@skiplist)
122
{
122
{
123
  $skipSec{$_} = 1;
123
  $skipSec{$_} = 1;
124
}
124
}
125
125
126
## Leafnode users will want /var/spool/news for this variable.
126
## Leafnode users will want /var/spool/news for this variable.
127
my $news = "/var/spool/news/";
127
my $news = "/var/spool/news/";
128
128
129
## Number of top or bottom posters to show
129
## Number of top or bottom posters to show
130
my $topposters = 20;
130
my $topposters = 20;
131
131
132
## Number of threads we want to know about
132
## Number of threads we want to know about
133
my $topthreads = 20;
133
my $topthreads = 20;
134
134
135
## Number of cross-posted threads to show
135
## Number of cross-posted threads to show
136
my $topcrossposts = 10;
136
my $topcrossposts = 10;
137
137
138
## Number of agents we list
138
## Number of agents we list
139
my $topagents = 10;
139
my $topagents = 10;
140
140
141
## Number of time zones to show
141
## Number of time zones to show
142
my $toptz = 10;
142
my $toptz = 10;
143
143
144
###################### DATA STRUCTURES ######################
144
###################### DATA STRUCTURES ######################
145
my $group = $newsgroup_name;
145
my $group = $newsgroup_name;
146
$group =~ s!\.!/!g;
146
$group =~ s!\.!/!g;
147
my %data;    # name, count, agent, total, orig, quoted
147
my %data;    # name, count, agent, total, orig, quoted
148
my $totsize = 0;    # holds total sizes of all files
148
my $totsize = 0;    # holds total sizes of all files
149
my %crossposts;     # group, count
149
my %crossposts;     # group, count
150
my %threads;        # subject, count
150
my %threads;        # subject, count
151
my $replies   = 0;  # total no. of replies
151
my $replies   = 0;  # total no. of replies
152
my $origposts = 0;  # total no. of original posts
152
my $origposts = 0;  # total no. of original posts
153
my %tz;             # timezones by count
153
my %tz;             # timezones by count
154
my $earliest;       # earliest article we have found
154
my $earliest;       # earliest article we have found
155
my $latest;         # latest article we have found
155
my $latest;         # latest article we have found
156
my $totheader = 0;  # total size of header material
156
my $totheader = 0;  # total size of header material
157
my $totbody   = 0;  # total size of body material
157
my $totbody   = 0;  # total size of body material
158
my $totsig    = 0;  # total size of sig material
158
my $totsig    = 0;  # total size of sig material
159
my $totquoted = 0;  # total size of quoted material
159
my $totquoted = 0;  # total size of quoted material
160
my $totorig   = 0;  # total size of original material
160
my $totorig   = 0;  # total size of original material
161
my $totalposts;     # total no. of posts considered
161
my $totalposts;     # total no. of posts considered
162
my %distinct_agent;
162
my %distinct_agent;
163
163
164
## Used to hold counts of User Agents used
164
## Used to hold counts of User Agents used
165
my %agents = (
165
my %agents = (
166
  "Compuserver"               => 0,
166
  "Compuserver"               => 0,
167
  "Foorum"                    => 0,
167
  "Foorum"                    => 0,
168
  "Forte Agent"               => 0,
168
  "Forte Agent"               => 0,
169
  "Forte Free Agent"          => 0,
169
  "Forte Free Agent"          => 0,
170
  "Gnus"                      => 0,
170
  "Gnus"                      => 0,
171
  "KNode"                     => 0,
171
  "KNode"                     => 0,
172
  "MacSOUP"                   => 0,
172
  "MacSOUP"                   => 0,
173
  "MT-NewsWatcher"            => 0,
173
  "MT-NewsWatcher"            => 0,
174
  "MicroPlanet Gravity"       => 0,
174
  "MicroPlanet Gravity"       => 0,
175
  "Microsoft Outlook Express" => 0,
175
  "Microsoft Outlook Express" => 0,
176
  "Microsoft Windows Mail"    => 0,
176
  "Microsoft Windows Mail"    => 0,
177
  "Mozilla"                   => 0,
177
  "Mozilla"                   => 0,
178
  "News Rover"                => 0,
178
  "News Rover"                => 0,
179
  "NN"                        => 0,
179
  "NN"                        => 0,
180
  "Pan"                       => 0,
180
  "Pan"                       => 0,
181
  "rn"                        => 0,
181
  "rn"                        => 0,
182
  "slrn"                      => 0,
182
  "slrn"                      => 0,
183
  "Sylpheed"                  => 0,
183
  "Sylpheed"                  => 0,
184
  "tin"                       => 0,
184
  "tin"                       => 0,
185
  "VSoup"                     => 0,
185
  "VSoup"                     => 0,
186
  "WebTV"                     => 0,
186
  "WebTV"                     => 0,
187
  "Xnews"                     => 0,
187
  "Xnews"                     => 0,
188
);
188
);
189
189
190
my $datetime_parser = DateTime::Format::Mail->new();
190
my $datetime_parser = DateTime::Format::Mail->new();
191
$datetime_parser->loose();
191
$datetime_parser->loose();
192
192
193
my $today = DateTime->today( time_zone => 'UTC' );
193
my $today = DateTime->today( time_zone => 'UTC' );
194
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
194
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
195
my $start      = int $prev_month->strftime('%s');
195
my $start      = int $prev_month->strftime('%s');
196
my $numdays    = int DateTime->last_day_of_month(
196
my $numdays    = int DateTime->last_day_of_month(
197
  year      => $prev_month->year(),
197
  year      => $prev_month->year(),
198
  month     => $prev_month->month(),
198
  month     => $prev_month->month(),
199
  time_zone => $prev_month->time_zone(),
199
  time_zone => $prev_month->time_zone(),
200
)->day();
200
)->day();
201
my $end = int $today->clone()->set_day(1)->strftime('%s');
201
my $end = int $today->clone()->set_day(1)->strftime('%s');
202
202
203
dmsg( $start, " to ", $end ) if DEBUG;
203
dmsg( $start, " to ", $end ) if DEBUG;
204
204
205
chdir("$news$group")
205
chdir("$news$group")
206
  or die __x(
206
  or die __x(
207
  "Can't cd to {newsgroup}: {error}\n",
207
  "Can't cd to {newsgroup}: {error}\n",
208
  newsgroup => "$news$group",
208
  newsgroup => "$news$group",
209
  error     => $!
209
  error     => $!
210
  );
210
  );
211
opendir( DIR, "." )
211
opendir( DIR, "." )
212
  or die __x(
212
  or die __x(
213
  "Can't open {newsgroup}: {error}\n",
213
  "Can't open {newsgroup}: {error}\n",
214
  newsgroup => "$news$group",
214
  newsgroup => "$news$group",
215
  error     => $!
215
  error     => $!
216
  );
216
  );
217
217
218
while ( defined( my $filename = readdir(DIR) ) )
218
while ( defined( my $filename = readdir(DIR) ) )
219
{
219
{
220
  next unless -f $filename;    # only want real files
220
  next unless -f $filename;    # only want real files
221
  next if ( $filename eq ".overview" );    # real articles only
221
  next if ( $filename eq ".overview" );    # real articles only
222
222
223
  get_article($filename);                  # read in the article
223
  get_article($filename);                  # read in the article
224
}
224
}
225
closedir(DIR);                             # finished with the directory
225
closedir(DIR);                             # finished with the directory
226
226
227
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
227
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
228
228
229
## Post-processing
229
## Post-processing
230
count_agents();                            # count agents, collapsing versions
230
count_agents();                            # count agents, collapsing versions
231
fix_percent();
231
fix_percent();
232
232
233
write_data();
233
write_data();
234
display_results();
234
display_results();
235
235
236
########################################
236
########################################
237
## Get current article's header and body
237
## Get current article's header and body
238
########################################
238
########################################
239
sub get_article
239
sub get_article
240
{
240
{
241
  my $filename = shift;
241
  my $filename = shift;
242
242
243
  open( my $FILE, '<', $filename )
243
  open( my $FILE, '<', $filename )
244
    or
244
    or
245
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
245
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
246
  my $msg       = Mail::Message->read($FILE);
246
  my $msg       = Mail::Message->read($FILE);
247
  my $timestamp = $msg->timestamp();
247
  my $timestamp = $msg->timestamp();
248
  my $date      = $msg->study('Date');
248
  my $date      = $msg->study('Date');
249
249
250
  ## Disregard article if timestamp is not in range
250
  ## Disregard article if timestamp is not in range
251
  dmsg($timestamp) if DEBUG;
251
  dmsg($timestamp) if DEBUG;
252
  if ( $timestamp < $start or $timestamp >= $end )
252
  if ( $timestamp < $start || $timestamp >= $end )
253
  {
253
  {
254
    dmsg("Posting on $date ignored.") if DEBUG;
254
    dmsg("Posting on $date ignored.") if DEBUG;
255
    return;
255
    return;
256
  }
256
  }
257
257
258
  $totalposts++;    # bump count of articles considered
258
  $totalposts++;    # bump count of articles considered
259
259
260
  ## DEBUG
260
  ## DEBUG
261
  dmsg($date) if DEBUG;
261
  dmsg($date) if DEBUG;
262
262
263
  ## get stats about the file itself
263
  ## get stats about the file itself
264
  my $filesize = -s $filename;    # get total size of file
264
  my $filesize = -s $filename;    # get total size of file
265
  $totsize += $filesize;          # bump total sizes of all files
265
  $totsize += $filesize;          # bump total sizes of all files
266
266
267
  if ( ( not defined $earliest ) or $timestamp < $earliest )
267
  if ( ( not defined $earliest ) || $timestamp < $earliest )
268
  {
268
  {
269
    $earliest = $timestamp;
269
    $earliest = $timestamp;
270
  }
270
  }
271
  elsif ( ( not defined $latest ) or $timestamp > $latest )
271
  elsif ( ( not defined $latest ) || $timestamp > $latest )
272
  {
272
  {
273
    $latest = $timestamp;
273
    $latest = $timestamp;
274
  }
274
  }
275
275
276
  #print "timestamp: $timestamp\n";
276
  #print "timestamp: $timestamp\n";
277
277
278
  ## count header size
278
  ## count header size
279
  $totheader += $msg->head()->size();
279
  $totheader += $msg->head()->size();
280
280
281
  ## get the poster's name (MIME-decoded, in UTF-8)
281
  ## get the poster's name (MIME-decoded, in UTF-8)
282
  my $poster = $msg->study('From');
282
  my $poster = $msg->study('From');
283
  if ( defined $poster )
283
  if ( defined $poster )
284
  {
284
  {
285
    ## Convert old to new format
285
    ## Convert old to new format
286
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
286
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
287
287
288
    ## Collapse whitespace
288
    ## Collapse whitespace
289
    $poster =~ s/\s+/ /g;
289
    $poster =~ s/\s+/ /g;
290
290
291
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
291
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
292
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
292
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
293
293
294
    ## DEBUG
294
    ## DEBUG
295
    dmsg($poster) if DEBUG;
295
    dmsg($poster) if DEBUG;
296
296
297
    ## seen this one before?
297
    ## seen this one before?
298
    if ( !defined( $data{$poster} ) )
298
    if ( !defined( $data{$poster} ) )
299
    {
299
    {
300
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
300
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
301
      $data{$poster}{'orig'}   = 0;
301
      $data{$poster}{'orig'}   = 0;
302
      $data{$poster}{'quoted'} = 0;
302
      $data{$poster}{'quoted'} = 0;
303
    }
303
    }
304
    $data{$poster}{'count'}++;                    # bump count for this poster
304
    $data{$poster}{'count'}++;                    # bump count for this poster
305
    $data{$poster}{'size'} += $filesize;          # total size of file
305
    $data{$poster}{'size'} += $filesize;          # total size of file
306
306
307
    ## The User-Agent and/or X-Newsreader fields
307
    ## The User-Agent and/or X-Newsreader fields
308
    ## for User-Agent by poster
308
    ## for User-Agent by poster
309
    my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader');
309
    my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');
310
    if ( defined $ua )
310
    if ( defined $ua )
311
    {
311
    {
312
      $data{$poster}{'agent'} = $ua;
312
      $data{$poster}{'agent'} = $ua;
313
313
314
      ## DEBUG
314
      ## DEBUG
315
      dmsg($ua) if DEBUG;
315
      dmsg($ua) if DEBUG;
316
    }
316
    }
317
317
318
    ## The User Agent for User-Agent by number of posts
318
    ## The User Agent for User-Agent by number of posts
319
    get_agent($msg);
319
    get_agent($msg);
320
320
321
    ## Get all cross-posted newsgroups
321
    ## Get all cross-posted newsgroups
322
    for ( split( /,/, $msg->study('Newsgroups') ) )
322
    for ( split( /,/, $msg->study('Newsgroups') ) )
323
    {
323
    {
324
      $crossposts{$_}++;    # bump count for each
324
      $crossposts{$_}++;    # bump count for each
325
    }
325
    }
326
326
327
    ## Get threads
327
    ## Get threads
328
    my $thread = $msg->study('Subject');
328
    my $thread = $msg->study('Subject');
329
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
329
    $thread =~ s/^re:\s+//i;    # Remove Re: or re: at start
330
    $thread =~ s/\s+/ /g;       # collapse whitespace
330
    $thread =~ s/\s+/ /g;       # collapse whitespace
331
    $threads{$thread}{'count'}++;    # bump count of this subject
331
    $threads{$thread}{'count'}++;    # bump count of this subject
332
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
332
    $threads{$thread}{'size'} += $filesize;    # bump bytes for this thread
333
333
334
    ## Is this an original post or a reply?
334
    ## Is this an original post or a reply?
335
    if ( defined $msg->study('References') )
335
    if ( defined $msg->study('References') )
336
    {
336
    {
337
      $replies++;
337
      $replies++;
338
    }
338
    }
339
    else
339
    else
340
    {
340
    {
341
      $origposts++;
341
      $origposts++;
342
    }
342
    }
343
343
344
    ## Get the time zone
344
    ## Get the time zone
345
    my $datetime = $datetime_parser->parse_datetime($date);
345
    my $datetime = $datetime_parser->parse_datetime($date);
346
    my $tz       = $datetime->strftime('%z');
346
    my $tz       = $datetime->strftime('%z');
347
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
347
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
348
    $tz{$tz}++;
348
    $tz{$tz}++;
349
349
350
    ## DEBUG
350
    ## DEBUG
351
    dmsg($tz) if DEBUG;
351
    dmsg($tz) if DEBUG;
352
352
353
#### Now analyse the body text ####
353
#### Now analyse the body text ####
354
    my $body = $msg->body();
354
    my $body = $msg->body();
355
355
356
    my $insig = 0;
356
    my $insig = 0;
357
    my @body  = $body->lines;
357
    my @body  = $body->lines;
358
    for (@body)
358
    for (@body)
359
    {
359
    {
360
      $totbody += length($_);    # bump total body size
360
      $totbody += length($_);    # bump total body size
361
      next if (m{^$>}o);         # don't count blank lines in body
361
      next if (m{^$>}o);         # don't count blank lines in body
362
      if ( $insig == 1 )
362
      if ( $insig == 1 )
363
      {
363
      {
364
364
365
        # bump total sig size
365
        # bump total sig size
366
        $totsig += length($_);
366
        $totsig += length($_);
367
      }
367
      }
368
      ## are we in a quote line?
368
      ## are we in a quote line?
369
      ## Bill Unruh uses ] quotes, and another poster uses ::
369
      ## Bill Unruh uses ] quotes, and another poster uses ::
370
      elsif ( m{^\s*[>\]]}o or m{^\s*::}o )
370
      elsif ( m{^\s*[>\]]}o || m{^\s*::}o )
371
      {
371
      {
372
        ## bump count of quoted chrs
372
        ## bump count of quoted chrs
373
        $data{$poster}{'quoted'} += length($_);
373
        $data{$poster}{'quoted'} += length($_);
374
        $totquoted += length($_);
374
        $totquoted += length($_);
375
      }
375
      }
376
      elsif (/^-- $/)
376
      elsif (/^-- $/)
377
      {
377
      {
378
        $insig = 1;
378
        $insig = 1;
379
      }
379
      }
380
      else
380
      else
381
      {
381
      {
382
        ## We must be processing an original line
382
        ## We must be processing an original line
383
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
383
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
384
        $totorig += length($_);
384
        $totorig += length($_);
385
      }
385
      }
386
    }
386
    }
387
387
388
    # end for (@body)
388
    # end for (@body)
389
  }
389
  }
390
390
391
  close($FILE);
391
  close($FILE);
392
}
392
}
393
393
394
sub get_agent
394
sub get_agent
395
{
395
{
396
  my $msg = shift;
396
  my $msg = shift;
397
397
398
  my $ua =
-
 
399
       $msg->study('User-Agent')
-
 
400
    or $msg->study('X-Newsreader')
398
  my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')
401
    or $msg->study('X-Mailer');
399
    // $msg->study('X-Mailer');
-
 
400
402
  if ( not defined $ua )
401
  if ( not defined $ua )
403
  {
402
  {
404
    my $org = $msg->study('Organization');
403
    my $org = $msg->study('Organization');
405
    if ( defined $org
404
    if ( defined $org
406
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
405
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
407
    {
406
    {
408
      $ua = $org;
407
      $ua = $org;
409
    }
408
    }
410
    elsif ( $msg->study('Message-ID') =~ /pine/i )
409
    elsif ( $msg->study('Message-ID') =~ /pine/i )
411
    {
410
    {
412
      $ua = "Pine";
411
      $ua = "Pine";
413
    }
412
    }
414
  }
413
  }
415
414
416
  ## Hopefully found UA, else set to unknown
415
  ## Hopefully found UA, else set to unknown
417
  if ( not defined $ua )
416
  if ( not defined $ua )
418
  {
417
  {
419
    $ua = __ "unknown";
418
    $ua = __ "unknown";
420
  }
419
  }
421
420
422
  $ua = clean($ua);
421
  $ua = clean($ua);
423
422
424
  my $raw   = $ua;
423
  my $raw   = $ua;
425
  my $agent = $raw;
424
  my $agent = $raw;
426
425
427
  ## strip http
426
  ## strip http
428
  if ( $raw =~ /.*http.*/ )
427
  if ( $raw =~ /.*http.*/ )
429
  {
428
  {
430
    $raw =~ s!posted via!!i;
429
    $raw =~ s!posted via!!i;
431
    $raw =~ s!http://!!g;
430
    $raw =~ s!http://!!g;
432
    $raw =~ s!/!!g;
431
    $raw =~ s!/!!g;
433
    $raw =~ s! !!g;
432
    $raw =~ s! !!g;
434
  }
433
  }
435
434
436
  ## Fix Outlook from Mac
435
  ## Fix Outlook from Mac
437
  if ( $raw =~ /^microsoft/i )
436
  if ( $raw =~ /^microsoft/i )
438
  {
437
  {
439
    $raw =~ s/-/ /g;
438
    $raw =~ s/-/ /g;
440
  }
439
  }
441
440
442
  ## Pick out the popular agents
441
  ## Pick out the popular agents
443
  if ( $raw =~ /(outlook express)/i
442
  if ( $raw =~ /(outlook express)/i
444
    || $raw =~ /(windows mail)/i
443
    || $raw =~ /(windows mail)/i
445
    || $raw =~ /(microplanet gravity)/i
444
    || $raw =~ /(microplanet gravity)/i
446
    || $raw =~ /(news rover)/i
445
    || $raw =~ /(news rover)/i
447
    || $raw =~ /(forte agent)/i
446
    || $raw =~ /(forte agent)/i
448
    || $raw =~ /(forte free agent)/i )
447
    || $raw =~ /(forte free agent)/i )
449
  {
448
  {
450
    $agent = $1;
449
    $agent = $1;
451
  }
450
  }
452
  elsif (
451
  elsif (
453
    $raw =~ /^(
452
    $raw =~ /^(
454
        pan
453
        pan
455
       |sylpheed
454
       |sylpheed
456
       |slrn
455
       |slrn
457
       |mozilla
456
       |mozilla
458
       |knode
457
       |knode
459
       |tin
458
       |tin
460
       |hamster
459
       |hamster
461
       |xrn
460
       |xrn
462
       |xnews
461
       |xnews
463
       |aol
462
       |aol
464
       |gnus
463
       |gnus
465
       |krn
464
       |krn
466
       |macsoup
465
       |macsoup
467
       |messenger
466
       |messenger
468
       |openxp
467
       |openxp
469
       |pine
468
       |pine
470
       |thoth
469
       |thoth
471
       |turnpike
470
       |turnpike
472
       |winvn
471
       |winvn
473
       |vsoup
472
       |vsoup
474
       |google
473
       |google
475
       |supernews
474
       |supernews
476
       |nn
475
       |nn
477
       |rn
476
       |rn
478
       |007
477
       |007
479
       |webtv
478
       |webtv
480
       |compuserve
479
       |compuserve
481
       )/ix
480
       )/ix
482
    )
481
    )
483
  {
482
  {
484
    $agent = $1;
483
    $agent = $1;
485
  }
484
  }
486
  else
485
  else
487
  {
486
  {
488
    ## Clean up unknown agents
487
    ## Clean up unknown agents
489
    if ( $raw =~ m!^(.*?)/! )
488
    if ( $raw =~ m!^(.*?)/! )
490
    {
489
    {
491
      $agent = $1;
490
      $agent = $1;
492
    }
491
    }
493
    elsif ( $raw =~ /^(\w*)\d.*/ )
492
    elsif ( $raw =~ /^(\w*)\d.*/ )
494
    {
493
    {
495
      $agent = $1;
494
      $agent = $1;
496
    }
495
    }
497
  }
496
  }
498
497
499
  $distinct_agent{$agent}++;
498
  $distinct_agent{$agent}++;
500
  return $agent;
499
  return $agent;
501
}
500
}
502
## get_agent
501
## get_agent
503
502
504
#########################################
503
#########################################
505
## Count the User-Agents used, collapsing
504
## Count the User-Agents used, collapsing
506
## different versions into one per agent.
505
## different versions into one per agent.
507
#########################################
506
#########################################
508
sub count_agents
507
sub count_agents
509
{
508
{
510
POSTER:
509
POSTER:
511
  foreach my $poster ( keys %data )
510
  foreach my $poster ( keys %data )
512
  {
511
  {
513
    foreach my $agent_name ( keys %distinct_agent )
512
    foreach my $agent_name ( keys %distinct_agent )
514
    {    # check against known ones
513
    {    # check against known ones
515
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
514
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
516
      {
515
      {
517
        $agents{$agent_name}++;
516
        $agents{$agent_name}++;
518
        next POSTER;
517
        next POSTER;
519
      }
518
      }
520
    }
519
    }
521
    $agents{ $data{$poster}{'agent'} }++;
520
    $agents{ $data{$poster}{'agent'} }++;
522
  }
521
  }
523
}    # count_agents
522
}    # count_agents
524
523
525
#############################################
524
#############################################
526
## Set orig/total percentages for all posters
525
## Set orig/total percentages for all posters
527
#############################################
526
#############################################
528
sub fix_percent
527
sub fix_percent
529
{
528
{
530
  foreach my $poster ( keys %data )
529
  foreach my $poster ( keys %data )
531
  {
530
  {
532
    my $percent = 100;
531
    my $percent = 100;
533
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
532
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
534
    {
533
    {
535
      $percent =
534
      $percent =
536
        $data{$poster}{'orig'} * 100 /
535
        $data{$poster}{'orig'} * 100 /
537
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
536
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
538
    }
537
    }
539
    elsif ( $data{$poster}{'orig'} == 0 )
538
    elsif ( $data{$poster}{'orig'} == 0 )
540
    {
539
    {
541
      $percent = 0;
540
      $percent = 0;
542
    }
541
    }
543
    $data{$poster}{'percent'} = $percent;
542
    $data{$poster}{'percent'} = $percent;
544
  }
543
  }
545
}
544
}
546
## fix_percent
545
## fix_percent
547
546
548
##################################
547
##################################
549
## Write data structures to a file
548
## Write data structures to a file
550
##################################
549
##################################
551
sub write_data
550
sub write_data
552
{
551
{
553
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
552
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
554
    or die __x( "Can't create XDATA: {error}\n", error => $! );
553
    or die __x( "Can't create XDATA: {error}\n", error => $! );
555
  print $OUTF "Data collected from $newsgroup_name\n\n";
554
  print $OUTF "Data collected from $newsgroup_name\n\n";
556
  print $OUTF
555
  print $OUTF
557
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
556
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
558
  foreach my $name ( keys %data )
557
  foreach my $name ( keys %data )
559
  {
558
  {
560
    print $OUTF
559
    print $OUTF
561
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
560
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
562
  }
561
  }
563
  print $OUTF
562
  print $OUTF
564
"============================================================================\n";
563
"============================================================================\n";
565
  print $OUTF "Thread subjects\n";
564
  print $OUTF "Thread subjects\n";
566
  print $OUTF
565
  print $OUTF
567
"----------------------------------------------------------------------------\n";
566
"----------------------------------------------------------------------------\n";
568
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
567
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
569
  {
568
  {
570
    print $OUTF
569
    print $OUTF
571
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
570
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
572
  }
571
  }
573
  print $OUTF
572
  print $OUTF
574
"============================================================================\n";
573
"============================================================================\n";
575
  print $OUTF "Cross-posts\n";
574
  print $OUTF "Cross-posts\n";
576
  print $OUTF
575
  print $OUTF
577
"----------------------------------------------------------------------------\n";
576
"----------------------------------------------------------------------------\n";
578
  foreach my $name ( sort keys %crossposts )
577
  foreach my $name ( sort keys %crossposts )
579
  {
578
  {
580
    print $OUTF "$name : $crossposts{$name}\n";
579
    print $OUTF "$name : $crossposts{$name}\n";
581
  }
580
  }
582
  print $OUTF
581
  print $OUTF
583
"============================================================================\n";
582
"============================================================================\n";
584
  print $OUTF "User agents\n";
583
  print $OUTF "User agents\n";
585
  print $OUTF
584
  print $OUTF
586
"----------------------------------------------------------------------------\n";
585
"----------------------------------------------------------------------------\n";
587
  foreach my $name ( sort keys %agents )
586
  foreach my $name ( sort keys %agents )
588
  {
587
  {
589
    print $OUTF "$name : $agents{$name}\n";
588
    print $OUTF "$name : $agents{$name}\n";
590
  }
589
  }
591
  print $OUTF
590
  print $OUTF
592
"============================================================================\n";
591
"============================================================================\n";
593
  print $OUTF "Time zones\n";
592
  print $OUTF "Time zones\n";
594
  print $OUTF
593
  print $OUTF
595
"----------------------------------------------------------------------------\n";
594
"----------------------------------------------------------------------------\n";
596
  foreach my $name ( sort keys %tz )
595
  foreach my $name ( sort keys %tz )
597
  {
596
  {
598
    print $OUTF "$name : $tz{$name}\n";
597
    print $OUTF "$name : $tz{$name}\n";
599
  }
598
  }
600
  close $OUTF;
599
  close $OUTF;
601
}    # write_data
600
}    # write_data
602
601
603
sub display_results
602
sub display_results
604
{
603
{
605
  #################### DISPLAY RESULTS #####################
604
  #################### DISPLAY RESULTS #####################
606
  println( "=" x 76 );
605
  println( "=" x 76 );
607
  printf "%s\n",
606
  printf "%s\n",
608
    centred(
607
    centred(
609
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
608
    __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ),
610
    76 );
609
    76 );
611
  println( "=" x 76 );
610
  println( "=" x 76 );
612
  printf "%s\n",
611
  printf "%s\n",
613
    centred(
612
    centred(
614
    __(
613
    __(
615
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
614
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
616
    ),
615
    ),
617
    76
616
    76
618
    );
617
    );
619
  print "\n\n";
618
  print "\n\n";
620
  printf __"Total posts considered: %s over %d days\n",
619
  printf __"Total posts considered: %s over %d days\n",
621
    $formatter->format_number($totalposts),
620
    $formatter->format_number($totalposts),
622
    $formatter->format_number($numdays);
621
    $formatter->format_number($numdays);
623
  my $time_locale       = setlocale(LC_TIME);
622
  my $time_locale       = setlocale(LC_TIME);
624
  my $earliest_datetime = DateTime->from_epoch(
623
  my $earliest_datetime = DateTime->from_epoch(
625
    epoch     => $earliest,
624
    epoch     => $earliest,
626
    locale    => $time_locale,
625
    locale    => $time_locale,
627
    time_zone => 'UTC',
626
    time_zone => 'UTC',
628
  );
627
  );
629
  my $latest_datetime = DateTime->from_epoch(
628
  my $latest_datetime = DateTime->from_epoch(
630
    epoch     => $latest,
629
    epoch     => $latest,
631
    locale    => $time_locale,
630
    locale    => $time_locale,
632
    time_zone => 'UTC',
631
    time_zone => 'UTC',
633
  );
632
  );
634
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
633
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
635
  printf __"Earliest article: %s\n",
634
  printf __"Earliest article: %s\n",
636
    $earliest_datetime->strftime($datetime_format);
635
    $earliest_datetime->strftime($datetime_format);
637
  printf __"Latest article:   %s\n",
636
  printf __"Latest article:   %s\n",
638
    $latest_datetime->strftime($datetime_format);
637
    $latest_datetime->strftime($datetime_format);
639
  printf __"Original articles: %s; replies: %s\n",
638
  printf __"Original articles: %s; replies: %s\n",
640
    $formatter->format_number($origposts),
639
    $formatter->format_number($origposts),
641
    $formatter->format_number($replies);
640
    $formatter->format_number($replies);
642
  printf __"Total size of posts: %s bytes (%s)" . "\n",
641
  printf __"Total size of posts: %s bytes (%s)" . "\n",
643
    $formatter->format_number($totsize),
642
    $formatter->format_number($totsize),
644
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
643
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
645
  printf __"Average %s articles per day, %s per day, %s bytes per article\n",
644
  printf __"Average %s articles per day, %s per day, %s bytes per article\n",
646
    $formatter->format_number( int( $totalposts / $numdays ) ),
645
    $formatter->format_number( int( $totalposts / $numdays ) ),
647
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
646
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
648
    $formatter->format_number( int( $totsize / $totalposts ) );
647
    $formatter->format_number( int( $totsize / $totalposts ) );
649
648
650
  my $count = keys %data;
649
  my $count = keys %data;
651
  printf __"Total headers: %s; bodies: %s\n",
650
  printf __"Total headers: %s; bodies: %s\n",
652
    $formatter->format_bytes(
651
    $formatter->format_bytes(
653
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
652
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
654
    ),
653
    ),
655
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
654
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
656
  printf __
655
  printf __
657
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
656
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
658
    $formatter->format_bytes(
657
    $formatter->format_bytes(
659
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
658
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
660
    ),
659
    ),
661
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
660
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
662
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) ),
661
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) ),
663
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
662
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
664
  printf __"Total number of posters: %s, average %s per poster\n",
663
  printf __"Total number of posters: %s, average %s per poster\n",
665
    $formatter->format_number($count),
664
    $formatter->format_number($count),
666
    $formatter->format_bytes( $totsize / $count,
665
    $formatter->format_bytes( $totsize / $count,
667
    ( 'precision' => 1, 'mode' => 'iec' ) );
666
    ( 'precision' => 1, 'mode' => 'iec' ) );
668
  $count = keys %threads;
667
  $count = keys %threads;
669
  printf __"Total number of threads: %s, average %s per thread\n",
668
  printf __"Total number of threads: %s, average %s per thread\n",
670
    $formatter->format_number($count),
669
    $formatter->format_number($count),
671
    $formatter->format_bytes( $totsize / $count,
670
    $formatter->format_bytes( $totsize / $count,
672
    ( 'precision' => 1, 'mode' => 'iec' ) );
671
    ( 'precision' => 1, 'mode' => 'iec' ) );
673
  printf __"Total number of user agents: %d\n",
672
  printf __"Total number of user agents: %d\n",
674
    $formatter->format_number( scalar keys %agents );
673
    $formatter->format_number( scalar keys %agents );
675
  print "\n", "=" x 76, "\n"  ;
674
  print "\n", "=" x 76, "\n";
676
  ########################################
675
  ########################################
677
  ## Show posters by article count  Sec 1;
676
  ## Show posters by article count  Sec 1;
678
  ########################################
677
  ########################################
679
  unless ( $skipSec{1} )
678
  unless ( $skipSec{1} )
680
  {
679
  {
681
    if ( keys %data < $topposters )
680
    if ( keys %data < $topposters )
682
    {
681
    {
683
      $count = keys %data;
682
      $count = keys %data;
684
    }
683
    }
685
    else
684
    else
686
    {
685
    {
687
      $count = $topposters;
686
      $count = $topposters;
688
    }
687
    }
689
    printf "%s\n",
688
    printf "%s\n",
690
      centred(
689
      centred(
691
      __x( "Top {count} posters by number of articles", count => $topposters ),
690
      __x( "Top {count} posters by number of articles", count => $topposters ),
692
      76
691
      76
693
      );
692
      );
694
    print "=" x 76, "\n";
693
    print "=" x 76, "\n";
695
    my $i = 0;
694
    my $i = 0;
696
    foreach
695
    foreach
697
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
696
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
698
    {
697
    {
699
      my $name = substr( $poster, 0, 65 );
698
      my $name = substr( $poster, 0, 65 );
700
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
699
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
701
        $data{$poster}{count};
700
        $data{$poster}{count};
702
      last if ( ++$i == $count );
701
      last if ( ++$i == $count );
703
    }
702
    }
704
    print "\n", "=" x 76, "\n";
703
    print "\n", "=" x 76, "\n";
705
  }
704
  }
706
705
707
  ######################################
706
  ######################################
708
  ## Show posters by size in KiB  Sec 2;
707
  ## Show posters by size in KiB  Sec 2;
709
  ######################################
708
  ######################################
710
  unless ( $skipSec{2} )
709
  unless ( $skipSec{2} )
711
  {
710
  {
712
    if ( keys %data < $topposters )
711
    if ( keys %data < $topposters )
713
    {
712
    {
714
      $count = keys %data;
713
      $count = keys %data;
715
    }
714
    }
716
    else
715
    else
717
    {
716
    {
718
      $count = $topposters;
717
      $count = $topposters;
719
    }
718
    }
720
    printf "%s\n",
719
    printf "%s\n",
721
      centred(
720
      centred(
722
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
721
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
723
      76
722
      76
724
      );
723
      );
725
    print "=" x 76, "\n";
724
    print "=" x 76, "\n";
726
    my $i = 0;
725
    my $i = 0;
727
    foreach
726
    foreach
728
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
727
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
729
    {
728
    {
730
      my $name = substr( $poster, 0, 62 );
729
      my $name = substr( $poster, 0, 62 );
731
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
730
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
732
        $data{$poster}{size} / 1024;    #/
731
        $data{$poster}{size} / 1024;    #/
733
      last if ( ++$i == $count );
732
      last if ( ++$i == $count );
734
    }
733
    }
735
    print "\n", "=" x 76, "\n";
734
    print "\n", "=" x 76, "\n";
736
  }
735
  }
737
736
738
  #####################################
737
  #####################################
739
  ## Show top posters for original text
738
  ## Show top posters for original text
740
  #####################################
739
  #####################################
-
 
740
  my $topposters_real = 0;
-
 
741
741
  unless ( $skipSec{3} )
742
  unless ( $skipSec{3} )
742
  {
743
  {
743
    if ( keys %data < $topposters )
744
    if ( keys %data < $topposters )
744
    {
745
    {
745
      $count = keys %data;
746
      $count = keys %data;
746
    }
747
    }
747
    else
748
    else
748
    {
749
    {
749
      $count = $topposters;
750
      $count = $topposters;
750
    }
751
    }
-
 
752
751
    printf "%s\n",
753
    printf "%s\n",
752
      centred(
754
      centred(
753
      __x(
755
      __x(
754
        "Top {count} responders by original text (> 5 posts)",
756
        "Top {count} responders by original text (> 5 posts)",
755
        count => $topposters
757
        count => $topposters
756
      ),
758
      ),
757
      76
759
      76
758
      );
760
      );
759
    print "=" x 76, "\n";
761
    print "=" x 76, "\n";
760
    my $i = 0;
-
 
761
    foreach my $poster (
762
    foreach my $poster (
762
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
763
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
763
      keys %data
764
      keys %data
764
      )
765
      )
765
    {
766
    {
766
      next if $data{$poster}{quoted} == 0;
767
      next if $data{$poster}{quoted} == 0;
767
      next if $data{$poster}{count} < 5;
768
      next if $data{$poster}{count} < 5;
768
      my $name = substr( $poster, 0, 63 );
769
      my $name = substr( $poster, 0, 63 );
769
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
770
      printf "%2d. %-63s : %02.2f%%\n", $topposters_real + 1,
-
 
771
        rpad( $poster, 63, "." ),
770
        $data{$poster}{percent};
772
        $data{$poster}{percent};
771
      last if ( ++$i == $count );
773
      last if ( ++$topposters_real == $count );
772
    }
774
    }
773
    print "\n", "=" x 76, "\n";
775
    print "\n", "=" x 76, "\n";
774
  }
776
  }
775
777
776
  ########################################
778
  ########################################
777
  ## Show bottom posters for original text
779
  ## Show bottom posters for original text
778
  ########################################
780
  ########################################
-
 
781
-
 
782
  $skipSec{4} = ( $topposters_real <= $topposters ) unless defined $skipSec{4};
-
 
783
779
  unless ( $skipSec{4} )
784
  unless ( $skipSec{4} )
780
  {
785
  {
781
    if ( keys %data < $topposters )
786
    if ( keys %data < $topposters )
782
    {
787
    {
783
      $count = keys %data;
788
      $count = keys %data;
784
    }
789
    }
785
    else
790
    else
786
    {
791
    {
787
      $count = $topposters;
792
      $count = $topposters;
788
    }
793
    }
-
 
794
789
    printf "%s\n",
795
    printf "%s\n",
790
      centred(
796
      centred(
791
      __x(
797
      __x(
792
        "Bottom {count} responders by original text  (> 5 posts)",
798
        "Bottom {count} responders by original text  (> 5 posts)",
793
        count => $topposters
799
        count => $topposters
794
      ),
800
      ),
795
      76
801
      76
796
      );
802
      );
797
    print "=" x 76, "\n";
803
    print "=" x 76, "\n";
798
    my $i = 0;
804
    my $i = 0;
799
    foreach my $poster (
805
    foreach my $poster (
800
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
806
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
801
      keys %data
807
      keys %data
802
      )
808
      )
803
    {
809
    {
804
      next if $data{$poster}{quoted} == 0;
810
      next if $data{$poster}{quoted} == 0;
805
      next if $data{$poster}{count} < 5;
811
      next if $data{$poster}{count} < 5;
806
      my $name = substr( $poster, 0, 63 );
812
      my $name = substr( $poster, 0, 63 );
807
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
813
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
808
        $data{$poster}{percent};
814
        $data{$poster}{percent};
809
      last if ( ++$i == $count );
815
      last if ( ++$i == $count );
810
    }
816
    }
811
    print "\n", "=" x 76, "\n";
817
    print "\n", "=" x 76, "\n";
812
  }
818
  }
813
819
814
  #####################################
820
  #####################################
815
  ## Show threads by number of articles
821
  ## Show threads by number of articles
816
  #####################################
822
  #####################################
817
  unless ( $skipSec{5} )
823
  unless ( $skipSec{5} )
818
  {
824
  {
819
    if ( keys %threads < $topthreads )
825
    if ( keys %threads < $topthreads )
820
    {
826
    {
821
      $count = keys %threads;
827
      $count = keys %threads;
822
    }
828
    }
823
    else
829
    else
824
    {
830
    {
825
      $count = $topthreads;
831
      $count = $topthreads;
826
    }
832
    }
827
    printf "%s\n",
833
    printf "%s\n",
828
      centred(
834
      centred(
829
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
835
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
830
      76 );
836
      76 );
831
    print "=" x 76, "\n";
837
    print "=" x 76, "\n";
832
    my $i = 0;
838
    my $i = 0;
833
    foreach my $thread (
839
    foreach my $thread (
834
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
840
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
835
      keys %threads
841
      keys %threads
836
      )
842
      )
837
    {
843
    {
838
      my $name = substr( $thread, 0, 65 );
844
      my $name = substr( $thread, 0, 65 );
839
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
845
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
840
        $threads{$thread}{'count'};
846
        $threads{$thread}{'count'};
841
      last if ( ++$i == $count );
847
      last if ( ++$i == $count );
842
    }
848
    }
843
    print "\n", "=" x 76, "\n";
849
    print "\n", "=" x 76, "\n";
844
  }
850
  }
845
851
846
  ##############################
852
  ##############################
847
  ## Show threads by size in KiB
853
  ## Show threads by size in KiB
848
  ##############################
854
  ##############################
849
  unless ( $skipSec{6} )
855
  unless ( $skipSec{6} )
850
  {
856
  {
851
    if ( keys %threads < $topthreads )
857
    if ( keys %threads < $topthreads )
852
    {
858
    {
853
      $count = keys %threads;
859
      $count = keys %threads;
854
    }
860
    }
855
    else
861
    else
856
    {
862
    {
857
      $count = $topthreads;
863
      $count = $topthreads;
858
    }
864
    }
859
    printf "%s\n",
865
    printf "%s\n",
860
      centred(
866
      centred(
861
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
867
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
862
    print "=" x 76, "\n";
868
    print "=" x 76, "\n";
863
    my $i = 0;
869
    my $i = 0;
864
    foreach my $thread (
870
    foreach my $thread (
865
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
871
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
866
      keys %threads
872
      keys %threads
867
      )
873
      )
868
    {
874
    {
869
      my $name = substr( $thread, 0, 65 );
875
      my $name = substr( $thread, 0, 65 );
870
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
876
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
871
        $threads{$thread}{'size'} / 1024;    #/
877
        $threads{$thread}{'size'} / 1024;    #/
872
      last if ( ++$i == $count );
878
      last if ( ++$i == $count );
873
    }
879
    }
874
    print "\n", "=" x 76, "\n";
880
    print "\n", "=" x 76, "\n";
875
  }
881
  }
876
882
877
  ##################################
883
  ##################################
878
  ## Show top 10 cross-posted groups
884
  ## Show top 10 cross-posted groups
879
  ##################################
885
  ##################################
880
  unless ( $skipSec{7} )
886
  unless ( $skipSec{7} )
881
  {
887
  {
882
    delete $crossposts{"$newsgroup_name"};    # don't include ours
888
    delete $crossposts{"$newsgroup_name"};    # don't include ours
883
    if ( keys %crossposts < $topcrossposts )
889
    if ( keys %crossposts < $topcrossposts )
884
    {
890
    {
885
      $count = keys %crossposts;
891
      $count = keys %crossposts;
886
    }
892
    }
887
    else
893
    else
888
    {
894
    {
889
      $count = $topcrossposts;
895
      $count = $topcrossposts;
890
    }
896
    }
891
    printf "%s\n",
897
    printf "%s\n",
892
      centred(
898
      centred(
893
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
899
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
894
    print "=" x 76, "\n";
900
    print "=" x 76, "\n";
895
    my $i = 0;
901
    my $i = 0;
896
    foreach
902
    foreach
897
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
903
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
898
    {
904
    {
899
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
905
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
900
        $crossposts{$name};
906
        $crossposts{$name};
901
      last if ( ++$i == $count );
907
      last if ( ++$i == $count );
902
    }
908
    }
903
    print "\n", "=" x 76, "\n";
909
    print "\n", "=" x 76, "\n";
904
  }
910
  }
905
911
906
  #########################
912
  #########################
907
  ## Show agents and counts
913
  ## Show agents and counts
908
  #########################
914
  #########################
909
  unless ( $skipSec{8} )
915
  unless ( $skipSec{8} )
910
  {
916
  {
911
    if ( keys %agents < $topagents )
917
    if ( keys %agents < $topagents )
912
    {
918
    {
913
      $count = keys %agents;
919
      $count = keys %agents;
914
    }
920
    }
915
    else
921
    else
916
    {
922
    {
917
      $count = $topagents;
923
      $count = $topagents;
918
    }
924
    }
919
    printf "%s\n",
925
    printf "%s\n",
920
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
926
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
921
      76 );
927
      76 );
922
    print "=" x 76, "\n";
928
    print "=" x 76, "\n";
923
    my $i = 0;
929
    my $i = 0;
924
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
930
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
925
    {
931
    {
926
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
932
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
927
        $agents{$agent};
933
        $agents{$agent};
928
      last if ( ++$i == $count );
934
      last if ( ++$i == $count );
929
    }
935
    }
930
    print "\n", "=" x 76, "\n";
936
    print "\n", "=" x 76, "\n";
931
  }
937
  }
932
938
933
  #######################
939
  #######################
934
  ## Show distinct agents
940
  ## Show distinct agents
935
  #######################
941
  #######################
936
  unless ( $skipSec{9} )
942
  unless ( $skipSec{9} )
937
  {
943
  {
938
    if ( keys %distinct_agent < $topagents )
944
    if ( keys %distinct_agent < $topagents )
939
    {
945
    {
940
      $count = keys %distinct_agent;
946
      $count = keys %distinct_agent;
941
    }
947
    }
942
    else
948
    else
943
    {
949
    {
944
      $count = $topagents;
950
      $count = $topagents;
945
    }
951
    }
946
    printf "%s\n",
952
    printf "%s\n",
947
      centred(
953
      centred(
948
      __x( "Top {count} user agents by number of posts", count => $topagents ),
954
      __x( "Top {count} user agents by number of posts", count => $topagents ),
949
      76
955
      76
950
      );
956
      );
951
    print "=" x 76, "\n";
957
    print "=" x 76, "\n";
952
    my $i = 0;
958
    my $i = 0;
953
    foreach my $agent (
959
    foreach my $agent (
954
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
960
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
955
      keys %distinct_agent
961
      keys %distinct_agent
956
      )
962
      )
957
    {
963
    {
958
      printf "%2d. %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
964
      printf "%2d. %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
959
        $distinct_agent{$agent},
965
        $distinct_agent{$agent},
960
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
966
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
961
      last if ( ++$i == $count );
967
      last if ( ++$i == $count );
962
    }
968
    }
963
    print "\n", "=" x 76, "\n";
969
    print "\n", "=" x 76, "\n";
964
  }
970
  }
965
971
966
  ############################
972
  ############################
967
  ## Show timezones and counts
973
  ## Show timezones and counts
968
  ############################
974
  ############################
969
  unless ( $skipSec{10} )
975
  unless ( $skipSec{10} )
970
  {
976
  {
971
    if ( keys %tz < $toptz )
977
    if ( keys %tz < $toptz )
972
    {
978
    {
973
      $count = keys %tz;
979
      $count = keys %tz;
974
    }
980
    }
975
    else
981
    else
976
    {
982
    {
977
      $count = $toptz;
983
      $count = $toptz;
978
    }
984
    }
979
    printf "%s\n",
985
    printf "%s\n",
980
      centred( __x( "Top {count} time zones", count => $toptz ), 76 );
986
      centred( __x( "Top {count} time zones", count => $toptz ), 76 );
981
    print "=" x 76, "\n";
987
    print "=" x 76, "\n";
982
    my $i = 0;
988
    my $i = 0;
983
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
989
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
984
    {
990
    {
985
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
991
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
986
      last if ( ++$i == $count );
992
      last if ( ++$i == $count );
987
    }
993
    }
988
    print "\n", "=" x 76, "\n";
994
    print "\n", "=" x 76, "\n";
989
  }
995
  }
990
}
996
}
991
997
992
## helper subs
998
## helper subs
993
999
994
###############################
1000
###############################
995
## Right pad a string with '.'s
1001
## Right pad a string with '.'s
996
###############################
1002
###############################
997
sub rpad
1003
sub rpad
998
{
1004
{
999
  ## Get text to pad, length to pad, pad chr
1005
  ## Get text to pad, length to pad, pad chr
1000
  my ( $text, $pad_len, $pad_chr ) = @_;
1006
  my ( $text, $pad_len, $pad_chr ) = @_;
1001
1007
1002
  ## DEBUG
1008
  ## DEBUG
1003
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
1009
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
1004
1010
1005
  if ( length($text) > $pad_len )
1011
  if ( length($text) > $pad_len )
1006
  {
1012
  {
1007
    $text = substr( $text, 0, $pad_len );
1013
    $text = substr( $text, 0, $pad_len );
1008
  }
1014
  }
1009
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
1015
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
1010
  return $padded;
1016
  return $padded;
1011
}
1017
}
1012
1018
1013
##################
1019
##################
1014
## Centre a string
1020
## Centre a string
1015
##################
1021
##################
1016
sub centred
1022
sub centred
1017
{
1023
{
1018
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
1024
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
1019
  my $pad_len = ( $width - length($text) ) / 2;    #/
1025
  my $pad_len = ( $width - length($text) ) / 2;    #/
1020
  my $centred = " " x $pad_len . $text;
1026
  my $centred = " " x $pad_len . $text;
1021
  return $centred;
1027
  return $centred;
1022
}
1028
}
1023
1029
1024
###########################
1030
###########################
1025
## Put commas into a number
1031
## Put commas into a number
1026
###########################
1032
###########################
1027
sub commify
1033
sub commify
1028
{
1034
{
1029
  local $_ = shift;
1035
  local $_ = shift;
1030
  my $number = $_;
1036
  my $number = $_;
1031
  $_ = int;                                        # Chop non-integer part
1037
  $_ = int;                                        # Chop non-integer part
1032
  1 while
1038
  1 while
1033
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1039
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1034
  my $int_part  = $_;
1040
  my $int_part  = $_;
1035
  my $real_part = '';
1041
  my $real_part = '';
1036
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1042
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1037
  {
1043
  {
1038
    $real_part = $1;
1044
    $real_part = $1;
1039
  }
1045
  }
1040
  return $int_part . $real_part;
1046
  return $int_part . $real_part;
1041
}
1047
}
1042
1048
1043
################################################################
1049
################################################################
1044
## Returns a string with leading and trailing whitespace removed
1050
## Returns a string with leading and trailing whitespace removed
1045
################################################################
1051
################################################################
1046
sub clean
1052
sub clean
1047
{
1053
{
1048
  my $dirty = shift;
1054
  my $dirty = shift;
1049
  my $clean = $dirty;
1055
  my $clean = $dirty;
1050
  $clean =~ s/^\s+|\s+$//g;
1056
  $clean =~ s/^\s+|\s+$//g;
1051
1057
1052
  return $clean;
1058
  return $clean;
1053
}
1059
}
1054
1060
1055
sub usage
1061
sub usage
1056
{
1062
{
1057
  println( __ "usage: newsstat.pl NEWS.GROUP" );
1063
  println( __ "usage: newsstat.pl NEWS.GROUP" );
1058
  exit 1;
1064
  exit 1;
1059
}
1065
}
1060
1066
1061
sub dmsg
1067
sub dmsg
1062
{
1068
{
1063
  print STDERR @_, "\n";
1069
  print STDERR @_, "\n";
1064
}
1070
}
1065
1071
1066
sub dmsg2
1072
sub dmsg2
1067
{
1073
{
1068
  my ( $level, @msg ) = @_;
1074
  my ( $level, @msg ) = @_;
1069
  print STDERR @msg, "\n" if $level >= DEBUG;
1075
  print STDERR @msg, "\n" if $level >= DEBUG;
1070
}
1076
}
1071
1077
1072
sub println
1078
sub println
1073
{
1079
{
1074
  print @_, "\n";
1080
  print @_, "\n";
1075
}
1081
}
1076
 
1082