Subversion Repositories LCARS

Rev

Rev 48 | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 48 Rev 49
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
4
require 5.004;
4
require 5.004;
5
5
6
#use diagnostics;
6
#use diagnostics;
7
use utf8;
7
use utf8;
8
8
9
## NOTE:
9
## NOTE:
10
## Enable and remove binmode when utf8::all has actually become lexically scoped
10
## Enable and remove binmode when utf8::all has actually become lexically scoped
11
# use utf8:all;
11
# use utf8:all;
12
12
13
use constant DEBUG => 0;
13
use constant DEBUG => 0;
14
14
15
## newsstat.pl
15
## newsstat.pl
16
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
16
## Copyright (C) 2011, 2012  Thomas Lahn <startrek@PointedEars.de>
17
## Based on work by Garry Knight et al.
17
## Based on work by Garry Knight et al.
18
##
18
##
19
## This program is free software: you can redistribute it and/or modify
19
## This program is free software: you can redistribute it and/or modify
20
## it under the terms of the GNU General Public License as published by
20
## it under the terms of the GNU General Public License as published by
21
## the Free Software Foundation, either version 3 of the License, or
21
## the Free Software Foundation, either version 3 of the License, or
22
## (at your option) any later version.
22
## (at your option) any later version.
23
##
23
##
24
## This program is distributed in the hope that it will be useful,
24
## This program is distributed in the hope that it will be useful,
25
## but WITHOUT ANY WARRANTY; without even the implied warranty of
25
## but WITHOUT ANY WARRANTY; without even the implied warranty of
26
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27
## GNU General Public License for more details.
27
## GNU General Public License for more details.
28
##
28
##
29
## You should have received a copy of the GNU General Public License
29
## You should have received a copy of the GNU General Public License
30
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
30
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
31
31
32
## Print out all text to STDOUT UTF-8 encoded
32
## Print out all text to STDOUT UTF-8 encoded
33
binmode STDOUT, ':encoding(UTF-8)';
33
binmode STDOUT, ':encoding(UTF-8)';
34
binmode STDERR, ':encoding(UTF-8)';
34
binmode STDERR, ':encoding(UTF-8)';
35
35
36
## L10n
36
## L10n
37
use locale ':not_characters';
37
use locale ':not_characters';
38
38
39
# setlocale( LC_MESSAGES, '' );
39
# setlocale( LC_MESSAGES, '' );
40
require Number::Format;
40
require Number::Format;
41
41
42
## i18n
42
## i18n
43
## FIXME: Automatically include resolved '.' in @INC
43
## FIXME: Automatically include resolved '.' in @INC
44
# print join "\n", @INC;
44
# print join "\n", @INC;
45
45
46
use Locale::TextDomain ('de.pointedears.newsstat');
46
use Locale::TextDomain ('de.pointedears.newsstat');
47
use POSIX              ('locale_h');
47
use POSIX              ('locale_h');
48
use Locale::Messages qw (bind_textdomain_filter
48
use Locale::Messages qw (bind_textdomain_filter
49
  bind_textdomain_codeset
49
  bind_textdomain_codeset
50
  turn_utf_8_on);
50
  turn_utf_8_on);
51
51
52
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
52
bind_textdomain_filter 'de.pointedears.newsstat',  \&turn_utf_8_on;
53
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
53
bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8';
54
54
55
require Mail::Message;
55
require Mail::Message;
56
require DateTime;
56
require DateTime;
57
require DateTime::Format::Mail;
57
require DateTime::Format::Mail;
58
58
59
# See comments in previous example
59
# See comments in previous example
60
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
60
my ( $thousands_sep, $mon_thousands_sep, $grouping, $decimal_point ) =
61
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
61
  @{ localeconv() }{ 'thousands_sep', 'mon_thousands_sep', 'grouping',
62
  'decimal_point' };
62
  'decimal_point' };
63
63
64
# Apply defaults if values are missing
64
# Apply defaults if values are missing
65
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
65
$thousands_sep = $mon_thousands_sep unless $thousands_sep;
66
$thousands_sep = ' ' unless $thousands_sep;
66
$thousands_sep = ' ' unless $thousands_sep;
67
67
68
# grouping and mon_grouping are packed lists
68
# grouping and mon_grouping are packed lists
69
# of small integers (characters) telling the
69
# of small integers (characters) telling the
70
# grouping (thousand_seps and mon_thousand_seps
70
# grouping (thousand_seps and mon_thousand_seps
71
# being the group dividers) of numbers and
71
# being the group dividers) of numbers and
72
# monetary quantities.  The integers' meanings:
72
# monetary quantities.  The integers' meanings:
73
# 255 means no more grouping, 0 means repeat
73
# 255 means no more grouping, 0 means repeat
74
# the previous grouping, 1-254 means use that
74
# the previous grouping, 1-254 means use that
75
# as the current grouping.  Grouping goes from
75
# as the current grouping.  Grouping goes from
76
# right to left (low to high digits).  In the
76
# right to left (low to high digits).  In the
77
# below we cheat slightly by never using anything
77
# below we cheat slightly by never using anything
78
# else than the first grouping (whatever that is).
78
# else than the first grouping (whatever that is).
79
my @grouping;
79
my @grouping;
80
if ($grouping)
80
if ($grouping)
81
{
81
{
82
  @grouping = unpack( "C*", $grouping );
82
  @grouping = unpack( "C*", $grouping );
83
}
83
}
84
else
84
else
85
{
85
{
86
  @grouping = (3);
86
  @grouping = (3);
87
}
87
}
88
88
89
## FIXME: Why don't the defaults work already?
89
## FIXME: Why don't the defaults work already?
90
my $formatter = new Number::Format(
90
my $formatter = new Number::Format(
91
  -decimal_point => $decimal_point,
91
  -decimal_point => $decimal_point,
92
  -thousands_sep => $thousands_sep,
92
  -thousands_sep => $thousands_sep,
-
 
93
  -kibi_suffix   => ' KiB',
-
 
94
  -mebi_suffix   => ' MiB',
-
 
95
  -gibi_suffix   => ' GiB',
-
 
96
93
  # -grouping      => $grouping[0]
97
  # -grouping      => $grouping[0]
94
);
98
);
95
99
96
###################### USER CONFIGURATIONS ############################
100
###################### USER CONFIGURATIONS ############################
97
101
98
## The name of the group to do stats for
102
## The name of the group to do stats for
99
my $newsgroup_name = $ARGV[0];
103
my $newsgroup_name = $ARGV[0];
100
$newsgroup_name // usage();
104
$newsgroup_name // usage();
101
105
102
## Check for removal flags
106
## Check for removal flags
103
my $ix;
107
my $ix;
104
my $j;
108
my $j;
105
my %skipSec;
109
my %skipSec;
106
my @skiplist;
110
my @skiplist;
107
my $args = @ARGV;
111
my $args = @ARGV;
108
for ( $ix = 1 ; $ix < $args ; $ix++ )
112
for ( $ix = 1 ; $ix < $args ; $ix++ )
109
{
113
{
110
  $j = $ix + 1;
114
  $j = $ix + 1;
111
  if ( $ARGV[$ix] eq "-x" )
115
  if ( $ARGV[$ix] eq "-x" )
112
  {
116
  {
113
    @skiplist = split( ",", $ARGV[$j] );
117
    @skiplist = split( ",", $ARGV[$j] );
114
  }
118
  }
115
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
119
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
116
  {
120
  {
117
    @skiplist = split( ",", $1 );
121
    @skiplist = split( ",", $1 );
118
  }
122
  }
119
}
123
}
120
foreach (@skiplist)
124
foreach (@skiplist)
121
{
125
{
122
  $skipSec{$_} = 1;
126
  $skipSec{$_} = 1;
123
}
127
}
124
128
125
## Leafnode users will want /var/spool/news for this variable.
129
## Leafnode users will want /var/spool/news for this variable.
126
my $news = "/var/spool/news/";
130
my $news = "/var/spool/news/";
127
131
128
## Number of top or bottom posters to show
132
## Number of top or bottom posters to show
129
my $topposters = 20;
133
my $topposters = 20;
130
134
131
## Number of threads we want to know about
135
## Number of threads we want to know about
132
my $topthreads = 20;
136
my $topthreads = 20;
133
137
134
## Number of cross-posted threads to show
138
## Number of cross-posted threads to show
135
my $topcrossposts = 10;
139
my $topcrossposts = 10;
136
140
137
## Number of agents we list
141
## Number of agents we list
138
my $topagents = 10;
142
my $topagents = 10;
139
143
140
## Number of time zones to show
144
## Number of time zones to show
141
my $toptz = 10;
145
my $toptz = 10;
142
146
143
###################### DATA STRUCTURES ######################
147
###################### DATA STRUCTURES ######################
144
my $group = $newsgroup_name;
148
my $group = $newsgroup_name;
145
$group =~ s!\.!/!g;
149
$group =~ s!\.!/!g;
146
my %data;    # name, count, agent, total, orig, quoted
150
my %data;    # name, count, agent, total, orig, quoted
147
my $totsize = 0;    # holds total sizes of all files
151
my $totsize = 0;    # holds total sizes of all files
148
my %crossposts;     # group, count
152
my %crossposts;     # group, count
149
my %threads;        # subject, count
153
my %threads;        # subject, count
150
my $replies   = 0;  # total no. of replies
154
my $replies   = 0;  # total no. of replies
151
my $origposts = 0;  # total no. of original posts
155
my $origposts = 0;  # total no. of original posts
152
my %tz;             # timezones by count
156
my %tz;             # timezones by count
153
my $earliest;       # earliest article we have found
157
my $earliest;       # earliest article we have found
154
my $latest;         # latest article we have found
158
my $latest;         # latest article we have found
155
my $totheader = 0;  # total size of header material
159
my $totheader = 0;  # total size of header material
156
my $totbody   = 0;  # total size of body material
160
my $totbody   = 0;  # total size of body material
157
my $totsig    = 0;  # total size of sig material
161
my $totsig    = 0;  # total size of sig material
158
my $totquoted = 0;  # total size of quoted material
162
my $totquoted = 0;  # total size of quoted material
159
my $totorig   = 0;  # total size of original material
163
my $totorig   = 0;  # total size of original material
160
my $totalposts;     # total no. of posts considered
164
my $totalposts;     # total no. of posts considered
161
my %distinct_agent;
165
my %distinct_agent;
162
166
163
## Used to hold counts of User Agents used
167
## Used to hold counts of User Agents used
164
my %agents = (
168
my %agents = (
165
  "Compuserver"               => 0,
169
  "Compuserver"               => 0,
166
  "Foorum"                    => 0,
170
  "Foorum"                    => 0,
167
  "Forte Agent"               => 0,
171
  "Forte Agent"               => 0,
168
  "Forte Free Agent"          => 0,
172
  "Forte Free Agent"          => 0,
169
  "Gnus"                      => 0,
173
  "Gnus"                      => 0,
170
  "KNode"                     => 0,
174
  "KNode"                     => 0,
171
  "MacSOUP"                   => 0,
175
  "MacSOUP"                   => 0,
172
  "MT-NewsWatcher"            => 0,
176
  "MT-NewsWatcher"            => 0,
173
  "MicroPlanet Gravity"       => 0,
177
  "MicroPlanet Gravity"       => 0,
174
  "Microsoft Outlook Express" => 0,
178
  "Microsoft Outlook Express" => 0,
175
  "Microsoft Windows Mail"    => 0,
179
  "Microsoft Windows Mail"    => 0,
176
  "Mozilla"                   => 0,
180
  "Mozilla"                   => 0,
177
  "News Rover"                => 0,
181
  "News Rover"                => 0,
178
  "NN"                        => 0,
182
  "NN"                        => 0,
179
  "Pan"                       => 0,
183
  "Pan"                       => 0,
180
  "rn"                        => 0,
184
  "rn"                        => 0,
181
  "slrn"                      => 0,
185
  "slrn"                      => 0,
182
  "Sylpheed"                  => 0,
186
  "Sylpheed"                  => 0,
183
  "tin"                       => 0,
187
  "tin"                       => 0,
184
  "VSoup"                     => 0,
188
  "VSoup"                     => 0,
185
  "WebTV"                     => 0,
189
  "WebTV"                     => 0,
186
  "Xnews"                     => 0,
190
  "Xnews"                     => 0,
187
);
191
);
188
192
189
my $datetime_parser = DateTime::Format::Mail->new();
193
my $datetime_parser = DateTime::Format::Mail->new();
190
$datetime_parser->loose();
194
$datetime_parser->loose();
191
195
192
my $today = DateTime->today( time_zone => 'UTC' );
196
my $today = DateTime->today( time_zone => 'UTC' );
193
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
197
my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1);
194
my $start      = int $prev_month->strftime('%s');
198
my $start      = int $prev_month->strftime('%s');
195
my $numdays    = int DateTime->last_day_of_month(
199
my $numdays    = int DateTime->last_day_of_month(
196
  year      => $prev_month->year(),
200
  year      => $prev_month->year(),
197
  month     => $prev_month->month(),
201
  month     => $prev_month->month(),
198
  time_zone => $prev_month->time_zone(),
202
  time_zone => $prev_month->time_zone(),
199
)->day();
203
)->day();
200
my $end = int $today->clone()->set_day(1)->strftime('%s');
204
my $end = int $today->clone()->set_day(1)->strftime('%s');
201
205
202
dmsg( $start, " to ", $end ) if DEBUG;
206
dmsg( $start, " to ", $end ) if DEBUG;
203
207
204
chdir("$news$group")
208
chdir("$news$group")
205
  or die __x(
209
  or die __x(
206
  "Can't cd to {newsgroup}: {error}\n",
210
  "Can't cd to {newsgroup}: {error}\n",
207
  newsgroup => "$news$group",
211
  newsgroup => "$news$group",
208
  error     => $!
212
  error     => $!
209
  );
213
  );
210
opendir( DIR, "." )
214
opendir( DIR, "." )
211
  or die __x(
215
  or die __x(
212
  "Can't open {newsgroup}: {error}\n",
216
  "Can't open {newsgroup}: {error}\n",
213
  newsgroup => "$news$group",
217
  newsgroup => "$news$group",
214
  error     => $!
218
  error     => $!
215
  );
219
  );
216
220
217
while ( defined( my $filename = readdir(DIR) ) )
221
while ( defined( my $filename = readdir(DIR) ) )
218
{
222
{
219
  next unless -f $filename;    # only want real files
223
  next unless -f $filename;    # only want real files
220
  next if ( $filename eq ".overview" );    # real articles only
224
  next if ( $filename eq ".overview" );    # real articles only
221
225
222
  get_article($filename);                  # read in the article
226
  get_article($filename);                  # read in the article
223
}
227
}
224
closedir(DIR);                             # finished with the directory
228
closedir(DIR);                             # finished with the directory
225
229
226
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
230
dmsg("\nearliest: $earliest\nlatest:   $latest") if DEBUG;
227
231
228
## Post-processing
232
## Post-processing
229
count_agents();                            # count agents, collapsing versions
233
count_agents();                            # count agents, collapsing versions
230
fix_percent();
234
fix_percent();
231
235
232
write_data();
236
write_data();
233
display_results();
237
display_results();
234
238
235
########################################
239
########################################
236
## Get current article's header and body
240
## Get current article's header and body
237
########################################
241
########################################
238
sub get_article
242
sub get_article
239
{
243
{
240
  my $filename = shift;
244
  my $filename = shift;
241
245
242
  open( my $FILE, '<', $filename )
246
  open( my $FILE, '<', $filename )
243
    or
247
    or
244
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
248
    die __x( "Can't open {file}: {error}\n", file => $filename, error => $! );
245
  my $msg       = Mail::Message->read($FILE);
249
  my $msg       = Mail::Message->read($FILE);
246
  my $timestamp = $msg->timestamp();
250
  my $timestamp = $msg->timestamp();
247
  my $date      = $msg->study('Date');
251
  my $date      = $msg->study('Date');
248
252
249
  ## Disregard article if timestamp is not in range
253
  ## Disregard article if timestamp is not in range
250
  dmsg($timestamp) if DEBUG;
254
  dmsg($timestamp) if DEBUG;
251
  if ( $timestamp < $start || $timestamp >= $end )
255
  if ( $timestamp < $start || $timestamp >= $end )
252
  {
256
  {
253
    dmsg("Posting on $date ignored.") if DEBUG;
257
    dmsg("Posting on $date ignored.") if DEBUG;
254
    return;
258
    return;
255
  }
259
  }
256
260
257
  $totalposts++;    # bump count of articles considered
261
  $totalposts++;    # bump count of articles considered
258
262
259
  ## DEBUG
263
  ## DEBUG
260
  dmsg($date) if DEBUG;
264
  dmsg($date) if DEBUG;
261
265
262
  ## get stats about the file itself
266
  ## get stats about the file itself
263
  my $filesize = -s $filename;    # get total size of file
267
  my $filesize = -s $filename;    # get total size of file
264
  $totsize += $filesize;          # bump total sizes of all files
268
  $totsize += $filesize;          # bump total sizes of all files
265
269
266
  if ( ( not defined $earliest ) || $timestamp < $earliest )
270
  if ( ( not defined $earliest ) || $timestamp < $earliest )
267
  {
271
  {
268
    $earliest = $timestamp;
272
    $earliest = $timestamp;
269
  }
273
  }
270
  elsif ( ( not defined $latest ) || $timestamp > $latest )
274
  elsif ( ( not defined $latest ) || $timestamp > $latest )
271
  {
275
  {
272
    $latest = $timestamp;
276
    $latest = $timestamp;
273
  }
277
  }
274
278
275
  #print "timestamp: $timestamp\n";
279
  #print "timestamp: $timestamp\n";
276
280
277
  ## count header size
281
  ## count header size
278
  $totheader += $msg->head()->size();
282
  $totheader += $msg->head()->size();
279
283
280
  ## get the poster's name (MIME-decoded, in UTF-8)
284
  ## get the poster's name (MIME-decoded, in UTF-8)
281
  my $poster = $msg->study('From');
285
  my $poster = $msg->study('From');
282
  if ( defined $poster )
286
  if ( defined $poster )
283
  {
287
  {
284
    ## Convert old to new format
288
    ## Convert old to new format
285
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
289
    $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/;
286
290
287
    ## Collapse whitespace
291
    ## Collapse whitespace
288
    $poster =~ s/\s+/ /g;
292
    $poster =~ s/\s+/ /g;
289
293
290
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
294
    ## Remove outer quotes; TODO: observe RFC 5322 strictly
291
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
295
    $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x;
292
296
293
    ## DEBUG
297
    ## DEBUG
294
    dmsg($poster) if DEBUG;
298
    dmsg($poster) if DEBUG;
295
299
296
    ## seen this one before?
300
    ## seen this one before?
297
    if ( !defined( $data{$poster} ) )
301
    if ( !defined( $data{$poster} ) )
298
    {
302
    {
299
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
303
      $data{$poster}{'agent'}  = __ 'unknown';    # comes after For: field
300
      $data{$poster}{'orig'}   = 0;
304
      $data{$poster}{'orig'}   = 0;
301
      $data{$poster}{'quoted'} = 0;
305
      $data{$poster}{'quoted'} = 0;
302
    }
306
    }
303
    $data{$poster}{'count'}++;                    # bump count for this poster
307
    $data{$poster}{'count'}++;                    # bump count for this poster
304
    $data{$poster}{'size'} += $filesize;          # total size of file
308
    $data{$poster}{'size'} += $filesize;          # total size of file
305
309
306
    ## The User-Agent and/or X-Newsreader fields
310
    ## The User-Agent and/or X-Newsreader fields
307
    ## for User-Agent by poster
311
    ## for User-Agent by poster
308
    my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');
312
    my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader');
309
    if ( defined $ua )
313
    if ( defined $ua )
310
    {
314
    {
311
      $data{$poster}{'agent'} = $ua;
315
      $data{$poster}{'agent'} = $ua;
312
316
313
      ## DEBUG
317
      ## DEBUG
314
      dmsg($ua) if DEBUG;
318
      dmsg($ua) if DEBUG;
315
    }
319
    }
316
320
317
    ## The User Agent for User-Agent by number of articles
321
    ## The User Agent for User-Agent by number of articles
318
    get_agent($msg);
322
    get_agent($msg);
319
323
320
    ## Get all cross-posted newsgroups
324
    ## Get all cross-posted newsgroups
321
    for ( split( /,/, $msg->study('Newsgroups') ) )
325
    for ( split( /,/, $msg->study('Newsgroups') ) )
322
    {
326
    {
323
      $crossposts{$_}++;    # bump count for each
327
      $crossposts{$_}++;    # bump count for each
324
    }
328
    }
325
329
326
    ## Get threads
330
    ## Get threads
327
    my $thread = $msg->study('Subject');
331
    my $thread = $msg->study('Subject');
328
    $thread =~ s/^\s*re:\s*//i;              # Remove Re: or re: at the start
332
    $thread =~ s/^\s*re:\s*//i;              # Remove Re: or re: at the start
329
    $thread =~ s/\s*\(was:\s*.*\)\s*$//i;    # Remove (was: ...) at the end
333
    $thread =~ s/\s*\(was:\s*.*\)\s*$//i;    # Remove (was: ...) at the end
330
    $thread =~ s/\s+/ /g;                    # collapse whitespace
334
    $thread =~ s/\s+/ /g;                    # collapse whitespace
331
    $threads{$thread}{'count'}++;            # bump count of this subject
335
    $threads{$thread}{'count'}++;            # bump count of this subject
332
    $threads{$thread}{'size'} += $filesize;  # bump bytes for this thread
336
    $threads{$thread}{'size'} += $filesize;  # bump bytes for this thread
333
337
334
    ## Is this an original post or a reply?
338
    ## Is this an original post or a reply?
335
    if ( defined $msg->study('References') )
339
    if ( defined $msg->study('References') )
336
    {
340
    {
337
      $replies++;
341
      $replies++;
338
    }
342
    }
339
    else
343
    else
340
    {
344
    {
341
      $origposts++;
345
      $origposts++;
342
    }
346
    }
343
347
344
    ## Get the time zone
348
    ## Get the time zone
345
    my $datetime = $datetime_parser->parse_datetime($date);
349
    my $datetime = $datetime_parser->parse_datetime($date);
346
    my $tz       = $datetime->strftime('%z');
350
    my $tz       = $datetime->strftime('%z');
347
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
351
    $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o;
348
    $tz{$tz}++;
352
    $tz{$tz}++;
349
353
350
    ## DEBUG
354
    ## DEBUG
351
    dmsg($tz) if DEBUG;
355
    dmsg($tz) if DEBUG;
352
356
353
#### Now analyse the body text ####
357
#### Now analyse the body text ####
354
    my $body = $msg->body();
358
    my $body = $msg->body();
355
359
356
    my $insig = 0;
360
    my $insig = 0;
357
    my @body  = $body->lines;
361
    my @body  = $body->lines;
358
    for (@body)
362
    for (@body)
359
    {
363
    {
360
      $totbody += length($_);    # bump total body size
364
      $totbody += length($_);    # bump total body size
361
      next if (m{^$>}o);         # don't count blank lines in body
365
      next if (m{^$>}o);         # don't count blank lines in body
362
      if ( $insig == 1 )
366
      if ( $insig == 1 )
363
      {
367
      {
364
368
365
        # bump total sig size
369
        # bump total sig size
366
        $totsig += length($_);
370
        $totsig += length($_);
367
      }
371
      }
368
      ## are we in a quote line?
372
      ## are we in a quote line?
369
      ## Bill Unruh uses ] quotes, and another poster uses ::
373
      ## Bill Unruh uses ] quotes, and another poster uses ::
370
      elsif ( m{^\s*[>\]]}o || m{^\s*::}o )
374
      elsif ( m{^\s*[>\]]}o || m{^\s*::}o )
371
      {
375
      {
372
        ## bump count of quoted chrs
376
        ## bump count of quoted chrs
373
        $data{$poster}{'quoted'} += length($_);
377
        $data{$poster}{'quoted'} += length($_);
374
        $totquoted += length($_);
378
        $totquoted += length($_);
375
      }
379
      }
376
      elsif (/^-- $/)
380
      elsif (/^-- $/)
377
      {
381
      {
378
        $insig = 1;
382
        $insig = 1;
379
      }
383
      }
380
      else
384
      else
381
      {
385
      {
382
        ## We must be processing an original line
386
        ## We must be processing an original line
383
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
387
        $data{$poster}{'orig'} += length($_);    # bump count of original chrs
384
        $totorig += length($_);
388
        $totorig += length($_);
385
      }
389
      }
386
    }
390
    }
387
391
388
    # end for (@body)
392
    # end for (@body)
389
  }
393
  }
390
394
391
  close($FILE);
395
  close($FILE);
392
}
396
}
393
397
394
sub get_agent
398
sub get_agent
395
{
399
{
396
  my $msg = shift;
400
  my $msg = shift;
397
401
398
  my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')
402
  my $ua = $msg->study('User-Agent') // $msg->study('X-Newsreader')
399
    // $msg->study('X-Mailer');
403
    // $msg->study('X-Mailer');
400
404
401
  if ( not defined $ua )
405
  if ( not defined $ua )
402
  {
406
  {
403
    my $org = $msg->study('Organization');
407
    my $org = $msg->study('Organization');
404
    if ( defined $org
408
    if ( defined $org
405
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
409
      and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ )
406
    {
410
    {
407
      $ua = $org;
411
      $ua = $org;
408
    }
412
    }
409
    elsif ( $msg->study('Message-ID') =~ /pine/i )
413
    elsif ( $msg->study('Message-ID') =~ /pine/i )
410
    {
414
    {
411
      $ua = "Pine";
415
      $ua = "Pine";
412
    }
416
    }
413
  }
417
  }
414
418
415
  ## Hopefully found UA, else set to unknown
419
  ## Hopefully found UA, else set to unknown
416
  if ( not defined $ua )
420
  if ( not defined $ua )
417
  {
421
  {
418
    $ua = __ "unknown";
422
    $ua = __ "unknown";
419
  }
423
  }
420
424
421
  $ua = clean($ua);
425
  $ua = clean($ua);
422
426
423
  my $raw   = $ua;
427
  my $raw   = $ua;
424
  my $agent = $raw;
428
  my $agent = $raw;
425
429
426
  ## strip http
430
  ## strip http
427
  if ( $raw =~ /.*http.*/ )
431
  if ( $raw =~ /.*http.*/ )
428
  {
432
  {
429
    $raw =~ s!posted via!!i;
433
    $raw =~ s!posted via!!i;
430
    $raw =~ s!http://!!g;
434
    $raw =~ s!http://!!g;
431
    $raw =~ s!/!!g;
435
    $raw =~ s!/!!g;
432
    $raw =~ s! !!g;
436
    $raw =~ s! !!g;
433
  }
437
  }
434
438
435
  ## Fix Outlook from Mac
439
  ## Fix Outlook from Mac
436
  if ( $raw =~ /^microsoft/i )
440
  if ( $raw =~ /^microsoft/i )
437
  {
441
  {
438
    $raw =~ s/-/ /g;
442
    $raw =~ s/-/ /g;
439
  }
443
  }
440
444
441
  ## Pick out the popular agents
445
  ## Pick out the popular agents
442
  if ( $raw =~ /(outlook express)/i
446
  if ( $raw =~ /(outlook express)/i
443
    || $raw =~ /(windows mail)/i
447
    || $raw =~ /(windows mail)/i
444
    || $raw =~ /(microplanet gravity)/i
448
    || $raw =~ /(microplanet gravity)/i
445
    || $raw =~ /(news rover)/i
449
    || $raw =~ /(news rover)/i
446
    || $raw =~ /(forte agent)/i
450
    || $raw =~ /(forte agent)/i
447
    || $raw =~ /(forte free agent)/i )
451
    || $raw =~ /(forte free agent)/i )
448
  {
452
  {
449
    $agent = $1;
453
    $agent = $1;
450
  }
454
  }
451
  elsif (
455
  elsif (
452
    $raw =~ /^(
456
    $raw =~ /^(
453
        pan
457
        pan
454
       |sylpheed
458
       |sylpheed
455
       |slrn
459
       |slrn
456
       |mozilla
460
       |mozilla
457
       |knode
461
       |knode
458
       |tin
462
       |tin
459
       |hamster
463
       |hamster
460
       |xrn
464
       |xrn
461
       |xnews
465
       |xnews
462
       |aol
466
       |aol
463
       |gnus
467
       |gnus
464
       |krn
468
       |krn
465
       |macsoup
469
       |macsoup
466
       |messenger
470
       |messenger
467
       |openxp
471
       |openxp
468
       |pine
472
       |pine
469
       |thoth
473
       |thoth
470
       |turnpike
474
       |turnpike
471
       |winvn
475
       |winvn
472
       |vsoup
476
       |vsoup
473
       |google
477
       |google
474
       |supernews
478
       |supernews
475
       |nn
479
       |nn
476
       |rn
480
       |rn
477
       |007
481
       |007
478
       |webtv
482
       |webtv
479
       |compuserve
483
       |compuserve
480
       )/ix
484
       )/ix
481
    )
485
    )
482
  {
486
  {
483
    $agent = $1;
487
    $agent = $1;
484
  }
488
  }
485
  else
489
  else
486
  {
490
  {
487
    ## Clean up unknown agents
491
    ## Clean up unknown agents
488
    if ( $raw =~ m!^(.*?)/! )
492
    if ( $raw =~ m!^(.*?)/! )
489
    {
493
    {
490
      $agent = $1;
494
      $agent = $1;
491
    }
495
    }
492
    elsif ( $raw =~ /^(\w*)\d.*/ )
496
    elsif ( $raw =~ /^(\w*)\d.*/ )
493
    {
497
    {
494
      $agent = $1;
498
      $agent = $1;
495
    }
499
    }
496
  }
500
  }
497
501
498
  $distinct_agent{$agent}++;
502
  $distinct_agent{$agent}++;
499
  return $agent;
503
  return $agent;
500
}
504
}
501
## get_agent
505
## get_agent
502
506
503
#########################################
507
#########################################
504
## Count the User-Agents used, collapsing
508
## Count the User-Agents used, collapsing
505
## different versions into one per agent.
509
## different versions into one per agent.
506
#########################################
510
#########################################
507
sub count_agents
511
sub count_agents
508
{
512
{
509
POSTER:
513
POSTER:
510
  foreach my $poster ( keys %data )
514
  foreach my $poster ( keys %data )
511
  {
515
  {
512
    foreach my $agent_name ( keys %distinct_agent )
516
    foreach my $agent_name ( keys %distinct_agent )
513
    {    # check against known ones
517
    {    # check against known ones
514
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
518
      if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ )
515
      {
519
      {
516
        $agents{$agent_name}++;
520
        $agents{$agent_name}++;
517
        next POSTER;
521
        next POSTER;
518
      }
522
      }
519
    }
523
    }
520
    $agents{ $data{$poster}{'agent'} }++;
524
    $agents{ $data{$poster}{'agent'} }++;
521
  }
525
  }
522
}    # count_agents
526
}    # count_agents
523
527
524
#############################################
528
#############################################
525
## Set orig/total percentages for all posters
529
## Set orig/total percentages for all posters
526
#############################################
530
#############################################
527
sub fix_percent
531
sub fix_percent
528
{
532
{
529
  foreach my $poster ( keys %data )
533
  foreach my $poster ( keys %data )
530
  {
534
  {
531
    my $percent = 100;
535
    my $percent = 100;
532
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
536
    if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) )
533
    {
537
    {
534
      $percent =
538
      $percent =
535
        $data{$poster}{'orig'} * 100 /
539
        $data{$poster}{'orig'} * 100 /
536
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
540
        ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} );    #/
537
    }
541
    }
538
    elsif ( $data{$poster}{'orig'} == 0 )
542
    elsif ( $data{$poster}{'orig'} == 0 )
539
    {
543
    {
540
      $percent = 0;
544
      $percent = 0;
541
    }
545
    }
542
    $data{$poster}{'percent'} = $percent;
546
    $data{$poster}{'percent'} = $percent;
543
  }
547
  }
544
}
548
}
545
## fix_percent
549
## fix_percent
546
550
547
##################################
551
##################################
548
## Write data structures to a file
552
## Write data structures to a file
549
##################################
553
##################################
550
sub write_data
554
sub write_data
551
{
555
{
552
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
556
  open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" )
553
    or die __x( "Can't create XDATA: {error}\n", error => $! );
557
    or die __x( "Can't create XDATA: {error}\n", error => $! );
554
  print $OUTF "Data collected from $newsgroup_name\n\n";
558
  print $OUTF "Data collected from $newsgroup_name\n\n";
555
  print $OUTF
559
  print $OUTF
556
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
560
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
557
  foreach my $name ( keys %data )
561
  foreach my $name ( keys %data )
558
  {
562
  {
559
    print $OUTF
563
    print $OUTF
560
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
564
"$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n";
561
  }
565
  }
562
  print $OUTF
566
  print $OUTF
563
"============================================================================\n";
567
"============================================================================\n";
564
  print $OUTF "Thread subjects\n";
568
  print $OUTF "Thread subjects\n";
565
  print $OUTF
569
  print $OUTF
566
"----------------------------------------------------------------------------\n";
570
"----------------------------------------------------------------------------\n";
567
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
571
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
568
  {
572
  {
569
    print $OUTF
573
    print $OUTF
570
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
574
      "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n";
571
  }
575
  }
572
  print $OUTF
576
  print $OUTF
573
"============================================================================\n";
577
"============================================================================\n";
574
  print $OUTF "Cross-posts\n";
578
  print $OUTF "Cross-posts\n";
575
  print $OUTF
579
  print $OUTF
576
"----------------------------------------------------------------------------\n";
580
"----------------------------------------------------------------------------\n";
577
  foreach my $name ( sort keys %crossposts )
581
  foreach my $name ( sort keys %crossposts )
578
  {
582
  {
579
    print $OUTF "$name : $crossposts{$name}\n";
583
    print $OUTF "$name : $crossposts{$name}\n";
580
  }
584
  }
581
  print $OUTF
585
  print $OUTF
582
"============================================================================\n";
586
"============================================================================\n";
583
  print $OUTF "User agents\n";
587
  print $OUTF "User agents\n";
584
  print $OUTF
588
  print $OUTF
585
"----------------------------------------------------------------------------\n";
589
"----------------------------------------------------------------------------\n";
586
  foreach my $name ( sort keys %agents )
590
  foreach my $name ( sort keys %agents )
587
  {
591
  {
588
    print $OUTF "$name : $agents{$name}\n";
592
    print $OUTF "$name : $agents{$name}\n";
589
  }
593
  }
590
  print $OUTF
594
  print $OUTF
591
"============================================================================\n";
595
"============================================================================\n";
592
  print $OUTF "Time zones\n";
596
  print $OUTF "Time zones\n";
593
  print $OUTF
597
  print $OUTF
594
"----------------------------------------------------------------------------\n";
598
"----------------------------------------------------------------------------\n";
595
  foreach my $name ( sort keys %tz )
599
  foreach my $name ( sort keys %tz )
596
  {
600
  {
597
    print $OUTF "$name : $tz{$name}\n";
601
    print $OUTF "$name : $tz{$name}\n";
598
  }
602
  }
599
  close $OUTF;
603
  close $OUTF;
600
}    # write_data
604
}    # write_data
601
605
602
sub display_results
606
sub display_results
603
{
607
{
604
  #################### DISPLAY RESULTS #####################
608
  #################### DISPLAY RESULTS #####################
605
  print "=" x 76, "\n";
609
  print "=" x 76, "\n";
606
  printf "%s\n",
610
  printf "%s\n",
607
    centred(
611
    centred(
608
    __x( "Analysis of articles to {newsgroup}", newsgroup => $newsgroup_name ),
612
    __x( "Analysis of articles to {newsgroup}", newsgroup => $newsgroup_name ),
609
    76
613
    76
610
    );
614
    );
611
  print "=" x 76, "\n";
615
  print "=" x 76, "\n";
612
  printf "%s\n",
616
  printf "%s\n",
613
    centred(
617
    centred(
614
    __(
618
    __(
615
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
619
"(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)"
616
    ),
620
    ),
617
    76
621
    76
618
    );
622
    );
619
  print "\n\n";
623
  print "\n";
620
  printf __"Total articles considered: %s over %d days\n",
624
  printf __"Total articles considered:   %s over %d days\n",
621
    $formatter->format_number($totalposts),
625
    $formatter->format_number($totalposts),
622
    $formatter->format_number($numdays);
626
    $formatter->format_number($numdays);
623
  my $time_locale       = setlocale(LC_TIME);
627
  my $time_locale       = setlocale(LC_TIME);
624
  my $earliest_datetime = DateTime->from_epoch(
628
  my $earliest_datetime = DateTime->from_epoch(
625
    epoch     => $earliest,
629
    epoch     => $earliest,
626
    locale    => $time_locale,
630
    locale    => $time_locale,
627
    time_zone => 'UTC',
631
    time_zone => 'UTC',
628
  );
632
  );
629
  my $latest_datetime = DateTime->from_epoch(
633
  my $latest_datetime = DateTime->from_epoch(
630
    epoch     => $latest,
634
    epoch     => $latest,
631
    locale    => $time_locale,
635
    locale    => $time_locale,
632
    time_zone => 'UTC',
636
    time_zone => 'UTC',
633
  );
637
  );
634
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
638
  my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z';
635
  printf __"Earliest article: %s\n",
639
  printf __"Earliest article:            %s" . "\n",
636
    $earliest_datetime->strftime($datetime_format);
640
    $earliest_datetime->strftime($datetime_format);
637
  printf __"Latest article:   %s\n",
641
  printf __"Latest article:              %s" . "\n",
638
    $latest_datetime->strftime($datetime_format);
642
    $latest_datetime->strftime($datetime_format);
639
  printf __"Original articles: %s; replies: %s\n",
643
  printf __"Original articles:           %s; replies: %s" . "\n",
640
    $formatter->format_number($origposts),
644
    $formatter->format_number($origposts),
641
    $formatter->format_number($replies);
645
    $formatter->format_number($replies);
642
  printf __"Total size of articles: %s bytes (%s)" . "\n",
646
  printf __"Total size of articles:      %s bytes (%s)" . "\n",
643
    $formatter->format_number($totsize),
647
    $formatter->format_number($totsize),
644
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
648
    $formatter->format_bytes( $totsize, ( 'precision' => 1, 'mode' => 'iec' ) );
645
  printf __"Average %s articles per day, %s per day, %s bytes per article\n",
649
  printf __"Average %s articles per day, %s per day, %s bytes per article.\n",
646
    $formatter->format_number( int( $totalposts / $numdays ) ),
650
    $formatter->format_number( int( $totalposts / $numdays ) ),
647
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
651
    $formatter->format_bytes( $totsize / $numdays, ( 'mode' => 'iec' ) ),
648
    $formatter->format_number( int( $totsize / $totalposts ) );
652
    $formatter->format_number( int( $totsize / $totalposts ) );
649
653
650
  my $count = keys %data;
654
  my $count = keys %data;
-
 
655
  print "\n";
651
  printf __"Total headers: %s; bodies: %s\n",
656
  printf __"Total headers:      %s; bodies: %s\n",
652
    $formatter->format_bytes(
657
    $formatter->format_bytes(
653
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
658
    $totheader, ( 'precision' => 1, 'mode' => 'iec' )
654
    ),
659
    ),
655
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
660
    $formatter->format_bytes( $totbody, ( 'precision' => 1, 'mode' => 'iec' ) );
656
  printf __
661
  printf __
657
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
662
    "Body text - quoted: %s; original: %s = %s%%; sigs: %s\n",
658
    $formatter->format_bytes(
663
    $formatter->format_bytes(
659
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
664
    $totquoted, ( 'precision' => 1, 'mode' => 'iec' )
660
    ),
665
    ),
661
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
666
    $formatter->format_bytes( $totorig, ( 'precision' => 1, 'mode' => 'iec' ) ),
662
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) ),
667
    $formatter->format_number( ( $totorig * 100 ) / ( $totorig + $totquoted ) )
-
 
668
    ,
663
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
669
    $formatter->format_bytes( $totsig, ( 'precision' => 1, 'mode' => 'iec' ) );
-
 
670
  print "\n";
664
  printf __"Total number of posters: %s, average %s per poster\n",
671
  printf __"Total number of posters:     %s, average %s per poster\n",
665
    $formatter->format_number($count),
672
    $formatter->format_number($count),
666
    $formatter->format_bytes( $totsize / $count,
673
    $formatter->format_bytes( $totsize / $count,
667
    ( 'precision' => 1, 'mode' => 'iec' ) );
674
    ( 'precision' => 1, 'mode' => 'iec' ) );
668
  $count = keys %threads;
675
  $count = keys %threads;
669
  printf __"Total number of threads: %s, average %s per thread\n",
676
  printf __"Total number of threads:     %s, average %s per thread\n",
670
    $formatter->format_number($count),
677
    $formatter->format_number($count),
671
    $formatter->format_bytes( $totsize / $count,
678
    $formatter->format_bytes( $totsize / $count,
672
    ( 'precision' => 1, 'mode' => 'iec' ) );
679
    ( 'precision' => 1, 'mode' => 'iec' ) );
673
  printf __"Total number of user agents: %d\n",
680
  printf __"Total number of user agents: %d\n",
674
    $formatter->format_number( scalar keys %agents );
681
    $formatter->format_number( scalar keys %agents );
675
  print "\n", "=" x 76, "\n";
682
  print "\n", "=" x 76, "\n";
676
  ########################################
683
  ########################################
677
  ## Show posters by article count  Sec 1;
684
  ## Show posters by article count  Sec 1;
678
  ########################################
685
  ########################################
679
  unless ( $skipSec{1} )
686
  unless ( $skipSec{1} )
680
  {
687
  {
681
    if ( keys %data < $topposters )
688
    if ( keys %data < $topposters )
682
    {
689
    {
683
      $count = keys %data;
690
      $count = keys %data;
684
    }
691
    }
685
    else
692
    else
686
    {
693
    {
687
      $count = $topposters;
694
      $count = $topposters;
688
    }
695
    }
689
    printf "%s\n",
696
    printf "%s\n",
690
      centred(
697
      centred(
691
      __x( "Top {count} posters by number of articles", count => $topposters ),
698
      __x( "Top {count} posters by number of articles", count => $topposters ),
692
      76
699
      76
693
      );
700
      );
694
    print "=" x 76, "\n";
701
    print "=" x 76, "\n";
695
    my $i = 0;
702
    my $i = 0;
696
    foreach
703
    foreach
697
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
704
      my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
698
    {
705
    {
699
      my $name = substr( $poster, 0, 65 );
706
      my $name = substr( $poster, 0, 65 );
700
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
707
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
701
        $data{$poster}{count};
708
        $data{$poster}{count};
702
      last if ( ++$i == $count );
709
      last if ( ++$i == $count );
703
    }
710
    }
704
    print "\n", "=" x 76, "\n";
711
    print "\n", "=" x 76, "\n";
705
  }
712
  }
706
713
707
  ######################################
714
  ######################################
708
  ## Show posters by size in KiB  Sec 2;
715
  ## Show posters by size in KiB  Sec 2;
709
  ######################################
716
  ######################################
710
  unless ( $skipSec{2} )
717
  unless ( $skipSec{2} )
711
  {
718
  {
712
    if ( keys %data < $topposters )
719
    if ( keys %data < $topposters )
713
    {
720
    {
714
      $count = keys %data;
721
      $count = keys %data;
715
    }
722
    }
716
    else
723
    else
717
    {
724
    {
718
      $count = $topposters;
725
      $count = $topposters;
719
    }
726
    }
720
    printf "%s\n",
727
    printf "%s\n",
721
      centred(
728
      centred(
722
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
729
      __x( "Top {count} posters by article size in KiB", count => $topposters ),
723
      76
730
      76
724
      );
731
      );
725
    print "=" x 76, "\n";
732
    print "=" x 76, "\n";
726
    my $i = 0;
733
    my $i = 0;
727
    foreach
734
    foreach
728
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
735
      my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
729
    {
736
    {
730
      my $name = substr( $poster, 0, 62 );
737
      my $name = substr( $poster, 0, 62 );
731
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
738
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
732
        $data{$poster}{size} / 1024;    #/
739
        $data{$poster}{size} / 1024;    #/
733
      last if ( ++$i == $count );
740
      last if ( ++$i == $count );
734
    }
741
    }
735
    print "\n", "=" x 76, "\n";
742
    print "\n", "=" x 76, "\n";
736
  }
743
  }
737
744
738
  #####################################
745
  #####################################
739
  ## Show top posters for original text
746
  ## Show top posters for original text
740
  #####################################
747
  #####################################
741
  my $topposters_real = 0;
748
  my $topposters_real = 0;
742
749
743
  unless ( $skipSec{3} )
750
  unless ( $skipSec{3} )
744
  {
751
  {
745
    if ( keys %data < $topposters )
752
    if ( keys %data < $topposters )
746
    {
753
    {
747
      $count = keys %data;
754
      $count = keys %data;
748
    }
755
    }
749
    else
756
    else
750
    {
757
    {
751
      $count = $topposters;
758
      $count = $topposters;
752
    }
759
    }
753
760
754
    printf "%s\n",
761
    printf "%s\n",
755
      centred(
762
      centred(
756
      __x(
763
      __x(
757
        "Top {count} responders by original text (> 5 articles)",
764
        "Top {count} responders by original text (> 5 articles)",
758
        count => $topposters
765
        count => $topposters
759
      ),
766
      ),
760
      76
767
      76
761
      );
768
      );
762
    print "=" x 76, "\n";
769
    print "=" x 76, "\n";
763
    foreach my $poster (
770
    foreach my $poster (
764
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
771
      sort { $data{$b}{percent} <=> $data{$a}{percent} }
765
      keys %data
772
      keys %data
766
      )
773
      )
767
    {
774
    {
768
      next if $data{$poster}{quoted} == 0;
775
      next if $data{$poster}{quoted} == 0;
769
      next if $data{$poster}{count} < 5;
776
      next if $data{$poster}{count} < 5;
770
      my $name = substr( $poster, 0, 63 );
777
      my $name = substr( $poster, 0, 63 );
771
      printf "%2d. %-63s : %02.2f%%\n", $topposters_real + 1,
778
      printf "%2d. %-63s : %02.2f%%\n", $topposters_real + 1,
772
        rpad( $poster, 63, "." ),
779
        rpad( $poster, 63, "." ),
773
        $data{$poster}{percent};
780
        $data{$poster}{percent};
774
      last if ( ++$topposters_real == $count );
781
      last if ( ++$topposters_real == $count );
775
    }
782
    }
776
    print "\n", "=" x 76, "\n";
783
    print "\n", "=" x 76, "\n";
777
  }
784
  }
778
785
779
  ########################################
786
  ########################################
780
  ## Show bottom posters for original text
787
  ## Show bottom posters for original text
781
  ########################################
788
  ########################################
782
789
783
  $skipSec{4} = ( $topposters_real <= $topposters ) unless defined $skipSec{4};
790
  $skipSec{4} = ( $topposters_real <= $topposters ) unless defined $skipSec{4};
784
791
785
  unless ( $skipSec{4} )
792
  unless ( $skipSec{4} )
786
  {
793
  {
787
    if ( keys %data < $topposters )
794
    if ( keys %data < $topposters )
788
    {
795
    {
789
      $count = keys %data;
796
      $count = keys %data;
790
    }
797
    }
791
    else
798
    else
792
    {
799
    {
793
      $count = $topposters;
800
      $count = $topposters;
794
    }
801
    }
795
802
796
    printf "%s\n",
803
    printf "%s\n",
797
      centred(
804
      centred(
798
      __x(
805
      __x(
799
        "Bottom {count} responders by original text  (> 5 articles)",
806
        "Bottom {count} responders by original text  (> 5 articles)",
800
        count => $topposters
807
        count => $topposters
801
      ),
808
      ),
802
      76
809
      76
803
      );
810
      );
804
    print "=" x 76, "\n";
811
    print "=" x 76, "\n";
805
    my $i = 0;
812
    my $i = 0;
806
    foreach my $poster (
813
    foreach my $poster (
807
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
814
      sort { $data{$a}{percent} <=> $data{$b}{percent} }
808
      keys %data
815
      keys %data
809
      )
816
      )
810
    {
817
    {
811
      next if $data{$poster}{quoted} == 0;
818
      next if $data{$poster}{quoted} == 0;
812
      next if $data{$poster}{count} < 5;
819
      next if $data{$poster}{count} < 5;
813
      my $name = substr( $poster, 0, 63 );
820
      my $name = substr( $poster, 0, 63 );
814
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
821
      printf "%2d. %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
815
        $data{$poster}{percent};
822
        $data{$poster}{percent};
816
      last if ( ++$i == $count );
823
      last if ( ++$i == $count );
817
    }
824
    }
818
    print "\n", "=" x 76, "\n";
825
    print "\n", "=" x 76, "\n";
819
  }
826
  }
820
827
821
  #####################################
828
  #####################################
822
  ## Show threads by number of articles
829
  ## Show threads by number of articles
823
  #####################################
830
  #####################################
824
  unless ( $skipSec{5} )
831
  unless ( $skipSec{5} )
825
  {
832
  {
826
    if ( keys %threads < $topthreads )
833
    if ( keys %threads < $topthreads )
827
    {
834
    {
828
      $count = keys %threads;
835
      $count = keys %threads;
829
    }
836
    }
830
    else
837
    else
831
    {
838
    {
832
      $count = $topthreads;
839
      $count = $topthreads;
833
    }
840
    }
834
    printf "%s\n",
841
    printf "%s\n",
835
      centred(
842
      centred(
836
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
843
      __x( "Top {count} threads by no. of articles", count => $topthreads ),
837
      76 );
844
      76 );
838
    print "=" x 76, "\n";
845
    print "=" x 76, "\n";
839
    my $i = 0;
846
    my $i = 0;
840
    foreach my $thread (
847
    foreach my $thread (
841
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
848
      sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} }
842
      keys %threads
849
      keys %threads
843
      )
850
      )
844
    {
851
    {
845
      my $name = substr( $thread, 0, 65 );
852
      my $name = substr( $thread, 0, 65 );
846
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
853
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
847
        $threads{$thread}{'count'};
854
        $threads{$thread}{'count'};
848
      last if ( ++$i == $count );
855
      last if ( ++$i == $count );
849
    }
856
    }
850
    print "\n", "=" x 76, "\n";
857
    print "\n", "=" x 76, "\n";
851
  }
858
  }
852
859
853
  ##############################
860
  ##############################
854
  ## Show threads by size in KiB
861
  ## Show threads by size in KiB
855
  ##############################
862
  ##############################
856
  unless ( $skipSec{6} )
863
  unless ( $skipSec{6} )
857
  {
864
  {
858
    if ( keys %threads < $topthreads )
865
    if ( keys %threads < $topthreads )
859
    {
866
    {
860
      $count = keys %threads;
867
      $count = keys %threads;
861
    }
868
    }
862
    else
869
    else
863
    {
870
    {
864
      $count = $topthreads;
871
      $count = $topthreads;
865
    }
872
    }
866
    printf "%s\n",
873
    printf "%s\n",
867
      centred(
874
      centred(
868
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
875
      __x( "Top {count} threads by size in KiB", count => $topthreads ), 76 );
869
    print "=" x 76, "\n";
876
    print "=" x 76, "\n";
870
    my $i = 0;
877
    my $i = 0;
871
    foreach my $thread (
878
    foreach my $thread (
872
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
879
      sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} }
873
      keys %threads
880
      keys %threads
874
      )
881
      )
875
    {
882
    {
876
      my $name = substr( $thread, 0, 65 );
883
      my $name = substr( $thread, 0, 65 );
877
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
884
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
878
        $threads{$thread}{'size'} / 1024;    #/
885
        $threads{$thread}{'size'} / 1024;    #/
879
      last if ( ++$i == $count );
886
      last if ( ++$i == $count );
880
    }
887
    }
881
    print "\n", "=" x 76, "\n";
888
    print "\n", "=" x 76, "\n";
882
  }
889
  }
883
890
884
  ##################################
891
  ##################################
885
  ## Show top 10 cross-posted groups
892
  ## Show top 10 cross-posted groups
886
  ##################################
893
  ##################################
887
  unless ( $skipSec{7} )
894
  unless ( $skipSec{7} )
888
  {
895
  {
889
    delete $crossposts{"$newsgroup_name"};    # don't include ours
896
    delete $crossposts{"$newsgroup_name"};    # don't include ours
890
    if ( keys %crossposts < $topcrossposts )
897
    if ( keys %crossposts < $topcrossposts )
891
    {
898
    {
892
      $count = keys %crossposts;
899
      $count = keys %crossposts;
893
    }
900
    }
894
    else
901
    else
895
    {
902
    {
896
      $count = $topcrossposts;
903
      $count = $topcrossposts;
897
    }
904
    }
898
    printf "%s\n",
905
    printf "%s\n",
899
      centred(
906
      centred(
900
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
907
      __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 );
901
    print "=" x 76, "\n";
908
    print "=" x 76, "\n";
902
    my $i = 0;
909
    my $i = 0;
903
    foreach
910
    foreach
904
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
911
      my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
905
    {
912
    {
906
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
913
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
907
        $crossposts{$name};
914
        $crossposts{$name};
908
      last if ( ++$i == $count );
915
      last if ( ++$i == $count );
909
    }
916
    }
910
    print "\n", "=" x 76, "\n";
917
    print "\n", "=" x 76, "\n";
911
  }
918
  }
912
919
913
  #########################
920
  #########################
914
  ## Show agents and counts
921
  ## Show agents and counts
915
  #########################
922
  #########################
916
  unless ( $skipSec{8} )
923
  unless ( $skipSec{8} )
917
  {
924
  {
918
    if ( keys %agents < $topagents )
925
    if ( keys %agents < $topagents )
919
    {
926
    {
920
      $count = keys %agents;
927
      $count = keys %agents;
921
    }
928
    }
922
    else
929
    else
923
    {
930
    {
924
      $count = $topagents;
931
      $count = $topagents;
925
    }
932
    }
926
    printf "%s\n",
933
    printf "%s\n",
927
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
934
      centred( __x( "Top {count} user agents by poster", count => $topagents ),
928
      76 );
935
      76 );
929
    print "=" x 76, "\n";
936
    print "=" x 76, "\n";
930
    my $i = 0;
937
    my $i = 0;
931
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
938
    foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
932
    {
939
    {
933
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
940
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
934
        $agents{$agent};
941
        $agents{$agent};
935
      last if ( ++$i == $count );
942
      last if ( ++$i == $count );
936
    }
943
    }
937
    print "\n", "=" x 76, "\n";
944
    print "\n", "=" x 76, "\n";
938
  }
945
  }
939
946
940
  #######################
947
  #######################
941
  ## Show distinct agents
948
  ## Show distinct agents
942
  #######################
949
  #######################
943
  unless ( $skipSec{9} )
950
  unless ( $skipSec{9} )
944
  {
951
  {
945
    if ( keys %distinct_agent < $topagents )
952
    if ( keys %distinct_agent < $topagents )
946
    {
953
    {
947
      $count = keys %distinct_agent;
954
      $count = keys %distinct_agent;
948
    }
955
    }
949
    else
956
    else
950
    {
957
    {
951
      $count = $topagents;
958
      $count = $topagents;
952
    }
959
    }
953
    printf "%s\n",
960
    printf "%s\n",
954
      centred(
961
      centred(
955
      __x(
962
      __x(
956
        "Top {count} user agents by number of articles",
963
        "Top {count} user agents by number of articles",
957
        count => $topagents
964
        count => $topagents
958
      ),
965
      ),
959
      76
966
      76
960
      );
967
      );
961
    print "=" x 76, "\n";
968
    print "=" x 76, "\n";
962
    my $i = 0;
969
    my $i = 0;
963
    foreach my $agent (
970
    foreach my $agent (
964
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
971
      sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
965
      keys %distinct_agent
972
      keys %distinct_agent
966
      )
973
      )
967
    {
974
    {
968
      printf "%2d. %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
975
      printf "%2d. %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
969
        $distinct_agent{$agent},
976
        $distinct_agent{$agent},
970
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
977
        ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
971
      last if ( ++$i == $count );
978
      last if ( ++$i == $count );
972
    }
979
    }
973
    print "\n", "=" x 76, "\n";
980
    print "\n", "=" x 76, "\n";
974
  }
981
  }
975
982
976
  ############################
983
  ############################
977
  ## Show timezones and counts
984
  ## Show timezones and counts
978
  ############################
985
  ############################
979
  unless ( $skipSec{10} )
986
  unless ( $skipSec{10} )
980
  {
987
  {
981
    if ( keys %tz < $toptz )
988
    if ( keys %tz < $toptz )
982
    {
989
    {
983
      $count = keys %tz;
990
      $count = keys %tz;
984
    }
991
    }
985
    else
992
    else
986
    {
993
    {
987
      $count = $toptz;
994
      $count = $toptz;
988
    }
995
    }
989
    printf "%s\n",
996
    printf "%s\n",
990
      centred(
997
      centred(
991
      __x( "Top {count} time zones by number of articles", count => $toptz ),
998
      __x( "Top {count} time zones by number of articles", count => $toptz ),
992
      76 );
999
      76 );
993
    print "=" x 76, "\n";
1000
    print "=" x 76, "\n";
994
    my $i = 0;
1001
    my $i = 0;
995
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
1002
    foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
996
    {
1003
    {
997
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
1004
      printf "%2d. %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
998
      last if ( ++$i == $count );
1005
      last if ( ++$i == $count );
999
    }
1006
    }
1000
    print "\n", "=" x 76, "\n";
1007
    print "\n", "=" x 76, "\n";
1001
  }
1008
  }
1002
}
1009
}
1003
1010
1004
## helper subs
1011
## helper subs
1005
1012
1006
###############################
1013
###############################
1007
## Right pad a string with '.'s
1014
## Right pad a string with '.'s
1008
###############################
1015
###############################
1009
sub rpad
1016
sub rpad
1010
{
1017
{
1011
  ## Get text to pad, length to pad, pad chr
1018
  ## Get text to pad, length to pad, pad chr
1012
  my ( $text, $pad_len, $pad_chr ) = @_;
1019
  my ( $text, $pad_len, $pad_chr ) = @_;
1013
1020
1014
  ## DEBUG
1021
  ## DEBUG
1015
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
1022
  printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1;
1016
1023
1017
  if ( length($text) > $pad_len )
1024
  if ( length($text) > $pad_len )
1018
  {
1025
  {
1019
    $text = substr( $text, 0, $pad_len );
1026
    $text = substr( $text, 0, $pad_len );
1020
  }
1027
  }
1021
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
1028
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
1022
  return $padded;
1029
  return $padded;
1023
}
1030
}
1024
1031
1025
##################
1032
##################
1026
## Centre a string
1033
## Centre a string
1027
##################
1034
##################
1028
sub centred
1035
sub centred
1029
{
1036
{
1030
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
1037
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
1031
  my $pad_len = ( $width - length($text) ) / 2;    #/
1038
  my $pad_len = ( $width - length($text) ) / 2;    #/
1032
  my $centred = " " x $pad_len . $text;
1039
  my $centred = " " x $pad_len . $text;
1033
  return $centred;
1040
  return $centred;
1034
}
1041
}
1035
1042
1036
###########################
1043
###########################
1037
## Put commas into a number
1044
## Put commas into a number
1038
###########################
1045
###########################
1039
sub commify
1046
sub commify
1040
{
1047
{
1041
  local $_ = shift;
1048
  local $_ = shift;
1042
  my $number = $_;
1049
  my $number = $_;
1043
  $_ = int;                                        # Chop non-integer part
1050
  $_ = int;                                        # Chop non-integer part
1044
  1 while
1051
  1 while
1045
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1052
    s/([-+]?\d)(\d{$grouping[0]}($|\Q$thousands_sep\E))/$1$thousands_sep$2/;
1046
  my $int_part  = $_;
1053
  my $int_part  = $_;
1047
  my $real_part = '';
1054
  my $real_part = '';
1048
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1055
  if ( $number =~ /(\Q$decimal_point\E\d+)$/ )
1049
  {
1056
  {
1050
    $real_part = $1;
1057
    $real_part = $1;
1051
  }
1058
  }
1052
  return $int_part . $real_part;
1059
  return $int_part . $real_part;
1053
}
1060
}
1054
1061
1055
################################################################
1062
################################################################
1056
## Returns a string with leading and trailing whitespace removed
1063
## Returns a string with leading and trailing whitespace removed
1057
################################################################
1064
################################################################
1058
sub clean
1065
sub clean
1059
{
1066
{
1060
  my $dirty = shift;
1067
  my $dirty = shift;
1061
  my $clean = $dirty;
1068
  my $clean = $dirty;
1062
  $clean =~ s/^\s+|\s+$//g;
1069
  $clean =~ s/^\s+|\s+$//g;
1063
1070
1064
  return $clean;
1071
  return $clean;
1065
}
1072
}
1066
1073
1067
sub usage
1074
sub usage
1068
{
1075
{
1069
  print __ "usage: newsstat.pl NEWS.GROUP", "\n";
1076
  print __ "usage: newsstat.pl NEWS.GROUP", "\n";
1070
  exit 1;
1077
  exit 1;
1071
}
1078
}
1072
1079
1073
sub dmsg
1080
sub dmsg
1074
{
1081
{
1075
  print STDERR @_, "\n";
1082
  print STDERR @_, "\n";
1076
}
1083
}
1077
1084
1078
sub dmsg2
1085
sub dmsg2
1079
{
1086
{
1080
  my ( $level, @msg ) = @_;
1087
  my ( $level, @msg ) = @_;
1081
  print STDERR @msg, "\n" if $level >= DEBUG;
1088
  print STDERR @msg, "\n" if $level >= DEBUG;
1082
}
1089
}
1083
 
1090