Subversion Repositories LCARS

Rev

Rev 8 | Rev 10 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 8 Rev 9
1
#!/usr/bin/env perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
4
use utf8;
4
use utf8;
-
 
5
use encoding 'utf-8';
5
use Encode;
6
use Encode;
6
7
7
###########################
8
###########################
8
# newsstat.pl version 0.4.1
9
# newsstat.pl version 0.4.1
9
10
10
############################################################################
11
############################################################################
11
# Collect statistics about a newsgroup (specified by first argument) in
12
# Collect statistics about a newsgroup (specified by first argument) in
12
# the local news spool. Check all articles in the last 30-day period.
13
# the local news spool. Check all articles in the last 30-day period.
13
# Rank posters by number of posts and by volume of posts, report on top and
14
# Rank posters by number of posts and by volume of posts, report on top and
14
# bottom 20 posters. Show their name, number of posts, size of posts,
15
# bottom 20 posters. Show their name, number of posts, size of posts,
15
# percentage of quoted lines. Rank user-agents used, by poster rather than
16
# percentage of quoted lines. Rank user-agents used, by poster rather than
16
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
17
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
17
#
18
#
18
# (Numbers and paths can be configured below.  -- PE)
19
# (Numbers and paths can be configured below.  -- PE)
19
############################################################################
20
############################################################################
20
21
21
############################################################################
22
############################################################################
22
#                       RECENT CHANGES                                     #
23
#                       RECENT CHANGES                                     #
-
 
24
# 2011-10-03  PE  - Use more compatible shebang
-
 
25
#                 - Fixed some Perl::Critic-ized code
-
 
26
#                 - Fixed wrong indent for non-ASCII names (TODO: proper dot count)
-
 
27
#                 - Formatted source code
23
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
28
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
24
#                 - Use warnings, utf8 (just in case)
29
#                 - Use warnings, utf8 (just in case)
25
#                 - Documentation update
30
#                 - Documentation update
26
# N/A         NN  - Take newsgroup name as argument
31
# N/A         NN  - Take newsgroup name as argument
27
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
32
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
28
#                 - Allow command line flags for subtracting
33
#                 - Allow command line flags for subtracting
29
#                   output if not pertinent for a group
34
#                   output if not pertinent for a group
30
# 2002-11-09  NN  - Put Garry's writedata() function back in.
35
# 2002-11-09  NN  - Put Garry's writedata() function back in.
31
#                 - added "rn" to my list of UA's
36
#                 - added "rn" to my list of UA's
32
#                 - Started using %distinct_agent for both User agent
37
#                 - Started using %distinct_agent for both User agent
33
#                   sections
38
#                   sections
34
#                 - named it newsstat.pl version 0.3
39
#                 - named it newsstat.pl version 0.3
35
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
40
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
36
#                   mtime rather than ctime, and simplifying the logic
41
#                   mtime rather than ctime, and simplifying the logic
37
# 2002-11-05  NN  - moved user configurations to the top
42
# 2002-11-05  NN  - moved user configurations to the top
38
#                 - fixed the cross-posting section
43
#                 - fixed the cross-posting section
39
#                 - introduced the $newsgroup_name variable which
44
#                 - introduced the $newsgroup_name variable which
40
#                   later becomes $news$group
45
#                   later becomes $news$group
41
#                 - changed $name to $agent_name in countagents()
46
#                 - changed $name to $agent_name in countagents()
42
#
47
#
43
# Contributors
48
# Contributors
44
# -------------
49
# -------------
45
# NN  Nomen nominandum (name to be determined later)
50
# NN  Nomen nominandum (name to be determined later)
46
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
51
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
47
52
48
########### TODO #############
53
########### TODO #############
49
# Commas in bottom section of report
54
# Commas in bottom section of report
50
# Show date the figures were compiled
55
# Show date the figures were compiled
51
# No. of HTML articles (Content-Type: text/html)
56
# No. of HTML articles (Content-Type: text/html)
52
# No. of quoted sigs (/>\s*-- /)
57
# No. of quoted sigs (/>\s*-- /)
53
# Per cent of top-posted articles
58
# Per cent of top-posted articles
54
# Top 10 cross-posters
59
# Top 10 cross-posters
55
# Top 20 news posting hosts (from Path)
60
# Top 20 news posting hosts (from Path)
56
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
61
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
57
# Count *all* User Agents that each poster uses
62
# Count *all* User Agents that each poster uses
58
# What do we do about Bill Unruh's ] quote style?
63
# What do we do about Bill Unruh's ] quote style?
59
# Change the way dates/times are checked
64
# Change the way dates/times are checked
60
# include % share in posters by no. of arts
65
# include % share in posters by no. of arts
61
# include % share in posters by size
66
# include % share in posters by size
62
# Total, orig & quoted lines by user agent with per cent
67
# Total, orig & quoted lines by user agent with per cent
63
# Take more arguments
68
# Take more arguments
64
#######################################################
69
#######################################################
65
70
66
###################### USER CONFIGURATIONS ############################
71
###################### USER CONFIGURATIONS ############################
67
72
68
# The name of the group to do stats for
73
# The name of the group to do stats for
69
my $newsgroup_name = $ARGV[0];
74
my $newsgroup_name = $ARGV[0];
70
$newsgroup_name or &usage;
75
$newsgroup_name or &usage;
71
76
72
# Check for removal flags
77
# Check for removal flags
73
my $ix;
78
my $ix;
74
my $j;
79
my $j;
75
my %skipSec;
80
my %skipSec;
76
my @skiplist;
81
my @skiplist;
77
my $args = @ARGV;
82
my $args = @ARGV;
78
for ( $ix = 1 ; $ix < $args ; $ix++ )
83
for ( $ix = 1 ; $ix < $args ; $ix++ )
79
{
84
{
80
  $j = $ix + 1;
85
  $j = $ix + 1;
81
  if ( $ARGV[$ix] eq "-x" )
86
  if ( $ARGV[$ix] eq "-x" )
82
  {
87
  {
83
    @skiplist = split( ",", $ARGV[$j] );
88
    @skiplist = split( ",", $ARGV[$j] );
84
  }
89
  }
85
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
90
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
86
  {
91
  {
87
    @skiplist = split( ",", $1 );
92
    @skiplist = split( ",", $1 );
88
  }
93
  }
89
}
94
}
90
foreach (@skiplist)
95
foreach (@skiplist)
91
{
96
{
92
  $skipSec{$_} = 1;
97
  $skipSec{$_} = 1;
93
}
98
}
94
99
95
# Leafnode users will want /var/spool/news for this variable.
100
# Leafnode users will want /var/spool/news for this variable.
96
my $news = "/var/spool/news/";
101
my $news = "/var/spool/news/";
97
102
98
# How many days are we doing statistics for?
103
# How many days are we doing statistics for?
99
my $numdays = 30;
104
my $numdays = 30;
100
105
101
# no. of agents we list
106
# no. of agents we list
102
my $topagents = 10;
107
my $topagents = 10;
103
108
104
# no. of threads we want to know about
109
# no. of threads we want to know about
105
my $topthreads = 20;
110
my $topthreads = 20;
106
111
107
# no. of top or bottom posters to show
112
# no. of top or bottom posters to show
108
my $topposters = 20;
113
my $topposters = 20;
109
114
110
# no. of cross-posted threads to show
115
# no. of cross-posted threads to show
111
my $topcrossposts = 10;
116
my $topcrossposts = 10;
112
117
113
# no. of time zones to show
118
# no. of time zones to show
114
my $toptz = 10;
119
my $toptz = 10;
115
120
116
###################### DATA STRUCTURES ######################
121
###################### DATA STRUCTURES ######################
117
my $group = $newsgroup_name;
122
my $group = $newsgroup_name;
118
$group =~ s!\.!/!g;
123
$group =~ s!\.!/!g;
119
my %data;          # name, count, agent, total, orig, quoted
124
my %data;          # name, count, agent, total, orig, quoted
120
my %threads;       # subject, count
125
my %threads;       # subject, count
121
my %crossposts;    # group, count
126
my %crossposts;    # group, count
122
my %tz;            # timezones by count
127
my %tz;            # timezones by count
123
my %headers;       # holds header of current article
128
my %headers;       # holds header of current article
124
my %lcheader;      # holds lowercase headers
129
my %lcheader;      # holds lowercase headers
125
my @body;          # holds body of current article
130
my @body;          # holds body of current article
126
my @sig;           # holds sig text;
131
my @sig;           # holds sig text;
127
my $totalposts;    # total no. of posts considered
132
my $totalposts;    # total no. of posts considered
128
my $filename;      # name of current article file
133
my $filename;      # name of current article file
129
my $filesize;      # size of current article file
134
my $filesize;      # size of current article file
130
my $earliest;      # earliest article we have found
135
my $earliest;      # earliest article we have found
131
my $latest;        # latest article we have found
136
my $latest;        # latest article we have found
132
my $poster;        # poster we are dealing with
137
my $poster;        # poster we are dealing with
133
my $totsize   = 0; # holds total sizes of all files
138
my $totsize   = 0; # holds total sizes of all files
134
my $totheader = 0; # total size of header material
139
my $totheader = 0; # total size of header material
135
my $totbody   = 0; # total size of body material
140
my $totbody   = 0; # total size of body material
136
my $totsig    = 0; # total size of sig material
141
my $totsig    = 0; # total size of sig material
137
my $totorig   = 0; # total size of original material
142
my $totorig   = 0; # total size of original material
138
my $totquoted = 0; # total size of quoted material
143
my $totquoted = 0; # total size of quoted material
139
my $origposts = 0; # total no. of original posts
144
my $origposts = 0; # total no. of original posts
140
my $replies   = 0; # total no. of replies
145
my $replies   = 0; # total no. of replies
141
my $i;             # general purpose
146
my $i;             # general purpose
142
my %distinct_agent;
147
my %distinct_agent;
143
my %agents =       # used to hold counts of User Agents used
148
my %agents =       # used to hold counts of User Agents used
144
  (
149
  (
145
  "KNode"                     => 0,
150
  "KNode"                     => 0,
146
  "Pan"                       => 0,
151
  "Pan"                       => 0,
147
  "Mozilla"                   => 0,
152
  "Mozilla"                   => 0,
148
  "Sylpheed"                  => 0,
153
  "Sylpheed"                  => 0,
149
  "Gnus"                      => 0,
154
  "Gnus"                      => 0,
150
  "Forte Agent"               => 0,
155
  "Forte Agent"               => 0,
151
  "Forte Free Agent"          => 0,
156
  "Forte Free Agent"          => 0,
152
  "MicroPlanet Gravity"       => 0,
157
  "MicroPlanet Gravity"       => 0,
153
  "Microsoft Outlook Express" => 0,
158
  "Microsoft Outlook Express" => 0,
154
  "Xnews"                     => 0,
159
  "Xnews"                     => 0,
155
  "slrn"                      => 0,
160
  "slrn"                      => 0,
156
  "tin"                       => 0,
161
  "tin"                       => 0,
157
  "rn"                        => 0,
162
  "rn"                        => 0,
158
  "NN"                        => 0,
163
  "NN"                        => 0,
159
  "MacSOUP"                   => 0,
164
  "MacSOUP"                   => 0,
160
  "Foorum"                    => 0,
165
  "Foorum"                    => 0,
161
  "MT-NewsWatcher"            => 0,
166
  "MT-NewsWatcher"            => 0,
162
  "News Rover"                => 0,
167
  "News Rover"                => 0,
163
  "WebTV"                     => 0,
168
  "WebTV"                     => 0,
164
  "Compuserver"               => 0,
169
  "Compuserver"               => 0,
165
  "VSoup"                     => 0
170
  "VSoup"                     => 0
166
  );
171
  );
167
172
168
######################## MAIN CODE ########################
173
######################## MAIN CODE ########################
169
$! = 1;
174
$! = 1;
170
175
171
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
176
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
172
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
177
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
173
while ( defined( $filename = readdir(DIR) ) )
178
while ( defined( $filename = readdir(DIR) ) )
174
{
179
{
175
  %lcheader = ();
180
  %lcheader = ();
176
  next unless -f $filename;    # only want real files
181
  next unless -f $filename;    # only want real files
177
  next if ( $filename eq ".overview" );    # real articles only
182
  next if ( $filename eq ".overview" );    # real articles only
178
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
183
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
179
  $earliest = ( stat $filename )[9] unless defined($earliest);
184
  $earliest = ( stat $filename )[9] unless defined($earliest);
180
  $latest   = ( stat $filename )[9] unless defined($latest);
185
  $latest   = ( stat $filename )[9] unless defined($latest);
181
  &getarticle($filename);                  # read in the article
186
  &getarticle($filename);                  # read in the article
182
  &getdata;                                # grab the data from the article
187
  &getdata;                                # grab the data from the article
183
  $totalposts++;                           # bump count of articles considered
188
  $totalposts++;                           # bump count of articles considered
184
}
189
}
185
closedir(DIR);                             # finished with the directory
190
closedir(DIR);                             # finished with the directory
186
191
187
# post-processing
192
# post-processing
188
&countagents;    # count agents, collapsing versions
193
&countagents;    # count agents, collapsing versions
189
&fixpercent;     # check percentages orig/total for posters
194
&fixpercent;     # check percentages orig/total for posters
190
195
191
&writedata;
196
&writedata;
192
197
193
#################### DISPLAY RESULTS #####################
198
#################### DISPLAY RESULTS #####################
194
print "=" x 76, "\n";
199
print "=" x 76, "\n";
195
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
200
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
196
print "=" x 76, "\n";
201
print "=" x 76, "\n";
197
printf "%s\n",
202
printf "%s\n",
198
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
203
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
199
print "\n\n";
204
print "\n\n";
200
printf "Total posts considered: %s over %d days\n", commify($totalposts),
205
printf "Total posts considered: %s over %d days\n", commify($totalposts),
201
  $numdays;
206
  $numdays;
202
printf "Earliest article: %s\n",               scalar localtime($earliest);
207
printf "Earliest article: %s\n",               scalar localtime($earliest);
203
printf "Latest article:   %s\n",               scalar localtime($latest);
208
printf "Latest article:   %s\n",               scalar localtime($latest);
204
printf "Original articles: %s, replies: %s\n", commify($origposts),
209
printf "Original articles: %s, replies: %s\n", commify($origposts),
205
  commify($replies);
210
  commify($replies);
206
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
211
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
207
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
212
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
208
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
213
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
209
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
214
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
210
  commify( int( $totsize / $totalposts ) );
215
  commify( int( $totsize / $totalposts ) );
211
my $count = keys %data;
216
my $count = keys %data;
212
printf "Total headers: %s KB  bodies: %s KB\n",
217
printf "Total headers: %s KB  bodies: %s KB\n",
213
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
218
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
214
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
219
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
215
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
220
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
216
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
221
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
217
  commify( int( $totsig / 1024 ) );
222
  commify( int( $totsig / 1024 ) );
218
printf "Total number of posters: %s, average %s bytes per poster\n",
223
printf "Total number of posters: %s, average %s bytes per poster\n",
219
  commify($count), commify( int( $totsize / $count ) );     #/
224
  commify($count), commify( int( $totsize / $count ) );     #/
220
$count = keys %threads;
225
$count = keys %threads;
221
printf "Total number of threads: %s, average %s bytes per thread\n",
226
printf "Total number of threads: %s, average %s bytes per thread\n",
222
  commify($count), commify( int( $totsize / $count ) );     #/
227
  commify($count), commify( int( $totsize / $count ) );     #/
223
printf "Total number of User-Agents: %d\n", scalar keys %agents;
228
printf "Total number of User-Agents: %d\n", scalar keys %agents;
224
print "\n", "=" x 76, "\n";
229
print "\n", "=" x 76, "\n";
225
230
226
###############################
231
###############################
227
# show posters by article count  Sec 1;
232
# show posters by article count  Sec 1;
228
###############################
233
###############################
229
unless ( $skipSec{1} )
234
unless ( $skipSec{1} )
230
{
235
{
231
  if ( keys %data < $topposters )
236
  if ( keys %data < $topposters )
232
  {
237
  {
233
    $count = keys %data;
238
    $count = keys %data;
234
  }
239
  }
235
  else
240
  else
236
  {
241
  {
237
    $count = $topposters;
242
    $count = $topposters;
238
  }
243
  }
239
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
244
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
240
  print "=" x 76, "\n";
245
  print "=" x 76, "\n";
241
  $i = 0;
246
  $i = 0;
-
 
247
  foreach
242
  foreach my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
248
    my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
243
  {
249
  {
244
    my $name = substr( $poster, 0, 65 );
250
    my $name = substr( $poster, 0, 65 );
245
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
251
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
246
      $data{$poster}{count};
252
      $data{$poster}{count};
247
    last if ( ++$i == $count );
253
    last if ( ++$i == $count );
248
  }
254
  }
249
  print "\n", "=" x 76, "\n";
255
  print "\n", "=" x 76, "\n";
250
}
256
}
251
257
252
################################
258
################################
253
# show posters by size in Kbytes Sec 2;
259
# show posters by size in Kbytes Sec 2;
254
################################
260
################################
255
unless ( $skipSec{2} )
261
unless ( $skipSec{2} )
256
{
262
{
257
  if ( keys %data < $topposters )
263
  if ( keys %data < $topposters )
258
  {
264
  {
259
    $count = keys %data;
265
    $count = keys %data;
260
  }
266
  }
261
  else
267
  else
262
  {
268
  {
263
    $count = $topposters;
269
    $count = $topposters;
264
  }
270
  }
265
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
271
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
266
  print "=" x 76, "\n";
272
  print "=" x 76, "\n";
267
  $i = 0;
273
  $i = 0;
268
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
274
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
269
  {
275
  {
270
    my $name = substr( $poster, 0, 62 );
276
    my $name = substr( $poster, 0, 62 );
271
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
277
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
272
      $data{$poster}{size} / 1024;    #/
278
      $data{$poster}{size} / 1024;    #/
273
    last if ( ++$i == $count );
279
    last if ( ++$i == $count );
274
  }
280
  }
275
  print "\n", "=" x 76, "\n";
281
  print "\n", "=" x 76, "\n";
276
}
282
}
277
283
278
####################################
284
####################################
279
# show top posters for original text
285
# show top posters for original text
280
####################################
286
####################################
281
unless ( $skipSec{3} )
287
unless ( $skipSec{3} )
282
{
288
{
283
  if ( keys %data < $topposters )
289
  if ( keys %data < $topposters )
284
  {
290
  {
285
    $count = keys %data;
291
    $count = keys %data;
286
  }
292
  }
287
  else
293
  else
288
  {
294
  {
289
    $count = $topposters;
295
    $count = $topposters;
290
  }
296
  }
291
  printf "%s\n",
297
  printf "%s\n",
292
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
298
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
293
  print "=" x 76, "\n";
299
  print "=" x 76, "\n";
294
  $i = 0;
300
  $i = 0;
-
 
301
  foreach my $poster (
295
  foreach my $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
302
    sort { $data{$b}{percent} <=> $data{$a}{percent} }
296
    keys %data )
303
    keys %data
-
 
304
    )
297
  {
305
  {
298
    next if $data{$poster}{quoted} == 0;
306
    next if $data{$poster}{quoted} == 0;
299
    next if $data{$poster}{count} < 5;
307
    next if $data{$poster}{count} < 5;
300
    my $name = substr( $poster, 0, 63 );
308
    my $name = substr( $poster, 0, 63 );
301
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
309
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
302
      $data{$poster}{percent};
310
      $data{$poster}{percent};
303
    last if ( ++$i == $count );
311
    last if ( ++$i == $count );
304
  }
312
  }
305
  print "\n", "=" x 76, "\n";
313
  print "\n", "=" x 76, "\n";
306
}
314
}
307
315
308
#######################################
316
#######################################
309
# show bottom posters for original text
317
# show bottom posters for original text
310
#######################################
318
#######################################
311
unless ( $skipSec{4} )
319
unless ( $skipSec{4} )
312
{
320
{
313
  if ( keys %data < $topposters )
321
  if ( keys %data < $topposters )
314
  {
322
  {
315
    $count = keys %data;
323
    $count = keys %data;
316
  }
324
  }
317
  else
325
  else
318
  {
326
  {
319
    $count = $topposters;
327
    $count = $topposters;
320
  }
328
  }
321
  printf "%s\n",
329
  printf "%s\n",
322
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
330
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
323
  print "=" x 76, "\n";
331
  print "=" x 76, "\n";
324
  $i = 0;
332
  $i = 0;
-
 
333
  foreach my $poster (
325
  foreach my $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
334
    sort { $data{$a}{percent} <=> $data{$b}{percent} }
326
    keys %data )
335
    keys %data
-
 
336
    )
327
  {
337
  {
328
    next if $data{$poster}{quoted} == 0;
338
    next if $data{$poster}{quoted} == 0;
329
    next if $data{$poster}{count} < 5;
339
    next if $data{$poster}{count} < 5;
330
    my $name = substr( $poster, 0, 63 );
340
    my $name = substr( $poster, 0, 63 );
331
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
341
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
332
      $data{$poster}{percent};
342
      $data{$poster}{percent};
333
    last if ( ++$i == $count );
343
    last if ( ++$i == $count );
334
  }
344
  }
335
  print "\n", "=" x 76, "\n";
345
  print "\n", "=" x 76, "\n";
336
}
346
}
337
347
338
####################################
348
####################################
339
# show threads by number of articles
349
# show threads by number of articles
340
####################################
350
####################################
341
unless ( $skipSec{5} )
351
unless ( $skipSec{5} )
342
{
352
{
343
  if ( keys %threads < $topthreads )
353
  if ( keys %threads < $topthreads )
344
  {
354
  {
345
    $count = keys %threads;
355
    $count = keys %threads;
346
  }
356
  }
347
  else
357
  else
348
  {
358
  {
349
    $count = $topthreads;
359
    $count = $topthreads;
350
  }
360
  }
351
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
361
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
352
  print "=" x 76, "\n";
362
  print "=" x 76, "\n";
353
  $i = 0;
363
  $i = 0;
-
 
364
  foreach my $thread (
354
  foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
365
    sort { $threads{$b}{count} <=> $threads{$a}{count} }
355
    keys %threads )
366
    keys %threads
-
 
367
    )
356
  {
368
  {
357
    my $name = substr( $thread, 0, 65 );
369
    my $name = substr( $thread, 0, 65 );
358
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
370
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
359
      $threads{$thread}{count};
371
      $threads{$thread}{count};
360
    last if ( ++$i == $count );
372
    last if ( ++$i == $count );
361
  }
373
  }
362
  print "\n", "=" x 76, "\n";
374
  print "\n", "=" x 76, "\n";
363
}
375
}
364
################################
376
################################
365
# show threads by size in Kbytes
377
# show threads by size in Kbytes
366
################################
378
################################
367
unless ( $skipSec{6} )
379
unless ( $skipSec{6} )
368
{
380
{
369
  if ( keys %threads < $topthreads )
381
  if ( keys %threads < $topthreads )
370
  {
382
  {
371
    $count = keys %threads;
383
    $count = keys %threads;
372
  }
384
  }
373
  else
385
  else
374
  {
386
  {
375
    $count = $topthreads;
387
    $count = $topthreads;
376
  }
388
  }
377
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
389
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
378
  print "=" x 76, "\n";
390
  print "=" x 76, "\n";
379
  $i = 0;
391
  $i = 0;
-
 
392
  foreach my $thread (
380
  foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
393
    sort { $threads{$b}{size} <=> $threads{$a}{size} }
381
    keys %threads )
394
    keys %threads
-
 
395
    )
382
  {
396
  {
383
    my $name = substr( $thread, 0, 65 );
397
    my $name = substr( $thread, 0, 65 );
384
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
398
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
385
      $threads{$thread}{size} / 1024;    #/
399
      $threads{$thread}{size} / 1024;    #/
386
    last if ( ++$i == $count );
400
    last if ( ++$i == $count );
387
  }
401
  }
388
  print "\n", "=" x 76, "\n";
402
  print "\n", "=" x 76, "\n";
389
}
403
}
390
404
391
#################################
405
#################################
392
# show top 10 cross-posted groups
406
# show top 10 cross-posted groups
393
#################################
407
#################################
394
unless ( $skipSec{7} )
408
unless ( $skipSec{7} )
395
{
409
{
396
  delete $crossposts{"$newsgroup_name"};    # don't include ours
410
  delete $crossposts{"$newsgroup_name"};    # don't include ours
397
  if ( keys %crossposts < $topcrossposts )
411
  if ( keys %crossposts < $topcrossposts )
398
  {
412
  {
399
    $count = keys %crossposts;
413
    $count = keys %crossposts;
400
  }
414
  }
401
  else
415
  else
402
  {
416
  {
403
    $count = $topcrossposts;
417
    $count = $topcrossposts;
404
  }
418
  }
405
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
419
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
406
  print "=" x 76, "\n";
420
  print "=" x 76, "\n";
407
  $i = 0;
421
  $i = 0;
408
  foreach
422
  foreach
409
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
423
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
410
  {
424
  {
411
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
425
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
412
      $crossposts{$name};
426
      $crossposts{$name};
413
    last if ( ++$i == $count );
427
    last if ( ++$i == $count );
414
  }
428
  }
415
  print "\n", "=" x 76, "\n";
429
  print "\n", "=" x 76, "\n";
416
}
430
}
417
#######################
431
#######################
418
#show agents and counts
432
#show agents and counts
419
#######################
433
#######################
420
unless ( $skipSec{8} )
434
unless ( $skipSec{8} )
421
{
435
{
422
  if ( keys %agents < $topagents )
436
  if ( keys %agents < $topagents )
423
  {
437
  {
424
    $count = keys %agents;
438
    $count = keys %agents;
425
  }
439
  }
426
  else
440
  else
427
  {
441
  {
428
    $count = $topagents;
442
    $count = $topagents;
429
  }
443
  }
430
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
444
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
431
  print "=" x 76, "\n";
445
  print "=" x 76, "\n";
432
  $i = 0;
446
  $i = 0;
433
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
447
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
434
  {
448
  {
435
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
449
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
436
      $agents{$agent};
450
      $agents{$agent};
437
    last if ( ++$i == $count );
451
    last if ( ++$i == $count );
438
  }
452
  }
439
  print "\n", "=" x 76, "\n";
453
  print "\n", "=" x 76, "\n";
440
}
454
}
441
455
442
#######################
456
#######################
443
#show distinct agents
457
#show distinct agents
444
#######################
458
#######################
445
unless ( $skipSec{9} )
459
unless ( $skipSec{9} )
446
{
460
{
447
  if ( keys %distinct_agent < $topagents )
461
  if ( keys %distinct_agent < $topagents )
448
  {
462
  {
449
    $count = keys %distinct_agent;
463
    $count = keys %distinct_agent;
450
  }
464
  }
451
  else
465
  else
452
  {
466
  {
453
    $count = $topagents;
467
    $count = $topagents;
454
  }
468
  }
455
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
469
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
456
  print "=" x 76, "\n";
470
  print "=" x 76, "\n";
457
  $i = 0;
471
  $i = 0;
-
 
472
  foreach my $agent (
458
  foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
473
    sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
459
    keys %distinct_agent )
474
    keys %distinct_agent
-
 
475
    )
460
  {
476
  {
461
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
477
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
462
      $distinct_agent{$agent},
478
      $distinct_agent{$agent},
463
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
479
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
464
    last if ( ++$i == $count );
480
    last if ( ++$i == $count );
465
  }
481
  }
466
  print "\n", "=" x 76, "\n";
482
  print "\n", "=" x 76, "\n";
467
}
483
}
468
484
469
##########################
485
##########################
470
#show timezones and counts
486
#show timezones and counts
471
##########################
487
##########################
472
unless ( $skipSec{10} )
488
unless ( $skipSec{10} )
473
{
489
{
474
  if ( keys %tz < $toptz )
490
  if ( keys %tz < $toptz )
475
  {
491
  {
476
    $count = keys %tz;
492
    $count = keys %tz;
477
  }
493
  }
478
  else
494
  else
479
  {
495
  {
480
    $count = $toptz;
496
    $count = $toptz;
481
  }
497
  }
482
  printf "%s\n", &centred( "Top 10 time zones", 76 );
498
  printf "%s\n", &centred( "Top 10 time zones", 76 );
483
  print "=" x 76, "\n";
499
  print "=" x 76, "\n";
484
  $i = 0;
500
  $i = 0;
485
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
501
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
486
  {
502
  {
487
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
503
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
488
    last if ( ++$i == $count );
504
    last if ( ++$i == $count );
489
  }
505
  }
490
  print "\n", "=" x 76, "\n";
506
  print "\n", "=" x 76, "\n";
491
}
507
}
492
508
493
################################ SUBROUTINES ################################
509
################################ SUBROUTINES ################################
494
510
495
#######################################
511
#######################################
496
# get current article's header and body
512
# get current article's header and body
497
#######################################
513
#######################################
498
sub getarticle
514
sub getarticle
499
{
515
{
500
  %headers = ();    # dump old headers
516
  %headers = ();    # dump old headers
501
  my $filename = shift;    # get the name of the file
517
  my $filename = shift;    # get the name of the file
502
518
503
  # get stats about the file itself
519
  # get stats about the file itself
504
  $filesize = -s $filename;    # get total size of file
520
  $filesize = -s $filename;    # get total size of file
505
  $totsize += $filesize;       # bump total sizes of all files
521
  $totsize += $filesize;       # bump total sizes of all files
506
522
507
  my $mtime = ( stat $filename )[9];
523
  my $mtime = ( stat $filename )[9];
508
  if ( $mtime < $earliest )
524
  if ( $mtime < $earliest )
509
  {
525
  {
510
    $earliest = $mtime;
526
    $earliest = $mtime;
511
  }
527
  }
512
  elsif ( $mtime > $latest )
528
  elsif ( $mtime > $latest )
513
  {
529
  {
514
    $latest = $mtime;
530
    $latest = $mtime;
515
  }
531
  }
516
532
517
  # now read the file
533
  # now read the file
518
  open(my $FILE, $filename) or die "Can't open $filename: $!\n";
534
  open( my $FILE, $filename ) or die "Can't open $filename: $!\n";
519
  while (<$FILE>)
535
  while (<$FILE>)
520
  {
536
  {
521
    $totheader += length($_);    # bump total header size
537
    $totheader += length($_);    # bump total header size
522
    last if (/^\s*$/);           # end of header?
538
    last if (/^\s*$/);           # end of header?
523
    if (/^([^:\s]*):\s+(.*)/)
539
    if (/^([^:\s]*):\s+(.*)/)
524
    {
540
    {
525
      my ( $key, $val ) = ( $1, $2 );
541
      my ( $key, $val ) = ( $1, $2 );
526
      $headers{$key} = decode( 'MIME-Header', $val );
542
      $headers{$key} = decode( 'MIME-Header', $val );
527
      $lcheader{ clean( lc($key) ) } = clean($val);
543
      $lcheader{ clean( lc($key) ) } = clean($val);
528
    }
544
    }
529
  }
545
  }
530
  @body = <$FILE>;                # slurp up body
546
  @body = <$FILE>;               # slurp up body
531
  close($FILE);
547
  close($FILE);
532
}    # getarticle
548
}    # getarticle
533
549
534
###################################
550
###################################
535
# get data from the current article
551
# get data from the current article
536
###################################
552
###################################
537
sub getdata
553
sub getdata
538
{
554
{
539
#### First, analyse header fields ####
555
#### First, analyse header fields ####
540
556
541
  # Set up this poster if not defined, get counts, sizes
557
  # Set up this poster if not defined, get counts, sizes
542
  $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
558
  my $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
543
  if ( !defined( $data{$poster} ) )
559
  if ( !defined( $data{$poster} ) )
544
  {                                               # seen this one before?
560
  {                                                  # seen this one before?
545
    $data{$poster}{agent}  = 'Unknown';           # comes after For: field
561
    $data{$poster}{agent}  = 'Unknown';              # comes after For: field
546
    $data{$poster}{orig}   = 0;
562
    $data{$poster}{orig}   = 0;
547
    $data{$poster}{quoted} = 0;
563
    $data{$poster}{quoted} = 0;
548
  }
564
  }
549
  $data{$poster}{count}++;                        # bump count for this poster
565
  $data{$poster}{count}++;    # bump count for this poster
550
  $data{$poster}{size} += $filesize;              # total size of file
566
  $data{$poster}{size} += $filesize;    # total size of file
551
567
552
  # The User-Agent and/or X-Newsreader fields
568
  # The User-Agent and/or X-Newsreader fields
553
  # for User-Agent by poster
569
  # for User-Agent by poster
554
  if ( defined $lcheader{"user-agent"} )
570
  if ( defined $lcheader{"user-agent"} )
555
  {
571
  {
556
    $data{$poster}{agent} = $lcheader{"user-agent"};
572
    $data{$poster}{agent} = $lcheader{"user-agent"};
557
  }
573
  }
558
  if ( defined $lcheader{"x-newsreader"} )
574
  if ( defined $lcheader{"x-newsreader"} )
559
  {
575
  {
560
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
576
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
561
  }
577
  }
562
578
563
  # The User Agent for User-Agent by number of posts
579
  # The User Agent for User-Agent by number of posts
564
  my $UA = "unknown";
580
  my $UA = "unknown";
565
  foreach my $keys ( keys %lcheader )
581
  foreach my $keys ( keys %lcheader )
566
  {
582
  {
567
    if ( defined $lcheader{'user-agent'} )
583
    if ( defined $lcheader{'user-agent'} )
568
    {
584
    {
569
      $UA = $lcheader{'user-agent'};
585
      $UA = $lcheader{'user-agent'};
570
    }
586
    }
571
    elsif ( defined $lcheader{"x-newsreader"} )
587
    elsif ( defined $lcheader{"x-newsreader"} )
572
    {
588
    {
573
      $UA = $lcheader{"x-newsreader"};
589
      $UA = $lcheader{"x-newsreader"};
574
    }
590
    }
575
    elsif ( defined $lcheader{'x-mailer'} )
591
    elsif ( defined $lcheader{'x-mailer'} )
576
    {
592
    {
577
      $UA = $lcheader{'x-mailer'};
593
      $UA = $lcheader{'x-mailer'};
578
    }
594
    }
579
    elsif (
595
    elsif (
580
      ( defined $lcheader{'organization'} )
596
      ( defined $lcheader{'organization'} )
581
      && ( $lcheader{'organization'} =~
597
      && ( $lcheader{'organization'} =~
582
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
598
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
583
      )
599
      )
584
    {
600
    {
585
      $UA = $lcheader{'organization'};
601
      $UA = $lcheader{'organization'};
586
    }
602
    }
587
    elsif ( $lcheader{'message-id'} =~ /pine/i )
603
    elsif ( $lcheader{'message-id'} =~ /pine/i )
588
    {
604
    {
589
      $UA = "Pine";
605
      $UA = "Pine";
590
    }    ## Hopefully found UA, else set to unknown
606
    }    ## Hopefully found UA, else set to unknown
591
  }
607
  }
592
608
593
  $UA = clean($UA);
609
  $UA = clean($UA);
594
  $UA = get_agent($UA);
610
  $UA = get_agent($UA);
595
611
596
  sub get_agent
612
  sub get_agent
597
  {
613
  {
598
    my $raw   = shift;
614
    my $raw   = shift;
599
    my $agent = $raw;
615
    my $agent = $raw;
600
616
601
    ## strip http
617
    ## strip http
602
    if ( $raw =~ /.*http.*/ )
618
    if ( $raw =~ /.*http.*/ )
603
    {
619
    {
604
      $raw =~ s!posted via!!i;
620
      $raw =~ s!posted via!!i;
605
      $raw =~ s!http://!!g;
621
      $raw =~ s!http://!!g;
606
      $raw =~ s!/!!g;
622
      $raw =~ s!/!!g;
607
      $raw =~ s! !!g;
623
      $raw =~ s! !!g;
608
    }
624
    }
609
625
610
    ## Fix Outlook from Mac
626
    ## Fix Outlook from Mac
611
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
627
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
612
628
613
    ## Pick out the popular agents
629
    ## Pick out the popular agents
614
    if ( $raw =~ /(outlook express)/i
630
    if ( $raw =~ /(outlook express)/i
615
      || $raw =~ /(microplanet gravity)/i
631
      || $raw =~ /(microplanet gravity)/i
616
      || $raw =~ /(news rover)/i
632
      || $raw =~ /(news rover)/i
617
      || $raw =~ /(forte agent)/i
633
      || $raw =~ /(forte agent)/i
618
      || $raw =~ /(forte free agent)/i )
634
      || $raw =~ /(forte free agent)/i )
619
    {
635
    {
620
      $agent = $1;
636
      $agent = $1;
621
    }
637
    }
622
    elsif (
638
    elsif (
623
      $raw =~ /^(
639
      $raw =~ /^(
624
        pan
640
        pan
625
       |sylpheed
641
       |sylpheed
626
       |slrn
642
       |slrn
627
       |mozilla
643
       |mozilla
628
       |knode
644
       |knode
629
       |tin
645
       |tin
630
       |hamster
646
       |hamster
631
       |xrn
647
       |xrn
632
       |xnews
648
       |xnews
633
       |aol
649
       |aol
634
       |gnus
650
       |gnus
635
       |krn
651
       |krn
636
       |macsoup
652
       |macsoup
637
       |messenger
653
       |messenger
638
       |openxp
654
       |openxp
639
       |pine
655
       |pine
640
       |thoth
656
       |thoth
641
       |turnpike
657
       |turnpike
642
       |winvn
658
       |winvn
643
       |vsoup
659
       |vsoup
644
       |google
660
       |google
645
       |supernews
661
       |supernews
646
       |nn
662
       |nn
647
       |rn
663
       |rn
648
       |007
664
       |007
649
       |webtv
665
       |webtv
650
       |compuserve
666
       |compuserve
651
       )/ix
667
       )/ix
652
      )
668
      )
653
    {
669
    {
654
      $agent = $1;
670
      $agent = $1;
655
    }
671
    }
656
    else
672
    else
657
    {
673
    {
658
      ## Clean up unknown agents
674
      ## Clean up unknown agents
659
      if ( $raw =~ m!^(.*?)/! )
675
      if ( $raw =~ m!^(.*?)/! )
660
      {
676
      {
661
        $agent = $1;
677
        $agent = $1;
662
      }
678
      }
663
      elsif ( $raw =~ /^(\w*)\d.*/ )
679
      elsif ( $raw =~ /^(\w*)\d.*/ )
664
      {
680
      {
665
        $agent = $1;
681
        $agent = $1;
666
      }
682
      }
667
    }
683
    }
668
684
669
    $distinct_agent{$agent}++;
685
    $distinct_agent{$agent}++;
670
    return $agent;
686
    return $agent;
671
  }
687
  }
672
688
673
  # Get all cross-posted newsgroups
689
  # Get all cross-posted newsgroups
674
  for ( split /,/, $headers{"Newsgroups"} )
690
  for ( split /,/, $headers{"Newsgroups"} )
675
  {
691
  {
676
    $crossposts{$_}++;    # bump count for each
692
    $crossposts{$_}++;    # bump count for each
677
  }
693
  }
678
694
679
  # Get threads
695
  # Get threads
680
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
696
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
681
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
697
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
682
  $thread =~ s/\s+/ /g;     # collapse whitespace
698
  $thread =~ s/\s+/ /g;     # collapse whitespace
683
  $threads{$thread}{count} += 1;            # bump count of this subject
699
  $threads{$thread}{count} += 1;            # bump count of this subject
684
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
700
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
685
701
686
  # Is this an original post or a reply?
702
  # Is this an original post or a reply?
687
  if ( defined $headers{"References"} )
703
  if ( defined $headers{"References"} )
688
  {
704
  {
689
    $replies++;
705
    $replies++;
690
  }
706
  }
691
  else
707
  else
692
  {
708
  {
693
    $origposts++;
709
    $origposts++;
694
  }
710
  }
695
711
696
  # Get the time zone
712
  # Get the time zone
697
  $_ = $headers{"Date"};
713
  $_ = $headers{"Date"};
698
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
714
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
699
  if ( ($tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
715
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
700
  {
716
  {
701
    $tz = "UTC";
717
    $tz = "UTC";
702
  }
718
  }
703
  $tz{$tz}++;
719
  $tz{$tz}++;
704
720
705
#### Now analyse the body text ####
721
#### Now analyse the body text ####
706
  my $insig = 0;
722
  my $insig = 0;
707
  for (@body)
723
  for (@body)
708
  {
724
  {
709
    $totbody += length($_);    # bump total body size
725
    $totbody += length($_);    # bump total body size
710
    next if (/^$>/);           # don't count blank lines in body
726
    next if (/^$>/);           # don't count blank lines in body
711
    if ( $insig == 1 )
727
    if ( $insig == 1 )
712
    {
728
    {
713
      $totsig += length($_);    # bump total sig size
729
      $totsig += length($_);    # bump total sig size
714
730
715
      # Bill Unruh uses ] quotes, and another poster uses ::
731
      # Bill Unruh uses ] quotes, and another poster uses ::
716
    }
732
    }
717
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
733
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
718
    {                           # are we in a quote line?
734
    {                           # are we in a quote line?
719
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
735
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
720
      $totquoted             += length($_);
736
      $totquoted += length($_);
721
    }
737
    }
722
    elsif (/-- /)
738
    elsif (/-- /)
723
    {
739
    {
724
      $insig = 1;
740
      $insig = 1;
725
    }
741
    }
726
    else
742
    else
727
    {
743
    {
728
744
729
      # we must be processing an original line
745
      # we must be processing an original line
730
      $data{$poster}{orig} += length($_);      # bump count of original chrs
746
      $data{$poster}{orig} += length($_);      # bump count of original chrs
731
      $totorig             += length($_);
747
      $totorig += length($_);
732
    }
748
    }
733
  }    # end for (@body)
749
  }    # end for (@body)
734
750
735
}    # getdata
751
}    # getdata
736
752
737
########################################
753
########################################
738
# Count the User-Agents used, collapsing
754
# Count the User-Agents used, collapsing
739
# different versions into one per agent.
755
# different versions into one per agent.
740
########################################
756
########################################
741
sub countagents
757
sub countagents
742
{
758
{
743
POSTER:
759
POSTER:
744
  foreach my $poster ( keys %data )
760
  foreach my $poster ( keys %data )
745
  {
761
  {
746
    foreach my $agent_name ( keys %distinct_agent )
762
    foreach my $agent_name ( keys %distinct_agent )
747
    {    # check against known ones
763
    {    # check against known ones
748
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
764
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
749
      {
765
      {
750
        $agents{$agent_name}++;
766
        $agents{$agent_name}++;
751
        next POSTER;
767
        next POSTER;
752
      }
768
      }
753
    }
769
    }
754
    $agents{ $data{$poster}{agent} }++;
770
    $agents{ $data{$poster}{agent} }++;
755
  }
771
  }
756
}    # countagents
772
}    # countagents
757
773
758
############################################
774
############################################
759
# set orig/total percentages for all posters
775
# set orig/total percentages for all posters
760
############################################
776
############################################
761
sub fixpercent
777
sub fixpercent
762
{
778
{
763
  foreach my $poster ( keys %data )
779
  foreach my $poster ( keys %data )
764
  {
780
  {
765
    my $percent = 100;
781
    my $percent = 100;
766
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
782
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
767
    {
783
    {
-
 
784
      $percent =
768
      $percent = $data{$poster}{orig} * 100 /
785
        $data{$poster}{orig} * 100 /
769
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
786
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
770
    }
787
    }
771
    elsif ( $data{$poster}{orig} == 0 )
788
    elsif ( $data{$poster}{orig} == 0 )
772
    {
789
    {
773
      $percent = 0;
790
      $percent = 0;
774
    }
791
    }
775
    $data{$poster}{percent} = $percent;
792
    $data{$poster}{percent} = $percent;
776
  }
793
  }
777
}
794
}
778
795
779
##############################
796
##############################
780
# right pad a string with '.'s
797
# right pad a string with '.'s
781
##############################
798
##############################
782
sub rpad
799
sub rpad
783
{
800
{
784
801
785
  # get text to pad, length to pad, pad chr
802
  # get text to pad, length to pad, pad chr
786
  my ( $text, $pad_len, $pad_chr ) = @_;
803
  my ( $text, $pad_len, $pad_chr ) = @_;
787
  if ( length($text) > $pad_len )
804
  if ( length($text) > $pad_len )
788
  {
805
  {
789
    $text = substr( $text, 0, $pad_len );
806
    $text = substr( $text, 0, $pad_len );
790
  }
807
  }
791
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
808
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
792
  return $padded;
809
  return $padded;
793
}
810
}
794
811
795
#################
812
#################
796
# centre a string
813
# centre a string
797
#################
814
#################
798
sub centred
815
sub centred
799
{
816
{
800
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
817
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
801
  my $pad_len = ( $width - length($text) ) / 2;    #/
818
  my $pad_len = ( $width - length($text) ) / 2;    #/
802
  my $centred = " " x $pad_len . $text;
819
  my $centred = " " x $pad_len . $text;
803
  return $centred;
820
  return $centred;
804
}
821
}
805
822
806
##########################
823
##########################
807
# put commas into a number
824
# put commas into a number
808
##########################
825
##########################
809
sub commify
826
sub commify
810
{
827
{
811
  $_ = shift;
828
  $_ = shift;
812
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
829
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
813
  return $_;
830
  return $_;
814
}
831
}
815
832
816
#########################
833
#########################
817
# clean
834
# clean
818
#########################
835
#########################
819
sub clean
836
sub clean
820
{
837
{
821
  my $dirty = shift;
838
  my $dirty = shift;
822
  my $clean = $dirty;
839
  my $clean = $dirty;
823
  $clean =~ s/^\s*//;
840
  $clean =~ s/^\s*//;
824
  $clean =~ s/\s*$//;
841
  $clean =~ s/\s*$//;
825
842
826
  return $clean;
843
  return $clean;
827
}
844
}
828
845
829
sub usage
846
sub usage
830
{
847
{
831
848
832
  print "usage: newstat.pl newsgroupname\n";
849
  print "usage: newstat.pl newsgroupname\n";
833
  exit 1;
850
  exit 1;
834
}
851
}
835
852
836
###################################
853
###################################
837
# Write data structures to a file #
854
# Write data structures to a file #
838
###################################
855
###################################
839
sub writedata
856
sub writedata
840
{
857
{
841
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
858
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
842
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
859
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
843
  print $OUTF
860
  print $OUTF
844
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
861
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
845
  foreach my $name ( keys %data )
862
  foreach my $name ( keys %data )
846
  {
863
  {
847
    print $OUTF
864
    print $OUTF
848
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
865
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
849
  }
866
  }
850
  print $OUTF
867
  print $OUTF
851
"============================================================================\n";
868
"============================================================================\n";
852
  print $OUTF "Thread subjects\n";
869
  print $OUTF "Thread subjects\n";
853
  print $OUTF
870
  print $OUTF
854
"----------------------------------------------------------------------------\n";
871
"----------------------------------------------------------------------------\n";
855
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
872
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
856
  {
873
  {
-
 
874
    print $OUTF
857
    print $OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
875
      "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
858
  }
876
  }
859
  print $OUTF
877
  print $OUTF
860
"============================================================================\n";
878
"============================================================================\n";
861
  print $OUTF "Cross-posts\n";
879
  print $OUTF "Cross-posts\n";
862
  print $OUTF
880
  print $OUTF
863
"----------------------------------------------------------------------------\n";
881
"----------------------------------------------------------------------------\n";
864
  foreach my $name ( sort keys %crossposts )
882
  foreach my $name ( sort keys %crossposts )
865
  {
883
  {
866
    print $OUTF "$name : $crossposts{$name}\n";
884
    print $OUTF "$name : $crossposts{$name}\n";
867
  }
885
  }
868
  print $OUTF print $OUTF
886
  print $OUTF print $OUTF
869
"============================================================================\n";
887
"============================================================================\n";
870
  print $OUTF "User agents\n";
888
  print $OUTF "User agents\n";
871
  print $OUTF
889
  print $OUTF
872
"----------------------------------------------------------------------------\n";
890
"----------------------------------------------------------------------------\n";
873
  foreach my $name ( sort keys %agents )
891
  foreach my $name ( sort keys %agents )
874
  {
892
  {
875
    print $OUTF "$name : $agents{$name}\n";
893
    print $OUTF "$name : $agents{$name}\n";
876
  }
894
  }
877
  print $OUTF
895
  print $OUTF
878
"============================================================================\n";
896
"============================================================================\n";
879
  print $OUTF "Time zones\n";
897
  print $OUTF "Time zones\n";
880
  print $OUTF
898
  print $OUTF
881
"----------------------------------------------------------------------------\n";
899
"----------------------------------------------------------------------------\n";
882
  foreach my $name ( sort keys %tz )
900
  foreach my $name ( sort keys %tz )
883
  {
901
  {
884
    print $OUTF "$name : $tz{$name}\n";
902
    print $OUTF "$name : $tz{$name}\n";
885
  }
903
  }
886
  close $OUTF;
904
  close $OUTF;
887
}    # writedata
905
}    # writedata
888
 
906