Subversion Repositories LCARS

Rev

Rev 7 | Rev 9 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 7 Rev 8
1
#!/usr/bin/perl
1
#!/usr/bin/env perl
2
use strict;
2
use strict;
3
use warnings;
3
use warnings;
4
use utf8;
4
use utf8;
5
use Encode;
5
use Encode;
6
6
7
#########################
7
###########################
8
# newsstat.pl version 0.4
8
# newsstat.pl version 0.4.1
9
9
10
############################################################################
10
############################################################################
11
# Collect statistics about a newsgroup (specified by first argument) in
11
# Collect statistics about a newsgroup (specified by first argument) in
12
# the local news spool. Check all articles in the last 30-day period.
12
# the local news spool. Check all articles in the last 30-day period.
13
# Rank posters by number of posts and by volume of posts, report on top and
13
# Rank posters by number of posts and by volume of posts, report on top and
14
# bottom 20 posters. Show their name, number of posts, size of posts,
14
# bottom 20 posters. Show their name, number of posts, size of posts,
15
# percentage of quoted lines. Rank user-agents used, by poster rather than
15
# percentage of quoted lines. Rank user-agents used, by poster rather than
16
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
16
# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
17
#
17
#
18
# (Numbers and paths can be configured below.  -- PE)
18
# (Numbers and paths can be configured below.  -- PE)
19
############################################################################
19
############################################################################
20
20
21
############################################################################
21
############################################################################
22
#                       RECENT CHANGES                                     #
22
#                       RECENT CHANGES                                     #
23
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
23
# 2011-07-03  PE  - Use Encode to decode/encode MIME encodings
24
#                 - Use warnings, utf8 (just in case)
24
#                 - Use warnings, utf8 (just in case)
25
#                 - Documentation update
25
#                 - Documentation update
26
# N/A         NN  - Take newsgroup name as argument
26
# N/A         NN  - Take newsgroup name as argument
27
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
27
# 2004-06-19  NN  - newsgroup name is $ARGV[0]
28
#                 - Allow command line flags for subtracting
28
#                 - Allow command line flags for subtracting
29
#                   output if not pertinent for a group
29
#                   output if not pertinent for a group
30
# 2002-11-09  NN  - Put Garry's writedata() function back in.
30
# 2002-11-09  NN  - Put Garry's writedata() function back in.
31
#                 - added "rn" to my list of UA's
31
#                 - added "rn" to my list of UA's
32
#                 - Started using %distinct_agent for both User agent
32
#                 - Started using %distinct_agent for both User agent
33
#                   sections
33
#                   sections
34
#                 - named it newsstat.pl version 0.3
34
#                 - named it newsstat.pl version 0.3
35
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
35
# 2002-11-06  NN  - Fixed the earliest/latest file problem by using
36
#                   mtime rather than ctime, and simplifying the logic
36
#                   mtime rather than ctime, and simplifying the logic
37
# 2002-11-05  NN  - moved user configurations to the top
37
# 2002-11-05  NN  - moved user configurations to the top
38
#                 - fixed the cross-posting section
38
#                 - fixed the cross-posting section
39
#                 - introduced the $newsgroup_name variable which
39
#                 - introduced the $newsgroup_name variable which
40
#                   later becomes $news$group
40
#                   later becomes $news$group
41
#                 - changed $name to $agent_name in countagents()
41
#                 - changed $name to $agent_name in countagents()
42
#
42
#
43
# Contributors
43
# Contributors
44
# -------------
44
# -------------
45
# NN  Nomen nominandum (name to be determined later)
45
# NN  Nomen nominandum (name to be determined later)
46
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
46
# PE  Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
47
47
48
########### TODO #############
48
########### TODO #############
49
# Commas in bottom section of report
49
# Commas in bottom section of report
50
# Show date the figures were compiled
50
# Show date the figures were compiled
51
# No. of HTML articles (Content-Type: text/html)
51
# No. of HTML articles (Content-Type: text/html)
52
# No. of quoted sigs (/>\s*-- /)
52
# No. of quoted sigs (/>\s*-- /)
53
# Per cent of top-posted articles
53
# Per cent of top-posted articles
54
# Top 10 cross-posters
54
# Top 10 cross-posters
55
# Top 20 news posting hosts (from Path)
55
# Top 20 news posting hosts (from Path)
56
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
56
# Count of certain subject words: newbie, kde, burner, sendmail, etc.
57
# Count *all* User Agents that each poster uses
57
# Count *all* User Agents that each poster uses
58
# What do we do about Bill Unruh's ] quote style?
58
# What do we do about Bill Unruh's ] quote style?
59
# Change the way dates/times are checked
59
# Change the way dates/times are checked
60
# include % share in posters by no. of arts
60
# include % share in posters by no. of arts
61
# include % share in posters by size
61
# include % share in posters by size
62
# Total, orig & quoted lines by user agent with per cent
62
# Total, orig & quoted lines by user agent with per cent
63
# Take more arguments
63
# Take more arguments
64
#######################################################
64
#######################################################
65
65
66
###################### USER CONFIGURATIONS ############################
66
###################### USER CONFIGURATIONS ############################
67
67
68
# The name of the group to do stats for
68
# The name of the group to do stats for
69
my $newsgroup_name = $ARGV[0];
69
my $newsgroup_name = $ARGV[0];
70
$newsgroup_name or &usage;
70
$newsgroup_name or &usage;
71
71
72
# Check for removal flags
72
# Check for removal flags
73
my $ix;
73
my $ix;
74
my $j;
74
my $j;
75
my %skipSec;
75
my %skipSec;
76
my @skiplist;
76
my @skiplist;
77
my $args = @ARGV;
77
my $args = @ARGV;
78
for ( $ix = 1 ; $ix < $args ; $ix++ )
78
for ( $ix = 1 ; $ix < $args ; $ix++ )
79
{
79
{
80
  $j = $ix + 1;
80
  $j = $ix + 1;
81
  if ( $ARGV[$ix] eq "-x" )
81
  if ( $ARGV[$ix] eq "-x" )
82
  {
82
  {
83
    @skiplist = split( ",", $ARGV[$j] );
83
    @skiplist = split( ",", $ARGV[$j] );
84
  }
84
  }
85
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
85
  elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
86
  {
86
  {
87
    @skiplist = split( ",", $1 );
87
    @skiplist = split( ",", $1 );
88
  }
88
  }
89
}
89
}
90
foreach (@skiplist)
90
foreach (@skiplist)
91
{
91
{
92
  $skipSec{$_} = 1;
92
  $skipSec{$_} = 1;
93
}
93
}
94
94
95
# Leafnode users will want /var/spool/news for this variable.
95
# Leafnode users will want /var/spool/news for this variable.
96
my $news = "/var/spool/news/";
96
my $news = "/var/spool/news/";
97
97
98
# How many days are we doing statistics for?
98
# How many days are we doing statistics for?
99
my $numdays = 30;
99
my $numdays = 30;
100
100
101
# no. of agents we list
101
# no. of agents we list
102
my $topagents = 10;
102
my $topagents = 10;
103
103
104
# no. of threads we want to know about
104
# no. of threads we want to know about
105
my $topthreads = 20;
105
my $topthreads = 20;
106
106
107
# no. of top or bottom posters to show
107
# no. of top or bottom posters to show
108
my $topposters = 20;
108
my $topposters = 20;
109
109
110
# no. of cross-posted threads to show
110
# no. of cross-posted threads to show
111
my $topcrossposts = 10;
111
my $topcrossposts = 10;
112
112
113
# no. of time zones to show
113
# no. of time zones to show
114
my $toptz = 10;
114
my $toptz = 10;
115
115
116
###################### DATA STRUCTURES ######################
116
###################### DATA STRUCTURES ######################
117
my $group = $newsgroup_name;
117
my $group = $newsgroup_name;
118
$group =~ s!\.!/!g;
118
$group =~ s!\.!/!g;
119
my %data;          # name, count, agent, total, orig, quoted
119
my %data;          # name, count, agent, total, orig, quoted
120
my %threads;       # subject, count
120
my %threads;       # subject, count
121
my %crossposts;    # group, count
121
my %crossposts;    # group, count
122
my %tz;            # timezones by count
122
my %tz;            # timezones by count
123
my %headers;       # holds header of current article
123
my %headers;       # holds header of current article
124
my %lcheader;      # holds lowercase headers
124
my %lcheader;      # holds lowercase headers
125
my @body;          # holds body of current article
125
my @body;          # holds body of current article
126
my @sig;           # holds sig text;
126
my @sig;           # holds sig text;
127
my $totalposts;    # total no. of posts considered
127
my $totalposts;    # total no. of posts considered
128
my $filename;      # name of current article file
128
my $filename;      # name of current article file
129
my $filesize;      # size of current article file
129
my $filesize;      # size of current article file
130
my $earliest;      # earliest article we have found
130
my $earliest;      # earliest article we have found
131
my $latest;        # latest article we have found
131
my $latest;        # latest article we have found
132
my $poster;        # poster we are dealing with
132
my $poster;        # poster we are dealing with
133
my $totsize   = 0; # holds total sizes of all files
133
my $totsize   = 0; # holds total sizes of all files
134
my $totheader = 0; # total size of header material
134
my $totheader = 0; # total size of header material
135
my $totbody   = 0; # total size of body material
135
my $totbody   = 0; # total size of body material
136
my $totsig    = 0; # total size of sig material
136
my $totsig    = 0; # total size of sig material
137
my $totorig   = 0; # total size of original material
137
my $totorig   = 0; # total size of original material
138
my $totquoted = 0; # total size of quoted material
138
my $totquoted = 0; # total size of quoted material
139
my $origposts = 0; # total no. of original posts
139
my $origposts = 0; # total no. of original posts
140
my $replies   = 0; # total no. of replies
140
my $replies   = 0; # total no. of replies
141
my $i;             # general purpose
141
my $i;             # general purpose
142
my %distinct_agent;
142
my %distinct_agent;
143
my %agents =       # used to hold counts of User Agents used
143
my %agents =       # used to hold counts of User Agents used
144
  (
144
  (
145
  "KNode"                     => 0,
145
  "KNode"                     => 0,
146
  "Pan"                       => 0,
146
  "Pan"                       => 0,
147
  "Mozilla"                   => 0,
147
  "Mozilla"                   => 0,
148
  "Sylpheed"                  => 0,
148
  "Sylpheed"                  => 0,
149
  "Gnus"                      => 0,
149
  "Gnus"                      => 0,
150
  "Forte Agent"               => 0,
150
  "Forte Agent"               => 0,
151
  "Forte Free Agent"          => 0,
151
  "Forte Free Agent"          => 0,
152
  "MicroPlanet Gravity"       => 0,
152
  "MicroPlanet Gravity"       => 0,
153
  "Microsoft Outlook Express" => 0,
153
  "Microsoft Outlook Express" => 0,
154
  "Xnews"                     => 0,
154
  "Xnews"                     => 0,
155
  "slrn"                      => 0,
155
  "slrn"                      => 0,
156
  "tin"                       => 0,
156
  "tin"                       => 0,
157
  "rn"                        => 0,
157
  "rn"                        => 0,
158
  "NN"                        => 0,
158
  "NN"                        => 0,
159
  "MacSOUP"                   => 0,
159
  "MacSOUP"                   => 0,
160
  "Foorum"                    => 0,
160
  "Foorum"                    => 0,
161
  "MT-NewsWatcher"            => 0,
161
  "MT-NewsWatcher"            => 0,
162
  "News Rover"                => 0,
162
  "News Rover"                => 0,
163
  "WebTV"                     => 0,
163
  "WebTV"                     => 0,
164
  "Compuserver"               => 0,
164
  "Compuserver"               => 0,
165
  "VSoup"                     => 0
165
  "VSoup"                     => 0
166
  );
166
  );
167
167
168
######################## MAIN CODE ########################
168
######################## MAIN CODE ########################
169
$! = 1;
169
$! = 1;
170
170
171
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
171
chdir("$news$group") or die "Can't cd to $news$group: $!\n";
172
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
172
opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
173
while ( defined( $filename = readdir(DIR) ) )
173
while ( defined( $filename = readdir(DIR) ) )
174
{
174
{
175
  %lcheader = ();
175
  %lcheader = ();
176
  next unless -f $filename;    # only want real files
176
  next unless -f $filename;    # only want real files
177
  next if ( $filename eq ".overview" );    # real articles only
177
  next if ( $filename eq ".overview" );    # real articles only
178
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
178
  next if ( -M $filename > $numdays );     # only want articles <= a certain age
179
  $earliest = ( stat $filename )[9] unless defined($earliest);
179
  $earliest = ( stat $filename )[9] unless defined($earliest);
180
  $latest   = ( stat $filename )[9] unless defined($latest);
180
  $latest   = ( stat $filename )[9] unless defined($latest);
181
  &getarticle($filename);                  # read in the article
181
  &getarticle($filename);                  # read in the article
182
  &getdata;                                # grab the data from the article
182
  &getdata;                                # grab the data from the article
183
  $totalposts++;                           # bump count of articles considered
183
  $totalposts++;                           # bump count of articles considered
184
}
184
}
185
closedir(DIR);                             # finished with the directory
185
closedir(DIR);                             # finished with the directory
186
186
187
# post-processing
187
# post-processing
188
&countagents;    # count agents, collapsing versions
188
&countagents;    # count agents, collapsing versions
189
&fixpercent;     # check percentages orig/total for posters
189
&fixpercent;     # check percentages orig/total for posters
190
190
191
&writedata;
191
&writedata;
192
192
193
#################### DISPLAY RESULTS #####################
193
#################### DISPLAY RESULTS #####################
194
print "=" x 76, "\n";
194
print "=" x 76, "\n";
195
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
195
printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
196
print "=" x 76, "\n";
196
print "=" x 76, "\n";
197
printf "%s\n",
197
printf "%s\n",
198
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
198
  &centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
199
print "\n\n";
199
print "\n\n";
200
printf "Total posts considered: %s over %d days\n", commify($totalposts),
200
printf "Total posts considered: %s over %d days\n", commify($totalposts),
201
  $numdays;
201
  $numdays;
202
printf "Earliest article: %s\n",               scalar localtime($earliest);
202
printf "Earliest article: %s\n",               scalar localtime($earliest);
203
printf "Latest article:   %s\n",               scalar localtime($latest);
203
printf "Latest article:   %s\n",               scalar localtime($latest);
204
printf "Original articles: %s, replies: %s\n", commify($origposts),
204
printf "Original articles: %s, replies: %s\n", commify($origposts),
205
  commify($replies);
205
  commify($replies);
206
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
206
printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
207
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
207
  commify( int( $totsize / 1024 ) ), $totsize / 1048576;    #
208
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
208
printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
209
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
209
  commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
210
  commify( int( $totsize / $totalposts ) );
210
  commify( int( $totsize / $totalposts ) );
211
my $count = keys %data;
211
my $count = keys %data;
212
printf "Total headers: %s KB  bodies: %s KB\n",
212
printf "Total headers: %s KB  bodies: %s KB\n",
213
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
213
  commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
214
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
214
printf "Body text - quoted: %s KB,  original: %s KB = %02.2f%%, sigs: %s KB\n",
215
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
215
  commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
216
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
216
  ( $totorig * 100 ) / ( $totorig + $totquoted ),
217
  commify( int( $totsig / 1024 ) );
217
  commify( int( $totsig / 1024 ) );
218
printf "Total number of posters: %s, average %s bytes per poster\n",
218
printf "Total number of posters: %s, average %s bytes per poster\n",
219
  commify($count), commify( int( $totsize / $count ) );     #/
219
  commify($count), commify( int( $totsize / $count ) );     #/
220
$count = keys %threads;
220
$count = keys %threads;
221
printf "Total number of threads: %s, average %s bytes per thread\n",
221
printf "Total number of threads: %s, average %s bytes per thread\n",
222
  commify($count), commify( int( $totsize / $count ) );     #/
222
  commify($count), commify( int( $totsize / $count ) );     #/
223
printf "Total number of User-Agents: %d\n", scalar keys %agents;
223
printf "Total number of User-Agents: %d\n", scalar keys %agents;
224
print "\n", "=" x 76, "\n";
224
print "\n", "=" x 76, "\n";
225
225
226
###############################
226
###############################
227
# show posters by article count  Sec 1;
227
# show posters by article count  Sec 1;
228
###############################
228
###############################
229
unless ( $skipSec{1} )
229
unless ( $skipSec{1} )
230
{
230
{
231
  if ( keys %data < $topposters )
231
  if ( keys %data < $topposters )
232
  {
232
  {
233
    $count = keys %data;
233
    $count = keys %data;
234
  }
234
  }
235
  else
235
  else
236
  {
236
  {
237
    $count = $topposters;
237
    $count = $topposters;
238
  }
238
  }
239
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
239
  printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
240
  print "=" x 76, "\n";
240
  print "=" x 76, "\n";
241
  $i = 0;
241
  $i = 0;
242
  foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
242
  foreach my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
243
  {
243
  {
244
    my $name = substr( $poster, 0, 65 );
244
    my $name = substr( $poster, 0, 65 );
245
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
245
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
246
      $data{$poster}{count};
246
      $data{$poster}{count};
247
    last if ( ++$i == $count );
247
    last if ( ++$i == $count );
248
  }
248
  }
249
  print "\n", "=" x 76, "\n";
249
  print "\n", "=" x 76, "\n";
250
}
250
}
251
251
252
################################
252
################################
253
# show posters by size in Kbytes Sec 2;
253
# show posters by size in Kbytes Sec 2;
254
################################
254
################################
255
unless ( $skipSec{2} )
255
unless ( $skipSec{2} )
256
{
256
{
257
  if ( keys %data < $topposters )
257
  if ( keys %data < $topposters )
258
  {
258
  {
259
    $count = keys %data;
259
    $count = keys %data;
260
  }
260
  }
261
  else
261
  else
262
  {
262
  {
263
    $count = $topposters;
263
    $count = $topposters;
264
  }
264
  }
265
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
265
  printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
266
  print "=" x 76, "\n";
266
  print "=" x 76, "\n";
267
  $i = 0;
267
  $i = 0;
268
  foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
268
  foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
269
  {
269
  {
270
    my $name = substr( $poster, 0, 62 );
270
    my $name = substr( $poster, 0, 62 );
271
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
271
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
272
      $data{$poster}{size} / 1024;    #/
272
      $data{$poster}{size} / 1024;    #/
273
    last if ( ++$i == $count );
273
    last if ( ++$i == $count );
274
  }
274
  }
275
  print "\n", "=" x 76, "\n";
275
  print "\n", "=" x 76, "\n";
276
}
276
}
277
277
278
####################################
278
####################################
279
# show top posters for original text
279
# show top posters for original text
280
####################################
280
####################################
281
unless ( $skipSec{3} )
281
unless ( $skipSec{3} )
282
{
282
{
283
  if ( keys %data < $topposters )
283
  if ( keys %data < $topposters )
284
  {
284
  {
285
    $count = keys %data;
285
    $count = keys %data;
286
  }
286
  }
287
  else
287
  else
288
  {
288
  {
289
    $count = $topposters;
289
    $count = $topposters;
290
  }
290
  }
291
  printf "%s\n",
291
  printf "%s\n",
292
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
292
    &centred( "Top $count responders by original text (> 5 posts)", 76 );
293
  print "=" x 76, "\n";
293
  print "=" x 76, "\n";
294
  $i = 0;
294
  $i = 0;
295
  foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
295
  foreach my $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
296
    keys %data )
296
    keys %data )
297
  {
297
  {
298
    next if $data{$poster}{quoted} == 0;
298
    next if $data{$poster}{quoted} == 0;
299
    next if $data{$poster}{count} < 5;
299
    next if $data{$poster}{count} < 5;
300
    my $name = substr( $poster, 0, 63 );
300
    my $name = substr( $poster, 0, 63 );
301
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
301
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
302
      $data{$poster}{percent};
302
      $data{$poster}{percent};
303
    last if ( ++$i == $count );
303
    last if ( ++$i == $count );
304
  }
304
  }
305
  print "\n", "=" x 76, "\n";
305
  print "\n", "=" x 76, "\n";
306
}
306
}
307
307
308
#######################################
308
#######################################
309
# show bottom posters for original text
309
# show bottom posters for original text
310
#######################################
310
#######################################
311
unless ( $skipSec{4} )
311
unless ( $skipSec{4} )
312
{
312
{
313
  if ( keys %data < $topposters )
313
  if ( keys %data < $topposters )
314
  {
314
  {
315
    $count = keys %data;
315
    $count = keys %data;
316
  }
316
  }
317
  else
317
  else
318
  {
318
  {
319
    $count = $topposters;
319
    $count = $topposters;
320
  }
320
  }
321
  printf "%s\n",
321
  printf "%s\n",
322
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
322
    &centred( "Bottom $count responders by original text  (> 5 posts)", 76 );
323
  print "=" x 76, "\n";
323
  print "=" x 76, "\n";
324
  $i = 0;
324
  $i = 0;
325
  foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
325
  foreach my $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
326
    keys %data )
326
    keys %data )
327
  {
327
  {
328
    next if $data{$poster}{quoted} == 0;
328
    next if $data{$poster}{quoted} == 0;
329
    next if $data{$poster}{count} < 5;
329
    next if $data{$poster}{count} < 5;
330
    my $name = substr( $poster, 0, 63 );
330
    my $name = substr( $poster, 0, 63 );
331
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
331
    printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
332
      $data{$poster}{percent};
332
      $data{$poster}{percent};
333
    last if ( ++$i == $count );
333
    last if ( ++$i == $count );
334
  }
334
  }
335
  print "\n", "=" x 76, "\n";
335
  print "\n", "=" x 76, "\n";
336
}
336
}
337
337
338
####################################
338
####################################
339
# show threads by number of articles
339
# show threads by number of articles
340
####################################
340
####################################
341
unless ( $skipSec{5} )
341
unless ( $skipSec{5} )
342
{
342
{
343
  if ( keys %threads < $topthreads )
343
  if ( keys %threads < $topthreads )
344
  {
344
  {
345
    $count = keys %threads;
345
    $count = keys %threads;
346
  }
346
  }
347
  else
347
  else
348
  {
348
  {
349
    $count = $topthreads;
349
    $count = $topthreads;
350
  }
350
  }
351
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
351
  printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
352
  print "=" x 76, "\n";
352
  print "=" x 76, "\n";
353
  $i = 0;
353
  $i = 0;
354
  foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
354
  foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
355
    keys %threads )
355
    keys %threads )
356
  {
356
  {
357
    my $name = substr( $thread, 0, 65 );
357
    my $name = substr( $thread, 0, 65 );
358
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
358
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
359
      $threads{$thread}{count};
359
      $threads{$thread}{count};
360
    last if ( ++$i == $count );
360
    last if ( ++$i == $count );
361
  }
361
  }
362
  print "\n", "=" x 76, "\n";
362
  print "\n", "=" x 76, "\n";
363
}
363
}
364
################################
364
################################
365
# show threads by size in Kbytes
365
# show threads by size in Kbytes
366
################################
366
################################
367
unless ( $skipSec{6} )
367
unless ( $skipSec{6} )
368
{
368
{
369
  if ( keys %threads < $topthreads )
369
  if ( keys %threads < $topthreads )
370
  {
370
  {
371
    $count = keys %threads;
371
    $count = keys %threads;
372
  }
372
  }
373
  else
373
  else
374
  {
374
  {
375
    $count = $topthreads;
375
    $count = $topthreads;
376
  }
376
  }
377
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
377
  printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
378
  print "=" x 76, "\n";
378
  print "=" x 76, "\n";
379
  $i = 0;
379
  $i = 0;
380
  foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
380
  foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
381
    keys %threads )
381
    keys %threads )
382
  {
382
  {
383
    my $name = substr( $thread, 0, 65 );
383
    my $name = substr( $thread, 0, 65 );
384
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
384
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
385
      $threads{$thread}{size} / 1024;    #/
385
      $threads{$thread}{size} / 1024;    #/
386
    last if ( ++$i == $count );
386
    last if ( ++$i == $count );
387
  }
387
  }
388
  print "\n", "=" x 76, "\n";
388
  print "\n", "=" x 76, "\n";
389
}
389
}
390
390
391
#################################
391
#################################
392
# show top 10 cross-posted groups
392
# show top 10 cross-posted groups
393
#################################
393
#################################
394
unless ( $skipSec{7} )
394
unless ( $skipSec{7} )
395
{
395
{
396
  delete $crossposts{"$newsgroup_name"};    # don't include ours
396
  delete $crossposts{"$newsgroup_name"};    # don't include ours
397
  if ( keys %crossposts < $topcrossposts )
397
  if ( keys %crossposts < $topcrossposts )
398
  {
398
  {
399
    $count = keys %crossposts;
399
    $count = keys %crossposts;
400
  }
400
  }
401
  else
401
  else
402
  {
402
  {
403
    $count = $topcrossposts;
403
    $count = $topcrossposts;
404
  }
404
  }
405
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
405
  printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
406
  print "=" x 76, "\n";
406
  print "=" x 76, "\n";
407
  $i = 0;
407
  $i = 0;
408
  foreach
408
  foreach
409
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
409
    my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
410
  {
410
  {
411
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
411
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
412
      $crossposts{$name};
412
      $crossposts{$name};
413
    last if ( ++$i == $count );
413
    last if ( ++$i == $count );
414
  }
414
  }
415
  print "\n", "=" x 76, "\n";
415
  print "\n", "=" x 76, "\n";
416
}
416
}
417
#######################
417
#######################
418
#show agents and counts
418
#show agents and counts
419
#######################
419
#######################
420
unless ( $skipSec{8} )
420
unless ( $skipSec{8} )
421
{
421
{
422
  if ( keys %agents < $topagents )
422
  if ( keys %agents < $topagents )
423
  {
423
  {
424
    $count = keys %agents;
424
    $count = keys %agents;
425
  }
425
  }
426
  else
426
  else
427
  {
427
  {
428
    $count = $topagents;
428
    $count = $topagents;
429
  }
429
  }
430
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
430
  printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
431
  print "=" x 76, "\n";
431
  print "=" x 76, "\n";
432
  $i = 0;
432
  $i = 0;
433
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
433
  foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
434
  {
434
  {
435
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
435
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
436
      $agents{$agent};
436
      $agents{$agent};
437
    last if ( ++$i == $count );
437
    last if ( ++$i == $count );
438
  }
438
  }
439
  print "\n", "=" x 76, "\n";
439
  print "\n", "=" x 76, "\n";
440
}
440
}
441
441
442
#######################
442
#######################
443
#show distinct agents
443
#show distinct agents
444
#######################
444
#######################
445
unless ( $skipSec{9} )
445
unless ( $skipSec{9} )
446
{
446
{
447
  if ( keys %distinct_agent < $topagents )
447
  if ( keys %distinct_agent < $topagents )
448
  {
448
  {
449
    $count = keys %distinct_agent;
449
    $count = keys %distinct_agent;
450
  }
450
  }
451
  else
451
  else
452
  {
452
  {
453
    $count = $topagents;
453
    $count = $topagents;
454
  }
454
  }
455
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
455
  printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
456
  print "=" x 76, "\n";
456
  print "=" x 76, "\n";
457
  $i = 0;
457
  $i = 0;
458
  foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
458
  foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
459
    keys %distinct_agent )
459
    keys %distinct_agent )
460
  {
460
  {
461
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
461
    printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
462
      $distinct_agent{$agent},
462
      $distinct_agent{$agent},
463
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
463
      ( ( $distinct_agent{$agent} / $totalposts ) * 100 );
464
    last if ( ++$i == $count );
464
    last if ( ++$i == $count );
465
  }
465
  }
466
  print "\n", "=" x 76, "\n";
466
  print "\n", "=" x 76, "\n";
467
}
467
}
468
468
469
##########################
469
##########################
470
#show timezones and counts
470
#show timezones and counts
471
##########################
471
##########################
472
unless ( $skipSec{10} )
472
unless ( $skipSec{10} )
473
{
473
{
474
  if ( keys %tz < $toptz )
474
  if ( keys %tz < $toptz )
475
  {
475
  {
476
    $count = keys %tz;
476
    $count = keys %tz;
477
  }
477
  }
478
  else
478
  else
479
  {
479
  {
480
    $count = $toptz;
480
    $count = $toptz;
481
  }
481
  }
482
  printf "%s\n", &centred( "Top 10 time zones", 76 );
482
  printf "%s\n", &centred( "Top 10 time zones", 76 );
483
  print "=" x 76, "\n";
483
  print "=" x 76, "\n";
484
  $i = 0;
484
  $i = 0;
485
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
485
  foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
486
  {
486
  {
487
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
487
    printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
488
    last if ( ++$i == $count );
488
    last if ( ++$i == $count );
489
  }
489
  }
490
  print "\n", "=" x 76, "\n";
490
  print "\n", "=" x 76, "\n";
491
}
491
}
492
492
493
################################ SUBROUTINES ################################
493
################################ SUBROUTINES ################################
494
494
495
#######################################
495
#######################################
496
# get current article's header and body
496
# get current article's header and body
497
#######################################
497
#######################################
498
sub getarticle
498
sub getarticle
499
{
499
{
500
  %headers = ();    # dump old headers
500
  %headers = ();    # dump old headers
501
  my $filename = shift;    # get the name of the file
501
  my $filename = shift;    # get the name of the file
502
502
503
  # get stats about the file itself
503
  # get stats about the file itself
504
  $filesize = -s $filename;    # get total size of file
504
  $filesize = -s $filename;    # get total size of file
505
  $totsize += $filesize;       # bump total sizes of all files
505
  $totsize += $filesize;       # bump total sizes of all files
506
506
507
  my $mtime = ( stat $filename )[9];
507
  my $mtime = ( stat $filename )[9];
508
  if ( $mtime < $earliest )
508
  if ( $mtime < $earliest )
509
  {
509
  {
510
    $earliest = $mtime;
510
    $earliest = $mtime;
511
  }
511
  }
512
  elsif ( $mtime > $latest )
512
  elsif ( $mtime > $latest )
513
  {
513
  {
514
    $latest = $mtime;
514
    $latest = $mtime;
515
  }
515
  }
516
516
517
  # now read the file
517
  # now read the file
518
  open( FILE, $filename ) or die "Can't open $filename: $!\n";
518
  open(my $FILE, $filename) or die "Can't open $filename: $!\n";
519
  while (<FILE>)
519
  while (<$FILE>)
520
  {
520
  {
521
    $totheader += length($_);    # bump total header size
521
    $totheader += length($_);    # bump total header size
522
    last if (/^\s*$/);           # end of header?
522
    last if (/^\s*$/);           # end of header?
523
    if (/^([^:\s]*):\s+(.*)/)
523
    if (/^([^:\s]*):\s+(.*)/)
524
    {
524
    {
525
      my ( $key, $val ) = ( $1, $2 );
525
      my ( $key, $val ) = ( $1, $2 );
526
      $headers{$key} = decode( 'MIME-Header', $val );
526
      $headers{$key} = decode( 'MIME-Header', $val );
527
      $lcheader{ clean( lc($key) ) } = clean($val);
527
      $lcheader{ clean( lc($key) ) } = clean($val);
528
    }
528
    }
529
  }
529
  }
530
  @body = <FILE>;                # slurp up body
530
  @body = <$FILE>;                # slurp up body
531
  close(FILE);
531
  close($FILE);
532
}    # getarticle
532
}    # getarticle
533
533
534
###################################
534
###################################
535
# get data from the current article
535
# get data from the current article
536
###################################
536
###################################
537
sub getdata
537
sub getdata
538
{
538
{
539
#### First, analyse header fields ####
539
#### First, analyse header fields ####
540
540
541
  # Set up this poster if not defined, get counts, sizes
541
  # Set up this poster if not defined, get counts, sizes
542
  $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
542
  $poster = encode( 'UTF-8', $headers{From} );    # get the poster's name
543
  if ( !defined( $data{$poster} ) )
543
  if ( !defined( $data{$poster} ) )
544
  {                                               # seen this one before?
544
  {                                               # seen this one before?
545
    $data{$poster}{agent}  = 'Unknown';           # comes after For: field
545
    $data{$poster}{agent}  = 'Unknown';           # comes after For: field
546
    $data{$poster}{orig}   = 0;
546
    $data{$poster}{orig}   = 0;
547
    $data{$poster}{quoted} = 0;
547
    $data{$poster}{quoted} = 0;
548
  }
548
  }
549
  $data{$poster}{count}++;                        # bump count for this poster
549
  $data{$poster}{count}++;                        # bump count for this poster
550
  $data{$poster}{size} += $filesize;              # total size of file
550
  $data{$poster}{size} += $filesize;              # total size of file
551
551
552
  # The User-Agent and/or X-Newsreader fields
552
  # The User-Agent and/or X-Newsreader fields
553
  # for User-Agent by poster
553
  # for User-Agent by poster
554
  if ( defined $lcheader{"user-agent"} )
554
  if ( defined $lcheader{"user-agent"} )
555
  {
555
  {
556
    $data{$poster}{agent} = $lcheader{"user-agent"};
556
    $data{$poster}{agent} = $lcheader{"user-agent"};
557
  }
557
  }
558
  if ( defined $lcheader{"x-newsreader"} )
558
  if ( defined $lcheader{"x-newsreader"} )
559
  {
559
  {
560
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
560
    $data{$poster}{agent} = $lcheader{"x-newsreader"};
561
  }
561
  }
562
562
563
  # The User Agent for User-Agent by number of posts
563
  # The User Agent for User-Agent by number of posts
564
  my $UA = "unknown";
564
  my $UA = "unknown";
565
  foreach my $keys ( keys %lcheader )
565
  foreach my $keys ( keys %lcheader )
566
  {
566
  {
567
    if ( defined $lcheader{'user-agent'} )
567
    if ( defined $lcheader{'user-agent'} )
568
    {
568
    {
569
      $UA = $lcheader{'user-agent'};
569
      $UA = $lcheader{'user-agent'};
570
    }
570
    }
571
    elsif ( defined $lcheader{"x-newsreader"} )
571
    elsif ( defined $lcheader{"x-newsreader"} )
572
    {
572
    {
573
      $UA = $lcheader{"x-newsreader"};
573
      $UA = $lcheader{"x-newsreader"};
574
    }
574
    }
575
    elsif ( defined $lcheader{'x-mailer'} )
575
    elsif ( defined $lcheader{'x-mailer'} )
576
    {
576
    {
577
      $UA = $lcheader{'x-mailer'};
577
      $UA = $lcheader{'x-mailer'};
578
    }
578
    }
579
    elsif (
579
    elsif (
580
      ( defined $lcheader{'organization'} )
580
      ( defined $lcheader{'organization'} )
581
      && ( $lcheader{'organization'} =~
581
      && ( $lcheader{'organization'} =~
582
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
582
        /groups\.google|AOL|Supernews|WebTV|compuserve/ )
583
      )
583
      )
584
    {
584
    {
585
      $UA = $lcheader{'organization'};
585
      $UA = $lcheader{'organization'};
586
    }
586
    }
587
    elsif ( $lcheader{'message-id'} =~ /pine/i )
587
    elsif ( $lcheader{'message-id'} =~ /pine/i )
588
    {
588
    {
589
      $UA = "Pine";
589
      $UA = "Pine";
590
    }    ## Hopefully found UA, else set to unknown
590
    }    ## Hopefully found UA, else set to unknown
591
  }
591
  }
592
592
593
  $UA = clean($UA);
593
  $UA = clean($UA);
594
  $UA = get_agent($UA);
594
  $UA = get_agent($UA);
595
595
596
  sub get_agent
596
  sub get_agent
597
  {
597
  {
598
    my $raw   = shift;
598
    my $raw   = shift;
599
    my $agent = $raw;
599
    my $agent = $raw;
600
600
601
    ## strip http
601
    ## strip http
602
    if ( $raw =~ /.*http.*/ )
602
    if ( $raw =~ /.*http.*/ )
603
    {
603
    {
604
      $raw =~ s!posted via!!i;
604
      $raw =~ s!posted via!!i;
605
      $raw =~ s!http://!!g;
605
      $raw =~ s!http://!!g;
606
      $raw =~ s!/!!g;
606
      $raw =~ s!/!!g;
607
      $raw =~ s! !!g;
607
      $raw =~ s! !!g;
608
    }
608
    }
609
609
610
    ## Fix Outlook from Mac
610
    ## Fix Outlook from Mac
611
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
611
    if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
612
612
613
    ## Pick out the popular agents
613
    ## Pick out the popular agents
614
    if ( $raw =~ /(outlook express)/i
614
    if ( $raw =~ /(outlook express)/i
615
      || $raw =~ /(microplanet gravity)/i
615
      || $raw =~ /(microplanet gravity)/i
616
      || $raw =~ /(news rover)/i
616
      || $raw =~ /(news rover)/i
617
      || $raw =~ /(forte agent)/i
617
      || $raw =~ /(forte agent)/i
618
      || $raw =~ /(forte free agent)/i )
618
      || $raw =~ /(forte free agent)/i )
619
    {
619
    {
620
      $agent = $1;
620
      $agent = $1;
621
    }
621
    }
622
    elsif (
622
    elsif (
623
      $raw =~ /^(
623
      $raw =~ /^(
624
        pan
624
        pan
625
       |sylpheed
625
       |sylpheed
626
       |slrn
626
       |slrn
627
       |mozilla
627
       |mozilla
628
       |knode
628
       |knode
629
       |tin
629
       |tin
630
       |hamster
630
       |hamster
631
       |xrn
631
       |xrn
632
       |xnews
632
       |xnews
633
       |aol
633
       |aol
634
       |gnus
634
       |gnus
635
       |krn
635
       |krn
636
       |macsoup
636
       |macsoup
637
       |messenger
637
       |messenger
638
       |openxp
638
       |openxp
639
       |pine
639
       |pine
640
       |thoth
640
       |thoth
641
       |turnpike
641
       |turnpike
642
       |winvn
642
       |winvn
643
       |vsoup
643
       |vsoup
644
       |google
644
       |google
645
       |supernews
645
       |supernews
646
       |nn
646
       |nn
647
       |rn
647
       |rn
648
       |007
648
       |007
649
       |webtv
649
       |webtv
650
       |compuserve
650
       |compuserve
651
       )/ix
651
       )/ix
652
      )
652
      )
653
    {
653
    {
654
      $agent = $1;
654
      $agent = $1;
655
    }
655
    }
656
    else
656
    else
657
    {
657
    {
658
      ## Clean up unknown agents
658
      ## Clean up unknown agents
659
      if ( $raw =~ m!^(.*?)/! )
659
      if ( $raw =~ m!^(.*?)/! )
660
      {
660
      {
661
        $agent = $1;
661
        $agent = $1;
662
      }
662
      }
663
      elsif ( $raw =~ /^(\w*)\d.*/ )
663
      elsif ( $raw =~ /^(\w*)\d.*/ )
664
      {
664
      {
665
        $agent = $1;
665
        $agent = $1;
666
      }
666
      }
667
    }
667
    }
668
668
669
    $distinct_agent{$agent}++;
669
    $distinct_agent{$agent}++;
670
    return $agent;
670
    return $agent;
671
  }
671
  }
672
672
673
  # Get all cross-posted newsgroups
673
  # Get all cross-posted newsgroups
674
  for ( split /,/, $headers{"Newsgroups"} )
674
  for ( split /,/, $headers{"Newsgroups"} )
675
  {
675
  {
676
    $crossposts{$_}++;    # bump count for each
676
    $crossposts{$_}++;    # bump count for each
677
  }
677
  }
678
678
679
  # Get threads
679
  # Get threads
680
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
680
  my $thread = encode( 'UTF-8', $headers{"Subject"} );
681
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
681
  $thread =~ s/^re: //i;    # Remove Re: or re: at start
682
  $thread =~ s/\s+/ /g;     # collapse whitespace
682
  $thread =~ s/\s+/ /g;     # collapse whitespace
683
  $threads{$thread}{count} += 1;            # bump count of this subject
683
  $threads{$thread}{count} += 1;            # bump count of this subject
684
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
684
  $threads{$thread}{size}  += $filesize;    # bump bytes for this thread
685
685
686
  # Is this an original post or a reply?
686
  # Is this an original post or a reply?
687
  if ( defined $headers{"References"} )
687
  if ( defined $headers{"References"} )
688
  {
688
  {
689
    $replies++;
689
    $replies++;
690
  }
690
  }
691
  else
691
  else
692
  {
692
  {
693
    $origposts++;
693
    $origposts++;
694
  }
694
  }
695
695
696
  # Get the time zone
696
  # Get the time zone
697
  $_ = $headers{"Date"};
697
  $_ = $headers{"Date"};
698
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
698
  my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
699
  if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
699
  if ( ($tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
700
  {
700
  {
701
    $tz = "UTC";
701
    $tz = "UTC";
702
  }
702
  }
703
  $tz{$tz}++;
703
  $tz{$tz}++;
704
704
705
#### Now analyse the body text ####
705
#### Now analyse the body text ####
706
  my $insig = 0;
706
  my $insig = 0;
707
  for (@body)
707
  for (@body)
708
  {
708
  {
709
    $totbody += length($_);    # bump total body size
709
    $totbody += length($_);    # bump total body size
710
    next if (/^$>/);           # don't count blank lines in body
710
    next if (/^$>/);           # don't count blank lines in body
711
    if ( $insig == 1 )
711
    if ( $insig == 1 )
712
    {
712
    {
713
      $totsig += length($_);    # bump total sig size
713
      $totsig += length($_);    # bump total sig size
714
714
715
      # Bill Unruh uses ] quotes, and another poster uses ::
715
      # Bill Unruh uses ] quotes, and another poster uses ::
716
    }
716
    }
717
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
717
    elsif ( /^\s*[>\]]/ or /^\s*::/ )
718
    {                           # are we in a quote line?
718
    {                           # are we in a quote line?
719
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
719
      $data{$poster}{quoted} += length($_);    # bump count of quoted chrs
720
      $totquoted             += length($_);
720
      $totquoted             += length($_);
721
    }
721
    }
722
    elsif (/-- /)
722
    elsif (/-- /)
723
    {
723
    {
724
      $insig = 1;
724
      $insig = 1;
725
    }
725
    }
726
    else
726
    else
727
    {
727
    {
728
728
729
      # we must be processing an original line
729
      # we must be processing an original line
730
      $data{$poster}{orig} += length($_);      # bump count of original chrs
730
      $data{$poster}{orig} += length($_);      # bump count of original chrs
731
      $totorig             += length($_);
731
      $totorig             += length($_);
732
    }
732
    }
733
  }    # end for (@body)
733
  }    # end for (@body)
734
734
735
}    # getdata
735
}    # getdata
736
736
737
########################################
737
########################################
738
# Count the User-Agents used, collapsing
738
# Count the User-Agents used, collapsing
739
# different versions into one per agent.
739
# different versions into one per agent.
740
########################################
740
########################################
741
sub countagents
741
sub countagents
742
{
742
{
743
POSTER:
743
POSTER:
744
  foreach $poster ( keys %data )
744
  foreach my $poster ( keys %data )
745
  {
745
  {
746
    foreach my $agent_name ( keys %distinct_agent )
746
    foreach my $agent_name ( keys %distinct_agent )
747
    {    # check against known ones
747
    {    # check against known ones
748
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
748
      if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
749
      {
749
      {
750
        $agents{$agent_name}++;
750
        $agents{$agent_name}++;
751
        next POSTER;
751
        next POSTER;
752
      }
752
      }
753
    }
753
    }
754
    $agents{ $data{$poster}{agent} }++;
754
    $agents{ $data{$poster}{agent} }++;
755
  }
755
  }
756
}    # countagents
756
}    # countagents
757
757
758
############################################
758
############################################
759
# set orig/total percentages for all posters
759
# set orig/total percentages for all posters
760
############################################
760
############################################
761
sub fixpercent
761
sub fixpercent
762
{
762
{
763
  foreach $poster ( keys %data )
763
  foreach my $poster ( keys %data )
764
  {
764
  {
765
    my $percent = 100;
765
    my $percent = 100;
766
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
766
    if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
767
    {
767
    {
768
      $percent = $data{$poster}{orig} * 100 /
768
      $percent = $data{$poster}{orig} * 100 /
769
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
769
        ( $data{$poster}{quoted} + $data{$poster}{orig} );    #/
770
    }
770
    }
771
    elsif ( $data{$poster}{orig} == 0 )
771
    elsif ( $data{$poster}{orig} == 0 )
772
    {
772
    {
773
      $percent = 0;
773
      $percent = 0;
774
    }
774
    }
775
    $data{$poster}{percent} = $percent;
775
    $data{$poster}{percent} = $percent;
776
  }
776
  }
777
}
777
}
778
778
779
##############################
779
##############################
780
# right pad a string with '.'s
780
# right pad a string with '.'s
781
##############################
781
##############################
782
sub rpad
782
sub rpad
783
{
783
{
784
784
785
  # get text to pad, length to pad, pad chr
785
  # get text to pad, length to pad, pad chr
786
  my ( $text, $pad_len, $pad_chr ) = @_;
786
  my ( $text, $pad_len, $pad_chr ) = @_;
787
  if ( length($text) > $pad_len )
787
  if ( length($text) > $pad_len )
788
  {
788
  {
789
    $text = substr( $text, 0, $pad_len );
789
    $text = substr( $text, 0, $pad_len );
790
  }
790
  }
791
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
791
  my $padded = $text . $pad_chr x ( $pad_len - length($text) );
792
  return $padded;
792
  return $padded;
793
}
793
}
794
794
795
#################
795
#################
796
# centre a string
796
# centre a string
797
#################
797
#################
798
sub centred
798
sub centred
799
{
799
{
800
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
800
  my ( $text, $width ) = @_;    # text to centre, size of field to centre in
801
  my $pad_len = ( $width - length($text) ) / 2;    #/
801
  my $pad_len = ( $width - length($text) ) / 2;    #/
802
  my $centred = " " x $pad_len . $text;
802
  my $centred = " " x $pad_len . $text;
803
  return $centred;
803
  return $centred;
804
}
804
}
805
805
806
##########################
806
##########################
807
# put commas into a number
807
# put commas into a number
808
##########################
808
##########################
809
sub commify
809
sub commify
810
{
810
{
811
  $_ = shift;
811
  $_ = shift;
812
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
812
  1 while s/^(-?\d+)(\d{3})/$1,$2/;
813
  return $_;
813
  return $_;
814
}
814
}
815
815
816
#########################
816
#########################
817
# clean
817
# clean
818
#########################
818
#########################
819
sub clean
819
sub clean
820
{
820
{
821
  my $dirty = shift;
821
  my $dirty = shift;
822
  my $clean = $dirty;
822
  my $clean = $dirty;
823
  $clean =~ s/^\s*//;
823
  $clean =~ s/^\s*//;
824
  $clean =~ s/\s*$//;
824
  $clean =~ s/\s*$//;
825
825
826
  return $clean;
826
  return $clean;
827
}
827
}
828
828
829
sub usage
829
sub usage
830
{
830
{
831
831
832
  print "usage: newstat.pl newsgroupname\n";
832
  print "usage: newstat.pl newsgroupname\n";
833
  exit 1;
833
  exit 1;
834
}
834
}
835
835
836
###################################
836
###################################
837
# Write data structures to a file #
837
# Write data structures to a file #
838
###################################
838
###################################
839
sub writedata
839
sub writedata
840
{
840
{
841
  open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
841
  open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
842
  print OUTF "Data collected from alt.os.linux.mandrake\n\n";
842
  print $OUTF "Data collected from alt.os.linux.mandrake\n\n";
843
  print OUTF
843
  print $OUTF
844
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
844
    "Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
845
  foreach my $name ( keys %data )
845
  foreach my $name ( keys %data )
846
  {
846
  {
847
    print OUTF
847
    print $OUTF
848
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
848
"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
849
  }
849
  }
850
  print OUTF
850
  print $OUTF
851
"============================================================================\n";
851
"============================================================================\n";
852
  print OUTF "Thread subjects\n";
852
  print $OUTF "Thread subjects\n";
853
  print OUTF
853
  print $OUTF
854
"----------------------------------------------------------------------------\n";
854
"----------------------------------------------------------------------------\n";
855
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
855
  foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
856
  {
856
  {
857
    print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
857
    print $OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
858
  }
858
  }
859
  print OUTF
859
  print $OUTF
860
"============================================================================\n";
860
"============================================================================\n";
861
  print OUTF "Cross-posts\n";
861
  print $OUTF "Cross-posts\n";
862
  print OUTF
862
  print $OUTF
863
"----------------------------------------------------------------------------\n";
863
"----------------------------------------------------------------------------\n";
864
  foreach my $name ( sort keys %crossposts )
864
  foreach my $name ( sort keys %crossposts )
865
  {
865
  {
866
    print OUTF "$name : $crossposts{$name}\n";
866
    print $OUTF "$name : $crossposts{$name}\n";
867
  }
867
  }
868
  print OUTF print OUTF
868
  print $OUTF print $OUTF
869
"============================================================================\n";
869
"============================================================================\n";
870
  print OUTF "User agents\n";
870
  print $OUTF "User agents\n";
871
  print OUTF
871
  print $OUTF
872
"----------------------------------------------------------------------------\n";
872
"----------------------------------------------------------------------------\n";
873
  foreach my $name ( sort keys %agents )
873
  foreach my $name ( sort keys %agents )
874
  {
874
  {
875
    print OUTF "$name : $agents{$name}\n";
875
    print $OUTF "$name : $agents{$name}\n";
876
  }
876
  }
877
  print OUTF
877
  print $OUTF
878
"============================================================================\n";
878
"============================================================================\n";
879
  print OUTF "Time zones\n";
879
  print $OUTF "Time zones\n";
880
  print OUTF
880
  print $OUTF
881
"----------------------------------------------------------------------------\n";
881
"----------------------------------------------------------------------------\n";
882
  foreach my $name ( sort keys %tz )
882
  foreach my $name ( sort keys %tz )
883
  {
883
  {
884
    print OUTF "$name : $tz{$name}\n";
884
    print $OUTF "$name : $tz{$name}\n";
885
  }
885
  }
886
  close OUTF;
886
  close $OUTF;
887
}    # writedata
887
}    # writedata
888
 
888