Rev 11 | Rev 14 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8 | PointedEar | 1 | #!/usr/bin/env perl |
5 | PointedEar | 2 | use strict; |
6 | PointedEar | 3 | use warnings; |
13 | PointedEar | 4 | use diagnostics; |
6 | PointedEar | 5 | use utf8; |
6 | use Encode; |
||
7 | |||
13 | PointedEar | 8 | ## Print out all text to STDOUT UTF-8 encoded |
9 | binmode STDOUT, ':encoding(UTF-8)'; |
||
5 | PointedEar | 10 | |
13 | PointedEar | 11 | ############################ |
12 | ## newsstat.pl version 0.4.3 |
||
5 | PointedEar | 13 | |
13 | PointedEar | 14 | ########################################################################### |
15 | ## Collect statistics about a newsgroup (specified by first argument) |
||
16 | ## in the local news spool. Check all articles in the last 30-day period. |
||
17 | ## Rank posters by number of posts and by volume of posts, report on top |
||
18 | ## and bottom 20 posters. Show their name, number of posts, size of posts, |
||
19 | ## percentage of quoted lines. Rank user-agents used, by poster rather |
||
20 | ## than by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
||
21 | ## |
||
22 | ## Numbers and paths can be configured below. See ChangeLog and TODO |
||
23 | ## for more. -- PE |
||
24 | ########################################################################### |
||
5 | PointedEar | 25 | |
26 | ###################### USER CONFIGURATIONS ############################ |
||
27 | |||
13 | PointedEar | 28 | ## The name of the group to do stats for |
5 | PointedEar | 29 | my $newsgroup_name = $ARGV[0]; |
6 | PointedEar | 30 | $newsgroup_name or &usage; |
5 | PointedEar | 31 | |
13 | PointedEar | 32 | ## Check for removal flags |
5 | PointedEar | 33 | my $ix; |
34 | my $j; |
||
35 | my %skipSec; |
||
36 | my @skiplist; |
||
37 | my $args = @ARGV; |
||
7 | PointedEar | 38 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
39 | { |
||
40 | $j = $ix + 1; |
||
41 | if ( $ARGV[$ix] eq "-x" ) |
||
42 | { |
||
43 | @skiplist = split( ",", $ARGV[$j] ); |
||
44 | } |
||
45 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
||
46 | { |
||
47 | @skiplist = split( ",", $1 ); |
||
48 | } |
||
5 | PointedEar | 49 | } |
7 | PointedEar | 50 | foreach (@skiplist) |
51 | { |
||
5 | PointedEar | 52 | $skipSec{$_} = 1; |
53 | } |
||
54 | |||
13 | PointedEar | 55 | ## Leafnode users will want /var/spool/news for this variable. |
5 | PointedEar | 56 | my $news = "/var/spool/news/"; |
57 | |||
13 | PointedEar | 58 | ## How many days are we doing statistics for? |
5 | PointedEar | 59 | my $numdays = 30; |
60 | |||
13 | PointedEar | 61 | ## Number of agents we list |
5 | PointedEar | 62 | my $topagents = 10; |
63 | |||
13 | PointedEar | 64 | ## Number of threads we want to know about |
5 | PointedEar | 65 | my $topthreads = 20; |
66 | |||
13 | PointedEar | 67 | ## Number of top or bottom posters to show |
5 | PointedEar | 68 | my $topposters = 20; |
69 | |||
13 | PointedEar | 70 | ## Number of cross-posted threads to show |
5 | PointedEar | 71 | my $topcrossposts = 10; |
72 | |||
13 | PointedEar | 73 | ## Number of time zones to show |
5 | PointedEar | 74 | my $toptz = 10; |
75 | |||
76 | ###################### DATA STRUCTURES ###################### |
||
77 | my $group = $newsgroup_name; |
||
78 | $group =~ s!\.!/!g; |
||
7 | PointedEar | 79 | my %data; # name, count, agent, total, orig, quoted |
80 | my %threads; # subject, count |
||
81 | my %crossposts; # group, count |
||
82 | my %tz; # timezones by count |
||
83 | my %headers; # holds header of current article |
||
84 | my %lcheader; # holds lowercase headers |
||
85 | my @body; # holds body of current article |
||
86 | my @sig; # holds sig text; |
||
87 | my $totalposts; # total no. of posts considered |
||
88 | my $filename; # name of current article file |
||
89 | my $filesize; # size of current article file |
||
90 | my $earliest; # earliest article we have found |
||
91 | my $latest; # latest article we have found |
||
92 | my $poster; # poster we are dealing with |
||
93 | my $totsize = 0; # holds total sizes of all files |
||
94 | my $totheader = 0; # total size of header material |
||
95 | my $totbody = 0; # total size of body material |
||
96 | my $totsig = 0; # total size of sig material |
||
97 | my $totorig = 0; # total size of original material |
||
98 | my $totquoted = 0; # total size of quoted material |
||
99 | my $origposts = 0; # total no. of original posts |
||
100 | my $replies = 0; # total no. of replies |
||
101 | my $i; # general purpose |
||
5 | PointedEar | 102 | my %distinct_agent; |
13 | PointedEar | 103 | |
104 | ## Used to hold counts of User Agents used |
||
105 | my %agents = ( |
||
106 | "Compuserver" => 0, |
||
107 | "Foorum" => 0, |
||
7 | PointedEar | 108 | "Forte Agent" => 0, |
109 | "Forte Free Agent" => 0, |
||
13 | PointedEar | 110 | "Gnus" => 0, |
111 | "KNode" => 0, |
||
112 | "MacSOUP" => 0, |
||
113 | "MT-NewsWatcher" => 0, |
||
7 | PointedEar | 114 | "MicroPlanet Gravity" => 0, |
115 | "Microsoft Outlook Express" => 0, |
||
13 | PointedEar | 116 | "Microsoft Windows Mail" => 0, |
117 | "Mozilla" => 0, |
||
118 | "News Rover" => 0, |
||
119 | "NN" => 0, |
||
120 | "Pan" => 0, |
||
121 | "rn" => 0, |
||
7 | PointedEar | 122 | "slrn" => 0, |
13 | PointedEar | 123 | "Sylpheed" => 0, |
7 | PointedEar | 124 | "tin" => 0, |
13 | PointedEar | 125 | "VSoup" => 0, |
7 | PointedEar | 126 | "WebTV" => 0, |
13 | PointedEar | 127 | "Xnews" => 0 |
128 | ); |
||
5 | PointedEar | 129 | |
130 | ######################## MAIN CODE ######################## |
||
131 | $! = 1; |
||
132 | |||
133 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
||
7 | PointedEar | 134 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
135 | while ( defined( $filename = readdir(DIR) ) ) |
||
136 | { |
||
137 | %lcheader = (); |
||
13 | PointedEar | 138 | next unless -f $filename; # only want real files |
139 | next if ( $filename eq ".overview" ); # real articles only |
||
140 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
||
7 | PointedEar | 141 | $earliest = ( stat $filename )[9] unless defined($earliest); |
142 | $latest = ( stat $filename )[9] unless defined($latest); |
||
13 | PointedEar | 143 | &get_article($filename); # read in the article |
144 | &get_data; # grab the data from the article |
||
145 | $totalposts++; # bump count of articles considered |
||
5 | PointedEar | 146 | } |
13 | PointedEar | 147 | closedir(DIR); # finished with the directory |
7 | PointedEar | 148 | |
13 | PointedEar | 149 | ## Post-processing |
150 | &count_agents; # count agents, collapsing versions |
||
151 | &fix_percent; # check percentages orig/total for posters |
||
5 | PointedEar | 152 | |
13 | PointedEar | 153 | &write_data; |
5 | PointedEar | 154 | |
155 | #################### DISPLAY RESULTS ##################### |
||
156 | print "=" x 76, "\n"; |
||
7 | PointedEar | 157 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
5 | PointedEar | 158 | print "=" x 76, "\n"; |
7 | PointedEar | 159 | printf "%s\n", |
160 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
||
5 | PointedEar | 161 | print "\n\n"; |
7 | PointedEar | 162 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
163 | $numdays; |
||
164 | printf "Earliest article: %s\n", scalar localtime($earliest); |
||
165 | printf "Latest article: %s\n", scalar localtime($latest); |
||
166 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
||
167 | commify($replies); |
||
13 | PointedEar | 168 | printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
7 | PointedEar | 169 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
13 | PointedEar | 170 | printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
7 | PointedEar | 171 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
172 | commify( int( $totsize / $totalposts ) ); |
||
5 | PointedEar | 173 | my $count = keys %data; |
13 | PointedEar | 174 | printf "Total headers: %s KiB bodies: %s KiB\n", |
7 | PointedEar | 175 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
13 | PointedEar | 176 | printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
7 | PointedEar | 177 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
178 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
||
179 | commify( int( $totsig / 1024 ) ); |
||
180 | printf "Total number of posters: %s, average %s bytes per poster\n", |
||
181 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
5 | PointedEar | 182 | $count = keys %threads; |
7 | PointedEar | 183 | printf "Total number of threads: %s, average %s bytes per thread\n", |
184 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
13 | PointedEar | 185 | printf "Total number of user agents: %d\n", scalar keys %agents; |
5 | PointedEar | 186 | print "\n", "=" x 76, "\n"; |
187 | |||
13 | PointedEar | 188 | ######################################## |
189 | ## Show posters by article count Sec 1; |
||
190 | ######################################## |
||
7 | PointedEar | 191 | unless ( $skipSec{1} ) |
192 | { |
||
193 | if ( keys %data < $topposters ) |
||
194 | { |
||
195 | $count = keys %data; |
||
196 | } |
||
197 | else |
||
198 | { |
||
199 | $count = $topposters; |
||
200 | } |
||
201 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
||
202 | print "=" x 76, "\n"; |
||
203 | $i = 0; |
||
9 | PointedEar | 204 | foreach |
205 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
||
7 | PointedEar | 206 | { |
207 | my $name = substr( $poster, 0, 65 ); |
||
208 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
209 | $data{$poster}{count}; |
||
210 | last if ( ++$i == $count ); |
||
211 | } |
||
212 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 213 | } |
214 | |||
13 | PointedEar | 215 | ###################################### |
216 | ## Show posters by size in KiB Sec 2; |
||
217 | ###################################### |
||
7 | PointedEar | 218 | unless ( $skipSec{2} ) |
219 | { |
||
220 | if ( keys %data < $topposters ) |
||
221 | { |
||
5 | PointedEar | 222 | $count = keys %data; |
7 | PointedEar | 223 | } |
224 | else |
||
225 | { |
||
5 | PointedEar | 226 | $count = $topposters; |
227 | } |
||
13 | PointedEar | 228 | printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
5 | PointedEar | 229 | print "=" x 76, "\n"; |
230 | $i = 0; |
||
8 | PointedEar | 231 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
7 | PointedEar | 232 | { |
233 | my $name = substr( $poster, 0, 62 ); |
||
234 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
235 | $data{$poster}{size} / 1024; #/ |
||
236 | last if ( ++$i == $count ); |
||
5 | PointedEar | 237 | } |
238 | print "\n", "=" x 76, "\n"; |
||
239 | } |
||
240 | |||
13 | PointedEar | 241 | ##################################### |
242 | ## Show top posters for original text |
||
243 | ##################################### |
||
7 | PointedEar | 244 | unless ( $skipSec{3} ) |
245 | { |
||
246 | if ( keys %data < $topposters ) |
||
247 | { |
||
248 | $count = keys %data; |
||
249 | } |
||
250 | else |
||
251 | { |
||
252 | $count = $topposters; |
||
253 | } |
||
254 | printf "%s\n", |
||
255 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
||
256 | print "=" x 76, "\n"; |
||
257 | $i = 0; |
||
9 | PointedEar | 258 | foreach my $poster ( |
259 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
||
260 | keys %data |
||
261 | ) |
||
7 | PointedEar | 262 | { |
263 | next if $data{$poster}{quoted} == 0; |
||
264 | next if $data{$poster}{count} < 5; |
||
265 | my $name = substr( $poster, 0, 63 ); |
||
266 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
267 | $data{$poster}{percent}; |
||
268 | last if ( ++$i == $count ); |
||
269 | } |
||
270 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 271 | } |
272 | |||
13 | PointedEar | 273 | ######################################## |
274 | ## Show bottom posters for original text |
||
275 | ######################################## |
||
7 | PointedEar | 276 | unless ( $skipSec{4} ) |
277 | { |
||
278 | if ( keys %data < $topposters ) |
||
279 | { |
||
5 | PointedEar | 280 | $count = keys %data; |
7 | PointedEar | 281 | } |
282 | else |
||
283 | { |
||
5 | PointedEar | 284 | $count = $topposters; |
285 | } |
||
7 | PointedEar | 286 | printf "%s\n", |
287 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
||
5 | PointedEar | 288 | print "=" x 76, "\n"; |
289 | $i = 0; |
||
9 | PointedEar | 290 | foreach my $poster ( |
291 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
||
292 | keys %data |
||
293 | ) |
||
7 | PointedEar | 294 | { |
5 | PointedEar | 295 | next if $data{$poster}{quoted} == 0; |
296 | next if $data{$poster}{count} < 5; |
||
7 | PointedEar | 297 | my $name = substr( $poster, 0, 63 ); |
298 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
299 | $data{$poster}{percent}; |
||
300 | last if ( ++$i == $count ); |
||
5 | PointedEar | 301 | } |
302 | print "\n", "=" x 76, "\n"; |
||
303 | } |
||
304 | |||
13 | PointedEar | 305 | ##################################### |
306 | ## Show threads by number of articles |
||
307 | ##################################### |
||
7 | PointedEar | 308 | unless ( $skipSec{5} ) |
309 | { |
||
310 | if ( keys %threads < $topthreads ) |
||
311 | { |
||
5 | PointedEar | 312 | $count = keys %threads; |
7 | PointedEar | 313 | } |
314 | else |
||
315 | { |
||
5 | PointedEar | 316 | $count = $topthreads; |
317 | } |
||
7 | PointedEar | 318 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
5 | PointedEar | 319 | print "=" x 76, "\n"; |
320 | $i = 0; |
||
9 | PointedEar | 321 | foreach my $thread ( |
322 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
||
323 | keys %threads |
||
324 | ) |
||
7 | PointedEar | 325 | { |
326 | my $name = substr( $thread, 0, 65 ); |
||
327 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
328 | $threads{$thread}{count}; |
||
329 | last if ( ++$i == $count ); |
||
5 | PointedEar | 330 | } |
331 | print "\n", "=" x 76, "\n"; |
||
332 | } |
||
13 | PointedEar | 333 | |
334 | ############################## |
||
335 | ## Show threads by size in KiB |
||
336 | ############################## |
||
7 | PointedEar | 337 | unless ( $skipSec{6} ) |
338 | { |
||
339 | if ( keys %threads < $topthreads ) |
||
340 | { |
||
5 | PointedEar | 341 | $count = keys %threads; |
7 | PointedEar | 342 | } |
343 | else |
||
344 | { |
||
5 | PointedEar | 345 | $count = $topthreads; |
346 | } |
||
13 | PointedEar | 347 | printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
5 | PointedEar | 348 | print "=" x 76, "\n"; |
349 | $i = 0; |
||
9 | PointedEar | 350 | foreach my $thread ( |
351 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
||
352 | keys %threads |
||
353 | ) |
||
7 | PointedEar | 354 | { |
355 | my $name = substr( $thread, 0, 65 ); |
||
356 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
357 | $threads{$thread}{size} / 1024; #/ |
||
358 | last if ( ++$i == $count ); |
||
5 | PointedEar | 359 | } |
360 | print "\n", "=" x 76, "\n"; |
||
361 | } |
||
362 | |||
13 | PointedEar | 363 | ################################## |
364 | ## Show top 10 cross-posted groups |
||
365 | ################################## |
||
7 | PointedEar | 366 | unless ( $skipSec{7} ) |
367 | { |
||
368 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
369 | if ( keys %crossposts < $topcrossposts ) |
||
370 | { |
||
371 | $count = keys %crossposts; |
||
372 | } |
||
373 | else |
||
374 | { |
||
375 | $count = $topcrossposts; |
||
376 | } |
||
377 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
||
378 | print "=" x 76, "\n"; |
||
379 | $i = 0; |
||
380 | foreach |
||
381 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
||
382 | { |
||
383 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
384 | $crossposts{$name}; |
||
385 | last if ( ++$i == $count ); |
||
386 | } |
||
387 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 388 | } |
13 | PointedEar | 389 | |
390 | ######################### |
||
391 | ## Show agents and counts |
||
392 | ######################### |
||
7 | PointedEar | 393 | unless ( $skipSec{8} ) |
394 | { |
||
395 | if ( keys %agents < $topagents ) |
||
396 | { |
||
397 | $count = keys %agents; |
||
398 | } |
||
399 | else |
||
400 | { |
||
401 | $count = $topagents; |
||
402 | } |
||
403 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
||
404 | print "=" x 76, "\n"; |
||
405 | $i = 0; |
||
406 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
||
407 | { |
||
408 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
||
409 | $agents{$agent}; |
||
410 | last if ( ++$i == $count ); |
||
411 | } |
||
412 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 413 | } |
414 | |||
415 | ####################### |
||
13 | PointedEar | 416 | ## Show distinct agents |
5 | PointedEar | 417 | ####################### |
7 | PointedEar | 418 | unless ( $skipSec{9} ) |
419 | { |
||
420 | if ( keys %distinct_agent < $topagents ) |
||
421 | { |
||
422 | $count = keys %distinct_agent; |
||
423 | } |
||
424 | else |
||
425 | { |
||
426 | $count = $topagents; |
||
427 | } |
||
428 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
||
429 | print "=" x 76, "\n"; |
||
430 | $i = 0; |
||
9 | PointedEar | 431 | foreach my $agent ( |
432 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
||
433 | keys %distinct_agent |
||
434 | ) |
||
7 | PointedEar | 435 | { |
436 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
||
437 | $distinct_agent{$agent}, |
||
438 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
||
439 | last if ( ++$i == $count ); |
||
440 | } |
||
441 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 442 | } |
443 | |||
13 | PointedEar | 444 | ############################ |
445 | ## Show timezones and counts |
||
446 | ############################ |
||
7 | PointedEar | 447 | unless ( $skipSec{10} ) |
448 | { |
||
449 | if ( keys %tz < $toptz ) |
||
450 | { |
||
451 | $count = keys %tz; |
||
452 | } |
||
453 | else |
||
454 | { |
||
455 | $count = $toptz; |
||
456 | } |
||
457 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
||
458 | print "=" x 76, "\n"; |
||
459 | $i = 0; |
||
460 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
||
461 | { |
||
462 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
||
463 | last if ( ++$i == $count ); |
||
464 | } |
||
465 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 466 | } |
467 | |||
468 | ################################ SUBROUTINES ################################ |
||
469 | |||
13 | PointedEar | 470 | ######################################## |
471 | ## Get current article's header and body |
||
472 | ######################################## |
||
473 | sub get_article |
||
7 | PointedEar | 474 | { |
475 | %headers = (); # dump old headers |
||
476 | my $filename = shift; # get the name of the file |
||
5 | PointedEar | 477 | |
13 | PointedEar | 478 | ## get stats about the file itself |
7 | PointedEar | 479 | $filesize = -s $filename; # get total size of file |
480 | $totsize += $filesize; # bump total sizes of all files |
||
481 | |||
482 | my $mtime = ( stat $filename )[9]; |
||
483 | if ( $mtime < $earliest ) |
||
484 | { |
||
5 | PointedEar | 485 | $earliest = $mtime; |
7 | PointedEar | 486 | } |
487 | elsif ( $mtime > $latest ) |
||
488 | { |
||
5 | PointedEar | 489 | $latest = $mtime; |
7 | PointedEar | 490 | } |
5 | PointedEar | 491 | |
13 | PointedEar | 492 | ## now read the file |
493 | open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
||
8 | PointedEar | 494 | while (<$FILE>) |
7 | PointedEar | 495 | { |
496 | $totheader += length($_); # bump total header size |
||
497 | last if (/^\s*$/); # end of header? |
||
13 | PointedEar | 498 | if (/^([^:\s]*):\s*(.*)/) |
7 | PointedEar | 499 | { |
500 | my ( $key, $val ) = ( $1, $2 ); |
||
501 | $headers{$key} = decode( 'MIME-Header', $val ); |
||
502 | $lcheader{ clean( lc($key) ) } = clean($val); |
||
503 | } |
||
5 | PointedEar | 504 | } |
9 | PointedEar | 505 | @body = <$FILE>; # slurp up body |
8 | PointedEar | 506 | close($FILE); |
13 | PointedEar | 507 | } # get_article |
5 | PointedEar | 508 | |
13 | PointedEar | 509 | #################################### |
510 | ## Get data from the current article |
||
511 | #################################### |
||
512 | sub get_data |
||
7 | PointedEar | 513 | { |
5 | PointedEar | 514 | #### First, analyse header fields #### |
515 | |||
13 | PointedEar | 516 | ## Set up this poster if not defined, get counts, sizes |
11 | PointedEar | 517 | my $poster = $headers{From}; # get the poster's name |
13 | PointedEar | 518 | |
519 | # Convert old to new format |
||
520 | $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
||
521 | |||
522 | # Collapse whitespace |
||
523 | $poster =~ s/\s+/ /; |
||
524 | |||
525 | # Remove outer quotes |
||
526 | $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
||
527 | |||
7 | PointedEar | 528 | if ( !defined( $data{$poster} ) ) |
13 | PointedEar | 529 | { # seen this one before? |
11 | PointedEar | 530 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
7 | PointedEar | 531 | $data{$poster}{orig} = 0; |
532 | $data{$poster}{quoted} = 0; |
||
533 | } |
||
11 | PointedEar | 534 | $data{$poster}{count}++; # bump count for this poster |
535 | $data{$poster}{size} += $filesize; # total size of file |
||
5 | PointedEar | 536 | |
13 | PointedEar | 537 | ## The User-Agent and/or X-Newsreader fields |
538 | ## for User-Agent by poster |
||
7 | PointedEar | 539 | if ( defined $lcheader{"user-agent"} ) |
540 | { |
||
541 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
||
542 | } |
||
543 | if ( defined $lcheader{"x-newsreader"} ) |
||
544 | { |
||
545 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
||
546 | } |
||
5 | PointedEar | 547 | |
13 | PointedEar | 548 | ## The User Agent for User-Agent by number of posts |
7 | PointedEar | 549 | my $UA = "unknown"; |
550 | foreach my $keys ( keys %lcheader ) |
||
551 | { |
||
552 | if ( defined $lcheader{'user-agent'} ) |
||
553 | { |
||
554 | $UA = $lcheader{'user-agent'}; |
||
555 | } |
||
556 | elsif ( defined $lcheader{"x-newsreader"} ) |
||
557 | { |
||
558 | $UA = $lcheader{"x-newsreader"}; |
||
559 | } |
||
560 | elsif ( defined $lcheader{'x-mailer'} ) |
||
561 | { |
||
562 | $UA = $lcheader{'x-mailer'}; |
||
563 | } |
||
564 | elsif ( |
||
565 | ( defined $lcheader{'organization'} ) |
||
566 | && ( $lcheader{'organization'} =~ |
||
567 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
||
568 | ) |
||
569 | { |
||
570 | $UA = $lcheader{'organization'}; |
||
571 | } |
||
572 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
||
573 | { |
||
574 | $UA = "Pine"; |
||
575 | } ## Hopefully found UA, else set to unknown |
||
576 | } |
||
5 | PointedEar | 577 | |
7 | PointedEar | 578 | $UA = clean($UA); |
579 | $UA = get_agent($UA); |
||
5 | PointedEar | 580 | |
7 | PointedEar | 581 | sub get_agent |
582 | { |
||
583 | my $raw = shift; |
||
584 | my $agent = $raw; |
||
5 | PointedEar | 585 | |
7 | PointedEar | 586 | ## strip http |
587 | if ( $raw =~ /.*http.*/ ) |
||
588 | { |
||
589 | $raw =~ s!posted via!!i; |
||
590 | $raw =~ s!http://!!g; |
||
591 | $raw =~ s!/!!g; |
||
592 | $raw =~ s! !!g; |
||
593 | } |
||
5 | PointedEar | 594 | |
7 | PointedEar | 595 | ## Fix Outlook from Mac |
596 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
||
5 | PointedEar | 597 | |
7 | PointedEar | 598 | ## Pick out the popular agents |
13 | PointedEar | 599 | if ( |
600 | $raw =~ /(outlook express)/i |
||
601 | || $raw =~ /(windows mail)/i |
||
602 | || $raw =~ /(microplanet gravity)/i |
||
603 | || $raw =~ /(news rover)/i |
||
604 | || $raw =~ /(forte agent)/i |
||
605 | || $raw =~ /(forte free agent)/i |
||
606 | ) |
||
7 | PointedEar | 607 | { |
608 | $agent = $1; |
||
609 | } |
||
610 | elsif ( |
||
611 | $raw =~ /^( |
||
5 | PointedEar | 612 | pan |
613 | |sylpheed |
||
614 | |slrn |
||
615 | |mozilla |
||
616 | |knode |
||
617 | |tin |
||
618 | |hamster |
||
619 | |xrn |
||
620 | |xnews |
||
621 | |aol |
||
622 | |gnus |
||
623 | |krn |
||
624 | |macsoup |
||
625 | |messenger |
||
626 | |openxp |
||
627 | |pine |
||
628 | |thoth |
||
629 | |turnpike |
||
630 | |winvn |
||
631 | |vsoup |
||
632 | |google |
||
633 | |supernews |
||
634 | |nn |
||
635 | |rn |
||
636 | |007 |
||
637 | |webtv |
||
638 | |compuserve |
||
7 | PointedEar | 639 | )/ix |
640 | ) |
||
641 | { |
||
642 | $agent = $1; |
||
643 | } |
||
644 | else |
||
645 | { |
||
646 | ## Clean up unknown agents |
||
647 | if ( $raw =~ m!^(.*?)/! ) |
||
648 | { |
||
649 | $agent = $1; |
||
650 | } |
||
651 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
||
652 | { |
||
653 | $agent = $1; |
||
654 | } |
||
655 | } |
||
656 | |||
657 | $distinct_agent{$agent}++; |
||
658 | return $agent; |
||
5 | PointedEar | 659 | } |
660 | |||
13 | PointedEar | 661 | ## Get all cross-posted newsgroups |
7 | PointedEar | 662 | for ( split /,/, $headers{"Newsgroups"} ) |
663 | { |
||
664 | $crossposts{$_}++; # bump count for each |
||
665 | } |
||
5 | PointedEar | 666 | |
13 | PointedEar | 667 | ## Get threads |
11 | PointedEar | 668 | my $thread = $headers{"Subject"}; |
7 | PointedEar | 669 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
670 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
671 | $threads{$thread}{count} += 1; # bump count of this subject |
||
672 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
||
5 | PointedEar | 673 | |
13 | PointedEar | 674 | ## Is this an original post or a reply? |
7 | PointedEar | 675 | if ( defined $headers{"References"} ) |
676 | { |
||
677 | $replies++; |
||
678 | } |
||
679 | else |
||
680 | { |
||
681 | $origposts++; |
||
682 | } |
||
5 | PointedEar | 683 | |
13 | PointedEar | 684 | ## Get the time zone |
7 | PointedEar | 685 | $_ = $headers{"Date"}; |
13 | PointedEar | 686 | my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
9 | PointedEar | 687 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
7 | PointedEar | 688 | { |
689 | $tz = "UTC"; |
||
690 | } |
||
691 | $tz{$tz}++; |
||
5 | PointedEar | 692 | |
7 | PointedEar | 693 | #### Now analyse the body text #### |
694 | my $insig = 0; |
||
695 | for (@body) |
||
696 | { |
||
697 | $totbody += length($_); # bump total body size |
||
698 | next if (/^$>/); # don't count blank lines in body |
||
699 | if ( $insig == 1 ) |
||
700 | { |
||
701 | $totsig += length($_); # bump total sig size |
||
5 | PointedEar | 702 | |
13 | PointedEar | 703 | ## Bill Unruh uses ] quotes, and another poster uses :: |
7 | PointedEar | 704 | } |
705 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
||
706 | { # are we in a quote line? |
||
707 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
||
9 | PointedEar | 708 | $totquoted += length($_); |
7 | PointedEar | 709 | } |
710 | elsif (/-- /) |
||
711 | { |
||
712 | $insig = 1; |
||
713 | } |
||
714 | else |
||
715 | { |
||
5 | PointedEar | 716 | |
13 | PointedEar | 717 | ## We must be processing an original line |
7 | PointedEar | 718 | $data{$poster}{orig} += length($_); # bump count of original chrs |
9 | PointedEar | 719 | $totorig += length($_); |
7 | PointedEar | 720 | } |
721 | } # end for (@body) |
||
5 | PointedEar | 722 | |
13 | PointedEar | 723 | } # get_data |
5 | PointedEar | 724 | |
13 | PointedEar | 725 | ######################################### |
726 | ## Count the User-Agents used, collapsing |
||
727 | ## different versions into one per agent. |
||
728 | ######################################### |
||
729 | sub count_agents |
||
7 | PointedEar | 730 | { |
5 | PointedEar | 731 | POSTER: |
8 | PointedEar | 732 | foreach my $poster ( keys %data ) |
7 | PointedEar | 733 | { |
734 | foreach my $agent_name ( keys %distinct_agent ) |
||
735 | { # check against known ones |
||
736 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
||
737 | { |
||
738 | $agents{$agent_name}++; |
||
739 | next POSTER; |
||
740 | } |
||
741 | } |
||
742 | $agents{ $data{$poster}{agent} }++; |
||
743 | } |
||
13 | PointedEar | 744 | } # count_agents |
5 | PointedEar | 745 | |
13 | PointedEar | 746 | ############################################# |
747 | ## Set orig/total percentages for all posters |
||
748 | ############################################# |
||
749 | sub fix_percent |
||
7 | PointedEar | 750 | { |
8 | PointedEar | 751 | foreach my $poster ( keys %data ) |
7 | PointedEar | 752 | { |
753 | my $percent = 100; |
||
754 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
||
755 | { |
||
9 | PointedEar | 756 | $percent = |
757 | $data{$poster}{orig} * 100 / |
||
7 | PointedEar | 758 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
759 | } |
||
760 | elsif ( $data{$poster}{orig} == 0 ) |
||
761 | { |
||
762 | $percent = 0; |
||
763 | } |
||
764 | $data{$poster}{percent} = $percent; |
||
765 | } |
||
5 | PointedEar | 766 | } |
767 | |||
13 | PointedEar | 768 | ############################### |
769 | ## Right pad a string with '.'s |
||
770 | ############################### |
||
7 | PointedEar | 771 | sub rpad |
772 | { |
||
13 | PointedEar | 773 | ## Get text to pad, length to pad, pad chr |
7 | PointedEar | 774 | my ( $text, $pad_len, $pad_chr ) = @_; |
11 | PointedEar | 775 | |
13 | PointedEar | 776 | ## DEBUG |
777 | #printf "|%s| = %d\n", $text, length($text); |
||
11 | PointedEar | 778 | |
7 | PointedEar | 779 | if ( length($text) > $pad_len ) |
780 | { |
||
781 | $text = substr( $text, 0, $pad_len ); |
||
782 | } |
||
783 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
||
784 | return $padded; |
||
5 | PointedEar | 785 | } |
786 | |||
13 | PointedEar | 787 | ################## |
788 | ## Centre a string |
||
789 | ################## |
||
7 | PointedEar | 790 | sub centred |
791 | { |
||
792 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
||
793 | my $pad_len = ( $width - length($text) ) / 2; #/ |
||
794 | my $centred = " " x $pad_len . $text; |
||
795 | return $centred; |
||
5 | PointedEar | 796 | } |
797 | |||
13 | PointedEar | 798 | ########################### |
799 | ## Put commas into a number |
||
800 | ########################### |
||
7 | PointedEar | 801 | sub commify |
802 | { |
||
13 | PointedEar | 803 | local $_ = shift; |
804 | 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
||
7 | PointedEar | 805 | return $_; |
5 | PointedEar | 806 | } |
807 | |||
13 | PointedEar | 808 | ################################################################ |
809 | ## Returns a string with leading and trailing whitespace removed |
||
810 | ################################################################ |
||
7 | PointedEar | 811 | sub clean |
812 | { |
||
813 | my $dirty = shift; |
||
814 | my $clean = $dirty; |
||
13 | PointedEar | 815 | $clean =~ s/^\s*|\s*$//g; |
5 | PointedEar | 816 | |
7 | PointedEar | 817 | return $clean; |
5 | PointedEar | 818 | } |
819 | |||
7 | PointedEar | 820 | sub usage |
821 | { |
||
822 | print "usage: newstat.pl newsgroupname\n"; |
||
823 | exit 1; |
||
5 | PointedEar | 824 | } |
825 | |||
13 | PointedEar | 826 | ################################## |
827 | ## Write data structures to a file |
||
828 | ################################## |
||
829 | sub write_data |
||
7 | PointedEar | 830 | { |
13 | PointedEar | 831 | open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
832 | or die "Can't create XDATA: $!\n"; |
||
833 | print $OUTF "Data collected from $newsgroup_name\n\n"; |
||
8 | PointedEar | 834 | print $OUTF |
7 | PointedEar | 835 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
836 | foreach my $name ( keys %data ) |
||
837 | { |
||
8 | PointedEar | 838 | print $OUTF |
7 | PointedEar | 839 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
840 | } |
||
8 | PointedEar | 841 | print $OUTF |
7 | PointedEar | 842 | "============================================================================\n"; |
8 | PointedEar | 843 | print $OUTF "Thread subjects\n"; |
844 | print $OUTF |
||
7 | PointedEar | 845 | "----------------------------------------------------------------------------\n"; |
846 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
||
847 | { |
||
9 | PointedEar | 848 | print $OUTF |
849 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
||
7 | PointedEar | 850 | } |
8 | PointedEar | 851 | print $OUTF |
7 | PointedEar | 852 | "============================================================================\n"; |
8 | PointedEar | 853 | print $OUTF "Cross-posts\n"; |
854 | print $OUTF |
||
7 | PointedEar | 855 | "----------------------------------------------------------------------------\n"; |
856 | foreach my $name ( sort keys %crossposts ) |
||
857 | { |
||
8 | PointedEar | 858 | print $OUTF "$name : $crossposts{$name}\n"; |
7 | PointedEar | 859 | } |
13 | PointedEar | 860 | print $OUTF |
7 | PointedEar | 861 | "============================================================================\n"; |
8 | PointedEar | 862 | print $OUTF "User agents\n"; |
863 | print $OUTF |
||
7 | PointedEar | 864 | "----------------------------------------------------------------------------\n"; |
865 | foreach my $name ( sort keys %agents ) |
||
866 | { |
||
8 | PointedEar | 867 | print $OUTF "$name : $agents{$name}\n"; |
7 | PointedEar | 868 | } |
8 | PointedEar | 869 | print $OUTF |
7 | PointedEar | 870 | "============================================================================\n"; |
8 | PointedEar | 871 | print $OUTF "Time zones\n"; |
872 | print $OUTF |
||
7 | PointedEar | 873 | "----------------------------------------------------------------------------\n"; |
874 | foreach my $name ( sort keys %tz ) |
||
875 | { |
||
8 | PointedEar | 876 | print $OUTF "$name : $tz{$name}\n"; |
7 | PointedEar | 877 | } |
8 | PointedEar | 878 | close $OUTF; |
13 | PointedEar | 879 | } # write_data |