Rev 9 | Rev 11 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8 | PointedEar | 1 | #!/usr/bin/env perl |
5 | PointedEar | 2 | use strict; |
6 | PointedEar | 3 | use warnings; |
4 | use utf8; |
||
9 | PointedEar | 5 | use encoding 'utf-8'; |
6 | PointedEar | 6 | use Encode; |
7 | |||
8 | PointedEar | 8 | ########################### |
10 | PointedEar | 9 | # newsstat.pl version 0.4.2 |
5 | PointedEar | 10 | |
6 | PointedEar | 11 | ############################################################################ |
12 | # Collect statistics about a newsgroup (specified by first argument) in |
||
7 | PointedEar | 13 | # the local news spool. Check all articles in the last 30-day period. |
6 | PointedEar | 14 | # Rank posters by number of posts and by volume of posts, report on top and |
15 | # bottom 20 posters. Show their name, number of posts, size of posts, |
||
16 | # percentage of quoted lines. Rank user-agents used, by poster rather than |
||
17 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
||
7 | PointedEar | 18 | # |
6 | PointedEar | 19 | # (Numbers and paths can be configured below. -- PE) |
20 | ############################################################################ |
||
5 | PointedEar | 21 | |
6 | PointedEar | 22 | ############################################################################ |
23 | # RECENT CHANGES # |
||
9 | PointedEar | 24 | # 2011-10-03 PE - Use more compatible shebang |
25 | # - Fixed some Perl::Critic-ized code |
||
26 | # - Fixed wrong indent for non-ASCII names (TODO: proper dot count) |
||
27 | # - Formatted source code |
||
6 | PointedEar | 28 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
7 | PointedEar | 29 | # - Use warnings, utf8 (just in case) |
6 | PointedEar | 30 | # - Documentation update |
31 | # N/A NN - Take newsgroup name as argument |
||
32 | # 2004-06-19 NN - newsgroup name is $ARGV[0] |
||
33 | # - Allow command line flags for subtracting |
||
34 | # output if not pertinent for a group |
||
35 | # 2002-11-09 NN - Put Garry's writedata() function back in. |
||
36 | # - added "rn" to my list of UA's |
||
37 | # - Started using %distinct_agent for both User agent |
||
38 | # sections |
||
39 | # - named it newsstat.pl version 0.3 |
||
40 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using |
||
41 | # mtime rather than ctime, and simplifying the logic |
||
42 | # 2002-11-05 NN - moved user configurations to the top |
||
43 | # - fixed the cross-posting section |
||
44 | # - introduced the $newsgroup_name variable which |
||
45 | # later becomes $news$group |
||
46 | # - changed $name to $agent_name in countagents() |
||
7 | PointedEar | 47 | # |
6 | PointedEar | 48 | # Contributors |
49 | # ------------- |
||
50 | # NN Nomen nominandum (name to be determined later) |
||
51 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de> |
||
5 | PointedEar | 52 | |
6 | PointedEar | 53 | ########### TODO ############# |
5 | PointedEar | 54 | # Commas in bottom section of report |
55 | # Show date the figures were compiled |
||
56 | # No. of HTML articles (Content-Type: text/html) |
||
57 | # No. of quoted sigs (/>\s*-- /) |
||
58 | # Per cent of top-posted articles |
||
59 | # Top 10 cross-posters |
||
60 | # Top 20 news posting hosts (from Path) |
||
61 | # Count of certain subject words: newbie, kde, burner, sendmail, etc. |
||
62 | # Count *all* User Agents that each poster uses |
||
63 | # What do we do about Bill Unruh's ] quote style? |
||
64 | # Change the way dates/times are checked |
||
65 | # include % share in posters by no. of arts |
||
66 | # include % share in posters by size |
||
67 | # Total, orig & quoted lines by user agent with per cent |
||
6 | PointedEar | 68 | # Take more arguments |
5 | PointedEar | 69 | ####################################################### |
70 | |||
71 | ###################### USER CONFIGURATIONS ############################ |
||
72 | |||
73 | # The name of the group to do stats for |
||
74 | my $newsgroup_name = $ARGV[0]; |
||
6 | PointedEar | 75 | $newsgroup_name or &usage; |
5 | PointedEar | 76 | |
77 | # Check for removal flags |
||
78 | my $ix; |
||
79 | my $j; |
||
80 | my %skipSec; |
||
81 | my @skiplist; |
||
82 | my $args = @ARGV; |
||
7 | PointedEar | 83 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
84 | { |
||
85 | $j = $ix + 1; |
||
86 | if ( $ARGV[$ix] eq "-x" ) |
||
87 | { |
||
88 | @skiplist = split( ",", $ARGV[$j] ); |
||
89 | } |
||
90 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
||
91 | { |
||
92 | @skiplist = split( ",", $1 ); |
||
93 | } |
||
5 | PointedEar | 94 | } |
7 | PointedEar | 95 | foreach (@skiplist) |
96 | { |
||
5 | PointedEar | 97 | $skipSec{$_} = 1; |
98 | } |
||
99 | |||
100 | # Leafnode users will want /var/spool/news for this variable. |
||
101 | my $news = "/var/spool/news/"; |
||
102 | |||
103 | # How many days are we doing statistics for? |
||
104 | my $numdays = 30; |
||
105 | |||
106 | # no. of agents we list |
||
107 | my $topagents = 10; |
||
108 | |||
109 | # no. of threads we want to know about |
||
110 | my $topthreads = 20; |
||
111 | |||
112 | # no. of top or bottom posters to show |
||
113 | my $topposters = 20; |
||
114 | |||
115 | # no. of cross-posted threads to show |
||
116 | my $topcrossposts = 10; |
||
117 | |||
118 | # no. of time zones to show |
||
119 | my $toptz = 10; |
||
120 | |||
121 | ###################### DATA STRUCTURES ###################### |
||
122 | my $group = $newsgroup_name; |
||
123 | $group =~ s!\.!/!g; |
||
7 | PointedEar | 124 | my %data; # name, count, agent, total, orig, quoted |
125 | my %threads; # subject, count |
||
126 | my %crossposts; # group, count |
||
127 | my %tz; # timezones by count |
||
128 | my %headers; # holds header of current article |
||
129 | my %lcheader; # holds lowercase headers |
||
130 | my @body; # holds body of current article |
||
131 | my @sig; # holds sig text; |
||
132 | my $totalposts; # total no. of posts considered |
||
133 | my $filename; # name of current article file |
||
134 | my $filesize; # size of current article file |
||
135 | my $earliest; # earliest article we have found |
||
136 | my $latest; # latest article we have found |
||
137 | my $poster; # poster we are dealing with |
||
138 | my $totsize = 0; # holds total sizes of all files |
||
139 | my $totheader = 0; # total size of header material |
||
140 | my $totbody = 0; # total size of body material |
||
141 | my $totsig = 0; # total size of sig material |
||
142 | my $totorig = 0; # total size of original material |
||
143 | my $totquoted = 0; # total size of quoted material |
||
144 | my $origposts = 0; # total no. of original posts |
||
145 | my $replies = 0; # total no. of replies |
||
146 | my $i; # general purpose |
||
5 | PointedEar | 147 | my %distinct_agent; |
7 | PointedEar | 148 | my %agents = # used to hold counts of User Agents used |
149 | ( |
||
150 | "KNode" => 0, |
||
151 | "Pan" => 0, |
||
152 | "Mozilla" => 0, |
||
153 | "Sylpheed" => 0, |
||
154 | "Gnus" => 0, |
||
155 | "Forte Agent" => 0, |
||
156 | "Forte Free Agent" => 0, |
||
157 | "MicroPlanet Gravity" => 0, |
||
158 | "Microsoft Outlook Express" => 0, |
||
159 | "Xnews" => 0, |
||
160 | "slrn" => 0, |
||
161 | "tin" => 0, |
||
162 | "rn" => 0, |
||
163 | "NN" => 0, |
||
164 | "MacSOUP" => 0, |
||
165 | "Foorum" => 0, |
||
166 | "MT-NewsWatcher" => 0, |
||
167 | "News Rover" => 0, |
||
168 | "WebTV" => 0, |
||
169 | "Compuserver" => 0, |
||
170 | "VSoup" => 0 |
||
171 | ); |
||
5 | PointedEar | 172 | |
173 | ######################## MAIN CODE ######################## |
||
174 | $! = 1; |
||
175 | |||
176 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
||
7 | PointedEar | 177 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
178 | while ( defined( $filename = readdir(DIR) ) ) |
||
179 | { |
||
180 | %lcheader = (); |
||
181 | next unless -f $filename; # only want real files |
||
182 | next if ( $filename eq ".overview" ); # real articles only |
||
183 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
||
184 | $earliest = ( stat $filename )[9] unless defined($earliest); |
||
185 | $latest = ( stat $filename )[9] unless defined($latest); |
||
186 | &getarticle($filename); # read in the article |
||
187 | &getdata; # grab the data from the article |
||
188 | $totalposts++; # bump count of articles considered |
||
5 | PointedEar | 189 | } |
7 | PointedEar | 190 | closedir(DIR); # finished with the directory |
191 | |||
5 | PointedEar | 192 | # post-processing |
7 | PointedEar | 193 | &countagents; # count agents, collapsing versions |
194 | &fixpercent; # check percentages orig/total for posters |
||
5 | PointedEar | 195 | |
196 | &writedata; |
||
197 | |||
198 | #################### DISPLAY RESULTS ##################### |
||
199 | print "=" x 76, "\n"; |
||
7 | PointedEar | 200 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
5 | PointedEar | 201 | print "=" x 76, "\n"; |
7 | PointedEar | 202 | printf "%s\n", |
203 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
||
5 | PointedEar | 204 | print "\n\n"; |
7 | PointedEar | 205 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
206 | $numdays; |
||
207 | printf "Earliest article: %s\n", scalar localtime($earliest); |
||
208 | printf "Latest article: %s\n", scalar localtime($latest); |
||
209 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
||
210 | commify($replies); |
||
5 | PointedEar | 211 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
7 | PointedEar | 212 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
5 | PointedEar | 213 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
7 | PointedEar | 214 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
215 | commify( int( $totsize / $totalposts ) ); |
||
5 | PointedEar | 216 | my $count = keys %data; |
217 | printf "Total headers: %s KB bodies: %s KB\n", |
||
7 | PointedEar | 218 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
5 | PointedEar | 219 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
7 | PointedEar | 220 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
221 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
||
222 | commify( int( $totsig / 1024 ) ); |
||
223 | printf "Total number of posters: %s, average %s bytes per poster\n", |
||
224 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
5 | PointedEar | 225 | $count = keys %threads; |
7 | PointedEar | 226 | printf "Total number of threads: %s, average %s bytes per thread\n", |
227 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
5 | PointedEar | 228 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
229 | print "\n", "=" x 76, "\n"; |
||
230 | |||
231 | ############################### |
||
232 | # show posters by article count Sec 1; |
||
233 | ############################### |
||
7 | PointedEar | 234 | unless ( $skipSec{1} ) |
235 | { |
||
236 | if ( keys %data < $topposters ) |
||
237 | { |
||
238 | $count = keys %data; |
||
239 | } |
||
240 | else |
||
241 | { |
||
242 | $count = $topposters; |
||
243 | } |
||
244 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
||
245 | print "=" x 76, "\n"; |
||
246 | $i = 0; |
||
9 | PointedEar | 247 | foreach |
248 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
||
7 | PointedEar | 249 | { |
250 | my $name = substr( $poster, 0, 65 ); |
||
251 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
252 | $data{$poster}{count}; |
||
253 | last if ( ++$i == $count ); |
||
254 | } |
||
255 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 256 | } |
257 | |||
258 | ################################ |
||
259 | # show posters by size in Kbytes Sec 2; |
||
260 | ################################ |
||
7 | PointedEar | 261 | unless ( $skipSec{2} ) |
262 | { |
||
263 | if ( keys %data < $topposters ) |
||
264 | { |
||
5 | PointedEar | 265 | $count = keys %data; |
7 | PointedEar | 266 | } |
267 | else |
||
268 | { |
||
5 | PointedEar | 269 | $count = $topposters; |
270 | } |
||
7 | PointedEar | 271 | printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
5 | PointedEar | 272 | print "=" x 76, "\n"; |
273 | $i = 0; |
||
8 | PointedEar | 274 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
7 | PointedEar | 275 | { |
276 | my $name = substr( $poster, 0, 62 ); |
||
277 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
278 | $data{$poster}{size} / 1024; #/ |
||
279 | last if ( ++$i == $count ); |
||
5 | PointedEar | 280 | } |
281 | print "\n", "=" x 76, "\n"; |
||
282 | } |
||
283 | |||
284 | #################################### |
||
285 | # show top posters for original text |
||
286 | #################################### |
||
7 | PointedEar | 287 | unless ( $skipSec{3} ) |
288 | { |
||
289 | if ( keys %data < $topposters ) |
||
290 | { |
||
291 | $count = keys %data; |
||
292 | } |
||
293 | else |
||
294 | { |
||
295 | $count = $topposters; |
||
296 | } |
||
297 | printf "%s\n", |
||
298 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
||
299 | print "=" x 76, "\n"; |
||
300 | $i = 0; |
||
9 | PointedEar | 301 | foreach my $poster ( |
302 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
||
303 | keys %data |
||
304 | ) |
||
7 | PointedEar | 305 | { |
306 | next if $data{$poster}{quoted} == 0; |
||
307 | next if $data{$poster}{count} < 5; |
||
308 | my $name = substr( $poster, 0, 63 ); |
||
309 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
310 | $data{$poster}{percent}; |
||
311 | last if ( ++$i == $count ); |
||
312 | } |
||
313 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 314 | } |
315 | |||
316 | ####################################### |
||
317 | # show bottom posters for original text |
||
318 | ####################################### |
||
7 | PointedEar | 319 | unless ( $skipSec{4} ) |
320 | { |
||
321 | if ( keys %data < $topposters ) |
||
322 | { |
||
5 | PointedEar | 323 | $count = keys %data; |
7 | PointedEar | 324 | } |
325 | else |
||
326 | { |
||
5 | PointedEar | 327 | $count = $topposters; |
328 | } |
||
7 | PointedEar | 329 | printf "%s\n", |
330 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
||
5 | PointedEar | 331 | print "=" x 76, "\n"; |
332 | $i = 0; |
||
9 | PointedEar | 333 | foreach my $poster ( |
334 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
||
335 | keys %data |
||
336 | ) |
||
7 | PointedEar | 337 | { |
5 | PointedEar | 338 | next if $data{$poster}{quoted} == 0; |
339 | next if $data{$poster}{count} < 5; |
||
7 | PointedEar | 340 | my $name = substr( $poster, 0, 63 ); |
341 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
342 | $data{$poster}{percent}; |
||
343 | last if ( ++$i == $count ); |
||
5 | PointedEar | 344 | } |
345 | print "\n", "=" x 76, "\n"; |
||
346 | } |
||
347 | |||
348 | #################################### |
||
349 | # show threads by number of articles |
||
350 | #################################### |
||
7 | PointedEar | 351 | unless ( $skipSec{5} ) |
352 | { |
||
353 | if ( keys %threads < $topthreads ) |
||
354 | { |
||
5 | PointedEar | 355 | $count = keys %threads; |
7 | PointedEar | 356 | } |
357 | else |
||
358 | { |
||
5 | PointedEar | 359 | $count = $topthreads; |
360 | } |
||
7 | PointedEar | 361 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
5 | PointedEar | 362 | print "=" x 76, "\n"; |
363 | $i = 0; |
||
9 | PointedEar | 364 | foreach my $thread ( |
365 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
||
366 | keys %threads |
||
367 | ) |
||
7 | PointedEar | 368 | { |
369 | my $name = substr( $thread, 0, 65 ); |
||
370 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
371 | $threads{$thread}{count}; |
||
372 | last if ( ++$i == $count ); |
||
5 | PointedEar | 373 | } |
374 | print "\n", "=" x 76, "\n"; |
||
375 | } |
||
376 | ################################ |
||
377 | # show threads by size in Kbytes |
||
378 | ################################ |
||
7 | PointedEar | 379 | unless ( $skipSec{6} ) |
380 | { |
||
381 | if ( keys %threads < $topthreads ) |
||
382 | { |
||
5 | PointedEar | 383 | $count = keys %threads; |
7 | PointedEar | 384 | } |
385 | else |
||
386 | { |
||
5 | PointedEar | 387 | $count = $topthreads; |
388 | } |
||
7 | PointedEar | 389 | printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
5 | PointedEar | 390 | print "=" x 76, "\n"; |
391 | $i = 0; |
||
9 | PointedEar | 392 | foreach my $thread ( |
393 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
||
394 | keys %threads |
||
395 | ) |
||
7 | PointedEar | 396 | { |
397 | my $name = substr( $thread, 0, 65 ); |
||
398 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
399 | $threads{$thread}{size} / 1024; #/ |
||
400 | last if ( ++$i == $count ); |
||
5 | PointedEar | 401 | } |
402 | print "\n", "=" x 76, "\n"; |
||
403 | } |
||
404 | |||
405 | ################################# |
||
406 | # show top 10 cross-posted groups |
||
407 | ################################# |
||
7 | PointedEar | 408 | unless ( $skipSec{7} ) |
409 | { |
||
410 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
411 | if ( keys %crossposts < $topcrossposts ) |
||
412 | { |
||
413 | $count = keys %crossposts; |
||
414 | } |
||
415 | else |
||
416 | { |
||
417 | $count = $topcrossposts; |
||
418 | } |
||
419 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
||
420 | print "=" x 76, "\n"; |
||
421 | $i = 0; |
||
422 | foreach |
||
423 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
||
424 | { |
||
425 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
426 | $crossposts{$name}; |
||
427 | last if ( ++$i == $count ); |
||
428 | } |
||
429 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 430 | } |
431 | ####################### |
||
432 | #show agents and counts |
||
433 | ####################### |
||
7 | PointedEar | 434 | unless ( $skipSec{8} ) |
435 | { |
||
436 | if ( keys %agents < $topagents ) |
||
437 | { |
||
438 | $count = keys %agents; |
||
439 | } |
||
440 | else |
||
441 | { |
||
442 | $count = $topagents; |
||
443 | } |
||
444 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
||
445 | print "=" x 76, "\n"; |
||
446 | $i = 0; |
||
447 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
||
448 | { |
||
449 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
||
450 | $agents{$agent}; |
||
451 | last if ( ++$i == $count ); |
||
452 | } |
||
453 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 454 | } |
455 | |||
456 | ####################### |
||
457 | #show distinct agents |
||
458 | ####################### |
||
7 | PointedEar | 459 | unless ( $skipSec{9} ) |
460 | { |
||
461 | if ( keys %distinct_agent < $topagents ) |
||
462 | { |
||
463 | $count = keys %distinct_agent; |
||
464 | } |
||
465 | else |
||
466 | { |
||
467 | $count = $topagents; |
||
468 | } |
||
469 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
||
470 | print "=" x 76, "\n"; |
||
471 | $i = 0; |
||
9 | PointedEar | 472 | foreach my $agent ( |
473 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
||
474 | keys %distinct_agent |
||
475 | ) |
||
7 | PointedEar | 476 | { |
477 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
||
478 | $distinct_agent{$agent}, |
||
479 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
||
480 | last if ( ++$i == $count ); |
||
481 | } |
||
482 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 483 | } |
484 | |||
485 | ########################## |
||
486 | #show timezones and counts |
||
487 | ########################## |
||
7 | PointedEar | 488 | unless ( $skipSec{10} ) |
489 | { |
||
490 | if ( keys %tz < $toptz ) |
||
491 | { |
||
492 | $count = keys %tz; |
||
493 | } |
||
494 | else |
||
495 | { |
||
496 | $count = $toptz; |
||
497 | } |
||
498 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
||
499 | print "=" x 76, "\n"; |
||
500 | $i = 0; |
||
501 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
||
502 | { |
||
503 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
||
504 | last if ( ++$i == $count ); |
||
505 | } |
||
506 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 507 | } |
508 | |||
509 | ################################ SUBROUTINES ################################ |
||
510 | |||
511 | ####################################### |
||
512 | # get current article's header and body |
||
513 | ####################################### |
||
7 | PointedEar | 514 | sub getarticle |
515 | { |
||
516 | %headers = (); # dump old headers |
||
517 | my $filename = shift; # get the name of the file |
||
5 | PointedEar | 518 | |
7 | PointedEar | 519 | # get stats about the file itself |
520 | $filesize = -s $filename; # get total size of file |
||
521 | $totsize += $filesize; # bump total sizes of all files |
||
522 | |||
523 | my $mtime = ( stat $filename )[9]; |
||
524 | if ( $mtime < $earliest ) |
||
525 | { |
||
5 | PointedEar | 526 | $earliest = $mtime; |
7 | PointedEar | 527 | } |
528 | elsif ( $mtime > $latest ) |
||
529 | { |
||
5 | PointedEar | 530 | $latest = $mtime; |
7 | PointedEar | 531 | } |
5 | PointedEar | 532 | |
7 | PointedEar | 533 | # now read the file |
9 | PointedEar | 534 | open( my $FILE, $filename ) or die "Can't open $filename: $!\n"; |
8 | PointedEar | 535 | while (<$FILE>) |
7 | PointedEar | 536 | { |
537 | $totheader += length($_); # bump total header size |
||
538 | last if (/^\s*$/); # end of header? |
||
539 | if (/^([^:\s]*):\s+(.*)/) |
||
540 | { |
||
541 | my ( $key, $val ) = ( $1, $2 ); |
||
542 | $headers{$key} = decode( 'MIME-Header', $val ); |
||
543 | $lcheader{ clean( lc($key) ) } = clean($val); |
||
544 | } |
||
5 | PointedEar | 545 | } |
9 | PointedEar | 546 | @body = <$FILE>; # slurp up body |
8 | PointedEar | 547 | close($FILE); |
7 | PointedEar | 548 | } # getarticle |
5 | PointedEar | 549 | |
550 | ################################### |
||
551 | # get data from the current article |
||
552 | ################################### |
||
7 | PointedEar | 553 | sub getdata |
554 | { |
||
5 | PointedEar | 555 | #### First, analyse header fields #### |
556 | |||
7 | PointedEar | 557 | # Set up this poster if not defined, get counts, sizes |
9 | PointedEar | 558 | my $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name |
7 | PointedEar | 559 | if ( !defined( $data{$poster} ) ) |
9 | PointedEar | 560 | { # seen this one before? |
561 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
||
7 | PointedEar | 562 | $data{$poster}{orig} = 0; |
563 | $data{$poster}{quoted} = 0; |
||
564 | } |
||
9 | PointedEar | 565 | $data{$poster}{count}++; # bump count for this poster |
566 | $data{$poster}{size} += $filesize; # total size of file |
||
5 | PointedEar | 567 | |
7 | PointedEar | 568 | # The User-Agent and/or X-Newsreader fields |
569 | # for User-Agent by poster |
||
570 | if ( defined $lcheader{"user-agent"} ) |
||
571 | { |
||
572 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
||
573 | } |
||
574 | if ( defined $lcheader{"x-newsreader"} ) |
||
575 | { |
||
576 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
||
577 | } |
||
5 | PointedEar | 578 | |
7 | PointedEar | 579 | # The User Agent for User-Agent by number of posts |
580 | my $UA = "unknown"; |
||
581 | foreach my $keys ( keys %lcheader ) |
||
582 | { |
||
583 | if ( defined $lcheader{'user-agent'} ) |
||
584 | { |
||
585 | $UA = $lcheader{'user-agent'}; |
||
586 | } |
||
587 | elsif ( defined $lcheader{"x-newsreader"} ) |
||
588 | { |
||
589 | $UA = $lcheader{"x-newsreader"}; |
||
590 | } |
||
591 | elsif ( defined $lcheader{'x-mailer'} ) |
||
592 | { |
||
593 | $UA = $lcheader{'x-mailer'}; |
||
594 | } |
||
595 | elsif ( |
||
596 | ( defined $lcheader{'organization'} ) |
||
597 | && ( $lcheader{'organization'} =~ |
||
598 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
||
599 | ) |
||
600 | { |
||
601 | $UA = $lcheader{'organization'}; |
||
602 | } |
||
603 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
||
604 | { |
||
605 | $UA = "Pine"; |
||
606 | } ## Hopefully found UA, else set to unknown |
||
607 | } |
||
5 | PointedEar | 608 | |
7 | PointedEar | 609 | $UA = clean($UA); |
610 | $UA = get_agent($UA); |
||
5 | PointedEar | 611 | |
7 | PointedEar | 612 | sub get_agent |
613 | { |
||
614 | my $raw = shift; |
||
615 | my $agent = $raw; |
||
5 | PointedEar | 616 | |
7 | PointedEar | 617 | ## strip http |
618 | if ( $raw =~ /.*http.*/ ) |
||
619 | { |
||
620 | $raw =~ s!posted via!!i; |
||
621 | $raw =~ s!http://!!g; |
||
622 | $raw =~ s!/!!g; |
||
623 | $raw =~ s! !!g; |
||
624 | } |
||
5 | PointedEar | 625 | |
7 | PointedEar | 626 | ## Fix Outlook from Mac |
627 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
||
5 | PointedEar | 628 | |
7 | PointedEar | 629 | ## Pick out the popular agents |
630 | if ( $raw =~ /(outlook express)/i |
||
631 | || $raw =~ /(microplanet gravity)/i |
||
632 | || $raw =~ /(news rover)/i |
||
633 | || $raw =~ /(forte agent)/i |
||
634 | || $raw =~ /(forte free agent)/i ) |
||
635 | { |
||
636 | $agent = $1; |
||
637 | } |
||
638 | elsif ( |
||
639 | $raw =~ /^( |
||
5 | PointedEar | 640 | pan |
641 | |sylpheed |
||
642 | |slrn |
||
643 | |mozilla |
||
644 | |knode |
||
645 | |tin |
||
646 | |hamster |
||
647 | |xrn |
||
648 | |xnews |
||
649 | |aol |
||
650 | |gnus |
||
651 | |krn |
||
652 | |macsoup |
||
653 | |messenger |
||
654 | |openxp |
||
655 | |pine |
||
656 | |thoth |
||
657 | |turnpike |
||
658 | |winvn |
||
659 | |vsoup |
||
660 | |google |
||
661 | |supernews |
||
662 | |nn |
||
663 | |rn |
||
664 | |007 |
||
665 | |webtv |
||
666 | |compuserve |
||
7 | PointedEar | 667 | )/ix |
668 | ) |
||
669 | { |
||
670 | $agent = $1; |
||
671 | } |
||
672 | else |
||
673 | { |
||
674 | ## Clean up unknown agents |
||
675 | if ( $raw =~ m!^(.*?)/! ) |
||
676 | { |
||
677 | $agent = $1; |
||
678 | } |
||
679 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
||
680 | { |
||
681 | $agent = $1; |
||
682 | } |
||
683 | } |
||
684 | |||
685 | $distinct_agent{$agent}++; |
||
686 | return $agent; |
||
5 | PointedEar | 687 | } |
688 | |||
7 | PointedEar | 689 | # Get all cross-posted newsgroups |
690 | for ( split /,/, $headers{"Newsgroups"} ) |
||
691 | { |
||
692 | $crossposts{$_}++; # bump count for each |
||
693 | } |
||
5 | PointedEar | 694 | |
7 | PointedEar | 695 | # Get threads |
696 | my $thread = encode( 'UTF-8', $headers{"Subject"} ); |
||
697 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
||
698 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
699 | $threads{$thread}{count} += 1; # bump count of this subject |
||
700 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
||
5 | PointedEar | 701 | |
7 | PointedEar | 702 | # Is this an original post or a reply? |
703 | if ( defined $headers{"References"} ) |
||
704 | { |
||
705 | $replies++; |
||
706 | } |
||
707 | else |
||
708 | { |
||
709 | $origposts++; |
||
710 | } |
||
5 | PointedEar | 711 | |
7 | PointedEar | 712 | # Get the time zone |
713 | $_ = $headers{"Date"}; |
||
714 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
||
9 | PointedEar | 715 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
7 | PointedEar | 716 | { |
717 | $tz = "UTC"; |
||
718 | } |
||
719 | $tz{$tz}++; |
||
5 | PointedEar | 720 | |
7 | PointedEar | 721 | #### Now analyse the body text #### |
722 | my $insig = 0; |
||
723 | for (@body) |
||
724 | { |
||
725 | $totbody += length($_); # bump total body size |
||
726 | next if (/^$>/); # don't count blank lines in body |
||
727 | if ( $insig == 1 ) |
||
728 | { |
||
729 | $totsig += length($_); # bump total sig size |
||
5 | PointedEar | 730 | |
7 | PointedEar | 731 | # Bill Unruh uses ] quotes, and another poster uses :: |
732 | } |
||
733 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
||
734 | { # are we in a quote line? |
||
735 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
||
9 | PointedEar | 736 | $totquoted += length($_); |
7 | PointedEar | 737 | } |
738 | elsif (/-- /) |
||
739 | { |
||
740 | $insig = 1; |
||
741 | } |
||
742 | else |
||
743 | { |
||
5 | PointedEar | 744 | |
7 | PointedEar | 745 | # we must be processing an original line |
746 | $data{$poster}{orig} += length($_); # bump count of original chrs |
||
9 | PointedEar | 747 | $totorig += length($_); |
7 | PointedEar | 748 | } |
749 | } # end for (@body) |
||
5 | PointedEar | 750 | |
7 | PointedEar | 751 | } # getdata |
5 | PointedEar | 752 | |
753 | ######################################## |
||
754 | # Count the User-Agents used, collapsing |
||
755 | # different versions into one per agent. |
||
756 | ######################################## |
||
7 | PointedEar | 757 | sub countagents |
758 | { |
||
5 | PointedEar | 759 | POSTER: |
8 | PointedEar | 760 | foreach my $poster ( keys %data ) |
7 | PointedEar | 761 | { |
762 | foreach my $agent_name ( keys %distinct_agent ) |
||
763 | { # check against known ones |
||
764 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
||
765 | { |
||
766 | $agents{$agent_name}++; |
||
767 | next POSTER; |
||
768 | } |
||
769 | } |
||
770 | $agents{ $data{$poster}{agent} }++; |
||
771 | } |
||
772 | } # countagents |
||
5 | PointedEar | 773 | |
774 | ############################################ |
||
775 | # set orig/total percentages for all posters |
||
776 | ############################################ |
||
7 | PointedEar | 777 | sub fixpercent |
778 | { |
||
8 | PointedEar | 779 | foreach my $poster ( keys %data ) |
7 | PointedEar | 780 | { |
781 | my $percent = 100; |
||
782 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
||
783 | { |
||
9 | PointedEar | 784 | $percent = |
785 | $data{$poster}{orig} * 100 / |
||
7 | PointedEar | 786 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
787 | } |
||
788 | elsif ( $data{$poster}{orig} == 0 ) |
||
789 | { |
||
790 | $percent = 0; |
||
791 | } |
||
792 | $data{$poster}{percent} = $percent; |
||
793 | } |
||
5 | PointedEar | 794 | } |
795 | |||
796 | ############################## |
||
797 | # right pad a string with '.'s |
||
798 | ############################## |
||
7 | PointedEar | 799 | sub rpad |
800 | { |
||
801 | |||
802 | # get text to pad, length to pad, pad chr |
||
803 | my ( $text, $pad_len, $pad_chr ) = @_; |
||
804 | if ( length($text) > $pad_len ) |
||
805 | { |
||
806 | $text = substr( $text, 0, $pad_len ); |
||
807 | } |
||
808 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
||
809 | return $padded; |
||
5 | PointedEar | 810 | } |
811 | |||
812 | ################# |
||
813 | # centre a string |
||
814 | ################# |
||
7 | PointedEar | 815 | sub centred |
816 | { |
||
817 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
||
818 | my $pad_len = ( $width - length($text) ) / 2; #/ |
||
819 | my $centred = " " x $pad_len . $text; |
||
820 | return $centred; |
||
5 | PointedEar | 821 | } |
822 | |||
823 | ########################## |
||
824 | # put commas into a number |
||
825 | ########################## |
||
7 | PointedEar | 826 | sub commify |
827 | { |
||
828 | $_ = shift; |
||
829 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
||
830 | return $_; |
||
5 | PointedEar | 831 | } |
832 | |||
833 | ######################### |
||
834 | # clean |
||
835 | ######################### |
||
7 | PointedEar | 836 | sub clean |
837 | { |
||
838 | my $dirty = shift; |
||
839 | my $clean = $dirty; |
||
840 | $clean =~ s/^\s*//; |
||
841 | $clean =~ s/\s*$//; |
||
5 | PointedEar | 842 | |
7 | PointedEar | 843 | return $clean; |
5 | PointedEar | 844 | } |
845 | |||
7 | PointedEar | 846 | sub usage |
847 | { |
||
5 | PointedEar | 848 | |
7 | PointedEar | 849 | print "usage: newstat.pl newsgroupname\n"; |
850 | exit 1; |
||
5 | PointedEar | 851 | } |
852 | |||
853 | ################################### |
||
854 | # Write data structures to a file # |
||
855 | ################################### |
||
7 | PointedEar | 856 | sub writedata |
857 | { |
||
8 | PointedEar | 858 | open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
859 | print $OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
||
860 | print $OUTF |
||
7 | PointedEar | 861 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
862 | foreach my $name ( keys %data ) |
||
863 | { |
||
8 | PointedEar | 864 | print $OUTF |
7 | PointedEar | 865 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
866 | } |
||
8 | PointedEar | 867 | print $OUTF |
7 | PointedEar | 868 | "============================================================================\n"; |
8 | PointedEar | 869 | print $OUTF "Thread subjects\n"; |
870 | print $OUTF |
||
7 | PointedEar | 871 | "----------------------------------------------------------------------------\n"; |
872 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
||
873 | { |
||
9 | PointedEar | 874 | print $OUTF |
875 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
||
7 | PointedEar | 876 | } |
8 | PointedEar | 877 | print $OUTF |
7 | PointedEar | 878 | "============================================================================\n"; |
8 | PointedEar | 879 | print $OUTF "Cross-posts\n"; |
880 | print $OUTF |
||
7 | PointedEar | 881 | "----------------------------------------------------------------------------\n"; |
882 | foreach my $name ( sort keys %crossposts ) |
||
883 | { |
||
8 | PointedEar | 884 | print $OUTF "$name : $crossposts{$name}\n"; |
7 | PointedEar | 885 | } |
8 | PointedEar | 886 | print $OUTF print $OUTF |
7 | PointedEar | 887 | "============================================================================\n"; |
8 | PointedEar | 888 | print $OUTF "User agents\n"; |
889 | print $OUTF |
||
7 | PointedEar | 890 | "----------------------------------------------------------------------------\n"; |
891 | foreach my $name ( sort keys %agents ) |
||
892 | { |
||
8 | PointedEar | 893 | print $OUTF "$name : $agents{$name}\n"; |
7 | PointedEar | 894 | } |
8 | PointedEar | 895 | print $OUTF |
7 | PointedEar | 896 | "============================================================================\n"; |
8 | PointedEar | 897 | print $OUTF "Time zones\n"; |
898 | print $OUTF |
||
7 | PointedEar | 899 | "----------------------------------------------------------------------------\n"; |
900 | foreach my $name ( sort keys %tz ) |
||
901 | { |
||
8 | PointedEar | 902 | print $OUTF "$name : $tz{$name}\n"; |
7 | PointedEar | 903 | } |
8 | PointedEar | 904 | close $OUTF; |
7 | PointedEar | 905 | } # writedata |