Rev 5 | Rev 7 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5 | PointedEar | 1 | #!/usr/bin/perl -w |
2 | use strict; |
||
6 | PointedEar | 3 | use warnings; |
4 | use utf8; |
||
5 | use Encode; |
||
6 | |||
5 | PointedEar | 7 | ######################### |
6 | PointedEar | 8 | # newsstat.pl version 0.4 |
5 | PointedEar | 9 | |
6 | PointedEar | 10 | ############################################################################ |
11 | # Collect statistics about a newsgroup (specified by first argument) in |
||
12 | # the local news spool. Check all articles in the last 30-day period. |
||
13 | # Rank posters by number of posts and by volume of posts, report on top and |
||
14 | # bottom 20 posters. Show their name, number of posts, size of posts, |
||
15 | # percentage of quoted lines. Rank user-agents used, by poster rather than |
||
16 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
||
17 | # |
||
18 | # (Numbers and paths can be configured below. -- PE) |
||
19 | ############################################################################ |
||
5 | PointedEar | 20 | |
6 | PointedEar | 21 | ############################################################################ |
22 | # RECENT CHANGES # |
||
23 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
||
24 | # - Use warnings, utf8 (just in case) |
||
25 | # - Documentation update |
||
26 | # N/A NN - Take newsgroup name as argument |
||
27 | # 2004-06-19 NN - newsgroup name is $ARGV[0] |
||
28 | # - Allow command line flags for subtracting |
||
29 | # output if not pertinent for a group |
||
30 | # 2002-11-09 NN - Put Garry's writedata() function back in. |
||
31 | # - added "rn" to my list of UA's |
||
32 | # - Started using %distinct_agent for both User agent |
||
33 | # sections |
||
34 | # - named it newsstat.pl version 0.3 |
||
35 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using |
||
36 | # mtime rather than ctime, and simplifying the logic |
||
37 | # 2002-11-05 NN - moved user configurations to the top |
||
38 | # - fixed the cross-posting section |
||
39 | # - introduced the $newsgroup_name variable which |
||
40 | # later becomes $news$group |
||
41 | # - changed $name to $agent_name in countagents() |
||
42 | # |
||
43 | # Contributors |
||
44 | # ------------- |
||
45 | # NN Nomen nominandum (name to be determined later) |
||
46 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de> |
||
5 | PointedEar | 47 | |
6 | PointedEar | 48 | ########### TODO ############# |
5 | PointedEar | 49 | # Commas in bottom section of report |
50 | # Show date the figures were compiled |
||
51 | # No. of HTML articles (Content-Type: text/html) |
||
52 | # No. of quoted sigs (/>\s*-- /) |
||
53 | # Per cent of top-posted articles |
||
54 | # Top 10 cross-posters |
||
55 | # Top 20 news posting hosts (from Path) |
||
56 | # Count of certain subject words: newbie, kde, burner, sendmail, etc. |
||
57 | # Count *all* User Agents that each poster uses |
||
58 | # What do we do about Bill Unruh's ] quote style? |
||
59 | # Change the way dates/times are checked |
||
60 | # include % share in posters by no. of arts |
||
61 | # include % share in posters by size |
||
62 | # Total, orig & quoted lines by user agent with per cent |
||
6 | PointedEar | 63 | # Take more arguments |
5 | PointedEar | 64 | ####################################################### |
65 | |||
66 | ###################### USER CONFIGURATIONS ############################ |
||
67 | |||
68 | # The name of the group to do stats for |
||
69 | my $newsgroup_name = $ARGV[0]; |
||
6 | PointedEar | 70 | $newsgroup_name or &usage; |
5 | PointedEar | 71 | |
72 | # Check for removal flags |
||
73 | my $ix; |
||
74 | my $j; |
||
75 | my %skipSec; |
||
76 | my @skiplist; |
||
77 | my $args = @ARGV; |
||
78 | for ( $ix = 1 ; $ix < $args ; $ix++ ) { |
||
79 | $j = $ix + 1; |
||
80 | if ( $ARGV[$ix] eq "-x" ) { |
||
81 | @skiplist = split(",",$ARGV[$j]); |
||
82 | } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { |
||
83 | @skiplist = split(",",$1); |
||
84 | } |
||
85 | } |
||
86 | foreach(@skiplist) { |
||
87 | $skipSec{$_} = 1; |
||
88 | } |
||
89 | |||
90 | # Leafnode users will want /var/spool/news for this variable. |
||
91 | my $news = "/var/spool/news/"; |
||
92 | |||
93 | # How many days are we doing statistics for? |
||
94 | my $numdays = 30; |
||
95 | |||
96 | # no. of agents we list |
||
97 | my $topagents = 10; |
||
98 | |||
99 | # no. of threads we want to know about |
||
100 | my $topthreads = 20; |
||
101 | |||
102 | # no. of top or bottom posters to show |
||
103 | my $topposters = 20; |
||
104 | |||
105 | # no. of cross-posted threads to show |
||
106 | my $topcrossposts = 10; |
||
107 | |||
108 | # no. of time zones to show |
||
109 | my $toptz = 10; |
||
110 | |||
111 | |||
112 | |||
113 | ###################### DATA STRUCTURES ###################### |
||
114 | my $group = $newsgroup_name; |
||
115 | $group =~ s!\.!/!g; |
||
116 | my %data; # name, count, agent, total, orig, quoted |
||
117 | my %threads; # subject, count |
||
118 | my %crossposts; # group, count |
||
119 | my %tz; # timezones by count |
||
120 | my %headers; # holds header of current article |
||
121 | my %lcheader; # holds lowercase headers |
||
122 | my @body; # holds body of current article |
||
123 | my @sig; # holds sig text; |
||
124 | my $totalposts; # total no. of posts considered |
||
125 | my $filename; # name of current article file |
||
126 | my $filesize; # size of current article file |
||
127 | my $earliest; # earliest article we have found |
||
128 | my $latest; # latest article we have found |
||
129 | my $poster; # poster we are dealing with |
||
130 | my $totsize = 0; # holds total sizes of all files |
||
131 | my $totheader = 0; # total size of header material |
||
132 | my $totbody = 0; # total size of body material |
||
133 | my $totsig = 0; # total size of sig material |
||
134 | my $totorig = 0; # total size of original material |
||
135 | my $totquoted = 0; # total size of quoted material |
||
136 | my $origposts = 0; # total no. of original posts |
||
137 | my $replies = 0; # total no. of replies |
||
138 | my $i; # general purpose |
||
139 | my %distinct_agent; |
||
140 | my %agents = # used to hold counts of User Agents used |
||
141 | ( "KNode" => 0, |
||
142 | "Pan" => 0, |
||
143 | "Mozilla" => 0, |
||
144 | "Sylpheed" => 0, |
||
145 | "Gnus" => 0, |
||
146 | "Forte Agent" => 0, |
||
147 | "Forte Free Agent" => 0, |
||
148 | "MicroPlanet Gravity" => 0, |
||
149 | "Microsoft Outlook Express" => 0, |
||
150 | "Xnews" => 0, |
||
151 | "slrn" => 0, |
||
152 | "tin" => 0, |
||
153 | "rn" => 0, |
||
154 | "NN" => 0, |
||
155 | "MacSOUP" => 0, |
||
156 | "Foorum" => 0, |
||
157 | "MT-NewsWatcher" => 0, |
||
158 | "News Rover" => 0, |
||
159 | "WebTV" => 0, |
||
160 | "Compuserver" => 0, |
||
161 | "VSoup" => 0); |
||
162 | |||
163 | ######################## MAIN CODE ######################## |
||
164 | $! = 1; |
||
165 | |||
166 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
||
167 | opendir(DIR, ".") or die "Can't open $news$group directory: $!\n"; |
||
168 | while (defined($filename = readdir(DIR))) { |
||
169 | %lcheader = (); |
||
170 | next unless -f $filename; # only want real files |
||
171 | next if ($filename eq ".overview"); # real articles only |
||
172 | next if (-M $filename > $numdays); # only want articles <= a certain age |
||
173 | $earliest = (stat $filename)[9] unless defined ($earliest); |
||
174 | $latest = (stat $filename)[9] unless defined ($latest); |
||
175 | &getarticle($filename); # read in the article |
||
176 | &getdata; # grab the data from the article |
||
177 | $totalposts++; # bump count of articles considered |
||
178 | } |
||
179 | closedir(DIR); # finished with the directory |
||
180 | # post-processing |
||
181 | &countagents; # count agents, collapsing versions |
||
182 | &fixpercent; # check percentages orig/total for posters |
||
183 | |||
184 | &writedata; |
||
185 | |||
186 | #################### DISPLAY RESULTS ##################### |
||
187 | print "=" x 76, "\n"; |
||
188 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
||
189 | print "=" x 76, "\n"; |
||
6 | PointedEar | 190 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight et al.)", 76); |
5 | PointedEar | 191 | print "\n\n"; |
192 | printf "Total posts considered: %s over %d days\n", |
||
193 | commify($totalposts), $numdays; |
||
194 | printf "Earliest article: %s\n", scalar localtime($earliest); |
||
195 | printf "Latest article: %s\n", scalar localtime($latest); |
||
196 | printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); |
||
197 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
||
198 | commify(int($totsize / 1024)), $totsize / 1048576; # |
||
199 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
||
200 | commify(int($totalposts / $numdays)), |
||
201 | $totsize / $numdays / 1048576, commify(int($totsize / $totalposts)); |
||
202 | my $count = keys %data; |
||
203 | printf "Total headers: %s KB bodies: %s KB\n", |
||
204 | commify(int($totheader / 1024)), commify(int($totbody / 1024)); |
||
205 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
||
206 | commify(int($totquoted / 1024)), commify(int($totorig / 1024)), |
||
207 | ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024)); |
||
208 | printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), |
||
209 | commify(int($totsize / $count)); #/ |
||
210 | $count = keys %threads; |
||
211 | printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), |
||
212 | commify(int($totsize / $count)); #/ |
||
213 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
||
214 | print "\n", "=" x 76, "\n"; |
||
215 | |||
216 | ############################### |
||
217 | # show posters by article count Sec 1; |
||
218 | ############################### |
||
219 | unless ( $skipSec{1} ) { |
||
220 | if (keys %data < $topposters) { |
||
221 | $count = keys %data; |
||
222 | } else { |
||
223 | $count = $topposters; |
||
224 | } |
||
225 | printf "%s\n", ¢red("Top $count posters by number of articles", 76); |
||
226 | print "=" x 76, "\n"; |
||
227 | $i = 0; |
||
228 | foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) { |
||
229 | my $name = substr($poster, 0, 65); |
||
230 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count}; |
||
231 | last if (++$i == $count); |
||
232 | } |
||
233 | print "\n", "=" x 76, "\n"; |
||
234 | } |
||
235 | |||
236 | ################################ |
||
237 | # show posters by size in Kbytes Sec 2; |
||
238 | ################################ |
||
239 | unless ( $skipSec{2} ) { |
||
240 | if (keys %data < $topposters) { |
||
241 | $count = keys %data; |
||
242 | } else { |
||
243 | $count = $topposters; |
||
244 | } |
||
245 | printf "%s\n", ¢red("Top $count posters by article size in Kbytes", 76); |
||
246 | print "=" x 76, "\n"; |
||
247 | $i = 0; |
||
248 | foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) { |
||
249 | my $name = substr($poster, 0, 62); |
||
250 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/ |
||
251 | last if (++$i == $count); |
||
252 | } |
||
253 | print "\n", "=" x 76, "\n"; |
||
254 | } |
||
255 | |||
256 | #################################### |
||
257 | # show top posters for original text |
||
258 | #################################### |
||
259 | unless ( $skipSec{3} ) { |
||
260 | if (keys %data < $topposters) { |
||
261 | $count = keys %data; |
||
262 | } else { |
||
263 | $count = $topposters; |
||
264 | } |
||
265 | printf "%s\n", ¢red("Top $count responders by original text (> 5 posts)", 76); |
||
266 | print "=" x 76, "\n"; |
||
267 | $i = 0; |
||
268 | foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) { |
||
269 | next if $data{$poster}{quoted} == 0; |
||
270 | next if $data{$poster}{count} < 5; |
||
271 | my $name = substr($poster, 0, 63); |
||
272 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
||
273 | last if (++$i == $count); |
||
274 | } |
||
275 | print "\n", "=" x 76, "\n"; |
||
276 | } |
||
277 | |||
278 | ####################################### |
||
279 | # show bottom posters for original text |
||
280 | ####################################### |
||
281 | unless ( $skipSec{4} ) { |
||
282 | if (keys %data < $topposters) { |
||
283 | $count = keys %data; |
||
284 | } else { |
||
285 | $count = $topposters; |
||
286 | } |
||
287 | printf "%s\n", ¢red("Bottom $count responders by original text (> 5 posts)", 76); |
||
288 | print "=" x 76, "\n"; |
||
289 | $i = 0; |
||
290 | foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) { |
||
291 | next if $data{$poster}{quoted} == 0; |
||
292 | next if $data{$poster}{count} < 5; |
||
293 | my $name = substr($poster, 0, 63); |
||
294 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
||
295 | last if (++$i == $count); |
||
296 | } |
||
297 | print "\n", "=" x 76, "\n"; |
||
298 | } |
||
299 | |||
300 | #################################### |
||
301 | # show threads by number of articles |
||
302 | #################################### |
||
303 | unless ( $skipSec{5} ) { |
||
304 | if (keys %threads < $topthreads) { |
||
305 | $count = keys %threads; |
||
306 | } else { |
||
307 | $count = $topthreads; |
||
308 | } |
||
309 | printf "%s\n", ¢red("Top $count threads by no. of articles", 76); |
||
310 | print "=" x 76, "\n"; |
||
311 | $i = 0; |
||
312 | foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) { |
||
313 | my $name = substr($thread, 0, 65); |
||
314 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count}; |
||
315 | last if (++$i == $count); |
||
316 | } |
||
317 | print "\n", "=" x 76, "\n"; |
||
318 | } |
||
319 | ################################ |
||
320 | # show threads by size in Kbytes |
||
321 | ################################ |
||
322 | unless ( $skipSec{6} ) { |
||
323 | if (keys %threads < $topthreads) { |
||
324 | $count = keys %threads; |
||
325 | } else { |
||
326 | $count = $topthreads; |
||
327 | } |
||
328 | printf "%s\n", ¢red("Top $count threads by size in KB", 76); |
||
329 | print "=" x 76, "\n"; |
||
330 | $i = 0; |
||
331 | foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) { |
||
332 | my $name = substr($thread, 0, 65); |
||
333 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/ |
||
334 | last if (++$i == $count); |
||
335 | } |
||
336 | print "\n", "=" x 76, "\n"; |
||
337 | } |
||
338 | |||
339 | ################################# |
||
340 | # show top 10 cross-posted groups |
||
341 | ################################# |
||
342 | unless ( $skipSec{7} ) { |
||
343 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
344 | if (keys %crossposts < $topcrossposts) { |
||
345 | $count = keys %crossposts; |
||
346 | } else { |
||
347 | $count = $topcrossposts; |
||
348 | } |
||
349 | printf "%s\n", ¢red("Top $count cross-posted groups", 76); |
||
350 | print "=" x 76, "\n"; |
||
351 | $i = 0; |
||
352 | foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) { |
||
353 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name}; |
||
354 | last if (++$i == $count); |
||
355 | } |
||
356 | print "\n", "=" x 76, "\n"; |
||
357 | } |
||
358 | ####################### |
||
359 | #show agents and counts |
||
360 | ####################### |
||
361 | unless ( $skipSec{8} ) { |
||
362 | if (keys %agents < $topagents) { |
||
363 | $count = keys %agents; |
||
364 | } else { |
||
365 | $count = $topagents; |
||
366 | } |
||
367 | printf "%s\n", ¢red("Top $count User Agents by poster", 76); |
||
368 | print "=" x 76, "\n"; |
||
369 | $i = 0; |
||
370 | foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) { |
||
371 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent}; |
||
372 | last if (++$i == $count); |
||
373 | } |
||
374 | print "\n", "=" x 76, "\n"; |
||
375 | } |
||
376 | |||
377 | ####################### |
||
378 | #show distinct agents |
||
379 | ####################### |
||
380 | unless ( $skipSec{9} ) { |
||
381 | if (keys %distinct_agent < $topagents) { |
||
382 | $count = keys %distinct_agent; |
||
383 | } else { |
||
384 | $count = $topagents; |
||
385 | } |
||
386 | printf "%s\n", ¢red("Top $count User Agents by number of posts", 76); |
||
387 | print "=" x 76, "\n"; |
||
388 | $i = 0; |
||
389 | foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) { |
||
390 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100); |
||
391 | last if (++$i == $count); |
||
392 | } |
||
393 | print "\n", "=" x 76, "\n"; |
||
394 | } |
||
395 | |||
396 | ########################## |
||
397 | #show timezones and counts |
||
398 | ########################## |
||
399 | unless ( $skipSec{10} ) { |
||
400 | if (keys %tz < $toptz) { |
||
401 | $count = keys %tz; |
||
402 | } else { |
||
403 | $count = $toptz; |
||
404 | } |
||
405 | printf "%s\n", ¢red("Top 10 time zones", 76); |
||
406 | print "=" x 76, "\n"; |
||
407 | $i = 0; |
||
408 | foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) { |
||
409 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone}; |
||
410 | last if (++$i == $count); |
||
411 | } |
||
412 | print "\n", "=" x 76, "\n"; |
||
413 | } |
||
414 | |||
415 | |||
416 | ################################ SUBROUTINES ################################ |
||
417 | |||
418 | |||
419 | ####################################### |
||
420 | # get current article's header and body |
||
421 | ####################################### |
||
422 | sub getarticle { |
||
423 | %headers = (); # dump old headers |
||
424 | my $filename = shift; # get the name of the file |
||
425 | # get stats about the file itself |
||
426 | $filesize = -s $filename; # get total size of file |
||
427 | $totsize += $filesize; # bump total sizes of all files |
||
428 | |||
429 | my $mtime = (stat $filename)[9]; |
||
430 | if ( $mtime < $earliest ) { |
||
431 | $earliest = $mtime; |
||
432 | } elsif ( $mtime > $latest ) { |
||
433 | $latest = $mtime; |
||
434 | } |
||
435 | |||
436 | # now read the file |
||
437 | open(FILE, $filename) or die "Can't open $filename: $!\n"; |
||
438 | while (<FILE>) { |
||
439 | $totheader += length($_); # bump total header size |
||
440 | last if (/^\s*$/); # end of header? |
||
441 | if (/^([^:\s]*):\s+(.*)/) { |
||
442 | my($key,$val) = ($1,$2); |
||
6 | PointedEar | 443 | $headers{$key} = decode('MIME-Header', $val); |
5 | PointedEar | 444 | $lcheader{clean(lc($key))} = clean($val); |
445 | } |
||
446 | } |
||
447 | @body = <FILE>; # slurp up body |
||
448 | close(FILE); |
||
449 | } # getarticle |
||
450 | |||
451 | ################################### |
||
452 | # get data from the current article |
||
453 | ################################### |
||
454 | sub getdata { |
||
455 | #### First, analyse header fields #### |
||
456 | |||
457 | # Set up this poster if not defined, get counts, sizes |
||
6 | PointedEar | 458 | $poster = encode('UTF-8', $headers{From}); # get the poster's name |
5 | PointedEar | 459 | if (!defined($data{$poster})) { # seen this one before? |
460 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
||
461 | $data{$poster}{orig} = 0; |
||
462 | $data{$poster}{quoted} = 0; |
||
463 | } |
||
464 | $data{$poster}{count}++; # bump count for this poster |
||
465 | $data{$poster}{size} += $filesize; # total size of file |
||
466 | |||
467 | # The User-Agent and/or X-Newsreader fields |
||
468 | # for User-Agent by poster |
||
469 | if (defined $lcheader{"user-agent"}) { |
||
470 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
||
471 | } |
||
472 | if (defined $lcheader{"x-newsreader"}) { |
||
473 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
||
474 | } |
||
475 | |||
476 | # The User Agent for User-Agent by number of posts |
||
477 | my $UA = "unknown"; |
||
478 | foreach my $keys ( keys %lcheader ) |
||
479 | { |
||
480 | if (defined $lcheader{'user-agent'}) |
||
481 | { |
||
482 | $UA = $lcheader{'user-agent'}; |
||
483 | } |
||
484 | elsif (defined $lcheader{"x-newsreader"}) |
||
485 | { |
||
486 | $UA = $lcheader{"x-newsreader"}; |
||
487 | } |
||
488 | elsif (defined $lcheader{'x-mailer'}) |
||
489 | { |
||
490 | $UA = $lcheader{'x-mailer'}; |
||
491 | } |
||
492 | elsif ((defined $lcheader{'organization'}) && |
||
493 | ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/)) |
||
494 | { |
||
495 | $UA = $lcheader{'organization'}; |
||
496 | } |
||
497 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
||
498 | { |
||
499 | $UA = "Pine"; |
||
500 | } ## Hopefully found UA, else set to unknown |
||
501 | } |
||
502 | |||
503 | |||
504 | $UA = clean($UA); |
||
505 | $UA = get_agent($UA); |
||
506 | |||
507 | |||
508 | sub get_agent { |
||
509 | my $raw = shift; |
||
510 | my $agent = $raw; |
||
511 | |||
512 | ## strip http |
||
513 | if ( $raw =~ /.*http.*/ ) { |
||
514 | $raw =~ s!posted via!!i; |
||
515 | $raw =~ s!http://!!g; |
||
516 | $raw =~ s!/!!g; |
||
517 | $raw =~ s! !!g; |
||
518 | } |
||
519 | |||
520 | ## Fix Outlook from Mac |
||
521 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;} |
||
522 | |||
523 | ## Pick out the popular agents |
||
524 | if ( $raw =~ /(outlook express)/i || |
||
525 | $raw =~ /(microplanet gravity)/i || |
||
526 | $raw =~ /(news rover)/i || |
||
527 | $raw =~ /(forte agent)/i || |
||
528 | $raw =~ /(forte free agent)/i |
||
529 | ) |
||
530 | { |
||
531 | $agent = $1; |
||
532 | } |
||
533 | elsif ( $raw =~ /^( |
||
534 | pan |
||
535 | |sylpheed |
||
536 | |slrn |
||
537 | |mozilla |
||
538 | |knode |
||
539 | |tin |
||
540 | |hamster |
||
541 | |xrn |
||
542 | |xnews |
||
543 | |aol |
||
544 | |gnus |
||
545 | |krn |
||
546 | |macsoup |
||
547 | |messenger |
||
548 | |openxp |
||
549 | |pine |
||
550 | |thoth |
||
551 | |turnpike |
||
552 | |winvn |
||
553 | |vsoup |
||
554 | |google |
||
555 | |supernews |
||
556 | |nn |
||
557 | |rn |
||
558 | |007 |
||
559 | |webtv |
||
560 | |compuserve |
||
561 | )/ix ) |
||
562 | { |
||
563 | $agent = $1; |
||
564 | } |
||
565 | else |
||
566 | { |
||
567 | ## Clean up unknown agents |
||
568 | if ( $raw =~ m!^(.*?)/! ) { |
||
569 | $agent = $1; |
||
570 | } |
||
571 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
||
572 | { |
||
573 | $agent = $1; |
||
574 | } |
||
575 | } |
||
576 | |||
577 | $distinct_agent{$agent}++; |
||
578 | return $agent; |
||
579 | } |
||
580 | |||
581 | |||
582 | # Get all cross-posted newsgroups |
||
583 | for (split /,/, $headers{"Newsgroups"}) { |
||
584 | $crossposts{$_}++; # bump count for each |
||
585 | } |
||
586 | |||
587 | # Get threads |
||
6 | PointedEar | 588 | my $thread = encode('UTF-8', $headers{"Subject"}); |
5 | PointedEar | 589 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
590 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
591 | $threads{$thread}{count} += 1; # bump count of this subject |
||
592 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
||
593 | |||
594 | # Is this an original post or a reply? |
||
595 | if (defined $headers{"References"}) { |
||
596 | $replies++; |
||
597 | } else { |
||
598 | $origposts++; |
||
599 | } |
||
600 | |||
601 | # Get the time zone |
||
602 | $_ = $headers{"Date"}; |
||
603 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
||
604 | if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) { |
||
605 | $tz = "UTC"; |
||
606 | } |
||
607 | $tz{$tz}++; |
||
608 | |||
609 | #### Now analyse the body text #### |
||
610 | my $insig = 0; |
||
611 | for (@body) { |
||
612 | $totbody += length($_); # bump total body size |
||
613 | next if (/^$>/); # don't count blank lines in body |
||
614 | if ($insig == 1) { |
||
615 | $totsig += length($_); # bump total sig size |
||
616 | # Bill Unruh uses ] quotes, and another poster uses :: |
||
617 | } elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line? |
||
618 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
||
619 | $totquoted += length($_); |
||
620 | } elsif (/-- /) { |
||
621 | $insig = 1; |
||
622 | } else { |
||
623 | # we must be processing an original line |
||
624 | $data{$poster}{orig} += length($_); # bump count of original chrs |
||
625 | $totorig += length($_); |
||
626 | } |
||
627 | } # end for (@body) |
||
628 | |||
629 | } # getdata |
||
630 | |||
631 | ######################################## |
||
632 | # Count the User-Agents used, collapsing |
||
633 | # different versions into one per agent. |
||
634 | ######################################## |
||
635 | sub countagents { |
||
636 | POSTER: |
||
637 | foreach $poster (keys %data) { |
||
638 | foreach my $agent_name (keys %distinct_agent) { # check against known ones |
||
639 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { |
||
640 | $agents{$agent_name}++; |
||
641 | next POSTER; |
||
642 | } |
||
643 | } |
||
644 | $agents{$data{$poster}{agent}}++; |
||
645 | } |
||
646 | } # countagents |
||
647 | |||
648 | ############################################ |
||
649 | # set orig/total percentages for all posters |
||
650 | ############################################ |
||
651 | sub fixpercent { |
||
652 | foreach $poster (keys %data) { |
||
653 | my $percent = 100; |
||
654 | if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) { |
||
655 | $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/ |
||
656 | } elsif ($data{$poster}{orig} == 0) { |
||
657 | $percent = 0; |
||
658 | } |
||
659 | $data{$poster}{percent} = $percent; |
||
660 | } |
||
661 | } |
||
662 | |||
663 | ############################## |
||
664 | # right pad a string with '.'s |
||
665 | ############################## |
||
666 | sub rpad { |
||
667 | # get text to pad, length to pad, pad chr |
||
668 | my ($text, $pad_len, $pad_chr) = @_; |
||
669 | if (length($text) > $pad_len) { |
||
670 | $text = substr($text, 0, $pad_len); |
||
671 | } |
||
672 | my $padded = $text . $pad_chr x ( $pad_len - length( $text ) ); |
||
673 | return $padded; |
||
674 | } |
||
675 | |||
676 | ################# |
||
677 | # centre a string |
||
678 | ################# |
||
679 | sub centred { |
||
680 | my ($text, $width) = @_; # text to centre, size of field to centre in |
||
681 | my $pad_len = ($width - length($text)) / 2; #/ |
||
682 | my $centred = " " x $pad_len . $text; |
||
683 | return $centred; |
||
684 | } |
||
685 | |||
686 | ########################## |
||
687 | # put commas into a number |
||
688 | ########################## |
||
689 | sub commify { |
||
690 | $_ = shift; |
||
691 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
||
692 | return $_; |
||
693 | } |
||
694 | |||
695 | ######################### |
||
696 | # clean |
||
697 | ######################### |
||
698 | sub clean { |
||
699 | my $dirty = shift; |
||
700 | my $clean = $dirty; |
||
701 | $clean =~ s/^\s*//; |
||
702 | $clean =~ s/\s*$//; |
||
703 | |||
704 | return $clean; |
||
705 | } |
||
706 | |||
707 | |||
708 | sub usage { |
||
709 | |||
710 | print "usage: newstat.pl newsgroupname\n"; |
||
711 | exit 1; |
||
712 | } |
||
713 | |||
714 | ################################### |
||
715 | # Write data structures to a file # |
||
716 | ################################### |
||
717 | sub writedata { |
||
718 | open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
||
719 | print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
||
720 | print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
||
721 | foreach my $name (keys %data) { |
||
722 | print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
||
723 | } |
||
724 | print OUTF "============================================================================\n"; |
||
725 | print OUTF "Thread subjects\n"; |
||
726 | print OUTF "----------------------------------------------------------------------------\n"; |
||
727 | foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) { |
||
728 | print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
||
729 | } |
||
730 | print OUTF "============================================================================\n"; |
||
731 | print OUTF "Cross-posts\n"; |
||
732 | print OUTF "----------------------------------------------------------------------------\n"; |
||
733 | foreach my $name (sort keys %crossposts) { |
||
734 | print OUTF "$name : $crossposts{$name}\n"; |
||
735 | } |
||
736 | print OUTF |
||
737 | print OUTF "============================================================================\n"; |
||
738 | print OUTF "User agents\n"; |
||
739 | print OUTF "----------------------------------------------------------------------------\n"; |
||
740 | foreach my $name (sort keys %agents) { |
||
741 | print OUTF "$name : $agents{$name}\n"; |
||
742 | } |
||
743 | print OUTF "============================================================================\n"; |
||
744 | print OUTF "Time zones\n"; |
||
745 | print OUTF "----------------------------------------------------------------------------\n"; |
||
746 | foreach my $name (sort keys %tz) { |
||
747 | print OUTF "$name : $tz{$name}\n"; |
||
748 | } |
||
749 | close OUTF; |
||
750 | } # writedata |