Rev 14 | Rev 23 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8 | PointedEar | 1 | #!/usr/bin/env perl |
5 | PointedEar | 2 | use strict; |
6 | PointedEar | 3 | use warnings; |
22 | PointedEar | 4 | |
5 | #use diagnostics; |
||
6 | PointedEar | 6 | use utf8; |
7 | use Encode; |
||
8 | |||
22 | PointedEar | 9 | use constant DEBUG => 0; |
10 | |||
13 | PointedEar | 11 | ## Print out all text to STDOUT UTF-8 encoded |
12 | binmode STDOUT, ':encoding(UTF-8)'; |
||
22 | PointedEar | 13 | binmode STDERR, ':encoding(UTF-8)'; |
5 | PointedEar | 14 | |
22 | PointedEar | 15 | # FIXME: Automatically include resolved '.' in @INC |
16 | # print join "\n", @INC; |
||
5 | PointedEar | 17 | |
22 | PointedEar | 18 | use locale ':not_characters'; |
19 | use Locale::TextDomain ('de.pointedears.newsstat'); |
||
20 | use POSIX ('locale_h'); |
||
21 | use Locale::Messages qw (bind_textdomain_filter |
||
22 | bind_textdomain_codeset |
||
23 | turn_utf_8_on); |
||
24 | #setlocale( LC_MESSAGES, '' ); |
||
25 | bind_textdomain_filter 'de.pointedears.newsstat', \&turn_utf_8_on; |
||
26 | bind_textdomain_codeset 'de.pointedears.newsstat', 'utf-8'; |
||
27 | |||
28 | require Mail::Message; |
||
29 | require DateTime; |
||
30 | require DateTime::Format::Mail; |
||
31 | |||
5 | PointedEar | 32 | ###################### USER CONFIGURATIONS ############################ |
33 | |||
13 | PointedEar | 34 | ## The name of the group to do stats for |
5 | PointedEar | 35 | my $newsgroup_name = $ARGV[0]; |
22 | PointedEar | 36 | $newsgroup_name or usage(); |
5 | PointedEar | 37 | |
13 | PointedEar | 38 | ## Check for removal flags |
5 | PointedEar | 39 | my $ix; |
40 | my $j; |
||
41 | my %skipSec; |
||
42 | my @skiplist; |
||
43 | my $args = @ARGV; |
||
7 | PointedEar | 44 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
45 | { |
||
46 | $j = $ix + 1; |
||
47 | if ( $ARGV[$ix] eq "-x" ) |
||
48 | { |
||
49 | @skiplist = split( ",", $ARGV[$j] ); |
||
50 | } |
||
51 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
||
52 | { |
||
53 | @skiplist = split( ",", $1 ); |
||
54 | } |
||
5 | PointedEar | 55 | } |
7 | PointedEar | 56 | foreach (@skiplist) |
57 | { |
||
5 | PointedEar | 58 | $skipSec{$_} = 1; |
59 | } |
||
60 | |||
13 | PointedEar | 61 | ## Leafnode users will want /var/spool/news for this variable. |
5 | PointedEar | 62 | my $news = "/var/spool/news/"; |
63 | |||
22 | PointedEar | 64 | ## Number of top or bottom posters to show |
65 | my $topposters = 20; |
||
5 | PointedEar | 66 | |
13 | PointedEar | 67 | ## Number of threads we want to know about |
5 | PointedEar | 68 | my $topthreads = 20; |
69 | |||
13 | PointedEar | 70 | ## Number of cross-posted threads to show |
5 | PointedEar | 71 | my $topcrossposts = 10; |
72 | |||
22 | PointedEar | 73 | ## Number of agents we list |
74 | my $topagents = 10; |
||
75 | |||
13 | PointedEar | 76 | ## Number of time zones to show |
5 | PointedEar | 77 | my $toptz = 10; |
78 | |||
79 | ###################### DATA STRUCTURES ###################### |
||
80 | my $group = $newsgroup_name; |
||
81 | $group =~ s!\.!/!g; |
||
22 | PointedEar | 82 | my %data; # name, count, agent, total, orig, quoted |
83 | my $totsize = 0; # holds total sizes of all files |
||
84 | my %crossposts; # group, count |
||
85 | my %threads; # subject, count |
||
86 | my $replies = 0; # total no. of replies |
||
87 | my $origposts = 0; # total no. of original posts |
||
88 | my %tz; # timezones by count |
||
89 | my $earliest; # earliest article we have found |
||
90 | my $latest; # latest article we have found |
||
91 | my $totheader = 0; # total size of header material |
||
92 | my $totbody = 0; # total size of body material |
||
93 | my $totsig = 0; # total size of sig material |
||
94 | my $totquoted = 0; # total size of quoted material |
||
95 | my $totorig = 0; # total size of original material |
||
96 | my $totalposts; # total no. of posts considered |
||
5 | PointedEar | 97 | my %distinct_agent; |
13 | PointedEar | 98 | |
99 | ## Used to hold counts of User Agents used |
||
100 | my %agents = ( |
||
101 | "Compuserver" => 0, |
||
102 | "Foorum" => 0, |
||
7 | PointedEar | 103 | "Forte Agent" => 0, |
104 | "Forte Free Agent" => 0, |
||
13 | PointedEar | 105 | "Gnus" => 0, |
106 | "KNode" => 0, |
||
107 | "MacSOUP" => 0, |
||
108 | "MT-NewsWatcher" => 0, |
||
7 | PointedEar | 109 | "MicroPlanet Gravity" => 0, |
110 | "Microsoft Outlook Express" => 0, |
||
13 | PointedEar | 111 | "Microsoft Windows Mail" => 0, |
112 | "Mozilla" => 0, |
||
113 | "News Rover" => 0, |
||
114 | "NN" => 0, |
||
115 | "Pan" => 0, |
||
116 | "rn" => 0, |
||
7 | PointedEar | 117 | "slrn" => 0, |
13 | PointedEar | 118 | "Sylpheed" => 0, |
7 | PointedEar | 119 | "tin" => 0, |
13 | PointedEar | 120 | "VSoup" => 0, |
7 | PointedEar | 121 | "WebTV" => 0, |
22 | PointedEar | 122 | "Xnews" => 0, |
13 | PointedEar | 123 | ); |
5 | PointedEar | 124 | |
22 | PointedEar | 125 | my $datetime_parser = DateTime::Format::Mail->new(); |
126 | $datetime_parser->loose(); |
||
5 | PointedEar | 127 | |
22 | PointedEar | 128 | my $today = DateTime->today( time_zone => 'UTC' ); |
129 | my $prev_month = $today->clone()->subtract( months => 1 )->set_day(1); |
||
130 | my $start = int $prev_month->strftime('%s'); |
||
131 | my $numdays = int DateTime->last_day_of_month( |
||
132 | year => $prev_month->year(), |
||
133 | month => $prev_month->month(), |
||
134 | time_zone => $prev_month->time_zone(), |
||
135 | )->day(); |
||
136 | my $end = int $today->clone()->set_day(1)->strftime('%s'); |
||
137 | |||
138 | dmsg( $start, " to ", $end ) if DEBUG; |
||
139 | |||
140 | chdir("$news$group") |
||
141 | or die __x( |
||
142 | "Can't cd to {newsgroup}: {error}\n", |
||
143 | newsgroup => "$news$group", |
||
144 | error => $! |
||
145 | ); |
||
146 | opendir( DIR, "." ) |
||
147 | or die __x( |
||
148 | "Can't open {newsgroup}: {error}\n", |
||
149 | newsgroup => "$news$group", |
||
150 | error => $! |
||
151 | ); |
||
152 | |||
153 | while ( defined( my $filename = readdir(DIR) ) ) |
||
7 | PointedEar | 154 | { |
22 | PointedEar | 155 | next unless -f $filename; # only want real files |
156 | next if ( $filename eq ".overview" ); # real articles only |
||
157 | |||
158 | get_article($filename); # read in the article |
||
5 | PointedEar | 159 | } |
22 | PointedEar | 160 | closedir(DIR); # finished with the directory |
7 | PointedEar | 161 | |
22 | PointedEar | 162 | dmsg("\nearliest: $earliest\nlatest: $latest") if DEBUG; |
163 | |||
13 | PointedEar | 164 | ## Post-processing |
22 | PointedEar | 165 | count_agents(); # count agents, collapsing versions |
166 | fix_percent(); |
||
5 | PointedEar | 167 | |
22 | PointedEar | 168 | write_data(); |
169 | display_results(); |
||
5 | PointedEar | 170 | |
13 | PointedEar | 171 | ######################################## |
22 | PointedEar | 172 | ## Get current article's header and body |
13 | PointedEar | 173 | ######################################## |
22 | PointedEar | 174 | sub get_article |
7 | PointedEar | 175 | { |
22 | PointedEar | 176 | my $filename = shift; |
5 | PointedEar | 177 | |
22 | PointedEar | 178 | open( my $FILE, '<', $filename ) |
179 | or |
||
180 | die __x( "Can't open {file}: {error}\n", file => $filename, error => $! ); |
||
181 | my $msg = Mail::Message->read($FILE); |
||
182 | my $timestamp = $msg->timestamp(); |
||
183 | my $date = $msg->study('Date'); |
||
5 | PointedEar | 184 | |
22 | PointedEar | 185 | ## Disregard article if timestamp is not in range |
186 | dmsg($timestamp) if DEBUG; |
||
187 | if ( $timestamp < $start or $timestamp >= $end ) |
||
7 | PointedEar | 188 | { |
22 | PointedEar | 189 | dmsg("Posting on $date ignored.") if DEBUG; |
190 | return; |
||
7 | PointedEar | 191 | } |
5 | PointedEar | 192 | |
22 | PointedEar | 193 | $totalposts++; # bump count of articles considered |
5 | PointedEar | 194 | |
22 | PointedEar | 195 | ## DEBUG |
196 | dmsg($date) if DEBUG; |
||
13 | PointedEar | 197 | |
22 | PointedEar | 198 | ## get stats about the file itself |
199 | my $filesize = -s $filename; # get total size of file |
||
200 | $totsize += $filesize; # bump total sizes of all files |
||
5 | PointedEar | 201 | |
22 | PointedEar | 202 | if ( ( not defined $earliest ) or $timestamp < $earliest ) |
7 | PointedEar | 203 | { |
22 | PointedEar | 204 | $earliest = $timestamp; |
7 | PointedEar | 205 | } |
22 | PointedEar | 206 | elsif ( ( not defined $latest ) or $timestamp > $latest ) |
7 | PointedEar | 207 | { |
22 | PointedEar | 208 | $latest = $timestamp; |
7 | PointedEar | 209 | } |
13 | PointedEar | 210 | |
22 | PointedEar | 211 | #print "timestamp: $timestamp\n"; |
5 | PointedEar | 212 | |
22 | PointedEar | 213 | ## count header size |
214 | $totheader += $msg->head()->size(); |
||
5 | PointedEar | 215 | |
22 | PointedEar | 216 | ## get the poster's name (MIME-decoded, in UTF-8) |
217 | my $poster = $msg->study('From'); |
||
218 | if ( defined $poster ) |
||
7 | PointedEar | 219 | { |
22 | PointedEar | 220 | ## Convert old to new format |
221 | $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
||
5 | PointedEar | 222 | |
22 | PointedEar | 223 | ## Collapse whitespace |
224 | $poster =~ s/\s+/ /g; |
||
5 | PointedEar | 225 | |
22 | PointedEar | 226 | ## Remove outer quotes; TODO: observe RFC 5322 strictly |
227 | $poster =~ s/^ " (.+ ) " \s+ (.*)/$1 $2/x; |
||
5 | PointedEar | 228 | |
22 | PointedEar | 229 | ## DEBUG |
230 | dmsg($poster) if DEBUG; |
||
7 | PointedEar | 231 | |
22 | PointedEar | 232 | ## seen this one before? |
233 | if ( !defined( $data{$poster} ) ) |
||
234 | { |
||
235 | $data{$poster}{'agent'} = __ 'unknown'; # comes after For: field |
||
236 | $data{$poster}{'orig'} = 0; |
||
237 | $data{$poster}{'quoted'} = 0; |
||
238 | } |
||
239 | $data{$poster}{'count'}++; # bump count for this poster |
||
240 | $data{$poster}{'size'} += $filesize; # total size of file |
||
5 | PointedEar | 241 | |
22 | PointedEar | 242 | ## The User-Agent and/or X-Newsreader fields |
243 | ## for User-Agent by poster |
||
244 | my $ua = $msg->study('User-Agent') or $msg->study('X-Newsreader'); |
||
245 | if ( defined $ua ) |
||
7 | PointedEar | 246 | { |
22 | PointedEar | 247 | $data{$poster}{'agent'} = $ua; |
248 | |||
249 | ## DEBUG |
||
250 | dmsg($ua) if DEBUG; |
||
7 | PointedEar | 251 | } |
5 | PointedEar | 252 | |
22 | PointedEar | 253 | ## The User Agent for User-Agent by number of posts |
254 | get_agent($msg); |
||
5 | PointedEar | 255 | |
22 | PointedEar | 256 | ## Get all cross-posted newsgroups |
257 | for ( split( /,/, $msg->study('Newsgroups') ) ) |
||
258 | { |
||
259 | $crossposts{$_}++; # bump count for each |
||
260 | } |
||
5 | PointedEar | 261 | |
22 | PointedEar | 262 | ## Get threads |
263 | my $thread = $msg->study('Subject'); |
||
264 | $thread =~ s/^re:\s+//i; # Remove Re: or re: at start |
||
265 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
266 | $threads{$thread}{'count'}++; # bump count of this subject |
||
267 | $threads{$thread}{'size'} += $filesize; # bump bytes for this thread |
||
5 | PointedEar | 268 | |
22 | PointedEar | 269 | ## Is this an original post or a reply? |
270 | if ( defined $msg->study('References') ) |
||
7 | PointedEar | 271 | { |
22 | PointedEar | 272 | $replies++; |
7 | PointedEar | 273 | } |
22 | PointedEar | 274 | else |
7 | PointedEar | 275 | { |
22 | PointedEar | 276 | $origposts++; |
7 | PointedEar | 277 | } |
22 | PointedEar | 278 | |
279 | ## Get the time zone |
||
280 | my $datetime = $datetime_parser->parse_datetime($date); |
||
281 | my $tz = $datetime->strftime('%z'); |
||
282 | $tz = "UTC" if $tz =~ m{^(?:GMT|0000)$}o; |
||
283 | $tz{$tz}++; |
||
284 | |||
285 | ## DEBUG |
||
286 | dmsg($tz) if DEBUG; |
||
287 | |||
288 | #### Now analyse the body text #### |
||
289 | my $body = $msg->body(); |
||
290 | |||
291 | my $insig = 0; |
||
292 | my @body = $body->lines; |
||
293 | for (@body) |
||
7 | PointedEar | 294 | { |
22 | PointedEar | 295 | $totbody += length($_); # bump total body size |
296 | next if (m{^$>}o); # don't count blank lines in body |
||
297 | if ( $insig == 1 ) |
||
298 | { |
||
299 | |||
300 | # bump total sig size |
||
301 | $totsig += length($_); |
||
302 | } |
||
303 | ## are we in a quote line? |
||
304 | ## Bill Unruh uses ] quotes, and another poster uses :: |
||
305 | elsif ( m{^\s*[>\]]}o or m{^\s*::}o ) |
||
306 | { |
||
307 | ## bump count of quoted chrs |
||
308 | $data{$poster}{'quoted'} += length($_); |
||
309 | $totquoted += length($_); |
||
310 | } |
||
311 | elsif (/^-- $/) |
||
312 | { |
||
313 | $insig = 1; |
||
314 | } |
||
315 | else |
||
316 | { |
||
317 | ## We must be processing an original line |
||
318 | $data{$poster}{'orig'} += length($_); # bump count of original chrs |
||
319 | $totorig += length($_); |
||
320 | } |
||
7 | PointedEar | 321 | } |
22 | PointedEar | 322 | |
323 | # end for (@body) |
||
324 | } |
||
325 | |||
326 | close($FILE); |
||
327 | } |
||
328 | |||
329 | sub get_agent |
||
330 | { |
||
331 | my $msg = shift; |
||
332 | |||
333 | my $ua = |
||
334 | $msg->study('User-Agent') |
||
335 | or $msg->study('X-Newsreader') |
||
336 | or $msg->study('X-Mailer'); |
||
337 | if ( not defined $ua ) |
||
338 | { |
||
339 | my $org = $msg->study('Organization'); |
||
340 | if ( defined $org |
||
341 | and $org =~ /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
||
7 | PointedEar | 342 | { |
22 | PointedEar | 343 | $ua = $org; |
7 | PointedEar | 344 | } |
22 | PointedEar | 345 | elsif ( $msg->study('Message-ID') =~ /pine/i ) |
7 | PointedEar | 346 | { |
22 | PointedEar | 347 | $ua = "Pine"; |
348 | } |
||
7 | PointedEar | 349 | } |
5 | PointedEar | 350 | |
22 | PointedEar | 351 | ## Hopefully found UA, else set to unknown |
352 | if ( not defined $ua ) |
||
353 | { |
||
354 | $ua = __ "unknown"; |
||
355 | } |
||
5 | PointedEar | 356 | |
22 | PointedEar | 357 | $ua = clean($ua); |
358 | |||
359 | my $raw = $ua; |
||
360 | my $agent = $raw; |
||
361 | |||
362 | ## strip http |
||
363 | if ( $raw =~ /.*http.*/ ) |
||
7 | PointedEar | 364 | { |
22 | PointedEar | 365 | $raw =~ s!posted via!!i; |
366 | $raw =~ s!http://!!g; |
||
367 | $raw =~ s!/!!g; |
||
368 | $raw =~ s! !!g; |
||
369 | } |
||
5 | PointedEar | 370 | |
22 | PointedEar | 371 | ## Fix Outlook from Mac |
372 | if ( $raw =~ /^microsoft/i ) |
||
373 | { |
||
374 | $raw =~ s/-/ /g; |
||
375 | } |
||
5 | PointedEar | 376 | |
22 | PointedEar | 377 | ## Pick out the popular agents |
378 | if ( $raw =~ /(outlook express)/i |
||
379 | || $raw =~ /(windows mail)/i |
||
380 | || $raw =~ /(microplanet gravity)/i |
||
381 | || $raw =~ /(news rover)/i |
||
382 | || $raw =~ /(forte agent)/i |
||
383 | || $raw =~ /(forte free agent)/i ) |
||
384 | { |
||
385 | $agent = $1; |
||
386 | } |
||
387 | elsif ( |
||
388 | $raw =~ /^( |
||
5 | PointedEar | 389 | pan |
390 | |sylpheed |
||
391 | |slrn |
||
392 | |mozilla |
||
393 | |knode |
||
394 | |tin |
||
395 | |hamster |
||
396 | |xrn |
||
397 | |xnews |
||
398 | |aol |
||
399 | |gnus |
||
400 | |krn |
||
401 | |macsoup |
||
402 | |messenger |
||
403 | |openxp |
||
404 | |pine |
||
405 | |thoth |
||
406 | |turnpike |
||
407 | |winvn |
||
408 | |vsoup |
||
409 | |google |
||
410 | |supernews |
||
411 | |nn |
||
412 | |rn |
||
413 | |007 |
||
414 | |webtv |
||
415 | |compuserve |
||
7 | PointedEar | 416 | )/ix |
22 | PointedEar | 417 | ) |
418 | { |
||
419 | $agent = $1; |
||
420 | } |
||
421 | else |
||
422 | { |
||
423 | ## Clean up unknown agents |
||
424 | if ( $raw =~ m!^(.*?)/! ) |
||
7 | PointedEar | 425 | { |
426 | $agent = $1; |
||
427 | } |
||
22 | PointedEar | 428 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
7 | PointedEar | 429 | { |
22 | PointedEar | 430 | $agent = $1; |
431 | } |
||
432 | } |
||
433 | |||
434 | $distinct_agent{$agent}++; |
||
435 | return $agent; |
||
436 | } |
||
437 | ## get_agent |
||
438 | |||
439 | ######################################### |
||
440 | ## Count the User-Agents used, collapsing |
||
441 | ## different versions into one per agent. |
||
442 | ######################################### |
||
443 | sub count_agents |
||
444 | { |
||
445 | POSTER: |
||
446 | foreach my $poster ( keys %data ) |
||
447 | { |
||
448 | foreach my $agent_name ( keys %distinct_agent ) |
||
449 | { # check against known ones |
||
450 | if ( $data{$poster}{'agent'} =~ /\Q$agent_name\E/ ) |
||
7 | PointedEar | 451 | { |
22 | PointedEar | 452 | $agents{$agent_name}++; |
453 | next POSTER; |
||
7 | PointedEar | 454 | } |
455 | } |
||
22 | PointedEar | 456 | $agents{ $data{$poster}{'agent'} }++; |
457 | } |
||
458 | } # count_agents |
||
7 | PointedEar | 459 | |
22 | PointedEar | 460 | ############################################# |
461 | ## Set orig/total percentages for all posters |
||
462 | ############################################# |
||
463 | sub fix_percent |
||
464 | { |
||
465 | foreach my $poster ( keys %data ) |
||
466 | { |
||
467 | my $percent = 100; |
||
468 | if ( ( $data{$poster}{'orig'} != 0 ) and ( $data{$poster}{'quoted'} != 0 ) ) |
||
469 | { |
||
470 | $percent = |
||
471 | $data{$poster}{'orig'} * 100 / |
||
472 | ( $data{$poster}{'quoted'} + $data{$poster}{'orig'} ); #/ |
||
473 | } |
||
474 | elsif ( $data{$poster}{'orig'} == 0 ) |
||
475 | { |
||
476 | $percent = 0; |
||
477 | } |
||
478 | $data{$poster}{'percent'} = $percent; |
||
5 | PointedEar | 479 | } |
22 | PointedEar | 480 | } |
481 | ## fix_percent |
||
5 | PointedEar | 482 | |
22 | PointedEar | 483 | ################################## |
484 | ## Write data structures to a file |
||
485 | ################################## |
||
486 | sub write_data |
||
487 | { |
||
488 | open( my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" ) |
||
489 | or die __x( "Can't create XDATA: {error}\n", error => $! ); |
||
490 | print $OUTF "Data collected from $newsgroup_name\n\n"; |
||
491 | print $OUTF |
||
492 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
||
493 | foreach my $name ( keys %data ) |
||
7 | PointedEar | 494 | { |
22 | PointedEar | 495 | print $OUTF |
496 | "$name : $data{$name}{'agent'} : $data{$name}{'count'} : $data{$name}{'size'} : $data{$name}{'orig'} : $data{$name}{'quoted'} : $data{$name}{'percent'}\n"; |
||
7 | PointedEar | 497 | } |
22 | PointedEar | 498 | print $OUTF |
499 | "============================================================================\n"; |
||
500 | print $OUTF "Thread subjects\n"; |
||
501 | print $OUTF |
||
502 | "----------------------------------------------------------------------------\n"; |
||
503 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
||
504 | { |
||
505 | print $OUTF |
||
506 | "$thread : $threads{$thread}{'count'} : $threads{$thread}{'size'}\n"; |
||
507 | } |
||
508 | print $OUTF |
||
509 | "============================================================================\n"; |
||
510 | print $OUTF "Cross-posts\n"; |
||
511 | print $OUTF |
||
512 | "----------------------------------------------------------------------------\n"; |
||
513 | foreach my $name ( sort keys %crossposts ) |
||
514 | { |
||
515 | print $OUTF "$name : $crossposts{$name}\n"; |
||
516 | } |
||
517 | print $OUTF |
||
518 | "============================================================================\n"; |
||
519 | print $OUTF "User agents\n"; |
||
520 | print $OUTF |
||
521 | "----------------------------------------------------------------------------\n"; |
||
522 | foreach my $name ( sort keys %agents ) |
||
523 | { |
||
524 | print $OUTF "$name : $agents{$name}\n"; |
||
525 | } |
||
526 | print $OUTF |
||
527 | "============================================================================\n"; |
||
528 | print $OUTF "Time zones\n"; |
||
529 | print $OUTF |
||
530 | "----------------------------------------------------------------------------\n"; |
||
531 | foreach my $name ( sort keys %tz ) |
||
532 | { |
||
533 | print $OUTF "$name : $tz{$name}\n"; |
||
534 | } |
||
535 | close $OUTF; |
||
536 | } # write_data |
||
5 | PointedEar | 537 | |
22 | PointedEar | 538 | sub display_results |
539 | { |
||
540 | #################### DISPLAY RESULTS ##################### |
||
541 | print "=" x 76, "\n"; |
||
542 | printf "%s\n", |
||
543 | centred( |
||
544 | __x( "Analysis of posts to {newsgroup}", newsgroup => $newsgroup_name ), |
||
545 | 76 ); |
||
546 | print "=" x 76, "\n"; |
||
547 | printf "%s\n", |
||
548 | centred( |
||
549 | __ |
||
550 | "(compiled with a script by Thomas 'PointedEars' Lahn, based on work by\nGarry Knight et al.)", |
||
551 | 76 |
||
552 | ); |
||
553 | print "\n\n"; |
||
554 | printf __"Total posts considered: %s over %d days" . "\n", |
||
555 | commify($totalposts), |
||
556 | $numdays; |
||
557 | my $time_locale = setlocale(LC_TIME); |
||
558 | my $earliest_datetime = DateTime->from_epoch( |
||
559 | epoch => $earliest, |
||
560 | locale => $time_locale, |
||
561 | time_zone => 'UTC', |
||
562 | ); |
||
563 | my $latest_datetime = DateTime->from_epoch( |
||
564 | epoch => $latest, |
||
565 | locale => $time_locale, |
||
566 | time_zone => 'UTC', |
||
567 | ); |
||
568 | my $datetime_format = '%a, %Y-%m-%dT%H:%M:%S %Z'; |
||
569 | printf __"Earliest article" . ": %s\n", $earliest_datetime->strftime($datetime_format); |
||
570 | printf __"Latest article" . ": %s\n", $latest_datetime->strftime($datetime_format); |
||
571 | printf __"Original articles: %s; replies" . ": %s\n", |
||
572 | commify($origposts), |
||
573 | commify($replies); |
||
574 | printf __"Total size of posts: %s bytes (%s KiB) (%.2f MiB)" . "\n", |
||
575 | commify($totsize), commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
||
576 | printf __ |
||
577 | "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
||
578 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
||
579 | commify( int( $totsize / $totalposts ) ); |
||
580 | my $count = keys %data; |
||
581 | printf __"Total headers: %s KiB; bodies: %s KiB\n", |
||
582 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
||
583 | printf __ |
||
584 | "Body text - quoted: %s KiB; original: %s KiB = %02.2f%%; sigs: %s KiB\n", |
||
585 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
||
586 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
||
587 | commify( int( $totsig / 1024 ) ); |
||
588 | printf __"Total number of posters: %s, average %s bytes per poster\n", |
||
589 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
590 | $count = keys %threads; |
||
591 | printf __"Total number of threads: %s, average %s bytes per thread\n", |
||
592 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
593 | printf __"Total number of user agents: %d\n", scalar keys %agents; |
||
594 | print "\n", "=" x 76, "\n"; |
||
5 | PointedEar | 595 | |
22 | PointedEar | 596 | ######################################## |
597 | ## Show posters by article count Sec 1; |
||
598 | ######################################## |
||
599 | unless ( $skipSec{1} ) |
||
7 | PointedEar | 600 | { |
22 | PointedEar | 601 | if ( keys %data < $topposters ) |
602 | { |
||
603 | $count = keys %data; |
||
604 | } |
||
605 | else |
||
606 | { |
||
607 | $count = $topposters; |
||
608 | } |
||
609 | printf "%s\n", |
||
610 | centred( |
||
611 | __x( "Top {count} posters by number of articles", count => $topposters ), 76 ); |
||
612 | print "=" x 76, "\n"; |
||
613 | my $i = 0; |
||
614 | foreach |
||
615 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
||
616 | { |
||
617 | my $name = substr( $poster, 0, 65 ); |
||
618 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
619 | $data{$poster}{count}; |
||
620 | last if ( ++$i == $count ); |
||
621 | } |
||
622 | print "\n", "=" x 76, "\n"; |
||
7 | PointedEar | 623 | } |
22 | PointedEar | 624 | |
625 | ###################################### |
||
626 | ## Show posters by size in KiB Sec 2; |
||
627 | ###################################### |
||
628 | unless ( $skipSec{2} ) |
||
7 | PointedEar | 629 | { |
22 | PointedEar | 630 | if ( keys %data < $topposters ) |
631 | { |
||
632 | $count = keys %data; |
||
633 | } |
||
634 | else |
||
635 | { |
||
636 | $count = $topposters; |
||
637 | } |
||
638 | printf "%s\n", |
||
639 | centred( |
||
640 | __x( "Top {count} posters by article size in KiB", count => $topposters ), |
||
641 | 76 ); |
||
642 | print "=" x 76, "\n"; |
||
643 | my $i = 0; |
||
644 | foreach |
||
645 | my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
||
646 | { |
||
647 | my $name = substr( $poster, 0, 62 ); |
||
648 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
649 | $data{$poster}{size} / 1024; #/ |
||
650 | last if ( ++$i == $count ); |
||
651 | } |
||
652 | print "\n", "=" x 76, "\n"; |
||
7 | PointedEar | 653 | } |
5 | PointedEar | 654 | |
22 | PointedEar | 655 | ##################################### |
656 | ## Show top posters for original text |
||
657 | ##################################### |
||
658 | unless ( $skipSec{3} ) |
||
7 | PointedEar | 659 | { |
22 | PointedEar | 660 | if ( keys %data < $topposters ) |
661 | { |
||
662 | $count = keys %data; |
||
663 | } |
||
664 | else |
||
665 | { |
||
666 | $count = $topposters; |
||
667 | } |
||
668 | printf "%s\n", |
||
669 | centred( |
||
670 | __x( |
||
671 | "Top {count} responders by original text (> 5 posts)", |
||
672 | count => $topposters |
||
673 | ), |
||
674 | 76 |
||
675 | ); |
||
676 | print "=" x 76, "\n"; |
||
677 | my $i = 0; |
||
678 | foreach my $poster ( |
||
679 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
||
680 | keys %data |
||
681 | ) |
||
682 | { |
||
683 | next if $data{$poster}{quoted} == 0; |
||
684 | next if $data{$poster}{count} < 5; |
||
685 | my $name = substr( $poster, 0, 63 ); |
||
686 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
687 | $data{$poster}{percent}; |
||
688 | last if ( ++$i == $count ); |
||
689 | } |
||
690 | print "\n", "=" x 76, "\n"; |
||
7 | PointedEar | 691 | } |
5 | PointedEar | 692 | |
22 | PointedEar | 693 | ######################################## |
694 | ## Show bottom posters for original text |
||
695 | ######################################## |
||
696 | unless ( $skipSec{4} ) |
||
7 | PointedEar | 697 | { |
22 | PointedEar | 698 | if ( keys %data < $topposters ) |
7 | PointedEar | 699 | { |
22 | PointedEar | 700 | $count = keys %data; |
701 | } |
||
702 | else |
||
703 | { |
||
704 | $count = $topposters; |
||
705 | } |
||
706 | printf "%s\n", |
||
707 | centred( |
||
708 | __x( |
||
709 | "Bottom {count} responders by original text (> 5 posts)", |
||
710 | count => $topposters |
||
711 | ), |
||
712 | 76 |
||
713 | ); |
||
714 | print "=" x 76, "\n"; |
||
715 | my $i = 0; |
||
716 | foreach my $poster ( |
||
717 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
||
718 | keys %data |
||
719 | ) |
||
720 | { |
||
721 | next if $data{$poster}{quoted} == 0; |
||
722 | next if $data{$poster}{count} < 5; |
||
723 | my $name = substr( $poster, 0, 63 ); |
||
724 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
725 | $data{$poster}{percent}; |
||
726 | last if ( ++$i == $count ); |
||
727 | } |
||
728 | print "\n", "=" x 76, "\n"; |
||
729 | } |
||
5 | PointedEar | 730 | |
22 | PointedEar | 731 | ##################################### |
732 | ## Show threads by number of articles |
||
733 | ##################################### |
||
734 | unless ( $skipSec{5} ) |
||
735 | { |
||
736 | if ( keys %threads < $topthreads ) |
||
737 | { |
||
738 | $count = keys %threads; |
||
7 | PointedEar | 739 | } |
22 | PointedEar | 740 | else |
741 | { |
||
742 | $count = $topthreads; |
||
7 | PointedEar | 743 | } |
22 | PointedEar | 744 | printf "%s\n", |
745 | centred( __x( "Top {count} threads by no. of articles", count => $topthreads ), |
||
746 | 76 ); |
||
747 | print "=" x 76, "\n"; |
||
748 | my $i = 0; |
||
749 | foreach my $thread ( |
||
750 | sort { $threads{$b}{'count'} <=> $threads{$a}{'count'} } |
||
751 | keys %threads |
||
752 | ) |
||
7 | PointedEar | 753 | { |
22 | PointedEar | 754 | my $name = substr( $thread, 0, 65 ); |
755 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
756 | $threads{$thread}{'count'}; |
||
757 | last if ( ++$i == $count ); |
||
7 | PointedEar | 758 | } |
22 | PointedEar | 759 | print "\n", "=" x 76, "\n"; |
760 | } |
||
761 | |||
762 | ############################## |
||
763 | ## Show threads by size in KiB |
||
764 | ############################## |
||
765 | unless ( $skipSec{6} ) |
||
766 | { |
||
767 | if ( keys %threads < $topthreads ) |
||
768 | { |
||
769 | $count = keys %threads; |
||
770 | } |
||
7 | PointedEar | 771 | else |
772 | { |
||
22 | PointedEar | 773 | $count = $topthreads; |
774 | } |
||
775 | printf "%s\n", |
||
776 | centred( __x( "Top {count} threads by size in KiB", count => $topthreads ), |
||
777 | 76 ); |
||
778 | print "=" x 76, "\n"; |
||
779 | my $i = 0; |
||
780 | foreach my $thread ( |
||
781 | sort { $threads{$b}{'size'} <=> $threads{$a}{'size'} } |
||
782 | keys %threads |
||
783 | ) |
||
784 | { |
||
785 | my $name = substr( $thread, 0, 65 ); |
||
786 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
787 | $threads{$thread}{'size'} / 1024; #/ |
||
788 | last if ( ++$i == $count ); |
||
789 | } |
||
790 | print "\n", "=" x 76, "\n"; |
||
791 | } |
||
5 | PointedEar | 792 | |
22 | PointedEar | 793 | ################################## |
794 | ## Show top 10 cross-posted groups |
||
795 | ################################## |
||
796 | unless ( $skipSec{7} ) |
||
797 | { |
||
798 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
799 | if ( keys %crossposts < $topcrossposts ) |
||
800 | { |
||
801 | $count = keys %crossposts; |
||
7 | PointedEar | 802 | } |
22 | PointedEar | 803 | else |
804 | { |
||
805 | $count = $topcrossposts; |
||
806 | } |
||
807 | printf "%s\n", |
||
808 | centred( __x( "Top {count} cross-posted groups", count => $topcrossposts ), 76 ); |
||
809 | print "=" x 76, "\n"; |
||
810 | my $i = 0; |
||
811 | foreach |
||
812 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
||
813 | { |
||
814 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
815 | $crossposts{$name}; |
||
816 | last if ( ++$i == $count ); |
||
817 | } |
||
818 | print "\n", "=" x 76, "\n"; |
||
819 | } |
||
5 | PointedEar | 820 | |
22 | PointedEar | 821 | ######################### |
822 | ## Show agents and counts |
||
823 | ######################### |
||
824 | unless ( $skipSec{8} ) |
||
825 | { |
||
826 | if ( keys %agents < $topagents ) |
||
827 | { |
||
828 | $count = keys %agents; |
||
829 | } |
||
830 | else |
||
831 | { |
||
832 | $count = $topagents; |
||
833 | } |
||
834 | printf "%s\n", |
||
835 | centred( __x( "Top {count} user agents by poster", count => $topagents ), |
||
836 | 76 ); |
||
837 | print "=" x 76, "\n"; |
||
838 | my $i = 0; |
||
839 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
||
840 | { |
||
841 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
||
842 | $agents{$agent}; |
||
843 | last if ( ++$i == $count ); |
||
844 | } |
||
845 | print "\n", "=" x 76, "\n"; |
||
846 | } |
||
5 | PointedEar | 847 | |
22 | PointedEar | 848 | ####################### |
849 | ## Show distinct agents |
||
850 | ####################### |
||
851 | unless ( $skipSec{9} ) |
||
7 | PointedEar | 852 | { |
22 | PointedEar | 853 | if ( keys %distinct_agent < $topagents ) |
854 | { |
||
855 | $count = keys %distinct_agent; |
||
7 | PointedEar | 856 | } |
22 | PointedEar | 857 | else |
858 | { |
||
859 | $count = $topagents; |
||
860 | } |
||
861 | printf "%s\n", |
||
862 | centred( |
||
863 | __x( "Top {count} user agents by number of posts", count => $topagents ), |
||
864 | 76 ); |
||
865 | print "=" x 76, "\n"; |
||
866 | my $i = 0; |
||
867 | foreach my $agent ( |
||
868 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
||
869 | keys %distinct_agent |
||
870 | ) |
||
871 | { |
||
872 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
||
873 | $distinct_agent{$agent}, |
||
874 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
||
875 | last if ( ++$i == $count ); |
||
876 | } |
||
877 | print "\n", "=" x 76, "\n"; |
||
7 | PointedEar | 878 | } |
5 | PointedEar | 879 | |
22 | PointedEar | 880 | ############################ |
881 | ## Show timezones and counts |
||
882 | ############################ |
||
883 | unless ( $skipSec{10} ) |
||
7 | PointedEar | 884 | { |
22 | PointedEar | 885 | if ( keys %tz < $toptz ) |
7 | PointedEar | 886 | { |
22 | PointedEar | 887 | $count = keys %tz; |
7 | PointedEar | 888 | } |
22 | PointedEar | 889 | else |
7 | PointedEar | 890 | { |
22 | PointedEar | 891 | $count = $toptz; |
7 | PointedEar | 892 | } |
22 | PointedEar | 893 | printf "%s\n", centred( __x("Top {count} time zones", count => $toptz), 76 ); |
894 | print "=" x 76, "\n"; |
||
895 | my $i = 0; |
||
896 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
||
897 | { |
||
898 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
||
899 | last if ( ++$i == $count ); |
||
900 | } |
||
901 | print "\n", "=" x 76, "\n"; |
||
7 | PointedEar | 902 | } |
5 | PointedEar | 903 | } |
904 | |||
22 | PointedEar | 905 | ## helper subs |
906 | |||
13 | PointedEar | 907 | ############################### |
908 | ## Right pad a string with '.'s |
||
909 | ############################### |
||
7 | PointedEar | 910 | sub rpad |
911 | { |
||
13 | PointedEar | 912 | ## Get text to pad, length to pad, pad chr |
7 | PointedEar | 913 | my ( $text, $pad_len, $pad_chr ) = @_; |
11 | PointedEar | 914 | |
13 | PointedEar | 915 | ## DEBUG |
22 | PointedEar | 916 | printf( "|%s| = %d\n", $text, length($text) ) if DEBUG > 1; |
11 | PointedEar | 917 | |
7 | PointedEar | 918 | if ( length($text) > $pad_len ) |
919 | { |
||
920 | $text = substr( $text, 0, $pad_len ); |
||
921 | } |
||
922 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
||
923 | return $padded; |
||
5 | PointedEar | 924 | } |
925 | |||
13 | PointedEar | 926 | ################## |
927 | ## Centre a string |
||
928 | ################## |
||
7 | PointedEar | 929 | sub centred |
930 | { |
||
931 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
||
932 | my $pad_len = ( $width - length($text) ) / 2; #/ |
||
933 | my $centred = " " x $pad_len . $text; |
||
934 | return $centred; |
||
5 | PointedEar | 935 | } |
936 | |||
13 | PointedEar | 937 | ########################### |
938 | ## Put commas into a number |
||
939 | ########################### |
||
22 | PointedEar | 940 | # Get some of locale's numeric formatting parameters |
941 | my ($thousands_sep, $grouping) = |
||
942 | @{localeconv()}{'thousands_sep', 'grouping'}; |
||
943 | # Apply defaults if values are missing |
||
944 | $thousands_sep = ',' unless $thousands_sep; |
||
945 | # grouping and mon_grouping are packed lists |
||
946 | # of small integers (characters) telling the |
||
947 | # grouping (thousand_seps and mon_thousand_seps |
||
948 | # being the group dividers) of numbers and |
||
949 | # monetary quantities. The integers' meanings: |
||
950 | # 255 means no more grouping, 0 means repeat |
||
951 | # the previous grouping, 1-254 means use that |
||
952 | # as the current grouping. Grouping goes from |
||
953 | # right to left (low to high digits). In the |
||
954 | # below we cheat slightly by never using anything |
||
955 | # else than the first grouping (whatever that is). |
||
956 | my @grouping; |
||
957 | if ($grouping) { |
||
958 | @grouping = unpack("C*", $grouping); |
||
959 | } else { |
||
960 | @grouping = (3); |
||
961 | } |
||
962 | |||
7 | PointedEar | 963 | sub commify |
964 | { |
||
13 | PointedEar | 965 | local $_ = shift; |
22 | PointedEar | 966 | #1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
967 | $_ = int; # Chop non-integer part |
||
968 | 1 while |
||
969 | s/(\d)(\d{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/; |
||
7 | PointedEar | 970 | return $_; |
5 | PointedEar | 971 | } |
972 | |||
13 | PointedEar | 973 | ################################################################ |
974 | ## Returns a string with leading and trailing whitespace removed |
||
975 | ################################################################ |
||
7 | PointedEar | 976 | sub clean |
977 | { |
||
978 | my $dirty = shift; |
||
979 | my $clean = $dirty; |
||
14 | PointedEar | 980 | $clean =~ s/^\s+|\s+$//g; |
5 | PointedEar | 981 | |
7 | PointedEar | 982 | return $clean; |
5 | PointedEar | 983 | } |
984 | |||
7 | PointedEar | 985 | sub usage |
986 | { |
||
22 | PointedEar | 987 | print __"usage: newsstat.pl NEWS.GROUP\n"; |
7 | PointedEar | 988 | exit 1; |
5 | PointedEar | 989 | } |
990 | |||
22 | PointedEar | 991 | sub dmsg |
7 | PointedEar | 992 | { |
22 | PointedEar | 993 | print STDERR @_, "\n"; |
994 | } |
||
995 | |||
996 | sub dmsg2 |
||
997 | { |
||
998 | my ( $level, @msg ) = @_; |
||
999 | print STDERR @msg, "\n" if $level >= DEBUG; |
||
1000 | } |