Rev 6 | Rev 8 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 7 | PointedEar | 1 | #!/usr/bin/perl |
| 5 | PointedEar | 2 | use strict; |
| 6 | PointedEar | 3 | use warnings; |
| 4 | use utf8; |
||
| 5 | use Encode; |
||
| 6 | |||
| 5 | PointedEar | 7 | ######################### |
| 6 | PointedEar | 8 | # newsstat.pl version 0.4 |
| 5 | PointedEar | 9 | |
| 6 | PointedEar | 10 | ############################################################################ |
| 11 | # Collect statistics about a newsgroup (specified by first argument) in |
||
| 7 | PointedEar | 12 | # the local news spool. Check all articles in the last 30-day period. |
| 6 | PointedEar | 13 | # Rank posters by number of posts and by volume of posts, report on top and |
| 14 | # bottom 20 posters. Show their name, number of posts, size of posts, |
||
| 15 | # percentage of quoted lines. Rank user-agents used, by poster rather than |
||
| 16 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups. |
||
| 7 | PointedEar | 17 | # |
| 6 | PointedEar | 18 | # (Numbers and paths can be configured below. -- PE) |
| 19 | ############################################################################ |
||
| 5 | PointedEar | 20 | |
| 6 | PointedEar | 21 | ############################################################################ |
| 22 | # RECENT CHANGES # |
||
| 23 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings |
||
| 7 | PointedEar | 24 | # - Use warnings, utf8 (just in case) |
| 6 | PointedEar | 25 | # - Documentation update |
| 26 | # N/A NN - Take newsgroup name as argument |
||
| 27 | # 2004-06-19 NN - newsgroup name is $ARGV[0] |
||
| 28 | # - Allow command line flags for subtracting |
||
| 29 | # output if not pertinent for a group |
||
| 30 | # 2002-11-09 NN - Put Garry's writedata() function back in. |
||
| 31 | # - added "rn" to my list of UA's |
||
| 32 | # - Started using %distinct_agent for both User agent |
||
| 33 | # sections |
||
| 34 | # - named it newsstat.pl version 0.3 |
||
| 35 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using |
||
| 36 | # mtime rather than ctime, and simplifying the logic |
||
| 37 | # 2002-11-05 NN - moved user configurations to the top |
||
| 38 | # - fixed the cross-posting section |
||
| 39 | # - introduced the $newsgroup_name variable which |
||
| 40 | # later becomes $news$group |
||
| 41 | # - changed $name to $agent_name in countagents() |
||
| 7 | PointedEar | 42 | # |
| 6 | PointedEar | 43 | # Contributors |
| 44 | # ------------- |
||
| 45 | # NN Nomen nominandum (name to be determined later) |
||
| 46 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de> |
||
| 5 | PointedEar | 47 | |
| 6 | PointedEar | 48 | ########### TODO ############# |
| 5 | PointedEar | 49 | # Commas in bottom section of report |
| 50 | # Show date the figures were compiled |
||
| 51 | # No. of HTML articles (Content-Type: text/html) |
||
| 52 | # No. of quoted sigs (/>\s*-- /) |
||
| 53 | # Per cent of top-posted articles |
||
| 54 | # Top 10 cross-posters |
||
| 55 | # Top 20 news posting hosts (from Path) |
||
| 56 | # Count of certain subject words: newbie, kde, burner, sendmail, etc. |
||
| 57 | # Count *all* User Agents that each poster uses |
||
| 58 | # What do we do about Bill Unruh's ] quote style? |
||
| 59 | # Change the way dates/times are checked |
||
| 60 | # include % share in posters by no. of arts |
||
| 61 | # include % share in posters by size |
||
| 62 | # Total, orig & quoted lines by user agent with per cent |
||
| 6 | PointedEar | 63 | # Take more arguments |
| 5 | PointedEar | 64 | ####################################################### |
| 65 | |||
| 66 | ###################### USER CONFIGURATIONS ############################ |
||
| 67 | |||
| 68 | # The name of the group to do stats for |
||
| 69 | my $newsgroup_name = $ARGV[0]; |
||
| 6 | PointedEar | 70 | $newsgroup_name or &usage; |
| 5 | PointedEar | 71 | |
| 72 | # Check for removal flags |
||
| 73 | my $ix; |
||
| 74 | my $j; |
||
| 75 | my %skipSec; |
||
| 76 | my @skiplist; |
||
| 77 | my $args = @ARGV; |
||
| 7 | PointedEar | 78 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
| 79 | { |
||
| 80 | $j = $ix + 1; |
||
| 81 | if ( $ARGV[$ix] eq "-x" ) |
||
| 82 | { |
||
| 83 | @skiplist = split( ",", $ARGV[$j] ); |
||
| 84 | } |
||
| 85 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
||
| 86 | { |
||
| 87 | @skiplist = split( ",", $1 ); |
||
| 88 | } |
||
| 5 | PointedEar | 89 | } |
| 7 | PointedEar | 90 | foreach (@skiplist) |
| 91 | { |
||
| 5 | PointedEar | 92 | $skipSec{$_} = 1; |
| 93 | } |
||
| 94 | |||
| 95 | # Leafnode users will want /var/spool/news for this variable. |
||
| 96 | my $news = "/var/spool/news/"; |
||
| 97 | |||
| 98 | # How many days are we doing statistics for? |
||
| 99 | my $numdays = 30; |
||
| 100 | |||
| 101 | # no. of agents we list |
||
| 102 | my $topagents = 10; |
||
| 103 | |||
| 104 | # no. of threads we want to know about |
||
| 105 | my $topthreads = 20; |
||
| 106 | |||
| 107 | # no. of top or bottom posters to show |
||
| 108 | my $topposters = 20; |
||
| 109 | |||
| 110 | # no. of cross-posted threads to show |
||
| 111 | my $topcrossposts = 10; |
||
| 112 | |||
| 113 | # no. of time zones to show |
||
| 114 | my $toptz = 10; |
||
| 115 | |||
| 116 | ###################### DATA STRUCTURES ###################### |
||
| 117 | my $group = $newsgroup_name; |
||
| 118 | $group =~ s!\.!/!g; |
||
| 7 | PointedEar | 119 | my %data; # name, count, agent, total, orig, quoted |
| 120 | my %threads; # subject, count |
||
| 121 | my %crossposts; # group, count |
||
| 122 | my %tz; # timezones by count |
||
| 123 | my %headers; # holds header of current article |
||
| 124 | my %lcheader; # holds lowercase headers |
||
| 125 | my @body; # holds body of current article |
||
| 126 | my @sig; # holds sig text; |
||
| 127 | my $totalposts; # total no. of posts considered |
||
| 128 | my $filename; # name of current article file |
||
| 129 | my $filesize; # size of current article file |
||
| 130 | my $earliest; # earliest article we have found |
||
| 131 | my $latest; # latest article we have found |
||
| 132 | my $poster; # poster we are dealing with |
||
| 133 | my $totsize = 0; # holds total sizes of all files |
||
| 134 | my $totheader = 0; # total size of header material |
||
| 135 | my $totbody = 0; # total size of body material |
||
| 136 | my $totsig = 0; # total size of sig material |
||
| 137 | my $totorig = 0; # total size of original material |
||
| 138 | my $totquoted = 0; # total size of quoted material |
||
| 139 | my $origposts = 0; # total no. of original posts |
||
| 140 | my $replies = 0; # total no. of replies |
||
| 141 | my $i; # general purpose |
||
| 5 | PointedEar | 142 | my %distinct_agent; |
| 7 | PointedEar | 143 | my %agents = # used to hold counts of User Agents used |
| 144 | ( |
||
| 145 | "KNode" => 0, |
||
| 146 | "Pan" => 0, |
||
| 147 | "Mozilla" => 0, |
||
| 148 | "Sylpheed" => 0, |
||
| 149 | "Gnus" => 0, |
||
| 150 | "Forte Agent" => 0, |
||
| 151 | "Forte Free Agent" => 0, |
||
| 152 | "MicroPlanet Gravity" => 0, |
||
| 153 | "Microsoft Outlook Express" => 0, |
||
| 154 | "Xnews" => 0, |
||
| 155 | "slrn" => 0, |
||
| 156 | "tin" => 0, |
||
| 157 | "rn" => 0, |
||
| 158 | "NN" => 0, |
||
| 159 | "MacSOUP" => 0, |
||
| 160 | "Foorum" => 0, |
||
| 161 | "MT-NewsWatcher" => 0, |
||
| 162 | "News Rover" => 0, |
||
| 163 | "WebTV" => 0, |
||
| 164 | "Compuserver" => 0, |
||
| 165 | "VSoup" => 0 |
||
| 166 | ); |
||
| 5 | PointedEar | 167 | |
| 168 | ######################## MAIN CODE ######################## |
||
| 169 | $! = 1; |
||
| 170 | |||
| 171 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
||
| 7 | PointedEar | 172 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
| 173 | while ( defined( $filename = readdir(DIR) ) ) |
||
| 174 | { |
||
| 175 | %lcheader = (); |
||
| 176 | next unless -f $filename; # only want real files |
||
| 177 | next if ( $filename eq ".overview" ); # real articles only |
||
| 178 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
||
| 179 | $earliest = ( stat $filename )[9] unless defined($earliest); |
||
| 180 | $latest = ( stat $filename )[9] unless defined($latest); |
||
| 181 | &getarticle($filename); # read in the article |
||
| 182 | &getdata; # grab the data from the article |
||
| 183 | $totalposts++; # bump count of articles considered |
||
| 5 | PointedEar | 184 | } |
| 7 | PointedEar | 185 | closedir(DIR); # finished with the directory |
| 186 | |||
| 5 | PointedEar | 187 | # post-processing |
| 7 | PointedEar | 188 | &countagents; # count agents, collapsing versions |
| 189 | &fixpercent; # check percentages orig/total for posters |
||
| 5 | PointedEar | 190 | |
| 191 | &writedata; |
||
| 192 | |||
| 193 | #################### DISPLAY RESULTS ##################### |
||
| 194 | print "=" x 76, "\n"; |
||
| 7 | PointedEar | 195 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
| 5 | PointedEar | 196 | print "=" x 76, "\n"; |
| 7 | PointedEar | 197 | printf "%s\n", |
| 198 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
||
| 5 | PointedEar | 199 | print "\n\n"; |
| 7 | PointedEar | 200 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
| 201 | $numdays; |
||
| 202 | printf "Earliest article: %s\n", scalar localtime($earliest); |
||
| 203 | printf "Latest article: %s\n", scalar localtime($latest); |
||
| 204 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
||
| 205 | commify($replies); |
||
| 5 | PointedEar | 206 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
| 7 | PointedEar | 207 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
| 5 | PointedEar | 208 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
| 7 | PointedEar | 209 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
| 210 | commify( int( $totsize / $totalposts ) ); |
||
| 5 | PointedEar | 211 | my $count = keys %data; |
| 212 | printf "Total headers: %s KB bodies: %s KB\n", |
||
| 7 | PointedEar | 213 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
| 5 | PointedEar | 214 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
| 7 | PointedEar | 215 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
| 216 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
||
| 217 | commify( int( $totsig / 1024 ) ); |
||
| 218 | printf "Total number of posters: %s, average %s bytes per poster\n", |
||
| 219 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
| 5 | PointedEar | 220 | $count = keys %threads; |
| 7 | PointedEar | 221 | printf "Total number of threads: %s, average %s bytes per thread\n", |
| 222 | commify($count), commify( int( $totsize / $count ) ); #/ |
||
| 5 | PointedEar | 223 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
| 224 | print "\n", "=" x 76, "\n"; |
||
| 225 | |||
| 226 | ############################### |
||
| 227 | # show posters by article count Sec 1; |
||
| 228 | ############################### |
||
| 7 | PointedEar | 229 | unless ( $skipSec{1} ) |
| 230 | { |
||
| 231 | if ( keys %data < $topposters ) |
||
| 232 | { |
||
| 233 | $count = keys %data; |
||
| 234 | } |
||
| 235 | else |
||
| 236 | { |
||
| 237 | $count = $topposters; |
||
| 238 | } |
||
| 239 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
||
| 240 | print "=" x 76, "\n"; |
||
| 241 | $i = 0; |
||
| 242 | foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
||
| 243 | { |
||
| 244 | my $name = substr( $poster, 0, 65 ); |
||
| 245 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
| 246 | $data{$poster}{count}; |
||
| 247 | last if ( ++$i == $count ); |
||
| 248 | } |
||
| 249 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 250 | } |
| 251 | |||
| 252 | ################################ |
||
| 253 | # show posters by size in Kbytes Sec 2; |
||
| 254 | ################################ |
||
| 7 | PointedEar | 255 | unless ( $skipSec{2} ) |
| 256 | { |
||
| 257 | if ( keys %data < $topposters ) |
||
| 258 | { |
||
| 5 | PointedEar | 259 | $count = keys %data; |
| 7 | PointedEar | 260 | } |
| 261 | else |
||
| 262 | { |
||
| 5 | PointedEar | 263 | $count = $topposters; |
| 264 | } |
||
| 7 | PointedEar | 265 | printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
| 5 | PointedEar | 266 | print "=" x 76, "\n"; |
| 267 | $i = 0; |
||
| 7 | PointedEar | 268 | foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
| 269 | { |
||
| 270 | my $name = substr( $poster, 0, 62 ); |
||
| 271 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
||
| 272 | $data{$poster}{size} / 1024; #/ |
||
| 273 | last if ( ++$i == $count ); |
||
| 5 | PointedEar | 274 | } |
| 275 | print "\n", "=" x 76, "\n"; |
||
| 276 | } |
||
| 277 | |||
| 278 | #################################### |
||
| 279 | # show top posters for original text |
||
| 280 | #################################### |
||
| 7 | PointedEar | 281 | unless ( $skipSec{3} ) |
| 282 | { |
||
| 283 | if ( keys %data < $topposters ) |
||
| 284 | { |
||
| 285 | $count = keys %data; |
||
| 286 | } |
||
| 287 | else |
||
| 288 | { |
||
| 289 | $count = $topposters; |
||
| 290 | } |
||
| 291 | printf "%s\n", |
||
| 292 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
||
| 293 | print "=" x 76, "\n"; |
||
| 294 | $i = 0; |
||
| 295 | foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} } |
||
| 296 | keys %data ) |
||
| 297 | { |
||
| 298 | next if $data{$poster}{quoted} == 0; |
||
| 299 | next if $data{$poster}{count} < 5; |
||
| 300 | my $name = substr( $poster, 0, 63 ); |
||
| 301 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
| 302 | $data{$poster}{percent}; |
||
| 303 | last if ( ++$i == $count ); |
||
| 304 | } |
||
| 305 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 306 | } |
| 307 | |||
| 308 | ####################################### |
||
| 309 | # show bottom posters for original text |
||
| 310 | ####################################### |
||
| 7 | PointedEar | 311 | unless ( $skipSec{4} ) |
| 312 | { |
||
| 313 | if ( keys %data < $topposters ) |
||
| 314 | { |
||
| 5 | PointedEar | 315 | $count = keys %data; |
| 7 | PointedEar | 316 | } |
| 317 | else |
||
| 318 | { |
||
| 5 | PointedEar | 319 | $count = $topposters; |
| 320 | } |
||
| 7 | PointedEar | 321 | printf "%s\n", |
| 322 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
||
| 5 | PointedEar | 323 | print "=" x 76, "\n"; |
| 324 | $i = 0; |
||
| 7 | PointedEar | 325 | foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} } |
| 326 | keys %data ) |
||
| 327 | { |
||
| 5 | PointedEar | 328 | next if $data{$poster}{quoted} == 0; |
| 329 | next if $data{$poster}{count} < 5; |
||
| 7 | PointedEar | 330 | my $name = substr( $poster, 0, 63 ); |
| 331 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
||
| 332 | $data{$poster}{percent}; |
||
| 333 | last if ( ++$i == $count ); |
||
| 5 | PointedEar | 334 | } |
| 335 | print "\n", "=" x 76, "\n"; |
||
| 336 | } |
||
| 337 | |||
| 338 | #################################### |
||
| 339 | # show threads by number of articles |
||
| 340 | #################################### |
||
| 7 | PointedEar | 341 | unless ( $skipSec{5} ) |
| 342 | { |
||
| 343 | if ( keys %threads < $topthreads ) |
||
| 344 | { |
||
| 5 | PointedEar | 345 | $count = keys %threads; |
| 7 | PointedEar | 346 | } |
| 347 | else |
||
| 348 | { |
||
| 5 | PointedEar | 349 | $count = $topthreads; |
| 350 | } |
||
| 7 | PointedEar | 351 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
| 5 | PointedEar | 352 | print "=" x 76, "\n"; |
| 353 | $i = 0; |
||
| 7 | PointedEar | 354 | foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} } |
| 355 | keys %threads ) |
||
| 356 | { |
||
| 357 | my $name = substr( $thread, 0, 65 ); |
||
| 358 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
| 359 | $threads{$thread}{count}; |
||
| 360 | last if ( ++$i == $count ); |
||
| 5 | PointedEar | 361 | } |
| 362 | print "\n", "=" x 76, "\n"; |
||
| 363 | } |
||
| 364 | ################################ |
||
| 365 | # show threads by size in Kbytes |
||
| 366 | ################################ |
||
| 7 | PointedEar | 367 | unless ( $skipSec{6} ) |
| 368 | { |
||
| 369 | if ( keys %threads < $topthreads ) |
||
| 370 | { |
||
| 5 | PointedEar | 371 | $count = keys %threads; |
| 7 | PointedEar | 372 | } |
| 373 | else |
||
| 374 | { |
||
| 5 | PointedEar | 375 | $count = $topthreads; |
| 376 | } |
||
| 7 | PointedEar | 377 | printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
| 5 | PointedEar | 378 | print "=" x 76, "\n"; |
| 379 | $i = 0; |
||
| 7 | PointedEar | 380 | foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} } |
| 381 | keys %threads ) |
||
| 382 | { |
||
| 383 | my $name = substr( $thread, 0, 65 ); |
||
| 384 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
| 385 | $threads{$thread}{size} / 1024; #/ |
||
| 386 | last if ( ++$i == $count ); |
||
| 5 | PointedEar | 387 | } |
| 388 | print "\n", "=" x 76, "\n"; |
||
| 389 | } |
||
| 390 | |||
| 391 | ################################# |
||
| 392 | # show top 10 cross-posted groups |
||
| 393 | ################################# |
||
| 7 | PointedEar | 394 | unless ( $skipSec{7} ) |
| 395 | { |
||
| 396 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
| 397 | if ( keys %crossposts < $topcrossposts ) |
||
| 398 | { |
||
| 399 | $count = keys %crossposts; |
||
| 400 | } |
||
| 401 | else |
||
| 402 | { |
||
| 403 | $count = $topcrossposts; |
||
| 404 | } |
||
| 405 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
||
| 406 | print "=" x 76, "\n"; |
||
| 407 | $i = 0; |
||
| 408 | foreach |
||
| 409 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
||
| 410 | { |
||
| 411 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
||
| 412 | $crossposts{$name}; |
||
| 413 | last if ( ++$i == $count ); |
||
| 414 | } |
||
| 415 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 416 | } |
| 417 | ####################### |
||
| 418 | #show agents and counts |
||
| 419 | ####################### |
||
| 7 | PointedEar | 420 | unless ( $skipSec{8} ) |
| 421 | { |
||
| 422 | if ( keys %agents < $topagents ) |
||
| 423 | { |
||
| 424 | $count = keys %agents; |
||
| 425 | } |
||
| 426 | else |
||
| 427 | { |
||
| 428 | $count = $topagents; |
||
| 429 | } |
||
| 430 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
||
| 431 | print "=" x 76, "\n"; |
||
| 432 | $i = 0; |
||
| 433 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
||
| 434 | { |
||
| 435 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
||
| 436 | $agents{$agent}; |
||
| 437 | last if ( ++$i == $count ); |
||
| 438 | } |
||
| 439 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 440 | } |
| 441 | |||
| 442 | ####################### |
||
| 443 | #show distinct agents |
||
| 444 | ####################### |
||
| 7 | PointedEar | 445 | unless ( $skipSec{9} ) |
| 446 | { |
||
| 447 | if ( keys %distinct_agent < $topagents ) |
||
| 448 | { |
||
| 449 | $count = keys %distinct_agent; |
||
| 450 | } |
||
| 451 | else |
||
| 452 | { |
||
| 453 | $count = $topagents; |
||
| 454 | } |
||
| 455 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
||
| 456 | print "=" x 76, "\n"; |
||
| 457 | $i = 0; |
||
| 458 | foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
||
| 459 | keys %distinct_agent ) |
||
| 460 | { |
||
| 461 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
||
| 462 | $distinct_agent{$agent}, |
||
| 463 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
||
| 464 | last if ( ++$i == $count ); |
||
| 465 | } |
||
| 466 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 467 | } |
| 468 | |||
| 469 | ########################## |
||
| 470 | #show timezones and counts |
||
| 471 | ########################## |
||
| 7 | PointedEar | 472 | unless ( $skipSec{10} ) |
| 473 | { |
||
| 474 | if ( keys %tz < $toptz ) |
||
| 475 | { |
||
| 476 | $count = keys %tz; |
||
| 477 | } |
||
| 478 | else |
||
| 479 | { |
||
| 480 | $count = $toptz; |
||
| 481 | } |
||
| 482 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
||
| 483 | print "=" x 76, "\n"; |
||
| 484 | $i = 0; |
||
| 485 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
||
| 486 | { |
||
| 487 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
||
| 488 | last if ( ++$i == $count ); |
||
| 489 | } |
||
| 490 | print "\n", "=" x 76, "\n"; |
||
| 5 | PointedEar | 491 | } |
| 492 | |||
| 493 | ################################ SUBROUTINES ################################ |
||
| 494 | |||
| 495 | ####################################### |
||
| 496 | # get current article's header and body |
||
| 497 | ####################################### |
||
| 7 | PointedEar | 498 | sub getarticle |
| 499 | { |
||
| 500 | %headers = (); # dump old headers |
||
| 501 | my $filename = shift; # get the name of the file |
||
| 5 | PointedEar | 502 | |
| 7 | PointedEar | 503 | # get stats about the file itself |
| 504 | $filesize = -s $filename; # get total size of file |
||
| 505 | $totsize += $filesize; # bump total sizes of all files |
||
| 506 | |||
| 507 | my $mtime = ( stat $filename )[9]; |
||
| 508 | if ( $mtime < $earliest ) |
||
| 509 | { |
||
| 5 | PointedEar | 510 | $earliest = $mtime; |
| 7 | PointedEar | 511 | } |
| 512 | elsif ( $mtime > $latest ) |
||
| 513 | { |
||
| 5 | PointedEar | 514 | $latest = $mtime; |
| 7 | PointedEar | 515 | } |
| 5 | PointedEar | 516 | |
| 7 | PointedEar | 517 | # now read the file |
| 518 | open( FILE, $filename ) or die "Can't open $filename: $!\n"; |
||
| 519 | while (<FILE>) |
||
| 520 | { |
||
| 521 | $totheader += length($_); # bump total header size |
||
| 522 | last if (/^\s*$/); # end of header? |
||
| 523 | if (/^([^:\s]*):\s+(.*)/) |
||
| 524 | { |
||
| 525 | my ( $key, $val ) = ( $1, $2 ); |
||
| 526 | $headers{$key} = decode( 'MIME-Header', $val ); |
||
| 527 | $lcheader{ clean( lc($key) ) } = clean($val); |
||
| 528 | } |
||
| 5 | PointedEar | 529 | } |
| 7 | PointedEar | 530 | @body = <FILE>; # slurp up body |
| 531 | close(FILE); |
||
| 532 | } # getarticle |
||
| 5 | PointedEar | 533 | |
| 534 | ################################### |
||
| 535 | # get data from the current article |
||
| 536 | ################################### |
||
| 7 | PointedEar | 537 | sub getdata |
| 538 | { |
||
| 5 | PointedEar | 539 | #### First, analyse header fields #### |
| 540 | |||
| 7 | PointedEar | 541 | # Set up this poster if not defined, get counts, sizes |
| 542 | $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name |
||
| 543 | if ( !defined( $data{$poster} ) ) |
||
| 544 | { # seen this one before? |
||
| 545 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
||
| 546 | $data{$poster}{orig} = 0; |
||
| 547 | $data{$poster}{quoted} = 0; |
||
| 548 | } |
||
| 549 | $data{$poster}{count}++; # bump count for this poster |
||
| 550 | $data{$poster}{size} += $filesize; # total size of file |
||
| 5 | PointedEar | 551 | |
| 7 | PointedEar | 552 | # The User-Agent and/or X-Newsreader fields |
| 553 | # for User-Agent by poster |
||
| 554 | if ( defined $lcheader{"user-agent"} ) |
||
| 555 | { |
||
| 556 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
||
| 557 | } |
||
| 558 | if ( defined $lcheader{"x-newsreader"} ) |
||
| 559 | { |
||
| 560 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
||
| 561 | } |
||
| 5 | PointedEar | 562 | |
| 7 | PointedEar | 563 | # The User Agent for User-Agent by number of posts |
| 564 | my $UA = "unknown"; |
||
| 565 | foreach my $keys ( keys %lcheader ) |
||
| 566 | { |
||
| 567 | if ( defined $lcheader{'user-agent'} ) |
||
| 568 | { |
||
| 569 | $UA = $lcheader{'user-agent'}; |
||
| 570 | } |
||
| 571 | elsif ( defined $lcheader{"x-newsreader"} ) |
||
| 572 | { |
||
| 573 | $UA = $lcheader{"x-newsreader"}; |
||
| 574 | } |
||
| 575 | elsif ( defined $lcheader{'x-mailer'} ) |
||
| 576 | { |
||
| 577 | $UA = $lcheader{'x-mailer'}; |
||
| 578 | } |
||
| 579 | elsif ( |
||
| 580 | ( defined $lcheader{'organization'} ) |
||
| 581 | && ( $lcheader{'organization'} =~ |
||
| 582 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
||
| 583 | ) |
||
| 584 | { |
||
| 585 | $UA = $lcheader{'organization'}; |
||
| 586 | } |
||
| 587 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
||
| 588 | { |
||
| 589 | $UA = "Pine"; |
||
| 590 | } ## Hopefully found UA, else set to unknown |
||
| 591 | } |
||
| 5 | PointedEar | 592 | |
| 7 | PointedEar | 593 | $UA = clean($UA); |
| 594 | $UA = get_agent($UA); |
||
| 5 | PointedEar | 595 | |
| 7 | PointedEar | 596 | sub get_agent |
| 597 | { |
||
| 598 | my $raw = shift; |
||
| 599 | my $agent = $raw; |
||
| 5 | PointedEar | 600 | |
| 7 | PointedEar | 601 | ## strip http |
| 602 | if ( $raw =~ /.*http.*/ ) |
||
| 603 | { |
||
| 604 | $raw =~ s!posted via!!i; |
||
| 605 | $raw =~ s!http://!!g; |
||
| 606 | $raw =~ s!/!!g; |
||
| 607 | $raw =~ s! !!g; |
||
| 608 | } |
||
| 5 | PointedEar | 609 | |
| 7 | PointedEar | 610 | ## Fix Outlook from Mac |
| 611 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
||
| 5 | PointedEar | 612 | |
| 7 | PointedEar | 613 | ## Pick out the popular agents |
| 614 | if ( $raw =~ /(outlook express)/i |
||
| 615 | || $raw =~ /(microplanet gravity)/i |
||
| 616 | || $raw =~ /(news rover)/i |
||
| 617 | || $raw =~ /(forte agent)/i |
||
| 618 | || $raw =~ /(forte free agent)/i ) |
||
| 619 | { |
||
| 620 | $agent = $1; |
||
| 621 | } |
||
| 622 | elsif ( |
||
| 623 | $raw =~ /^( |
||
| 5 | PointedEar | 624 | pan |
| 625 | |sylpheed |
||
| 626 | |slrn |
||
| 627 | |mozilla |
||
| 628 | |knode |
||
| 629 | |tin |
||
| 630 | |hamster |
||
| 631 | |xrn |
||
| 632 | |xnews |
||
| 633 | |aol |
||
| 634 | |gnus |
||
| 635 | |krn |
||
| 636 | |macsoup |
||
| 637 | |messenger |
||
| 638 | |openxp |
||
| 639 | |pine |
||
| 640 | |thoth |
||
| 641 | |turnpike |
||
| 642 | |winvn |
||
| 643 | |vsoup |
||
| 644 | |google |
||
| 645 | |supernews |
||
| 646 | |nn |
||
| 647 | |rn |
||
| 648 | |007 |
||
| 649 | |webtv |
||
| 650 | |compuserve |
||
| 7 | PointedEar | 651 | )/ix |
| 652 | ) |
||
| 653 | { |
||
| 654 | $agent = $1; |
||
| 655 | } |
||
| 656 | else |
||
| 657 | { |
||
| 658 | ## Clean up unknown agents |
||
| 659 | if ( $raw =~ m!^(.*?)/! ) |
||
| 660 | { |
||
| 661 | $agent = $1; |
||
| 662 | } |
||
| 663 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
||
| 664 | { |
||
| 665 | $agent = $1; |
||
| 666 | } |
||
| 667 | } |
||
| 668 | |||
| 669 | $distinct_agent{$agent}++; |
||
| 670 | return $agent; |
||
| 5 | PointedEar | 671 | } |
| 672 | |||
| 7 | PointedEar | 673 | # Get all cross-posted newsgroups |
| 674 | for ( split /,/, $headers{"Newsgroups"} ) |
||
| 675 | { |
||
| 676 | $crossposts{$_}++; # bump count for each |
||
| 677 | } |
||
| 5 | PointedEar | 678 | |
| 7 | PointedEar | 679 | # Get threads |
| 680 | my $thread = encode( 'UTF-8', $headers{"Subject"} ); |
||
| 681 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
||
| 682 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
| 683 | $threads{$thread}{count} += 1; # bump count of this subject |
||
| 684 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
||
| 5 | PointedEar | 685 | |
| 7 | PointedEar | 686 | # Is this an original post or a reply? |
| 687 | if ( defined $headers{"References"} ) |
||
| 688 | { |
||
| 689 | $replies++; |
||
| 690 | } |
||
| 691 | else |
||
| 692 | { |
||
| 693 | $origposts++; |
||
| 694 | } |
||
| 5 | PointedEar | 695 | |
| 7 | PointedEar | 696 | # Get the time zone |
| 697 | $_ = $headers{"Date"}; |
||
| 698 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
||
| 699 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
||
| 700 | { |
||
| 701 | $tz = "UTC"; |
||
| 702 | } |
||
| 703 | $tz{$tz}++; |
||
| 5 | PointedEar | 704 | |
| 7 | PointedEar | 705 | #### Now analyse the body text #### |
| 706 | my $insig = 0; |
||
| 707 | for (@body) |
||
| 708 | { |
||
| 709 | $totbody += length($_); # bump total body size |
||
| 710 | next if (/^$>/); # don't count blank lines in body |
||
| 711 | if ( $insig == 1 ) |
||
| 712 | { |
||
| 713 | $totsig += length($_); # bump total sig size |
||
| 5 | PointedEar | 714 | |
| 7 | PointedEar | 715 | # Bill Unruh uses ] quotes, and another poster uses :: |
| 716 | } |
||
| 717 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
||
| 718 | { # are we in a quote line? |
||
| 719 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
||
| 720 | $totquoted += length($_); |
||
| 721 | } |
||
| 722 | elsif (/-- /) |
||
| 723 | { |
||
| 724 | $insig = 1; |
||
| 725 | } |
||
| 726 | else |
||
| 727 | { |
||
| 5 | PointedEar | 728 | |
| 7 | PointedEar | 729 | # we must be processing an original line |
| 730 | $data{$poster}{orig} += length($_); # bump count of original chrs |
||
| 731 | $totorig += length($_); |
||
| 732 | } |
||
| 733 | } # end for (@body) |
||
| 5 | PointedEar | 734 | |
| 7 | PointedEar | 735 | } # getdata |
| 5 | PointedEar | 736 | |
| 737 | ######################################## |
||
| 738 | # Count the User-Agents used, collapsing |
||
| 739 | # different versions into one per agent. |
||
| 740 | ######################################## |
||
| 7 | PointedEar | 741 | sub countagents |
| 742 | { |
||
| 5 | PointedEar | 743 | POSTER: |
| 7 | PointedEar | 744 | foreach $poster ( keys %data ) |
| 745 | { |
||
| 746 | foreach my $agent_name ( keys %distinct_agent ) |
||
| 747 | { # check against known ones |
||
| 748 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
||
| 749 | { |
||
| 750 | $agents{$agent_name}++; |
||
| 751 | next POSTER; |
||
| 752 | } |
||
| 753 | } |
||
| 754 | $agents{ $data{$poster}{agent} }++; |
||
| 755 | } |
||
| 756 | } # countagents |
||
| 5 | PointedEar | 757 | |
| 758 | ############################################ |
||
| 759 | # set orig/total percentages for all posters |
||
| 760 | ############################################ |
||
| 7 | PointedEar | 761 | sub fixpercent |
| 762 | { |
||
| 763 | foreach $poster ( keys %data ) |
||
| 764 | { |
||
| 765 | my $percent = 100; |
||
| 766 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
||
| 767 | { |
||
| 768 | $percent = $data{$poster}{orig} * 100 / |
||
| 769 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
||
| 770 | } |
||
| 771 | elsif ( $data{$poster}{orig} == 0 ) |
||
| 772 | { |
||
| 773 | $percent = 0; |
||
| 774 | } |
||
| 775 | $data{$poster}{percent} = $percent; |
||
| 776 | } |
||
| 5 | PointedEar | 777 | } |
| 778 | |||
| 779 | ############################## |
||
| 780 | # right pad a string with '.'s |
||
| 781 | ############################## |
||
| 7 | PointedEar | 782 | sub rpad |
| 783 | { |
||
| 784 | |||
| 785 | # get text to pad, length to pad, pad chr |
||
| 786 | my ( $text, $pad_len, $pad_chr ) = @_; |
||
| 787 | if ( length($text) > $pad_len ) |
||
| 788 | { |
||
| 789 | $text = substr( $text, 0, $pad_len ); |
||
| 790 | } |
||
| 791 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
||
| 792 | return $padded; |
||
| 5 | PointedEar | 793 | } |
| 794 | |||
| 795 | ################# |
||
| 796 | # centre a string |
||
| 797 | ################# |
||
| 7 | PointedEar | 798 | sub centred |
| 799 | { |
||
| 800 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
||
| 801 | my $pad_len = ( $width - length($text) ) / 2; #/ |
||
| 802 | my $centred = " " x $pad_len . $text; |
||
| 803 | return $centred; |
||
| 5 | PointedEar | 804 | } |
| 805 | |||
| 806 | ########################## |
||
| 807 | # put commas into a number |
||
| 808 | ########################## |
||
| 7 | PointedEar | 809 | sub commify |
| 810 | { |
||
| 811 | $_ = shift; |
||
| 812 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
||
| 813 | return $_; |
||
| 5 | PointedEar | 814 | } |
| 815 | |||
| 816 | ######################### |
||
| 817 | # clean |
||
| 818 | ######################### |
||
| 7 | PointedEar | 819 | sub clean |
| 820 | { |
||
| 821 | my $dirty = shift; |
||
| 822 | my $clean = $dirty; |
||
| 823 | $clean =~ s/^\s*//; |
||
| 824 | $clean =~ s/\s*$//; |
||
| 5 | PointedEar | 825 | |
| 7 | PointedEar | 826 | return $clean; |
| 5 | PointedEar | 827 | } |
| 828 | |||
| 7 | PointedEar | 829 | sub usage |
| 830 | { |
||
| 5 | PointedEar | 831 | |
| 7 | PointedEar | 832 | print "usage: newstat.pl newsgroupname\n"; |
| 833 | exit 1; |
||
| 5 | PointedEar | 834 | } |
| 835 | |||
| 836 | ################################### |
||
| 837 | # Write data structures to a file # |
||
| 838 | ################################### |
||
| 7 | PointedEar | 839 | sub writedata |
| 840 | { |
||
| 841 | open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
||
| 842 | print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
||
| 843 | print OUTF |
||
| 844 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
||
| 845 | foreach my $name ( keys %data ) |
||
| 846 | { |
||
| 847 | print OUTF |
||
| 848 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
||
| 849 | } |
||
| 850 | print OUTF |
||
| 851 | "============================================================================\n"; |
||
| 852 | print OUTF "Thread subjects\n"; |
||
| 853 | print OUTF |
||
| 854 | "----------------------------------------------------------------------------\n"; |
||
| 855 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
||
| 856 | { |
||
| 857 | print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
||
| 858 | } |
||
| 859 | print OUTF |
||
| 860 | "============================================================================\n"; |
||
| 861 | print OUTF "Cross-posts\n"; |
||
| 862 | print OUTF |
||
| 863 | "----------------------------------------------------------------------------\n"; |
||
| 864 | foreach my $name ( sort keys %crossposts ) |
||
| 865 | { |
||
| 866 | print OUTF "$name : $crossposts{$name}\n"; |
||
| 867 | } |
||
| 868 | print OUTF print OUTF |
||
| 869 | "============================================================================\n"; |
||
| 870 | print OUTF "User agents\n"; |
||
| 871 | print OUTF |
||
| 872 | "----------------------------------------------------------------------------\n"; |
||
| 873 | foreach my $name ( sort keys %agents ) |
||
| 874 | { |
||
| 875 | print OUTF "$name : $agents{$name}\n"; |
||
| 876 | } |
||
| 877 | print OUTF |
||
| 878 | "============================================================================\n"; |
||
| 879 | print OUTF "Time zones\n"; |
||
| 880 | print OUTF |
||
| 881 | "----------------------------------------------------------------------------\n"; |
||
| 882 | foreach my $name ( sort keys %tz ) |
||
| 883 | { |
||
| 884 | print OUTF "$name : $tz{$name}\n"; |
||
| 885 | } |
||
| 886 | close OUTF; |
||
| 887 | } # writedata |