Rev 6 | Go to most recent revision | Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 5 | PointedEar | 1 | #!/usr/bin/perl -w |
| 2 | use strict; |
||
| 3 | ######################### |
||
| 4 | # newsstat.pl version 0.3 |
||
| 5 | |||
| 6 | |||
| 7 | |||
| 8 | ################################################################### |
||
| 9 | # Collect statistics about the alt.os.linux.mandrake newsgroup. |
||
| 10 | # Check all articles in the last 7-day period. Rank posters by |
||
| 11 | # no. of posts and by volume of posts, report on top and bottom |
||
| 12 | # 20 posters. Show their name, no. posts, size of posts, percentage |
||
| 13 | # quoted lines. Rank user-agents used, by poster rather than by |
||
| 14 | # post. Rank top 10 threads. Rank top 10 cross-posted groups. |
||
| 15 | ################################################################### |
||
| 16 | |||
| 17 | ################################################################## |
||
| 18 | # RECENT CHANGES # |
||
| 19 | # 2004/06/19 - newsgroup name is $ARGV[0] |
||
| 20 | # - Allow command line flags for subtracting |
||
| 21 | # output if not pertinent for a group |
||
| 22 | # 2002/11/09 - Put Garry's writedata() function back in. |
||
| 23 | # - added "rn" to my list of UA's |
||
| 24 | # - Started using %distinct_agent for both User agent |
||
| 25 | # sections |
||
| 26 | # - named it newsstat.pl version 0.3 |
||
| 27 | # 2002/11/06 - Fixed the earliest/latest file problem by using |
||
| 28 | # mtime rather than ctime, and simplifying the logic |
||
| 29 | # 2002/11/05 - moved user configurations to the top |
||
| 30 | # - fixed the cross-posting section |
||
| 31 | # - introduced the $newsgroup_name variable which |
||
| 32 | # later becomes $news$group |
||
| 33 | # - changed $name to $agent_name in countagents() |
||
| 34 | |||
| 35 | ########### NEXT ############# |
||
| 36 | # Commas in bottom section of report |
||
| 37 | # Show date the figures were compiled |
||
| 38 | # No. of HTML articles (Content-Type: text/html) |
||
| 39 | # No. of quoted sigs (/>\s*-- /) |
||
| 40 | # Per cent of top-posted articles |
||
| 41 | # Top 10 cross-posters |
||
| 42 | # Top 20 news posting hosts (from Path) |
||
| 43 | # Count of certain subject words: newbie, kde, burner, sendmail, etc. |
||
| 44 | # Count *all* User Agents that each poster uses |
||
| 45 | # What do we do about Bill Unruh's ] quote style? |
||
| 46 | # Change the way dates/times are checked |
||
| 47 | # include % share in posters by no. of arts |
||
| 48 | # include % share in posters by size |
||
| 49 | # Total, orig & quoted lines by user agent with per cent |
||
| 50 | # Take arguments, i.e. newsgroup name |
||
| 51 | ####################################################### |
||
| 52 | |||
| 53 | ###################### USER CONFIGURATIONS ############################ |
||
| 54 | |||
| 55 | # The name of the group to do stats for |
||
| 56 | my $newsgroup_name = $ARGV[0]; |
||
| 57 | $newsgroup_name or &usage; |
||
| 58 | |||
| 59 | # Check for removal flags |
||
| 60 | my $ix; |
||
| 61 | my $j; |
||
| 62 | my %skipSec; |
||
| 63 | my @skiplist; |
||
| 64 | my $args = @ARGV; |
||
| 65 | for ( $ix = 1 ; $ix < $args ; $ix++ ) { |
||
| 66 | $j = $ix + 1; |
||
| 67 | if ( $ARGV[$ix] eq "-x" ) { |
||
| 68 | @skiplist = split(",",$ARGV[$j]); |
||
| 69 | } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { |
||
| 70 | @skiplist = split(",",$1); |
||
| 71 | } |
||
| 72 | } |
||
| 73 | foreach(@skiplist) { |
||
| 74 | $skipSec{$_} = 1; |
||
| 75 | } |
||
| 76 | |||
| 77 | # Leafnode users will want /var/spool/news for this variable. |
||
| 78 | my $news = "/var/spool/news/"; |
||
| 79 | |||
| 80 | # How many days are we doing statistics for? |
||
| 81 | my $numdays = 30; |
||
| 82 | |||
| 83 | # no. of agents we list |
||
| 84 | my $topagents = 10; |
||
| 85 | |||
| 86 | # no. of threads we want to know about |
||
| 87 | my $topthreads = 20; |
||
| 88 | |||
| 89 | # no. of top or bottom posters to show |
||
| 90 | my $topposters = 20; |
||
| 91 | |||
| 92 | # no. of cross-posted threads to show |
||
| 93 | my $topcrossposts = 10; |
||
| 94 | |||
| 95 | # no. of time zones to show |
||
| 96 | my $toptz = 10; |
||
| 97 | |||
| 98 | |||
| 99 | |||
| 100 | ###################### DATA STRUCTURES ###################### |
||
| 101 | my $group = $newsgroup_name; |
||
| 102 | $group =~ s!\.!/!g; |
||
| 103 | my %data; # name, count, agent, total, orig, quoted |
||
| 104 | my %threads; # subject, count |
||
| 105 | my %crossposts; # group, count |
||
| 106 | my %tz; # timezones by count |
||
| 107 | my %headers; # holds header of current article |
||
| 108 | my %lcheader; # holds lowercase headers |
||
| 109 | my @body; # holds body of current article |
||
| 110 | my @sig; # holds sig text; |
||
| 111 | my $totalposts; # total no. of posts considered |
||
| 112 | my $filename; # name of current article file |
||
| 113 | my $filesize; # size of current article file |
||
| 114 | my $earliest; # earliest article we have found |
||
| 115 | my $latest; # latest article we have found |
||
| 116 | my $poster; # poster we are dealing with |
||
| 117 | my $totsize = 0; # holds total sizes of all files |
||
| 118 | my $totheader = 0; # total size of header material |
||
| 119 | my $totbody = 0; # total size of body material |
||
| 120 | my $totsig = 0; # total size of sig material |
||
| 121 | my $totorig = 0; # total size of original material |
||
| 122 | my $totquoted = 0; # total size of quoted material |
||
| 123 | my $origposts = 0; # total no. of original posts |
||
| 124 | my $replies = 0; # total no. of replies |
||
| 125 | my $i; # general purpose |
||
| 126 | my %distinct_agent; |
||
| 127 | my %agents = # used to hold counts of User Agents used |
||
| 128 | ( "KNode" => 0, |
||
| 129 | "Pan" => 0, |
||
| 130 | "Mozilla" => 0, |
||
| 131 | "Sylpheed" => 0, |
||
| 132 | "Gnus" => 0, |
||
| 133 | "Forte Agent" => 0, |
||
| 134 | "Forte Free Agent" => 0, |
||
| 135 | "MicroPlanet Gravity" => 0, |
||
| 136 | "Microsoft Outlook Express" => 0, |
||
| 137 | "Xnews" => 0, |
||
| 138 | "slrn" => 0, |
||
| 139 | "tin" => 0, |
||
| 140 | "rn" => 0, |
||
| 141 | "NN" => 0, |
||
| 142 | "MacSOUP" => 0, |
||
| 143 | "Foorum" => 0, |
||
| 144 | "MT-NewsWatcher" => 0, |
||
| 145 | "News Rover" => 0, |
||
| 146 | "WebTV" => 0, |
||
| 147 | "Compuserver" => 0, |
||
| 148 | "VSoup" => 0); |
||
| 149 | |||
| 150 | ######################## MAIN CODE ######################## |
||
| 151 | $! = 1; |
||
| 152 | |||
| 153 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
||
| 154 | opendir(DIR, ".") or die "Can't open $news$group directory: $!\n"; |
||
| 155 | while (defined($filename = readdir(DIR))) { |
||
| 156 | %lcheader = (); |
||
| 157 | next unless -f $filename; # only want real files |
||
| 158 | next if ($filename eq ".overview"); # real articles only |
||
| 159 | next if (-M $filename > $numdays); # only want articles <= a certain age |
||
| 160 | $earliest = (stat $filename)[9] unless defined ($earliest); |
||
| 161 | $latest = (stat $filename)[9] unless defined ($latest); |
||
| 162 | &getarticle($filename); # read in the article |
||
| 163 | &getdata; # grab the data from the article |
||
| 164 | $totalposts++; # bump count of articles considered |
||
| 165 | } |
||
| 166 | closedir(DIR); # finished with the directory |
||
| 167 | # post-processing |
||
| 168 | &countagents; # count agents, collapsing versions |
||
| 169 | &fixpercent; # check percentages orig/total for posters |
||
| 170 | |||
| 171 | &writedata; |
||
| 172 | |||
| 173 | #################### DISPLAY RESULTS ##################### |
||
| 174 | print "=" x 76, "\n"; |
||
| 175 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
||
| 176 | print "=" x 76, "\n"; |
||
| 177 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight)", 76); |
||
| 178 | print "\n\n"; |
||
| 179 | printf "Total posts considered: %s over %d days\n", |
||
| 180 | commify($totalposts), $numdays; |
||
| 181 | printf "Earliest article: %s\n", scalar localtime($earliest); |
||
| 182 | printf "Latest article: %s\n", scalar localtime($latest); |
||
| 183 | printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); |
||
| 184 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
||
| 185 | commify(int($totsize / 1024)), $totsize / 1048576; # |
||
| 186 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
||
| 187 | commify(int($totalposts / $numdays)), |
||
| 188 | $totsize / $numdays / 1048576, commify(int($totsize / $totalposts)); |
||
| 189 | my $count = keys %data; |
||
| 190 | printf "Total headers: %s KB bodies: %s KB\n", |
||
| 191 | commify(int($totheader / 1024)), commify(int($totbody / 1024)); |
||
| 192 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
||
| 193 | commify(int($totquoted / 1024)), commify(int($totorig / 1024)), |
||
| 194 | ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024)); |
||
| 195 | printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), |
||
| 196 | commify(int($totsize / $count)); #/ |
||
| 197 | $count = keys %threads; |
||
| 198 | printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), |
||
| 199 | commify(int($totsize / $count)); #/ |
||
| 200 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
||
| 201 | print "\n", "=" x 76, "\n"; |
||
| 202 | |||
| 203 | ############################### |
||
| 204 | # show posters by article count Sec 1; |
||
| 205 | ############################### |
||
| 206 | unless ( $skipSec{1} ) { |
||
| 207 | if (keys %data < $topposters) { |
||
| 208 | $count = keys %data; |
||
| 209 | } else { |
||
| 210 | $count = $topposters; |
||
| 211 | } |
||
| 212 | printf "%s\n", ¢red("Top $count posters by number of articles", 76); |
||
| 213 | print "=" x 76, "\n"; |
||
| 214 | $i = 0; |
||
| 215 | foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) { |
||
| 216 | my $name = substr($poster, 0, 65); |
||
| 217 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count}; |
||
| 218 | last if (++$i == $count); |
||
| 219 | } |
||
| 220 | print "\n", "=" x 76, "\n"; |
||
| 221 | } |
||
| 222 | |||
| 223 | ################################ |
||
| 224 | # show posters by size in Kbytes Sec 2; |
||
| 225 | ################################ |
||
| 226 | unless ( $skipSec{2} ) { |
||
| 227 | if (keys %data < $topposters) { |
||
| 228 | $count = keys %data; |
||
| 229 | } else { |
||
| 230 | $count = $topposters; |
||
| 231 | } |
||
| 232 | printf "%s\n", ¢red("Top $count posters by article size in Kbytes", 76); |
||
| 233 | print "=" x 76, "\n"; |
||
| 234 | $i = 0; |
||
| 235 | foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) { |
||
| 236 | my $name = substr($poster, 0, 62); |
||
| 237 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/ |
||
| 238 | last if (++$i == $count); |
||
| 239 | } |
||
| 240 | print "\n", "=" x 76, "\n"; |
||
| 241 | } |
||
| 242 | |||
| 243 | #################################### |
||
| 244 | # show top posters for original text |
||
| 245 | #################################### |
||
| 246 | unless ( $skipSec{3} ) { |
||
| 247 | if (keys %data < $topposters) { |
||
| 248 | $count = keys %data; |
||
| 249 | } else { |
||
| 250 | $count = $topposters; |
||
| 251 | } |
||
| 252 | printf "%s\n", ¢red("Top $count responders by original text (> 5 posts)", 76); |
||
| 253 | print "=" x 76, "\n"; |
||
| 254 | $i = 0; |
||
| 255 | foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) { |
||
| 256 | next if $data{$poster}{quoted} == 0; |
||
| 257 | next if $data{$poster}{count} < 5; |
||
| 258 | my $name = substr($poster, 0, 63); |
||
| 259 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
||
| 260 | last if (++$i == $count); |
||
| 261 | } |
||
| 262 | print "\n", "=" x 76, "\n"; |
||
| 263 | } |
||
| 264 | |||
| 265 | ####################################### |
||
| 266 | # show bottom posters for original text |
||
| 267 | ####################################### |
||
| 268 | unless ( $skipSec{4} ) { |
||
| 269 | if (keys %data < $topposters) { |
||
| 270 | $count = keys %data; |
||
| 271 | } else { |
||
| 272 | $count = $topposters; |
||
| 273 | } |
||
| 274 | printf "%s\n", ¢red("Bottom $count responders by original text (> 5 posts)", 76); |
||
| 275 | print "=" x 76, "\n"; |
||
| 276 | $i = 0; |
||
| 277 | foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) { |
||
| 278 | next if $data{$poster}{quoted} == 0; |
||
| 279 | next if $data{$poster}{count} < 5; |
||
| 280 | my $name = substr($poster, 0, 63); |
||
| 281 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
||
| 282 | last if (++$i == $count); |
||
| 283 | } |
||
| 284 | print "\n", "=" x 76, "\n"; |
||
| 285 | } |
||
| 286 | |||
| 287 | #################################### |
||
| 288 | # show threads by number of articles |
||
| 289 | #################################### |
||
| 290 | unless ( $skipSec{5} ) { |
||
| 291 | if (keys %threads < $topthreads) { |
||
| 292 | $count = keys %threads; |
||
| 293 | } else { |
||
| 294 | $count = $topthreads; |
||
| 295 | } |
||
| 296 | printf "%s\n", ¢red("Top $count threads by no. of articles", 76); |
||
| 297 | print "=" x 76, "\n"; |
||
| 298 | $i = 0; |
||
| 299 | foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) { |
||
| 300 | my $name = substr($thread, 0, 65); |
||
| 301 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count}; |
||
| 302 | last if (++$i == $count); |
||
| 303 | } |
||
| 304 | print "\n", "=" x 76, "\n"; |
||
| 305 | } |
||
| 306 | ################################ |
||
| 307 | # show threads by size in Kbytes |
||
| 308 | ################################ |
||
| 309 | unless ( $skipSec{6} ) { |
||
| 310 | if (keys %threads < $topthreads) { |
||
| 311 | $count = keys %threads; |
||
| 312 | } else { |
||
| 313 | $count = $topthreads; |
||
| 314 | } |
||
| 315 | printf "%s\n", ¢red("Top $count threads by size in KB", 76); |
||
| 316 | print "=" x 76, "\n"; |
||
| 317 | $i = 0; |
||
| 318 | foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) { |
||
| 319 | my $name = substr($thread, 0, 65); |
||
| 320 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/ |
||
| 321 | last if (++$i == $count); |
||
| 322 | } |
||
| 323 | print "\n", "=" x 76, "\n"; |
||
| 324 | } |
||
| 325 | |||
| 326 | ################################# |
||
| 327 | # show top 10 cross-posted groups |
||
| 328 | ################################# |
||
| 329 | unless ( $skipSec{7} ) { |
||
| 330 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
||
| 331 | if (keys %crossposts < $topcrossposts) { |
||
| 332 | $count = keys %crossposts; |
||
| 333 | } else { |
||
| 334 | $count = $topcrossposts; |
||
| 335 | } |
||
| 336 | printf "%s\n", ¢red("Top $count cross-posted groups", 76); |
||
| 337 | print "=" x 76, "\n"; |
||
| 338 | $i = 0; |
||
| 339 | foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) { |
||
| 340 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name}; |
||
| 341 | last if (++$i == $count); |
||
| 342 | } |
||
| 343 | print "\n", "=" x 76, "\n"; |
||
| 344 | } |
||
| 345 | ####################### |
||
| 346 | #show agents and counts |
||
| 347 | ####################### |
||
| 348 | unless ( $skipSec{8} ) { |
||
| 349 | if (keys %agents < $topagents) { |
||
| 350 | $count = keys %agents; |
||
| 351 | } else { |
||
| 352 | $count = $topagents; |
||
| 353 | } |
||
| 354 | printf "%s\n", ¢red("Top $count User Agents by poster", 76); |
||
| 355 | print "=" x 76, "\n"; |
||
| 356 | $i = 0; |
||
| 357 | foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) { |
||
| 358 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent}; |
||
| 359 | last if (++$i == $count); |
||
| 360 | } |
||
| 361 | print "\n", "=" x 76, "\n"; |
||
| 362 | } |
||
| 363 | |||
| 364 | ####################### |
||
| 365 | #show distinct agents |
||
| 366 | ####################### |
||
| 367 | unless ( $skipSec{9} ) { |
||
| 368 | if (keys %distinct_agent < $topagents) { |
||
| 369 | $count = keys %distinct_agent; |
||
| 370 | } else { |
||
| 371 | $count = $topagents; |
||
| 372 | } |
||
| 373 | printf "%s\n", ¢red("Top $count User Agents by number of posts", 76); |
||
| 374 | print "=" x 76, "\n"; |
||
| 375 | $i = 0; |
||
| 376 | foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) { |
||
| 377 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100); |
||
| 378 | last if (++$i == $count); |
||
| 379 | } |
||
| 380 | print "\n", "=" x 76, "\n"; |
||
| 381 | } |
||
| 382 | |||
| 383 | ########################## |
||
| 384 | #show timezones and counts |
||
| 385 | ########################## |
||
| 386 | unless ( $skipSec{10} ) { |
||
| 387 | if (keys %tz < $toptz) { |
||
| 388 | $count = keys %tz; |
||
| 389 | } else { |
||
| 390 | $count = $toptz; |
||
| 391 | } |
||
| 392 | printf "%s\n", ¢red("Top 10 time zones", 76); |
||
| 393 | print "=" x 76, "\n"; |
||
| 394 | $i = 0; |
||
| 395 | foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) { |
||
| 396 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone}; |
||
| 397 | last if (++$i == $count); |
||
| 398 | } |
||
| 399 | print "\n", "=" x 76, "\n"; |
||
| 400 | } |
||
| 401 | |||
| 402 | |||
| 403 | ################################ SUBROUTINES ################################ |
||
| 404 | |||
| 405 | |||
| 406 | ####################################### |
||
| 407 | # get current article's header and body |
||
| 408 | ####################################### |
||
| 409 | sub getarticle { |
||
| 410 | %headers = (); # dump old headers |
||
| 411 | my $filename = shift; # get the name of the file |
||
| 412 | # get stats about the file itself |
||
| 413 | $filesize = -s $filename; # get total size of file |
||
| 414 | $totsize += $filesize; # bump total sizes of all files |
||
| 415 | |||
| 416 | my $mtime = (stat $filename)[9]; |
||
| 417 | if ( $mtime < $earliest ) { |
||
| 418 | $earliest = $mtime; |
||
| 419 | } elsif ( $mtime > $latest ) { |
||
| 420 | $latest = $mtime; |
||
| 421 | } |
||
| 422 | |||
| 423 | # now read the file |
||
| 424 | open(FILE, $filename) or die "Can't open $filename: $!\n"; |
||
| 425 | while (<FILE>) { |
||
| 426 | $totheader += length($_); # bump total header size |
||
| 427 | last if (/^\s*$/); # end of header? |
||
| 428 | if (/^([^:\s]*):\s+(.*)/) { |
||
| 429 | my($key,$val) = ($1,$2); |
||
| 430 | $headers{$key} = $val; |
||
| 431 | $lcheader{clean(lc($key))} = clean($val); |
||
| 432 | } |
||
| 433 | } |
||
| 434 | @body = <FILE>; # slurp up body |
||
| 435 | close(FILE); |
||
| 436 | } # getarticle |
||
| 437 | |||
| 438 | ################################### |
||
| 439 | # get data from the current article |
||
| 440 | ################################### |
||
| 441 | sub getdata { |
||
| 442 | #### First, analyse header fields #### |
||
| 443 | |||
| 444 | # Set up this poster if not defined, get counts, sizes |
||
| 445 | $poster = $headers{From}; # get the poster's name |
||
| 446 | if (!defined($data{$poster})) { # seen this one before? |
||
| 447 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
||
| 448 | $data{$poster}{orig} = 0; |
||
| 449 | $data{$poster}{quoted} = 0; |
||
| 450 | } |
||
| 451 | $data{$poster}{count}++; # bump count for this poster |
||
| 452 | $data{$poster}{size} += $filesize; # total size of file |
||
| 453 | |||
| 454 | # The User-Agent and/or X-Newsreader fields |
||
| 455 | # for User-Agent by poster |
||
| 456 | if (defined $lcheader{"user-agent"}) { |
||
| 457 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
||
| 458 | } |
||
| 459 | if (defined $lcheader{"x-newsreader"}) { |
||
| 460 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
||
| 461 | } |
||
| 462 | |||
| 463 | # The User Agent for User-Agent by number of posts |
||
| 464 | my $UA = "unknown"; |
||
| 465 | foreach my $keys ( keys %lcheader ) |
||
| 466 | { |
||
| 467 | if (defined $lcheader{'user-agent'}) |
||
| 468 | { |
||
| 469 | $UA = $lcheader{'user-agent'}; |
||
| 470 | } |
||
| 471 | elsif (defined $lcheader{"x-newsreader"}) |
||
| 472 | { |
||
| 473 | $UA = $lcheader{"x-newsreader"}; |
||
| 474 | } |
||
| 475 | elsif (defined $lcheader{'x-mailer'}) |
||
| 476 | { |
||
| 477 | $UA = $lcheader{'x-mailer'}; |
||
| 478 | } |
||
| 479 | elsif ((defined $lcheader{'organization'}) && |
||
| 480 | ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/)) |
||
| 481 | { |
||
| 482 | $UA = $lcheader{'organization'}; |
||
| 483 | } |
||
| 484 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
||
| 485 | { |
||
| 486 | $UA = "Pine"; |
||
| 487 | } ## Hopefully found UA, else set to unknown |
||
| 488 | } |
||
| 489 | |||
| 490 | |||
| 491 | $UA = clean($UA); |
||
| 492 | $UA = get_agent($UA); |
||
| 493 | |||
| 494 | |||
| 495 | sub get_agent { |
||
| 496 | my $raw = shift; |
||
| 497 | my $agent = $raw; |
||
| 498 | |||
| 499 | ## strip http |
||
| 500 | if ( $raw =~ /.*http.*/ ) { |
||
| 501 | $raw =~ s!posted via!!i; |
||
| 502 | $raw =~ s!http://!!g; |
||
| 503 | $raw =~ s!/!!g; |
||
| 504 | $raw =~ s! !!g; |
||
| 505 | } |
||
| 506 | |||
| 507 | ## Fix Outlook from Mac |
||
| 508 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;} |
||
| 509 | |||
| 510 | ## Pick out the popular agents |
||
| 511 | if ( $raw =~ /(outlook express)/i || |
||
| 512 | $raw =~ /(microplanet gravity)/i || |
||
| 513 | $raw =~ /(news rover)/i || |
||
| 514 | $raw =~ /(forte agent)/i || |
||
| 515 | $raw =~ /(forte free agent)/i |
||
| 516 | ) |
||
| 517 | { |
||
| 518 | $agent = $1; |
||
| 519 | } |
||
| 520 | elsif ( $raw =~ /^( |
||
| 521 | pan |
||
| 522 | |sylpheed |
||
| 523 | |slrn |
||
| 524 | |mozilla |
||
| 525 | |knode |
||
| 526 | |tin |
||
| 527 | |hamster |
||
| 528 | |xrn |
||
| 529 | |xnews |
||
| 530 | |aol |
||
| 531 | |gnus |
||
| 532 | |krn |
||
| 533 | |macsoup |
||
| 534 | |messenger |
||
| 535 | |openxp |
||
| 536 | |pine |
||
| 537 | |thoth |
||
| 538 | |turnpike |
||
| 539 | |winvn |
||
| 540 | |vsoup |
||
| 541 | |google |
||
| 542 | |supernews |
||
| 543 | |nn |
||
| 544 | |rn |
||
| 545 | |007 |
||
| 546 | |webtv |
||
| 547 | |compuserve |
||
| 548 | )/ix ) |
||
| 549 | { |
||
| 550 | $agent = $1; |
||
| 551 | } |
||
| 552 | else |
||
| 553 | { |
||
| 554 | ## Clean up unknown agents |
||
| 555 | if ( $raw =~ m!^(.*?)/! ) { |
||
| 556 | $agent = $1; |
||
| 557 | } |
||
| 558 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
||
| 559 | { |
||
| 560 | $agent = $1; |
||
| 561 | } |
||
| 562 | } |
||
| 563 | |||
| 564 | $distinct_agent{$agent}++; |
||
| 565 | return $agent; |
||
| 566 | } |
||
| 567 | |||
| 568 | |||
| 569 | # Get all cross-posted newsgroups |
||
| 570 | for (split /,/, $headers{"Newsgroups"}) { |
||
| 571 | $crossposts{$_}++; # bump count for each |
||
| 572 | } |
||
| 573 | |||
| 574 | # Get threads |
||
| 575 | my $thread = $headers{"Subject"}; |
||
| 576 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
||
| 577 | $thread =~ s/\s+/ /g; # collapse whitespace |
||
| 578 | $threads{$thread}{count} += 1; # bump count of this subject |
||
| 579 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
||
| 580 | |||
| 581 | # Is this an original post or a reply? |
||
| 582 | if (defined $headers{"References"}) { |
||
| 583 | $replies++; |
||
| 584 | } else { |
||
| 585 | $origposts++; |
||
| 586 | } |
||
| 587 | |||
| 588 | # Get the time zone |
||
| 589 | $_ = $headers{"Date"}; |
||
| 590 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
||
| 591 | if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) { |
||
| 592 | $tz = "UTC"; |
||
| 593 | } |
||
| 594 | $tz{$tz}++; |
||
| 595 | |||
| 596 | #### Now analyse the body text #### |
||
| 597 | my $insig = 0; |
||
| 598 | for (@body) { |
||
| 599 | $totbody += length($_); # bump total body size |
||
| 600 | next if (/^$>/); # don't count blank lines in body |
||
| 601 | if ($insig == 1) { |
||
| 602 | $totsig += length($_); # bump total sig size |
||
| 603 | # Bill Unruh uses ] quotes, and another poster uses :: |
||
| 604 | } elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line? |
||
| 605 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
||
| 606 | $totquoted += length($_); |
||
| 607 | } elsif (/-- /) { |
||
| 608 | $insig = 1; |
||
| 609 | } else { |
||
| 610 | # we must be processing an original line |
||
| 611 | $data{$poster}{orig} += length($_); # bump count of original chrs |
||
| 612 | $totorig += length($_); |
||
| 613 | } |
||
| 614 | } # end for (@body) |
||
| 615 | |||
| 616 | } # getdata |
||
| 617 | |||
| 618 | ######################################## |
||
| 619 | # Count the User-Agents used, collapsing |
||
| 620 | # different versions into one per agent. |
||
| 621 | ######################################## |
||
| 622 | sub countagents { |
||
| 623 | POSTER: |
||
| 624 | foreach $poster (keys %data) { |
||
| 625 | foreach my $agent_name (keys %distinct_agent) { # check against known ones |
||
| 626 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { |
||
| 627 | $agents{$agent_name}++; |
||
| 628 | next POSTER; |
||
| 629 | } |
||
| 630 | } |
||
| 631 | $agents{$data{$poster}{agent}}++; |
||
| 632 | } |
||
| 633 | } # countagents |
||
| 634 | |||
| 635 | ############################################ |
||
| 636 | # set orig/total percentages for all posters |
||
| 637 | ############################################ |
||
| 638 | sub fixpercent { |
||
| 639 | foreach $poster (keys %data) { |
||
| 640 | my $percent = 100; |
||
| 641 | if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) { |
||
| 642 | $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/ |
||
| 643 | } elsif ($data{$poster}{orig} == 0) { |
||
| 644 | $percent = 0; |
||
| 645 | } |
||
| 646 | $data{$poster}{percent} = $percent; |
||
| 647 | } |
||
| 648 | } |
||
| 649 | |||
| 650 | ############################## |
||
| 651 | # right pad a string with '.'s |
||
| 652 | ############################## |
||
| 653 | sub rpad { |
||
| 654 | # get text to pad, length to pad, pad chr |
||
| 655 | my ($text, $pad_len, $pad_chr) = @_; |
||
| 656 | if (length($text) > $pad_len) { |
||
| 657 | $text = substr($text, 0, $pad_len); |
||
| 658 | } |
||
| 659 | my $padded = $text . $pad_chr x ( $pad_len - length( $text ) ); |
||
| 660 | return $padded; |
||
| 661 | } |
||
| 662 | |||
| 663 | ################# |
||
| 664 | # centre a string |
||
| 665 | ################# |
||
| 666 | sub centred { |
||
| 667 | my ($text, $width) = @_; # text to centre, size of field to centre in |
||
| 668 | my $pad_len = ($width - length($text)) / 2; #/ |
||
| 669 | my $centred = " " x $pad_len . $text; |
||
| 670 | return $centred; |
||
| 671 | } |
||
| 672 | |||
| 673 | ########################## |
||
| 674 | # put commas into a number |
||
| 675 | ########################## |
||
| 676 | sub commify { |
||
| 677 | $_ = shift; |
||
| 678 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
||
| 679 | return $_; |
||
| 680 | } |
||
| 681 | |||
| 682 | ######################### |
||
| 683 | # clean |
||
| 684 | ######################### |
||
| 685 | sub clean { |
||
| 686 | my $dirty = shift; |
||
| 687 | my $clean = $dirty; |
||
| 688 | $clean =~ s/^\s*//; |
||
| 689 | $clean =~ s/\s*$//; |
||
| 690 | |||
| 691 | return $clean; |
||
| 692 | } |
||
| 693 | |||
| 694 | |||
| 695 | sub usage { |
||
| 696 | |||
| 697 | print "usage: newstat.pl newsgroupname\n"; |
||
| 698 | exit 1; |
||
| 699 | } |
||
| 700 | |||
| 701 | ################################### |
||
| 702 | # Write data structures to a file # |
||
| 703 | ################################### |
||
| 704 | sub writedata { |
||
| 705 | open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
||
| 706 | print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
||
| 707 | print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
||
| 708 | foreach my $name (keys %data) { |
||
| 709 | print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
||
| 710 | } |
||
| 711 | print OUTF "============================================================================\n"; |
||
| 712 | print OUTF "Thread subjects\n"; |
||
| 713 | print OUTF "----------------------------------------------------------------------------\n"; |
||
| 714 | foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) { |
||
| 715 | print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
||
| 716 | } |
||
| 717 | print OUTF "============================================================================\n"; |
||
| 718 | print OUTF "Cross-posts\n"; |
||
| 719 | print OUTF "----------------------------------------------------------------------------\n"; |
||
| 720 | foreach my $name (sort keys %crossposts) { |
||
| 721 | print OUTF "$name : $crossposts{$name}\n"; |
||
| 722 | } |
||
| 723 | print OUTF |
||
| 724 | print OUTF "============================================================================\n"; |
||
| 725 | print OUTF "User agents\n"; |
||
| 726 | print OUTF "----------------------------------------------------------------------------\n"; |
||
| 727 | foreach my $name (sort keys %agents) { |
||
| 728 | print OUTF "$name : $agents{$name}\n"; |
||
| 729 | } |
||
| 730 | print OUTF "============================================================================\n"; |
||
| 731 | print OUTF "Time zones\n"; |
||
| 732 | print OUTF "----------------------------------------------------------------------------\n"; |
||
| 733 | foreach my $name (sort keys %tz) { |
||
| 734 | print OUTF "$name : $tz{$name}\n"; |
||
| 735 | } |
||
| 736 | close OUTF; |
||
| 737 | } # writedata |