Rev 5 | Rev 7 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 5 | Rev 6 | ||
---|---|---|---|
Line 1... | Line 1... | ||
1 | #!/usr/bin/perl -w
|
1 | #!/usr/bin/perl -w
|
2 | use strict; |
2 | use strict; |
3 | #########################
|
3 | use warnings; |
4 | # newsstat.pl version 0.3
|
4 | use utf8; |
5 | 5 | use Encode; |
|
6 | 6 | ||
- | 7 | #########################
|
|
- | 8 | # newsstat.pl version 0.4
|
|
7 | 9 | ||
8 | ###################################################################
|
10 | ############################################################################
|
9 | # Collect statistics about the alt.os.linux.mandrake newsgroup.
|
11 | # Collect statistics about a newsgroup (specified by first argument) in
|
10 | # Check all articles in the last 7-day period. Rank posters by
|
12 | # the local news spool. Check all articles in the last 30-day period.
|
11 | # no. of posts and by volume of posts, report on top and bottom
|
13 | # Rank posters by number of posts and by volume of posts, report on top and
|
12 | # 20 posters. Show their name, no. posts, size of posts, percentage
|
14 | # bottom 20 posters. Show their name, number of posts, size of posts,
|
13 | # quoted lines. Rank user-agents used, by poster rather than by
|
15 | # percentage of quoted lines. Rank user-agents used, by poster rather than
|
14 | # post. Rank top 10 threads. Rank top 10 cross-posted groups.
|
16 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
- | 17 | #
|
|
- | 18 | # (Numbers and paths can be configured below. -- PE)
|
|
15 | ###################################################################
|
19 | ############################################################################
|
16 | 20 | ||
17 | ##################################################################
|
21 | ############################################################################
|
18 | # RECENT CHANGES #
|
22 | # RECENT CHANGES #
|
- | 23 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings
|
|
- | 24 | # - Use warnings, utf8 (just in case)
|
|
- | 25 | # - Documentation update
|
|
- | 26 | # N/A NN - Take newsgroup name as argument
|
|
19 | # 2004/06/19 - newsgroup name is $ARGV[0]
|
27 | # 2004-06-19 NN - newsgroup name is $ARGV[0]
|
20 | # - Allow command line flags for subtracting
|
28 | # - Allow command line flags for subtracting
|
21 | # output if not pertinent for a group
|
29 | # output if not pertinent for a group
|
22 | # 2002/11/09 - Put Garry's writedata() function back in.
|
30 | # 2002-11-09 NN - Put Garry's writedata() function back in.
|
23 | # - added "rn" to my list of UA's
|
31 | # - added "rn" to my list of UA's
|
24 | # - Started using %distinct_agent for both User agent
|
32 | # - Started using %distinct_agent for both User agent
|
25 | # sections
|
33 | # sections
|
26 | # - named it newsstat.pl version 0.3
|
34 | # - named it newsstat.pl version 0.3
|
27 | # 2002/11/06 - Fixed the earliest/latest file problem by using
|
35 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using
|
28 | # mtime rather than ctime, and simplifying the logic
|
36 | # mtime rather than ctime, and simplifying the logic
|
29 | # 2002/11/05 - moved user configurations to the top
|
37 | # 2002-11-05 NN - moved user configurations to the top
|
30 | # - fixed the cross-posting section
|
38 | # - fixed the cross-posting section
|
31 | # - introduced the $newsgroup_name variable which
|
39 | # - introduced the $newsgroup_name variable which
|
32 | # later becomes $news$group
|
40 | # later becomes $news$group
|
33 | # - changed $name to $agent_name in countagents()
|
41 | # - changed $name to $agent_name in countagents()
|
- | 42 | #
|
|
- | 43 | # Contributors
|
|
- | 44 | # -------------
|
|
- | 45 | # NN Nomen nominandum (name to be determined later)
|
|
- | 46 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
|
|
34 | 47 | ||
35 | ########### NEXT #############
|
48 | ########### TODO #############
|
36 | # Commas in bottom section of report
|
49 | # Commas in bottom section of report
|
37 | # Show date the figures were compiled
|
50 | # Show date the figures were compiled
|
38 | # No. of HTML articles (Content-Type: text/html)
|
51 | # No. of HTML articles (Content-Type: text/html)
|
39 | # No. of quoted sigs (/>\s*-- /)
|
52 | # No. of quoted sigs (/>\s*-- /)
|
40 | # Per cent of top-posted articles
|
53 | # Per cent of top-posted articles
|
Line 45... | Line 58... | ||
45 | # What do we do about Bill Unruh's ] quote style?
|
58 | # What do we do about Bill Unruh's ] quote style?
|
46 | # Change the way dates/times are checked
|
59 | # Change the way dates/times are checked
|
47 | # include % share in posters by no. of arts
|
60 | # include % share in posters by no. of arts
|
48 | # include % share in posters by size
|
61 | # include % share in posters by size
|
49 | # Total, orig & quoted lines by user agent with per cent
|
62 | # Total, orig & quoted lines by user agent with per cent
|
50 | # Take arguments, i.e. newsgroup name
|
63 | # Take more arguments
|
51 | #######################################################
|
64 | #######################################################
|
52 | 65 | ||
53 | ###################### USER CONFIGURATIONS ############################
|
66 | ###################### USER CONFIGURATIONS ############################
|
54 | 67 | ||
55 | # The name of the group to do stats for
|
68 | # The name of the group to do stats for
|
56 | my $newsgroup_name = $ARGV[0]; |
69 | my $newsgroup_name = $ARGV[0]; |
57 | $newsgroup_name or &usage; |
70 | $newsgroup_name or &usage; |
58 | 71 | ||
59 | # Check for removal flags
|
72 | # Check for removal flags
|
60 | my $ix; |
73 | my $ix; |
61 | my $j; |
74 | my $j; |
62 | my %skipSec; |
75 | my %skipSec; |
Line 172... | Line 185... | ||
172 | 185 | ||
173 | #################### DISPLAY RESULTS #####################
|
186 | #################### DISPLAY RESULTS #####################
|
174 | print "=" x 76, "\n"; |
187 | print "=" x 76, "\n"; |
175 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
188 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
176 | print "=" x 76, "\n"; |
189 | print "=" x 76, "\n"; |
177 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight)", 76); |
190 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight et al.)", 76); |
178 | print "\n\n"; |
191 | print "\n\n"; |
179 | printf "Total posts considered: %s over %d days\n", |
192 | printf "Total posts considered: %s over %d days\n", |
180 | commify($totalposts), $numdays; |
193 | commify($totalposts), $numdays; |
181 | printf "Earliest article: %s\n", scalar localtime($earliest); |
194 | printf "Earliest article: %s\n", scalar localtime($earliest); |
182 | printf "Latest article: %s\n", scalar localtime($latest); |
195 | printf "Latest article: %s\n", scalar localtime($latest); |
Line 425... | Line 438... | ||
425 | while (<FILE>) { |
438 | while (<FILE>) { |
426 | $totheader += length($_); # bump total header size |
439 | $totheader += length($_); # bump total header size |
427 | last if (/^\s*$/); # end of header? |
440 | last if (/^\s*$/); # end of header? |
428 | if (/^([^:\s]*):\s+(.*)/) { |
441 | if (/^([^:\s]*):\s+(.*)/) { |
429 | my($key,$val) = ($1,$2); |
442 | my($key,$val) = ($1,$2); |
430 | $headers{$key} = $val; |
443 | $headers{$key} = decode('MIME-Header', $val); |
431 | $lcheader{clean(lc($key))} = clean($val); |
444 | $lcheader{clean(lc($key))} = clean($val); |
432 | }
|
445 | }
|
433 | }
|
446 | }
|
434 | @body = <FILE>; # slurp up body |
447 | @body = <FILE>; # slurp up body |
435 | close(FILE); |
448 | close(FILE); |
Line 440... | Line 453... | ||
440 | ###################################
|
453 | ###################################
|
441 | sub getdata { |
454 | sub getdata { |
442 | #### First, analyse header fields ####
|
455 | #### First, analyse header fields ####
|
443 | 456 | ||
444 | # Set up this poster if not defined, get counts, sizes
|
457 | # Set up this poster if not defined, get counts, sizes
|
445 | $poster = $headers{From}; # get the poster's name |
458 | $poster = encode('UTF-8', $headers{From}); # get the poster's name |
446 | if (!defined($data{$poster})) { # seen this one before? |
459 | if (!defined($data{$poster})) { # seen this one before? |
447 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
460 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
448 | $data{$poster}{orig} = 0; |
461 | $data{$poster}{orig} = 0; |
449 | $data{$poster}{quoted} = 0; |
462 | $data{$poster}{quoted} = 0; |
450 | }
|
463 | }
|
Line 570... | Line 583... | ||
570 | for (split /,/, $headers{"Newsgroups"}) { |
583 | for (split /,/, $headers{"Newsgroups"}) { |
571 | $crossposts{$_}++; # bump count for each |
584 | $crossposts{$_}++; # bump count for each |
572 | }
|
585 | }
|
573 | 586 | ||
574 | # Get threads
|
587 | # Get threads
|
575 | my $thread = $headers{"Subject"}; |
588 | my $thread = encode('UTF-8', $headers{"Subject"}); |
576 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
589 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
577 | $thread =~ s/\s+/ /g; # collapse whitespace |
590 | $thread =~ s/\s+/ /g; # collapse whitespace |
578 | $threads{$thread}{count} += 1; # bump count of this subject |
591 | $threads{$thread}{count} += 1; # bump count of this subject |
579 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
592 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
580 | 593 |