Rev 9 | Rev 11 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 9 | Rev 10 | ||
---|---|---|---|
1 | #!/usr/bin/env perl
|
1 | #!/usr/bin/env perl
|
2 | use strict; |
2 | use strict; |
3 | use warnings; |
3 | use warnings; |
4 | use utf8; |
4 | use utf8; |
5 | use encoding 'utf-8'; |
5 | use encoding 'utf-8'; |
6 | use Encode; |
6 | use Encode; |
7 | 7 | ||
8 | ###########################
|
8 | ###########################
|
9 | # newsstat.pl version 0.4.1
|
9 | # newsstat.pl version 0.4.2
|
10 | 10 | ||
11 | ############################################################################
|
11 | ############################################################################
|
12 | # Collect statistics about a newsgroup (specified by first argument) in
|
12 | # Collect statistics about a newsgroup (specified by first argument) in
|
13 | # the local news spool. Check all articles in the last 30-day period.
|
13 | # the local news spool. Check all articles in the last 30-day period.
|
14 | # Rank posters by number of posts and by volume of posts, report on top and
|
14 | # Rank posters by number of posts and by volume of posts, report on top and
|
15 | # bottom 20 posters. Show their name, number of posts, size of posts,
|
15 | # bottom 20 posters. Show their name, number of posts, size of posts,
|
16 | # percentage of quoted lines. Rank user-agents used, by poster rather than
|
16 | # percentage of quoted lines. Rank user-agents used, by poster rather than
|
17 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
17 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
18 | #
|
18 | #
|
19 | # (Numbers and paths can be configured below. -- PE)
|
19 | # (Numbers and paths can be configured below. -- PE)
|
20 | ############################################################################
|
20 | ############################################################################
|
21 | 21 | ||
22 | ############################################################################
|
22 | ############################################################################
|
23 | # RECENT CHANGES #
|
23 | # RECENT CHANGES #
|
24 | # 2011-10-03 PE - Use more compatible shebang
|
24 | # 2011-10-03 PE - Use more compatible shebang
|
25 | # - Fixed some Perl::Critic-ized code
|
25 | # - Fixed some Perl::Critic-ized code
|
26 | # - Fixed wrong indent for non-ASCII names (TODO: proper dot count)
|
26 | # - Fixed wrong indent for non-ASCII names (TODO: proper dot count)
|
27 | # - Formatted source code
|
27 | # - Formatted source code
|
28 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings
|
28 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings
|
29 | # - Use warnings, utf8 (just in case)
|
29 | # - Use warnings, utf8 (just in case)
|
30 | # - Documentation update
|
30 | # - Documentation update
|
31 | # N/A NN - Take newsgroup name as argument
|
31 | # N/A NN - Take newsgroup name as argument
|
32 | # 2004-06-19 NN - newsgroup name is $ARGV[0]
|
32 | # 2004-06-19 NN - newsgroup name is $ARGV[0]
|
33 | # - Allow command line flags for subtracting
|
33 | # - Allow command line flags for subtracting
|
34 | # output if not pertinent for a group
|
34 | # output if not pertinent for a group
|
35 | # 2002-11-09 NN - Put Garry's writedata() function back in.
|
35 | # 2002-11-09 NN - Put Garry's writedata() function back in.
|
36 | # - added "rn" to my list of UA's
|
36 | # - added "rn" to my list of UA's
|
37 | # - Started using %distinct_agent for both User agent
|
37 | # - Started using %distinct_agent for both User agent
|
38 | # sections
|
38 | # sections
|
39 | # - named it newsstat.pl version 0.3
|
39 | # - named it newsstat.pl version 0.3
|
40 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using
|
40 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using
|
41 | # mtime rather than ctime, and simplifying the logic
|
41 | # mtime rather than ctime, and simplifying the logic
|
42 | # 2002-11-05 NN - moved user configurations to the top
|
42 | # 2002-11-05 NN - moved user configurations to the top
|
43 | # - fixed the cross-posting section
|
43 | # - fixed the cross-posting section
|
44 | # - introduced the $newsgroup_name variable which
|
44 | # - introduced the $newsgroup_name variable which
|
45 | # later becomes $news$group
|
45 | # later becomes $news$group
|
46 | # - changed $name to $agent_name in countagents()
|
46 | # - changed $name to $agent_name in countagents()
|
47 | #
|
47 | #
|
48 | # Contributors
|
48 | # Contributors
|
49 | # -------------
|
49 | # -------------
|
50 | # NN Nomen nominandum (name to be determined later)
|
50 | # NN Nomen nominandum (name to be determined later)
|
51 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
|
51 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
|
52 | 52 | ||
53 | ########### TODO #############
|
53 | ########### TODO #############
|
54 | # Commas in bottom section of report
|
54 | # Commas in bottom section of report
|
55 | # Show date the figures were compiled
|
55 | # Show date the figures were compiled
|
56 | # No. of HTML articles (Content-Type: text/html)
|
56 | # No. of HTML articles (Content-Type: text/html)
|
57 | # No. of quoted sigs (/>\s*-- /)
|
57 | # No. of quoted sigs (/>\s*-- /)
|
58 | # Per cent of top-posted articles
|
58 | # Per cent of top-posted articles
|
59 | # Top 10 cross-posters
|
59 | # Top 10 cross-posters
|
60 | # Top 20 news posting hosts (from Path)
|
60 | # Top 20 news posting hosts (from Path)
|
61 | # Count of certain subject words: newbie, kde, burner, sendmail, etc.
|
61 | # Count of certain subject words: newbie, kde, burner, sendmail, etc.
|
62 | # Count *all* User Agents that each poster uses
|
62 | # Count *all* User Agents that each poster uses
|
63 | # What do we do about Bill Unruh's ] quote style?
|
63 | # What do we do about Bill Unruh's ] quote style?
|
64 | # Change the way dates/times are checked
|
64 | # Change the way dates/times are checked
|
65 | # include % share in posters by no. of arts
|
65 | # include % share in posters by no. of arts
|
66 | # include % share in posters by size
|
66 | # include % share in posters by size
|
67 | # Total, orig & quoted lines by user agent with per cent
|
67 | # Total, orig & quoted lines by user agent with per cent
|
68 | # Take more arguments
|
68 | # Take more arguments
|
69 | #######################################################
|
69 | #######################################################
|
70 | 70 | ||
71 | ###################### USER CONFIGURATIONS ############################
|
71 | ###################### USER CONFIGURATIONS ############################
|
72 | 72 | ||
73 | # The name of the group to do stats for
|
73 | # The name of the group to do stats for
|
74 | my $newsgroup_name = $ARGV[0]; |
74 | my $newsgroup_name = $ARGV[0]; |
75 | $newsgroup_name or &usage; |
75 | $newsgroup_name or &usage; |
76 | 76 | ||
77 | # Check for removal flags
|
77 | # Check for removal flags
|
78 | my $ix; |
78 | my $ix; |
79 | my $j; |
79 | my $j; |
80 | my %skipSec; |
80 | my %skipSec; |
81 | my @skiplist; |
81 | my @skiplist; |
82 | my $args = @ARGV; |
82 | my $args = @ARGV; |
83 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
83 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
84 | {
|
84 | {
|
85 | $j = $ix + 1; |
85 | $j = $ix + 1; |
86 | if ( $ARGV[$ix] eq "-x" ) |
86 | if ( $ARGV[$ix] eq "-x" ) |
87 | {
|
87 | {
|
88 | @skiplist = split( ",", $ARGV[$j] ); |
88 | @skiplist = split( ",", $ARGV[$j] ); |
89 | }
|
89 | }
|
90 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
90 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
91 | {
|
91 | {
|
92 | @skiplist = split( ",", $1 ); |
92 | @skiplist = split( ",", $1 ); |
93 | }
|
93 | }
|
94 | }
|
94 | }
|
95 | foreach (@skiplist) |
95 | foreach (@skiplist) |
96 | {
|
96 | {
|
97 | $skipSec{$_} = 1; |
97 | $skipSec{$_} = 1; |
98 | }
|
98 | }
|
99 | 99 | ||
100 | # Leafnode users will want /var/spool/news for this variable.
|
100 | # Leafnode users will want /var/spool/news for this variable.
|
101 | my $news = "/var/spool/news/"; |
101 | my $news = "/var/spool/news/"; |
102 | 102 | ||
103 | # How many days are we doing statistics for?
|
103 | # How many days are we doing statistics for?
|
104 | my $numdays = 30; |
104 | my $numdays = 30; |
105 | 105 | ||
106 | # no. of agents we list
|
106 | # no. of agents we list
|
107 | my $topagents = 10; |
107 | my $topagents = 10; |
108 | 108 | ||
109 | # no. of threads we want to know about
|
109 | # no. of threads we want to know about
|
110 | my $topthreads = 20; |
110 | my $topthreads = 20; |
111 | 111 | ||
112 | # no. of top or bottom posters to show
|
112 | # no. of top or bottom posters to show
|
113 | my $topposters = 20; |
113 | my $topposters = 20; |
114 | 114 | ||
115 | # no. of cross-posted threads to show
|
115 | # no. of cross-posted threads to show
|
116 | my $topcrossposts = 10; |
116 | my $topcrossposts = 10; |
117 | 117 | ||
118 | # no. of time zones to show
|
118 | # no. of time zones to show
|
119 | my $toptz = 10; |
119 | my $toptz = 10; |
120 | 120 | ||
121 | ###################### DATA STRUCTURES ######################
|
121 | ###################### DATA STRUCTURES ######################
|
122 | my $group = $newsgroup_name; |
122 | my $group = $newsgroup_name; |
123 | $group =~ s!\.!/!g; |
123 | $group =~ s!\.!/!g; |
124 | my %data; # name, count, agent, total, orig, quoted |
124 | my %data; # name, count, agent, total, orig, quoted |
125 | my %threads; # subject, count |
125 | my %threads; # subject, count |
126 | my %crossposts; # group, count |
126 | my %crossposts; # group, count |
127 | my %tz; # timezones by count |
127 | my %tz; # timezones by count |
128 | my %headers; # holds header of current article |
128 | my %headers; # holds header of current article |
129 | my %lcheader; # holds lowercase headers |
129 | my %lcheader; # holds lowercase headers |
130 | my @body; # holds body of current article |
130 | my @body; # holds body of current article |
131 | my @sig; # holds sig text; |
131 | my @sig; # holds sig text; |
132 | my $totalposts; # total no. of posts considered |
132 | my $totalposts; # total no. of posts considered |
133 | my $filename; # name of current article file |
133 | my $filename; # name of current article file |
134 | my $filesize; # size of current article file |
134 | my $filesize; # size of current article file |
135 | my $earliest; # earliest article we have found |
135 | my $earliest; # earliest article we have found |
136 | my $latest; # latest article we have found |
136 | my $latest; # latest article we have found |
137 | my $poster; # poster we are dealing with |
137 | my $poster; # poster we are dealing with |
138 | my $totsize = 0; # holds total sizes of all files |
138 | my $totsize = 0; # holds total sizes of all files |
139 | my $totheader = 0; # total size of header material |
139 | my $totheader = 0; # total size of header material |
140 | my $totbody = 0; # total size of body material |
140 | my $totbody = 0; # total size of body material |
141 | my $totsig = 0; # total size of sig material |
141 | my $totsig = 0; # total size of sig material |
142 | my $totorig = 0; # total size of original material |
142 | my $totorig = 0; # total size of original material |
143 | my $totquoted = 0; # total size of quoted material |
143 | my $totquoted = 0; # total size of quoted material |
144 | my $origposts = 0; # total no. of original posts |
144 | my $origposts = 0; # total no. of original posts |
145 | my $replies = 0; # total no. of replies |
145 | my $replies = 0; # total no. of replies |
146 | my $i; # general purpose |
146 | my $i; # general purpose |
147 | my %distinct_agent; |
147 | my %distinct_agent; |
148 | my %agents = # used to hold counts of User Agents used |
148 | my %agents = # used to hold counts of User Agents used |
149 | (
|
149 | (
|
150 | "KNode" => 0, |
150 | "KNode" => 0, |
151 | "Pan" => 0, |
151 | "Pan" => 0, |
152 | "Mozilla" => 0, |
152 | "Mozilla" => 0, |
153 | "Sylpheed" => 0, |
153 | "Sylpheed" => 0, |
154 | "Gnus" => 0, |
154 | "Gnus" => 0, |
155 | "Forte Agent" => 0, |
155 | "Forte Agent" => 0, |
156 | "Forte Free Agent" => 0, |
156 | "Forte Free Agent" => 0, |
157 | "MicroPlanet Gravity" => 0, |
157 | "MicroPlanet Gravity" => 0, |
158 | "Microsoft Outlook Express" => 0, |
158 | "Microsoft Outlook Express" => 0, |
159 | "Xnews" => 0, |
159 | "Xnews" => 0, |
160 | "slrn" => 0, |
160 | "slrn" => 0, |
161 | "tin" => 0, |
161 | "tin" => 0, |
162 | "rn" => 0, |
162 | "rn" => 0, |
163 | "NN" => 0, |
163 | "NN" => 0, |
164 | "MacSOUP" => 0, |
164 | "MacSOUP" => 0, |
165 | "Foorum" => 0, |
165 | "Foorum" => 0, |
166 | "MT-NewsWatcher" => 0, |
166 | "MT-NewsWatcher" => 0, |
167 | "News Rover" => 0, |
167 | "News Rover" => 0, |
168 | "WebTV" => 0, |
168 | "WebTV" => 0, |
169 | "Compuserver" => 0, |
169 | "Compuserver" => 0, |
170 | "VSoup" => 0 |
170 | "VSoup" => 0 |
171 | ); |
171 | ); |
172 | 172 | ||
173 | ######################## MAIN CODE ########################
|
173 | ######################## MAIN CODE ########################
|
174 | $! = 1; |
174 | $! = 1; |
175 | 175 | ||
176 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
176 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
177 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
177 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
178 | while ( defined( $filename = readdir(DIR) ) ) |
178 | while ( defined( $filename = readdir(DIR) ) ) |
179 | {
|
179 | {
|
180 | %lcheader = (); |
180 | %lcheader = (); |
181 | next unless -f $filename; # only want real files |
181 | next unless -f $filename; # only want real files |
182 | next if ( $filename eq ".overview" ); # real articles only |
182 | next if ( $filename eq ".overview" ); # real articles only |
183 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
183 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
184 | $earliest = ( stat $filename )[9] unless defined($earliest); |
184 | $earliest = ( stat $filename )[9] unless defined($earliest); |
185 | $latest = ( stat $filename )[9] unless defined($latest); |
185 | $latest = ( stat $filename )[9] unless defined($latest); |
186 | &getarticle($filename); # read in the article |
186 | &getarticle($filename); # read in the article |
187 | &getdata; # grab the data from the article |
187 | &getdata; # grab the data from the article |
188 | $totalposts++; # bump count of articles considered |
188 | $totalposts++; # bump count of articles considered |
189 | }
|
189 | }
|
190 | closedir(DIR); # finished with the directory |
190 | closedir(DIR); # finished with the directory |
191 | 191 | ||
192 | # post-processing
|
192 | # post-processing
|
193 | &countagents; # count agents, collapsing versions |
193 | &countagents; # count agents, collapsing versions |
194 | &fixpercent; # check percentages orig/total for posters |
194 | &fixpercent; # check percentages orig/total for posters |
195 | 195 | ||
196 | &writedata; |
196 | &writedata; |
197 | 197 | ||
198 | #################### DISPLAY RESULTS #####################
|
198 | #################### DISPLAY RESULTS #####################
|
199 | print "=" x 76, "\n"; |
199 | print "=" x 76, "\n"; |
200 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
200 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
201 | print "=" x 76, "\n"; |
201 | print "=" x 76, "\n"; |
202 | printf "%s\n", |
202 | printf "%s\n", |
203 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
203 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
204 | print "\n\n"; |
204 | print "\n\n"; |
205 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
205 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
206 | $numdays; |
206 | $numdays; |
207 | printf "Earliest article: %s\n", scalar localtime($earliest); |
207 | printf "Earliest article: %s\n", scalar localtime($earliest); |
208 | printf "Latest article: %s\n", scalar localtime($latest); |
208 | printf "Latest article: %s\n", scalar localtime($latest); |
209 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
209 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
210 | commify($replies); |
210 | commify($replies); |
211 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
211 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
212 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
212 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
213 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
213 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
214 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
214 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
215 | commify( int( $totsize / $totalposts ) ); |
215 | commify( int( $totsize / $totalposts ) ); |
216 | my $count = keys %data; |
216 | my $count = keys %data; |
217 | printf "Total headers: %s KB bodies: %s KB\n", |
217 | printf "Total headers: %s KB bodies: %s KB\n", |
218 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
218 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
219 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
219 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
220 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
220 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
221 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
221 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
222 | commify( int( $totsig / 1024 ) ); |
222 | commify( int( $totsig / 1024 ) ); |
223 | printf "Total number of posters: %s, average %s bytes per poster\n", |
223 | printf "Total number of posters: %s, average %s bytes per poster\n", |
224 | commify($count), commify( int( $totsize / $count ) ); #/ |
224 | commify($count), commify( int( $totsize / $count ) ); #/ |
225 | $count = keys %threads; |
225 | $count = keys %threads; |
226 | printf "Total number of threads: %s, average %s bytes per thread\n", |
226 | printf "Total number of threads: %s, average %s bytes per thread\n", |
227 | commify($count), commify( int( $totsize / $count ) ); #/ |
227 | commify($count), commify( int( $totsize / $count ) ); #/ |
228 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
228 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
229 | print "\n", "=" x 76, "\n"; |
229 | print "\n", "=" x 76, "\n"; |
230 | 230 | ||
231 | ###############################
|
231 | ###############################
|
232 | # show posters by article count Sec 1;
|
232 | # show posters by article count Sec 1;
|
233 | ###############################
|
233 | ###############################
|
234 | unless ( $skipSec{1} ) |
234 | unless ( $skipSec{1} ) |
235 | {
|
235 | {
|
236 | if ( keys %data < $topposters ) |
236 | if ( keys %data < $topposters ) |
237 | {
|
237 | {
|
238 | $count = keys %data; |
238 | $count = keys %data; |
239 | }
|
239 | }
|
240 | else
|
240 | else
|
241 | {
|
241 | {
|
242 | $count = $topposters; |
242 | $count = $topposters; |
243 | }
|
243 | }
|
244 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
244 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
245 | print "=" x 76, "\n"; |
245 | print "=" x 76, "\n"; |
246 | $i = 0; |
246 | $i = 0; |
247 | foreach
|
247 | foreach
|
248 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
248 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
249 | {
|
249 | {
|
250 | my $name = substr( $poster, 0, 65 ); |
250 | my $name = substr( $poster, 0, 65 ); |
251 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
251 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
252 | $data{$poster}{count}; |
252 | $data{$poster}{count}; |
253 | last if ( ++$i == $count ); |
253 | last if ( ++$i == $count ); |
254 | }
|
254 | }
|
255 | print "\n", "=" x 76, "\n"; |
255 | print "\n", "=" x 76, "\n"; |
256 | }
|
256 | }
|
257 | 257 | ||
258 | ################################
|
258 | ################################
|
259 | # show posters by size in Kbytes Sec 2;
|
259 | # show posters by size in Kbytes Sec 2;
|
260 | ################################
|
260 | ################################
|
261 | unless ( $skipSec{2} ) |
261 | unless ( $skipSec{2} ) |
262 | {
|
262 | {
|
263 | if ( keys %data < $topposters ) |
263 | if ( keys %data < $topposters ) |
264 | {
|
264 | {
|
265 | $count = keys %data; |
265 | $count = keys %data; |
266 | }
|
266 | }
|
267 | else
|
267 | else
|
268 | {
|
268 | {
|
269 | $count = $topposters; |
269 | $count = $topposters; |
270 | }
|
270 | }
|
271 | printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
271 | printf "%s\n", ¢red( "Top $count posters by article size in Kbytes", 76 ); |
272 | print "=" x 76, "\n"; |
272 | print "=" x 76, "\n"; |
273 | $i = 0; |
273 | $i = 0; |
274 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
274 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
275 | {
|
275 | {
|
276 | my $name = substr( $poster, 0, 62 ); |
276 | my $name = substr( $poster, 0, 62 ); |
277 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
277 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
278 | $data{$poster}{size} / 1024; #/ |
278 | $data{$poster}{size} / 1024; #/ |
279 | last if ( ++$i == $count ); |
279 | last if ( ++$i == $count ); |
280 | }
|
280 | }
|
281 | print "\n", "=" x 76, "\n"; |
281 | print "\n", "=" x 76, "\n"; |
282 | }
|
282 | }
|
283 | 283 | ||
284 | ####################################
|
284 | ####################################
|
285 | # show top posters for original text
|
285 | # show top posters for original text
|
286 | ####################################
|
286 | ####################################
|
287 | unless ( $skipSec{3} ) |
287 | unless ( $skipSec{3} ) |
288 | {
|
288 | {
|
289 | if ( keys %data < $topposters ) |
289 | if ( keys %data < $topposters ) |
290 | {
|
290 | {
|
291 | $count = keys %data; |
291 | $count = keys %data; |
292 | }
|
292 | }
|
293 | else
|
293 | else
|
294 | {
|
294 | {
|
295 | $count = $topposters; |
295 | $count = $topposters; |
296 | }
|
296 | }
|
297 | printf "%s\n", |
297 | printf "%s\n", |
298 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
298 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
299 | print "=" x 76, "\n"; |
299 | print "=" x 76, "\n"; |
300 | $i = 0; |
300 | $i = 0; |
301 | foreach my $poster ( |
301 | foreach my $poster ( |
302 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
302 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
303 | keys %data |
303 | keys %data |
304 | )
|
304 | )
|
305 | {
|
305 | {
|
306 | next if $data{$poster}{quoted} == 0; |
306 | next if $data{$poster}{quoted} == 0; |
307 | next if $data{$poster}{count} < 5; |
307 | next if $data{$poster}{count} < 5; |
308 | my $name = substr( $poster, 0, 63 ); |
308 | my $name = substr( $poster, 0, 63 ); |
309 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
309 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
310 | $data{$poster}{percent}; |
310 | $data{$poster}{percent}; |
311 | last if ( ++$i == $count ); |
311 | last if ( ++$i == $count ); |
312 | }
|
312 | }
|
313 | print "\n", "=" x 76, "\n"; |
313 | print "\n", "=" x 76, "\n"; |
314 | }
|
314 | }
|
315 | 315 | ||
316 | #######################################
|
316 | #######################################
|
317 | # show bottom posters for original text
|
317 | # show bottom posters for original text
|
318 | #######################################
|
318 | #######################################
|
319 | unless ( $skipSec{4} ) |
319 | unless ( $skipSec{4} ) |
320 | {
|
320 | {
|
321 | if ( keys %data < $topposters ) |
321 | if ( keys %data < $topposters ) |
322 | {
|
322 | {
|
323 | $count = keys %data; |
323 | $count = keys %data; |
324 | }
|
324 | }
|
325 | else
|
325 | else
|
326 | {
|
326 | {
|
327 | $count = $topposters; |
327 | $count = $topposters; |
328 | }
|
328 | }
|
329 | printf "%s\n", |
329 | printf "%s\n", |
330 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
330 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
331 | print "=" x 76, "\n"; |
331 | print "=" x 76, "\n"; |
332 | $i = 0; |
332 | $i = 0; |
333 | foreach my $poster ( |
333 | foreach my $poster ( |
334 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
334 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
335 | keys %data |
335 | keys %data |
336 | )
|
336 | )
|
337 | {
|
337 | {
|
338 | next if $data{$poster}{quoted} == 0; |
338 | next if $data{$poster}{quoted} == 0; |
339 | next if $data{$poster}{count} < 5; |
339 | next if $data{$poster}{count} < 5; |
340 | my $name = substr( $poster, 0, 63 ); |
340 | my $name = substr( $poster, 0, 63 ); |
341 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
341 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
342 | $data{$poster}{percent}; |
342 | $data{$poster}{percent}; |
343 | last if ( ++$i == $count ); |
343 | last if ( ++$i == $count ); |
344 | }
|
344 | }
|
345 | print "\n", "=" x 76, "\n"; |
345 | print "\n", "=" x 76, "\n"; |
346 | }
|
346 | }
|
347 | 347 | ||
348 | ####################################
|
348 | ####################################
|
349 | # show threads by number of articles
|
349 | # show threads by number of articles
|
350 | ####################################
|
350 | ####################################
|
351 | unless ( $skipSec{5} ) |
351 | unless ( $skipSec{5} ) |
352 | {
|
352 | {
|
353 | if ( keys %threads < $topthreads ) |
353 | if ( keys %threads < $topthreads ) |
354 | {
|
354 | {
|
355 | $count = keys %threads; |
355 | $count = keys %threads; |
356 | }
|
356 | }
|
357 | else
|
357 | else
|
358 | {
|
358 | {
|
359 | $count = $topthreads; |
359 | $count = $topthreads; |
360 | }
|
360 | }
|
361 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
361 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
362 | print "=" x 76, "\n"; |
362 | print "=" x 76, "\n"; |
363 | $i = 0; |
363 | $i = 0; |
364 | foreach my $thread ( |
364 | foreach my $thread ( |
365 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
365 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
366 | keys %threads |
366 | keys %threads |
367 | )
|
367 | )
|
368 | {
|
368 | {
|
369 | my $name = substr( $thread, 0, 65 ); |
369 | my $name = substr( $thread, 0, 65 ); |
370 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
370 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
371 | $threads{$thread}{count}; |
371 | $threads{$thread}{count}; |
372 | last if ( ++$i == $count ); |
372 | last if ( ++$i == $count ); |
373 | }
|
373 | }
|
374 | print "\n", "=" x 76, "\n"; |
374 | print "\n", "=" x 76, "\n"; |
375 | }
|
375 | }
|
376 | ################################
|
376 | ################################
|
377 | # show threads by size in Kbytes
|
377 | # show threads by size in Kbytes
|
378 | ################################
|
378 | ################################
|
379 | unless ( $skipSec{6} ) |
379 | unless ( $skipSec{6} ) |
380 | {
|
380 | {
|
381 | if ( keys %threads < $topthreads ) |
381 | if ( keys %threads < $topthreads ) |
382 | {
|
382 | {
|
383 | $count = keys %threads; |
383 | $count = keys %threads; |
384 | }
|
384 | }
|
385 | else
|
385 | else
|
386 | {
|
386 | {
|
387 | $count = $topthreads; |
387 | $count = $topthreads; |
388 | }
|
388 | }
|
389 | printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
389 | printf "%s\n", ¢red( "Top $count threads by size in KB", 76 ); |
390 | print "=" x 76, "\n"; |
390 | print "=" x 76, "\n"; |
391 | $i = 0; |
391 | $i = 0; |
392 | foreach my $thread ( |
392 | foreach my $thread ( |
393 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
393 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
394 | keys %threads |
394 | keys %threads |
395 | )
|
395 | )
|
396 | {
|
396 | {
|
397 | my $name = substr( $thread, 0, 65 ); |
397 | my $name = substr( $thread, 0, 65 ); |
398 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
398 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
399 | $threads{$thread}{size} / 1024; #/ |
399 | $threads{$thread}{size} / 1024; #/ |
400 | last if ( ++$i == $count ); |
400 | last if ( ++$i == $count ); |
401 | }
|
401 | }
|
402 | print "\n", "=" x 76, "\n"; |
402 | print "\n", "=" x 76, "\n"; |
403 | }
|
403 | }
|
404 | 404 | ||
405 | #################################
|
405 | #################################
|
406 | # show top 10 cross-posted groups
|
406 | # show top 10 cross-posted groups
|
407 | #################################
|
407 | #################################
|
408 | unless ( $skipSec{7} ) |
408 | unless ( $skipSec{7} ) |
409 | {
|
409 | {
|
410 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
410 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
411 | if ( keys %crossposts < $topcrossposts ) |
411 | if ( keys %crossposts < $topcrossposts ) |
412 | {
|
412 | {
|
413 | $count = keys %crossposts; |
413 | $count = keys %crossposts; |
414 | }
|
414 | }
|
415 | else
|
415 | else
|
416 | {
|
416 | {
|
417 | $count = $topcrossposts; |
417 | $count = $topcrossposts; |
418 | }
|
418 | }
|
419 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
419 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
420 | print "=" x 76, "\n"; |
420 | print "=" x 76, "\n"; |
421 | $i = 0; |
421 | $i = 0; |
422 | foreach
|
422 | foreach
|
423 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
423 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
424 | {
|
424 | {
|
425 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
425 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
426 | $crossposts{$name}; |
426 | $crossposts{$name}; |
427 | last if ( ++$i == $count ); |
427 | last if ( ++$i == $count ); |
428 | }
|
428 | }
|
429 | print "\n", "=" x 76, "\n"; |
429 | print "\n", "=" x 76, "\n"; |
430 | }
|
430 | }
|
431 | #######################
|
431 | #######################
|
432 | #show agents and counts
|
432 | #show agents and counts
|
433 | #######################
|
433 | #######################
|
434 | unless ( $skipSec{8} ) |
434 | unless ( $skipSec{8} ) |
435 | {
|
435 | {
|
436 | if ( keys %agents < $topagents ) |
436 | if ( keys %agents < $topagents ) |
437 | {
|
437 | {
|
438 | $count = keys %agents; |
438 | $count = keys %agents; |
439 | }
|
439 | }
|
440 | else
|
440 | else
|
441 | {
|
441 | {
|
442 | $count = $topagents; |
442 | $count = $topagents; |
443 | }
|
443 | }
|
444 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
444 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
445 | print "=" x 76, "\n"; |
445 | print "=" x 76, "\n"; |
446 | $i = 0; |
446 | $i = 0; |
447 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
447 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
448 | {
|
448 | {
|
449 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
449 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
450 | $agents{$agent}; |
450 | $agents{$agent}; |
451 | last if ( ++$i == $count ); |
451 | last if ( ++$i == $count ); |
452 | }
|
452 | }
|
453 | print "\n", "=" x 76, "\n"; |
453 | print "\n", "=" x 76, "\n"; |
454 | }
|
454 | }
|
455 | 455 | ||
456 | #######################
|
456 | #######################
|
457 | #show distinct agents
|
457 | #show distinct agents
|
458 | #######################
|
458 | #######################
|
459 | unless ( $skipSec{9} ) |
459 | unless ( $skipSec{9} ) |
460 | {
|
460 | {
|
461 | if ( keys %distinct_agent < $topagents ) |
461 | if ( keys %distinct_agent < $topagents ) |
462 | {
|
462 | {
|
463 | $count = keys %distinct_agent; |
463 | $count = keys %distinct_agent; |
464 | }
|
464 | }
|
465 | else
|
465 | else
|
466 | {
|
466 | {
|
467 | $count = $topagents; |
467 | $count = $topagents; |
468 | }
|
468 | }
|
469 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
469 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
470 | print "=" x 76, "\n"; |
470 | print "=" x 76, "\n"; |
471 | $i = 0; |
471 | $i = 0; |
472 | foreach my $agent ( |
472 | foreach my $agent ( |
473 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
473 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
474 | keys %distinct_agent |
474 | keys %distinct_agent |
475 | )
|
475 | )
|
476 | {
|
476 | {
|
477 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
477 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
478 | $distinct_agent{$agent}, |
478 | $distinct_agent{$agent}, |
479 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
479 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
480 | last if ( ++$i == $count ); |
480 | last if ( ++$i == $count ); |
481 | }
|
481 | }
|
482 | print "\n", "=" x 76, "\n"; |
482 | print "\n", "=" x 76, "\n"; |
483 | }
|
483 | }
|
484 | 484 | ||
485 | ##########################
|
485 | ##########################
|
486 | #show timezones and counts
|
486 | #show timezones and counts
|
487 | ##########################
|
487 | ##########################
|
488 | unless ( $skipSec{10} ) |
488 | unless ( $skipSec{10} ) |
489 | {
|
489 | {
|
490 | if ( keys %tz < $toptz ) |
490 | if ( keys %tz < $toptz ) |
491 | {
|
491 | {
|
492 | $count = keys %tz; |
492 | $count = keys %tz; |
493 | }
|
493 | }
|
494 | else
|
494 | else
|
495 | {
|
495 | {
|
496 | $count = $toptz; |
496 | $count = $toptz; |
497 | }
|
497 | }
|
498 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
498 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
499 | print "=" x 76, "\n"; |
499 | print "=" x 76, "\n"; |
500 | $i = 0; |
500 | $i = 0; |
501 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
501 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
502 | {
|
502 | {
|
503 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
503 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
504 | last if ( ++$i == $count ); |
504 | last if ( ++$i == $count ); |
505 | }
|
505 | }
|
506 | print "\n", "=" x 76, "\n"; |
506 | print "\n", "=" x 76, "\n"; |
507 | }
|
507 | }
|
508 | 508 | ||
509 | ################################ SUBROUTINES ################################
|
509 | ################################ SUBROUTINES ################################
|
510 | 510 | ||
511 | #######################################
|
511 | #######################################
|
512 | # get current article's header and body
|
512 | # get current article's header and body
|
513 | #######################################
|
513 | #######################################
|
514 | sub getarticle
|
514 | sub getarticle
|
515 | {
|
515 | {
|
516 | %headers = (); # dump old headers |
516 | %headers = (); # dump old headers |
517 | my $filename = shift; # get the name of the file |
517 | my $filename = shift; # get the name of the file |
518 | 518 | ||
519 | # get stats about the file itself
|
519 | # get stats about the file itself
|
520 | $filesize = -s $filename; # get total size of file |
520 | $filesize = -s $filename; # get total size of file |
521 | $totsize += $filesize; # bump total sizes of all files |
521 | $totsize += $filesize; # bump total sizes of all files |
522 | 522 | ||
523 | my $mtime = ( stat $filename )[9]; |
523 | my $mtime = ( stat $filename )[9]; |
524 | if ( $mtime < $earliest ) |
524 | if ( $mtime < $earliest ) |
525 | {
|
525 | {
|
526 | $earliest = $mtime; |
526 | $earliest = $mtime; |
527 | }
|
527 | }
|
528 | elsif ( $mtime > $latest ) |
528 | elsif ( $mtime > $latest ) |
529 | {
|
529 | {
|
530 | $latest = $mtime; |
530 | $latest = $mtime; |
531 | }
|
531 | }
|
532 | 532 | ||
533 | # now read the file
|
533 | # now read the file
|
534 | open( my $FILE, $filename ) or die "Can't open $filename: $!\n"; |
534 | open( my $FILE, $filename ) or die "Can't open $filename: $!\n"; |
535 | while (<$FILE>) |
535 | while (<$FILE>) |
536 | {
|
536 | {
|
537 | $totheader += length($_); # bump total header size |
537 | $totheader += length($_); # bump total header size |
538 | last if (/^\s*$/); # end of header? |
538 | last if (/^\s*$/); # end of header? |
539 | if (/^([^:\s]*):\s+(.*)/) |
539 | if (/^([^:\s]*):\s+(.*)/) |
540 | {
|
540 | {
|
541 | my ( $key, $val ) = ( $1, $2 ); |
541 | my ( $key, $val ) = ( $1, $2 ); |
542 | $headers{$key} = decode( 'MIME-Header', $val ); |
542 | $headers{$key} = decode( 'MIME-Header', $val ); |
543 | $lcheader{ clean( lc($key) ) } = clean($val); |
543 | $lcheader{ clean( lc($key) ) } = clean($val); |
544 | }
|
544 | }
|
545 | }
|
545 | }
|
546 | @body = <$FILE>; # slurp up body |
546 | @body = <$FILE>; # slurp up body |
547 | close($FILE); |
547 | close($FILE); |
548 | } # getarticle |
548 | } # getarticle |
549 | 549 | ||
550 | ###################################
|
550 | ###################################
|
551 | # get data from the current article
|
551 | # get data from the current article
|
552 | ###################################
|
552 | ###################################
|
553 | sub getdata
|
553 | sub getdata
|
554 | {
|
554 | {
|
555 | #### First, analyse header fields ####
|
555 | #### First, analyse header fields ####
|
556 | 556 | ||
557 | # Set up this poster if not defined, get counts, sizes
|
557 | # Set up this poster if not defined, get counts, sizes
|
558 | my $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name |
558 | my $poster = encode( 'UTF-8', $headers{From} ); # get the poster's name |
559 | if ( !defined( $data{$poster} ) ) |
559 | if ( !defined( $data{$poster} ) ) |
560 | { # seen this one before? |
560 | { # seen this one before? |
561 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
561 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
562 | $data{$poster}{orig} = 0; |
562 | $data{$poster}{orig} = 0; |
563 | $data{$poster}{quoted} = 0; |
563 | $data{$poster}{quoted} = 0; |
564 | }
|
564 | }
|
565 | $data{$poster}{count}++; # bump count for this poster |
565 | $data{$poster}{count}++; # bump count for this poster |
566 | $data{$poster}{size} += $filesize; # total size of file |
566 | $data{$poster}{size} += $filesize; # total size of file |
567 | 567 | ||
568 | # The User-Agent and/or X-Newsreader fields
|
568 | # The User-Agent and/or X-Newsreader fields
|
569 | # for User-Agent by poster
|
569 | # for User-Agent by poster
|
570 | if ( defined $lcheader{"user-agent"} ) |
570 | if ( defined $lcheader{"user-agent"} ) |
571 | {
|
571 | {
|
572 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
572 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
573 | }
|
573 | }
|
574 | if ( defined $lcheader{"x-newsreader"} ) |
574 | if ( defined $lcheader{"x-newsreader"} ) |
575 | {
|
575 | {
|
576 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
576 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
577 | }
|
577 | }
|
578 | 578 | ||
579 | # The User Agent for User-Agent by number of posts
|
579 | # The User Agent for User-Agent by number of posts
|
580 | my $UA = "unknown"; |
580 | my $UA = "unknown"; |
581 | foreach my $keys ( keys %lcheader ) |
581 | foreach my $keys ( keys %lcheader ) |
582 | {
|
582 | {
|
583 | if ( defined $lcheader{'user-agent'} ) |
583 | if ( defined $lcheader{'user-agent'} ) |
584 | {
|
584 | {
|
585 | $UA = $lcheader{'user-agent'}; |
585 | $UA = $lcheader{'user-agent'}; |
586 | }
|
586 | }
|
587 | elsif ( defined $lcheader{"x-newsreader"} ) |
587 | elsif ( defined $lcheader{"x-newsreader"} ) |
588 | {
|
588 | {
|
589 | $UA = $lcheader{"x-newsreader"}; |
589 | $UA = $lcheader{"x-newsreader"}; |
590 | }
|
590 | }
|
591 | elsif ( defined $lcheader{'x-mailer'} ) |
591 | elsif ( defined $lcheader{'x-mailer'} ) |
592 | {
|
592 | {
|
593 | $UA = $lcheader{'x-mailer'}; |
593 | $UA = $lcheader{'x-mailer'}; |
594 | }
|
594 | }
|
595 | elsif ( |
595 | elsif ( |
596 | ( defined $lcheader{'organization'} ) |
596 | ( defined $lcheader{'organization'} ) |
597 | && ( $lcheader{'organization'} =~ |
597 | && ( $lcheader{'organization'} =~ |
598 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
598 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
599 | )
|
599 | )
|
600 | {
|
600 | {
|
601 | $UA = $lcheader{'organization'}; |
601 | $UA = $lcheader{'organization'}; |
602 | }
|
602 | }
|
603 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
603 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
604 | {
|
604 | {
|
605 | $UA = "Pine"; |
605 | $UA = "Pine"; |
606 | } ## Hopefully found UA, else set to unknown |
606 | } ## Hopefully found UA, else set to unknown |
607 | }
|
607 | }
|
608 | 608 | ||
609 | $UA = clean($UA); |
609 | $UA = clean($UA); |
610 | $UA = get_agent($UA); |
610 | $UA = get_agent($UA); |
611 | 611 | ||
612 | sub get_agent
|
612 | sub get_agent
|
613 | {
|
613 | {
|
614 | my $raw = shift; |
614 | my $raw = shift; |
615 | my $agent = $raw; |
615 | my $agent = $raw; |
616 | 616 | ||
617 | ## strip http
|
617 | ## strip http
|
618 | if ( $raw =~ /.*http.*/ ) |
618 | if ( $raw =~ /.*http.*/ ) |
619 | {
|
619 | {
|
620 | $raw =~ s!posted via!!i; |
620 | $raw =~ s!posted via!!i; |
621 | $raw =~ s!http://!!g; |
621 | $raw =~ s!http://!!g; |
622 | $raw =~ s!/!!g; |
622 | $raw =~ s!/!!g; |
623 | $raw =~ s! !!g; |
623 | $raw =~ s! !!g; |
624 | }
|
624 | }
|
625 | 625 | ||
626 | ## Fix Outlook from Mac
|
626 | ## Fix Outlook from Mac
|
627 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
627 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
628 | 628 | ||
629 | ## Pick out the popular agents
|
629 | ## Pick out the popular agents
|
630 | if ( $raw =~ /(outlook express)/i |
630 | if ( $raw =~ /(outlook express)/i |
631 | || $raw =~ /(microplanet gravity)/i |
631 | || $raw =~ /(microplanet gravity)/i |
632 | || $raw =~ /(news rover)/i |
632 | || $raw =~ /(news rover)/i |
633 | || $raw =~ /(forte agent)/i |
633 | || $raw =~ /(forte agent)/i |
634 | || $raw =~ /(forte free agent)/i ) |
634 | || $raw =~ /(forte free agent)/i ) |
635 | {
|
635 | {
|
636 | $agent = $1; |
636 | $agent = $1; |
637 | }
|
637 | }
|
638 | elsif ( |
638 | elsif ( |
639 | $raw =~ /^( |
639 | $raw =~ /^( |
640 | pan |
640 | pan |
641 | |sylpheed
|
641 | |sylpheed
|
642 | |slrn
|
642 | |slrn
|
643 | |mozilla
|
643 | |mozilla
|
644 | |knode
|
644 | |knode
|
645 | |tin
|
645 | |tin
|
646 | |hamster
|
646 | |hamster
|
647 | |xrn
|
647 | |xrn
|
648 | |xnews
|
648 | |xnews
|
649 | |aol
|
649 | |aol
|
650 | |gnus
|
650 | |gnus
|
651 | |krn
|
651 | |krn
|
652 | |macsoup
|
652 | |macsoup
|
653 | |messenger
|
653 | |messenger
|
654 | |openxp
|
654 | |openxp
|
655 | |pine
|
655 | |pine
|
656 | |thoth
|
656 | |thoth
|
657 | |turnpike
|
657 | |turnpike
|
658 | |winvn
|
658 | |winvn
|
659 | |vsoup
|
659 | |vsoup
|
660 | |google
|
660 | |google
|
661 | |supernews
|
661 | |supernews
|
662 | |nn
|
662 | |nn
|
663 | |rn
|
663 | |rn
|
664 | |007
|
664 | |007
|
665 | |webtv
|
665 | |webtv
|
666 | |compuserve
|
666 | |compuserve
|
667 | )/ix |
667 | )/ix |
668 | )
|
668 | )
|
669 | {
|
669 | {
|
670 | $agent = $1; |
670 | $agent = $1; |
671 | }
|
671 | }
|
672 | else
|
672 | else
|
673 | {
|
673 | {
|
674 | ## Clean up unknown agents
|
674 | ## Clean up unknown agents
|
675 | if ( $raw =~ m!^(.*?)/! ) |
675 | if ( $raw =~ m!^(.*?)/! ) |
676 | {
|
676 | {
|
677 | $agent = $1; |
677 | $agent = $1; |
678 | }
|
678 | }
|
679 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
679 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
680 | {
|
680 | {
|
681 | $agent = $1; |
681 | $agent = $1; |
682 | }
|
682 | }
|
683 | }
|
683 | }
|
684 | 684 | ||
685 | $distinct_agent{$agent}++; |
685 | $distinct_agent{$agent}++; |
686 | return $agent; |
686 | return $agent; |
687 | }
|
687 | }
|
688 | 688 | ||
689 | # Get all cross-posted newsgroups
|
689 | # Get all cross-posted newsgroups
|
690 | for ( split /,/, $headers{"Newsgroups"} ) |
690 | for ( split /,/, $headers{"Newsgroups"} ) |
691 | {
|
691 | {
|
692 | $crossposts{$_}++; # bump count for each |
692 | $crossposts{$_}++; # bump count for each |
693 | }
|
693 | }
|
694 | 694 | ||
695 | # Get threads
|
695 | # Get threads
|
696 | my $thread = encode( 'UTF-8', $headers{"Subject"} ); |
696 | my $thread = encode( 'UTF-8', $headers{"Subject"} ); |
697 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
697 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
698 | $thread =~ s/\s+/ /g; # collapse whitespace |
698 | $thread =~ s/\s+/ /g; # collapse whitespace |
699 | $threads{$thread}{count} += 1; # bump count of this subject |
699 | $threads{$thread}{count} += 1; # bump count of this subject |
700 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
700 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
701 | 701 | ||
702 | # Is this an original post or a reply?
|
702 | # Is this an original post or a reply?
|
703 | if ( defined $headers{"References"} ) |
703 | if ( defined $headers{"References"} ) |
704 | {
|
704 | {
|
705 | $replies++; |
705 | $replies++; |
706 | }
|
706 | }
|
707 | else
|
707 | else
|
708 | {
|
708 | {
|
709 | $origposts++; |
709 | $origposts++; |
710 | }
|
710 | }
|
711 | 711 | ||
712 | # Get the time zone
|
712 | # Get the time zone
|
713 | $_ = $headers{"Date"}; |
713 | $_ = $headers{"Date"}; |
714 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
714 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
715 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
715 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
716 | {
|
716 | {
|
717 | $tz = "UTC"; |
717 | $tz = "UTC"; |
718 | }
|
718 | }
|
719 | $tz{$tz}++; |
719 | $tz{$tz}++; |
720 | 720 | ||
721 | #### Now analyse the body text ####
|
721 | #### Now analyse the body text ####
|
722 | my $insig = 0; |
722 | my $insig = 0; |
723 | for (@body) |
723 | for (@body) |
724 | {
|
724 | {
|
725 | $totbody += length($_); # bump total body size |
725 | $totbody += length($_); # bump total body size |
726 | next if (/^$>/); # don't count blank lines in body |
726 | next if (/^$>/); # don't count blank lines in body |
727 | if ( $insig == 1 ) |
727 | if ( $insig == 1 ) |
728 | {
|
728 | {
|
729 | $totsig += length($_); # bump total sig size |
729 | $totsig += length($_); # bump total sig size |
730 | 730 | ||
731 | # Bill Unruh uses ] quotes, and another poster uses ::
|
731 | # Bill Unruh uses ] quotes, and another poster uses ::
|
732 | }
|
732 | }
|
733 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
733 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
734 | { # are we in a quote line? |
734 | { # are we in a quote line? |
735 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
735 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
736 | $totquoted += length($_); |
736 | $totquoted += length($_); |
737 | }
|
737 | }
|
738 | elsif (/-- /) |
738 | elsif (/-- /) |
739 | {
|
739 | {
|
740 | $insig = 1; |
740 | $insig = 1; |
741 | }
|
741 | }
|
742 | else
|
742 | else
|
743 | {
|
743 | {
|
744 | 744 | ||
745 | # we must be processing an original line
|
745 | # we must be processing an original line
|
746 | $data{$poster}{orig} += length($_); # bump count of original chrs |
746 | $data{$poster}{orig} += length($_); # bump count of original chrs |
747 | $totorig += length($_); |
747 | $totorig += length($_); |
748 | }
|
748 | }
|
749 | } # end for (@body) |
749 | } # end for (@body) |
750 | 750 | ||
751 | } # getdata |
751 | } # getdata |
752 | 752 | ||
753 | ########################################
|
753 | ########################################
|
754 | # Count the User-Agents used, collapsing
|
754 | # Count the User-Agents used, collapsing
|
755 | # different versions into one per agent.
|
755 | # different versions into one per agent.
|
756 | ########################################
|
756 | ########################################
|
757 | sub countagents
|
757 | sub countagents
|
758 | {
|
758 | {
|
759 | POSTER:
|
759 | POSTER:
|
760 | foreach my $poster ( keys %data ) |
760 | foreach my $poster ( keys %data ) |
761 | {
|
761 | {
|
762 | foreach my $agent_name ( keys %distinct_agent ) |
762 | foreach my $agent_name ( keys %distinct_agent ) |
763 | { # check against known ones |
763 | { # check against known ones |
764 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
764 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
765 | {
|
765 | {
|
766 | $agents{$agent_name}++; |
766 | $agents{$agent_name}++; |
767 | next POSTER; |
767 | next POSTER; |
768 | }
|
768 | }
|
769 | }
|
769 | }
|
770 | $agents{ $data{$poster}{agent} }++; |
770 | $agents{ $data{$poster}{agent} }++; |
771 | }
|
771 | }
|
772 | } # countagents |
772 | } # countagents |
773 | 773 | ||
774 | ############################################
|
774 | ############################################
|
775 | # set orig/total percentages for all posters
|
775 | # set orig/total percentages for all posters
|
776 | ############################################
|
776 | ############################################
|
777 | sub fixpercent
|
777 | sub fixpercent
|
778 | {
|
778 | {
|
779 | foreach my $poster ( keys %data ) |
779 | foreach my $poster ( keys %data ) |
780 | {
|
780 | {
|
781 | my $percent = 100; |
781 | my $percent = 100; |
782 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
782 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
783 | {
|
783 | {
|
784 | $percent = |
784 | $percent = |
785 | $data{$poster}{orig} * 100 / |
785 | $data{$poster}{orig} * 100 / |
786 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
786 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
787 | }
|
787 | }
|
788 | elsif ( $data{$poster}{orig} == 0 ) |
788 | elsif ( $data{$poster}{orig} == 0 ) |
789 | {
|
789 | {
|
790 | $percent = 0; |
790 | $percent = 0; |
791 | }
|
791 | }
|
792 | $data{$poster}{percent} = $percent; |
792 | $data{$poster}{percent} = $percent; |
793 | }
|
793 | }
|
794 | }
|
794 | }
|
795 | 795 | ||
796 | ##############################
|
796 | ##############################
|
797 | # right pad a string with '.'s
|
797 | # right pad a string with '.'s
|
798 | ##############################
|
798 | ##############################
|
799 | sub rpad
|
799 | sub rpad
|
800 | {
|
800 | {
|
801 | 801 | ||
802 | # get text to pad, length to pad, pad chr
|
802 | # get text to pad, length to pad, pad chr
|
803 | my ( $text, $pad_len, $pad_chr ) = @_; |
803 | my ( $text, $pad_len, $pad_chr ) = @_; |
804 | if ( length($text) > $pad_len ) |
804 | if ( length($text) > $pad_len ) |
805 | {
|
805 | {
|
806 | $text = substr( $text, 0, $pad_len ); |
806 | $text = substr( $text, 0, $pad_len ); |
807 | }
|
807 | }
|
808 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
808 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
809 | return $padded; |
809 | return $padded; |
810 | }
|
810 | }
|
811 | 811 | ||
812 | #################
|
812 | #################
|
813 | # centre a string
|
813 | # centre a string
|
814 | #################
|
814 | #################
|
815 | sub centred
|
815 | sub centred
|
816 | {
|
816 | {
|
817 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
817 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
818 | my $pad_len = ( $width - length($text) ) / 2; #/ |
818 | my $pad_len = ( $width - length($text) ) / 2; #/ |
819 | my $centred = " " x $pad_len . $text; |
819 | my $centred = " " x $pad_len . $text; |
820 | return $centred; |
820 | return $centred; |
821 | }
|
821 | }
|
822 | 822 | ||
823 | ##########################
|
823 | ##########################
|
824 | # put commas into a number
|
824 | # put commas into a number
|
825 | ##########################
|
825 | ##########################
|
826 | sub commify
|
826 | sub commify
|
827 | {
|
827 | {
|
828 | $_ = shift; |
828 | $_ = shift; |
829 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
829 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
830 | return $_; |
830 | return $_; |
831 | }
|
831 | }
|
832 | 832 | ||
833 | #########################
|
833 | #########################
|
834 | # clean
|
834 | # clean
|
835 | #########################
|
835 | #########################
|
836 | sub clean
|
836 | sub clean
|
837 | {
|
837 | {
|
838 | my $dirty = shift; |
838 | my $dirty = shift; |
839 | my $clean = $dirty; |
839 | my $clean = $dirty; |
840 | $clean =~ s/^\s*//; |
840 | $clean =~ s/^\s*//; |
841 | $clean =~ s/\s*$//; |
841 | $clean =~ s/\s*$//; |
842 | 842 | ||
843 | return $clean; |
843 | return $clean; |
844 | }
|
844 | }
|
845 | 845 | ||
846 | sub usage
|
846 | sub usage
|
847 | {
|
847 | {
|
848 | 848 | ||
849 | print "usage: newstat.pl newsgroupname\n"; |
849 | print "usage: newstat.pl newsgroupname\n"; |
850 | exit 1; |
850 | exit 1; |
851 | }
|
851 | }
|
852 | 852 | ||
853 | ###################################
|
853 | ###################################
|
854 | # Write data structures to a file #
|
854 | # Write data structures to a file #
|
855 | ###################################
|
855 | ###################################
|
856 | sub writedata
|
856 | sub writedata
|
857 | {
|
857 | {
|
858 | open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
858 | open my $OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
859 | print $OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
859 | print $OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
860 | print $OUTF |
860 | print $OUTF |
861 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
861 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
862 | foreach my $name ( keys %data ) |
862 | foreach my $name ( keys %data ) |
863 | {
|
863 | {
|
864 | print $OUTF |
864 | print $OUTF |
865 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
865 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
866 | }
|
866 | }
|
867 | print $OUTF |
867 | print $OUTF |
868 | "============================================================================\n"; |
868 | "============================================================================\n"; |
869 | print $OUTF "Thread subjects\n"; |
869 | print $OUTF "Thread subjects\n"; |
870 | print $OUTF |
870 | print $OUTF |
871 | "----------------------------------------------------------------------------\n"; |
871 | "----------------------------------------------------------------------------\n"; |
872 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
872 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
873 | {
|
873 | {
|
874 | print $OUTF |
874 | print $OUTF |
875 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
875 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
876 | }
|
876 | }
|
877 | print $OUTF |
877 | print $OUTF |
878 | "============================================================================\n"; |
878 | "============================================================================\n"; |
879 | print $OUTF "Cross-posts\n"; |
879 | print $OUTF "Cross-posts\n"; |
880 | print $OUTF |
880 | print $OUTF |
881 | "----------------------------------------------------------------------------\n"; |
881 | "----------------------------------------------------------------------------\n"; |
882 | foreach my $name ( sort keys %crossposts ) |
882 | foreach my $name ( sort keys %crossposts ) |
883 | {
|
883 | {
|
884 | print $OUTF "$name : $crossposts{$name}\n"; |
884 | print $OUTF "$name : $crossposts{$name}\n"; |
885 | }
|
885 | }
|
886 | print $OUTF print $OUTF |
886 | print $OUTF print $OUTF |
887 | "============================================================================\n"; |
887 | "============================================================================\n"; |
888 | print $OUTF "User agents\n"; |
888 | print $OUTF "User agents\n"; |
889 | print $OUTF |
889 | print $OUTF |
890 | "----------------------------------------------------------------------------\n"; |
890 | "----------------------------------------------------------------------------\n"; |
891 | foreach my $name ( sort keys %agents ) |
891 | foreach my $name ( sort keys %agents ) |
892 | {
|
892 | {
|
893 | print $OUTF "$name : $agents{$name}\n"; |
893 | print $OUTF "$name : $agents{$name}\n"; |
894 | }
|
894 | }
|
895 | print $OUTF |
895 | print $OUTF |
896 | "============================================================================\n"; |
896 | "============================================================================\n"; |
897 | print $OUTF "Time zones\n"; |
897 | print $OUTF "Time zones\n"; |
898 | print $OUTF |
898 | print $OUTF |
899 | "----------------------------------------------------------------------------\n"; |
899 | "----------------------------------------------------------------------------\n"; |
900 | foreach my $name ( sort keys %tz ) |
900 | foreach my $name ( sort keys %tz ) |
901 | {
|
901 | {
|
902 | print $OUTF "$name : $tz{$name}\n"; |
902 | print $OUTF "$name : $tz{$name}\n"; |
903 | }
|
903 | }
|
904 | close $OUTF; |
904 | close $OUTF; |
905 | } # writedata |
905 | } # writedata |
906 | 906 |