Rev 13 | Rev 22 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 13 | Rev 14 | ||
---|---|---|---|
1 | #!/usr/bin/env perl
|
1 | #!/usr/bin/env perl
|
2 | use strict; |
2 | use strict; |
3 | use warnings; |
3 | use warnings; |
4 | use diagnostics; |
4 | use diagnostics; |
5 | use utf8; |
5 | use utf8; |
6 | use Encode; |
6 | use Encode; |
7 | 7 | ||
8 | ## Print out all text to STDOUT UTF-8 encoded
|
8 | ## Print out all text to STDOUT UTF-8 encoded
|
9 | binmode STDOUT, ':encoding(UTF-8)'; |
9 | binmode STDOUT, ':encoding(UTF-8)'; |
10 | 10 | ||
11 | ############################
|
11 | ##############################
|
12 | ## newsstat.pl version 0.4.3
|
12 | ## newsstat.pl version 0.4.3.1
|
13 | 13 | ||
14 | ###########################################################################
|
14 | ###########################################################################
|
15 | ## Collect statistics about a newsgroup (specified by first argument)
|
15 | ## Collect statistics about a newsgroup (specified by first argument)
|
16 | ## in the local news spool. Check all articles in the last 30-day period.
|
16 | ## in the local news spool. Check all articles in the last 30-day period.
|
17 | ## Rank posters by number of posts and by volume of posts, report on top
|
17 | ## Rank posters by number of posts and by volume of posts, report on top
|
18 | ## and bottom 20 posters. Show their name, number of posts, size of posts,
|
18 | ## and bottom 20 posters. Show their name, number of posts, size of posts,
|
19 | ## percentage of quoted lines. Rank user-agents used, by poster rather
|
19 | ## percentage of quoted lines. Rank user-agents used, by poster rather
|
20 | ## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
20 | ## than by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
21 | ##
|
21 | ##
|
22 | ## Numbers and paths can be configured below. See ChangeLog and TODO
|
22 | ## Numbers and paths can be configured below. See ChangeLog and TODO
|
23 | ## for more. -- PE
|
23 | ## for more. -- PE
|
24 | ###########################################################################
|
24 | ###########################################################################
|
25 | 25 | ||
26 | ###################### USER CONFIGURATIONS ############################
|
26 | ###################### USER CONFIGURATIONS ############################
|
27 | 27 | ||
28 | ## The name of the group to do stats for
|
28 | ## The name of the group to do stats for
|
29 | my $newsgroup_name = $ARGV[0]; |
29 | my $newsgroup_name = $ARGV[0]; |
30 | $newsgroup_name or &usage; |
30 | $newsgroup_name or &usage; |
31 | 31 | ||
32 | ## Check for removal flags
|
32 | ## Check for removal flags
|
33 | my $ix; |
33 | my $ix; |
34 | my $j; |
34 | my $j; |
35 | my %skipSec; |
35 | my %skipSec; |
36 | my @skiplist; |
36 | my @skiplist; |
37 | my $args = @ARGV; |
37 | my $args = @ARGV; |
38 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
38 | for ( $ix = 1 ; $ix < $args ; $ix++ ) |
39 | {
|
39 | {
|
40 | $j = $ix + 1; |
40 | $j = $ix + 1; |
41 | if ( $ARGV[$ix] eq "-x" ) |
41 | if ( $ARGV[$ix] eq "-x" ) |
42 | {
|
42 | {
|
43 | @skiplist = split( ",", $ARGV[$j] ); |
43 | @skiplist = split( ",", $ARGV[$j] ); |
44 | }
|
44 | }
|
45 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
45 | elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) |
46 | {
|
46 | {
|
47 | @skiplist = split( ",", $1 ); |
47 | @skiplist = split( ",", $1 ); |
48 | }
|
48 | }
|
49 | }
|
49 | }
|
50 | foreach (@skiplist) |
50 | foreach (@skiplist) |
51 | {
|
51 | {
|
52 | $skipSec{$_} = 1; |
52 | $skipSec{$_} = 1; |
53 | }
|
53 | }
|
54 | 54 | ||
55 | ## Leafnode users will want /var/spool/news for this variable.
|
55 | ## Leafnode users will want /var/spool/news for this variable.
|
56 | my $news = "/var/spool/news/"; |
56 | my $news = "/var/spool/news/"; |
57 | 57 | ||
58 | ## How many days are we doing statistics for?
|
58 | ## How many days are we doing statistics for?
|
59 | my $numdays = 30; |
59 | my $numdays = 30; |
60 | 60 | ||
61 | ## Number of agents we list
|
61 | ## Number of agents we list
|
62 | my $topagents = 10; |
62 | my $topagents = 10; |
63 | 63 | ||
64 | ## Number of threads we want to know about
|
64 | ## Number of threads we want to know about
|
65 | my $topthreads = 20; |
65 | my $topthreads = 20; |
66 | 66 | ||
67 | ## Number of top or bottom posters to show
|
67 | ## Number of top or bottom posters to show
|
68 | my $topposters = 20; |
68 | my $topposters = 20; |
69 | 69 | ||
70 | ## Number of cross-posted threads to show
|
70 | ## Number of cross-posted threads to show
|
71 | my $topcrossposts = 10; |
71 | my $topcrossposts = 10; |
72 | 72 | ||
73 | ## Number of time zones to show
|
73 | ## Number of time zones to show
|
74 | my $toptz = 10; |
74 | my $toptz = 10; |
75 | 75 | ||
76 | ###################### DATA STRUCTURES ######################
|
76 | ###################### DATA STRUCTURES ######################
|
77 | my $group = $newsgroup_name; |
77 | my $group = $newsgroup_name; |
78 | $group =~ s!\.!/!g; |
78 | $group =~ s!\.!/!g; |
79 | my %data; # name, count, agent, total, orig, quoted |
79 | my %data; # name, count, agent, total, orig, quoted |
80 | my %threads; # subject, count |
80 | my %threads; # subject, count |
81 | my %crossposts; # group, count |
81 | my %crossposts; # group, count |
82 | my %tz; # timezones by count |
82 | my %tz; # timezones by count |
83 | my %headers; # holds header of current article |
83 | my %headers; # holds header of current article |
84 | my %lcheader; # holds lowercase headers |
84 | my %lcheader; # holds lowercase headers |
85 | my @body; # holds body of current article |
85 | my @body; # holds body of current article |
86 | my @sig; # holds sig text; |
86 | my @sig; # holds sig text; |
87 | my $totalposts; # total no. of posts considered |
87 | my $totalposts; # total no. of posts considered |
88 | my $filename; # name of current article file |
88 | my $filename; # name of current article file |
89 | my $filesize; # size of current article file |
89 | my $filesize; # size of current article file |
90 | my $earliest; # earliest article we have found |
90 | my $earliest; # earliest article we have found |
91 | my $latest; # latest article we have found |
91 | my $latest; # latest article we have found |
92 | my $poster; # poster we are dealing with |
92 | my $poster; # poster we are dealing with |
93 | my $totsize = 0; # holds total sizes of all files |
93 | my $totsize = 0; # holds total sizes of all files |
94 | my $totheader = 0; # total size of header material |
94 | my $totheader = 0; # total size of header material |
95 | my $totbody = 0; # total size of body material |
95 | my $totbody = 0; # total size of body material |
96 | my $totsig = 0; # total size of sig material |
96 | my $totsig = 0; # total size of sig material |
97 | my $totorig = 0; # total size of original material |
97 | my $totorig = 0; # total size of original material |
98 | my $totquoted = 0; # total size of quoted material |
98 | my $totquoted = 0; # total size of quoted material |
99 | my $origposts = 0; # total no. of original posts |
99 | my $origposts = 0; # total no. of original posts |
100 | my $replies = 0; # total no. of replies |
100 | my $replies = 0; # total no. of replies |
101 | my $i; # general purpose |
101 | my $i; # general purpose |
102 | my %distinct_agent; |
102 | my %distinct_agent; |
103 | 103 | ||
104 | ## Used to hold counts of User Agents used
|
104 | ## Used to hold counts of User Agents used
|
105 | my %agents = ( |
105 | my %agents = ( |
106 | "Compuserver" => 0, |
106 | "Compuserver" => 0, |
107 | "Foorum" => 0, |
107 | "Foorum" => 0, |
108 | "Forte Agent" => 0, |
108 | "Forte Agent" => 0, |
109 | "Forte Free Agent" => 0, |
109 | "Forte Free Agent" => 0, |
110 | "Gnus" => 0, |
110 | "Gnus" => 0, |
111 | "KNode" => 0, |
111 | "KNode" => 0, |
112 | "MacSOUP" => 0, |
112 | "MacSOUP" => 0, |
113 | "MT-NewsWatcher" => 0, |
113 | "MT-NewsWatcher" => 0, |
114 | "MicroPlanet Gravity" => 0, |
114 | "MicroPlanet Gravity" => 0, |
115 | "Microsoft Outlook Express" => 0, |
115 | "Microsoft Outlook Express" => 0, |
116 | "Microsoft Windows Mail" => 0, |
116 | "Microsoft Windows Mail" => 0, |
117 | "Mozilla" => 0, |
117 | "Mozilla" => 0, |
118 | "News Rover" => 0, |
118 | "News Rover" => 0, |
119 | "NN" => 0, |
119 | "NN" => 0, |
120 | "Pan" => 0, |
120 | "Pan" => 0, |
121 | "rn" => 0, |
121 | "rn" => 0, |
122 | "slrn" => 0, |
122 | "slrn" => 0, |
123 | "Sylpheed" => 0, |
123 | "Sylpheed" => 0, |
124 | "tin" => 0, |
124 | "tin" => 0, |
125 | "VSoup" => 0, |
125 | "VSoup" => 0, |
126 | "WebTV" => 0, |
126 | "WebTV" => 0, |
127 | "Xnews" => 0 |
127 | "Xnews" => 0 |
128 | ); |
128 | ); |
129 | 129 | ||
130 | ######################## MAIN CODE ########################
|
130 | ######################## MAIN CODE ########################
|
131 | $! = 1; |
131 | $! = 1; |
132 | 132 | ||
133 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
133 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
134 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
134 | opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n"; |
135 | while ( defined( $filename = readdir(DIR) ) ) |
135 | while ( defined( $filename = readdir(DIR) ) ) |
136 | {
|
136 | {
|
137 | %lcheader = (); |
137 | %lcheader = (); |
138 | next unless -f $filename; # only want real files |
138 | next unless -f $filename; # only want real files |
139 | next if ( $filename eq ".overview" ); # real articles only |
139 | next if ( $filename eq ".overview" ); # real articles only |
140 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
140 | next if ( -M $filename > $numdays ); # only want articles <= a certain age |
141 | $earliest = ( stat $filename )[9] unless defined($earliest); |
141 | $earliest = ( stat $filename )[9] unless defined($earliest); |
142 | $latest = ( stat $filename )[9] unless defined($latest); |
142 | $latest = ( stat $filename )[9] unless defined($latest); |
143 | &get_article($filename); # read in the article |
143 | &get_article($filename); # read in the article |
144 | &get_data; # grab the data from the article |
144 | &get_data; # grab the data from the article |
145 | $totalposts++; # bump count of articles considered |
145 | $totalposts++; # bump count of articles considered |
146 | }
|
146 | }
|
147 | closedir(DIR); # finished with the directory |
147 | closedir(DIR); # finished with the directory |
148 | 148 | ||
149 | ## Post-processing
|
149 | ## Post-processing
|
150 | &count_agents; # count agents, collapsing versions |
150 | &count_agents; # count agents, collapsing versions |
151 | &fix_percent; # check percentages orig/total for posters |
151 | &fix_percent; # check percentages orig/total for posters |
152 | 152 | ||
153 | &write_data; |
153 | &write_data; |
154 | 154 | ||
155 | #################### DISPLAY RESULTS #####################
|
155 | #################### DISPLAY RESULTS #####################
|
156 | print "=" x 76, "\n"; |
156 | print "=" x 76, "\n"; |
157 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
157 | printf "%s\n", ¢red( "Analysis of posts to $newsgroup_name", 76 ); |
158 | print "=" x 76, "\n"; |
158 | print "=" x 76, "\n"; |
159 | printf "%s\n", |
159 | printf "%s\n", |
160 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
160 | ¢red( "(stats compiled with a script by Garry Knight et al.)", 76 ); |
161 | print "\n\n"; |
161 | print "\n\n"; |
162 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
162 | printf "Total posts considered: %s over %d days\n", commify($totalposts), |
163 | $numdays; |
163 | $numdays; |
164 | printf "Earliest article: %s\n", scalar localtime($earliest); |
164 | printf "Earliest article: %s\n", scalar localtime($earliest); |
165 | printf "Latest article: %s\n", scalar localtime($latest); |
165 | printf "Latest article: %s\n", scalar localtime($latest); |
166 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
166 | printf "Original articles: %s, replies: %s\n", commify($origposts), |
167 | commify($replies); |
167 | commify($replies); |
168 | printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
168 | printf "Total size of posts: %s bytes (%s KiB) (%.2f MiB)\n", commify($totsize), |
169 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
169 | commify( int( $totsize / 1024 ) ), $totsize / 1048576; # |
170 | printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
170 | printf "Average %s articles per day, %.2f MiB per day, %s bytes per article\n", |
171 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
171 | commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576, |
172 | commify( int( $totsize / $totalposts ) ); |
172 | commify( int( $totsize / $totalposts ) ); |
173 | my $count = keys %data; |
173 | my $count = keys %data; |
174 | printf "Total headers: %s KiB bodies: %s KiB\n", |
174 | printf "Total headers: %s KiB bodies: %s KiB\n", |
175 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
175 | commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) ); |
176 | printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
176 | printf "Body text - quoted: %s KiB, original: %s KiB = %02.2f%%, sigs: %s KiB\n", |
177 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
177 | commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ), |
178 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
178 | ( $totorig * 100 ) / ( $totorig + $totquoted ), |
179 | commify( int( $totsig / 1024 ) ); |
179 | commify( int( $totsig / 1024 ) ); |
180 | printf "Total number of posters: %s, average %s bytes per poster\n", |
180 | printf "Total number of posters: %s, average %s bytes per poster\n", |
181 | commify($count), commify( int( $totsize / $count ) ); #/ |
181 | commify($count), commify( int( $totsize / $count ) ); #/ |
182 | $count = keys %threads; |
182 | $count = keys %threads; |
183 | printf "Total number of threads: %s, average %s bytes per thread\n", |
183 | printf "Total number of threads: %s, average %s bytes per thread\n", |
184 | commify($count), commify( int( $totsize / $count ) ); #/ |
184 | commify($count), commify( int( $totsize / $count ) ); #/ |
185 | printf "Total number of user agents: %d\n", scalar keys %agents; |
185 | printf "Total number of user agents: %d\n", scalar keys %agents; |
186 | print "\n", "=" x 76, "\n"; |
186 | print "\n", "=" x 76, "\n"; |
187 | 187 | ||
188 | ########################################
|
188 | ########################################
|
189 | ## Show posters by article count Sec 1;
|
189 | ## Show posters by article count Sec 1;
|
190 | ########################################
|
190 | ########################################
|
191 | unless ( $skipSec{1} ) |
191 | unless ( $skipSec{1} ) |
192 | {
|
192 | {
|
193 | if ( keys %data < $topposters ) |
193 | if ( keys %data < $topposters ) |
194 | {
|
194 | {
|
195 | $count = keys %data; |
195 | $count = keys %data; |
196 | }
|
196 | }
|
197 | else
|
197 | else
|
198 | {
|
198 | {
|
199 | $count = $topposters; |
199 | $count = $topposters; |
200 | }
|
200 | }
|
201 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
201 | printf "%s\n", ¢red( "Top $count posters by number of articles", 76 ); |
202 | print "=" x 76, "\n"; |
202 | print "=" x 76, "\n"; |
203 | $i = 0; |
203 | $i = 0; |
204 | foreach
|
204 | foreach
|
205 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
205 | my $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data ) |
206 | {
|
206 | {
|
207 | my $name = substr( $poster, 0, 65 ); |
207 | my $name = substr( $poster, 0, 65 ); |
208 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
208 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
209 | $data{$poster}{count}; |
209 | $data{$poster}{count}; |
210 | last if ( ++$i == $count ); |
210 | last if ( ++$i == $count ); |
211 | }
|
211 | }
|
212 | print "\n", "=" x 76, "\n"; |
212 | print "\n", "=" x 76, "\n"; |
213 | }
|
213 | }
|
214 | 214 | ||
215 | ######################################
|
215 | ######################################
|
216 | ## Show posters by size in KiB Sec 2;
|
216 | ## Show posters by size in KiB Sec 2;
|
217 | ######################################
|
217 | ######################################
|
218 | unless ( $skipSec{2} ) |
218 | unless ( $skipSec{2} ) |
219 | {
|
219 | {
|
220 | if ( keys %data < $topposters ) |
220 | if ( keys %data < $topposters ) |
221 | {
|
221 | {
|
222 | $count = keys %data; |
222 | $count = keys %data; |
223 | }
|
223 | }
|
224 | else
|
224 | else
|
225 | {
|
225 | {
|
226 | $count = $topposters; |
226 | $count = $topposters; |
227 | }
|
227 | }
|
228 | printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
228 | printf "%s\n", ¢red( "Top $count posters by article size in KiB", 76 ); |
229 | print "=" x 76, "\n"; |
229 | print "=" x 76, "\n"; |
230 | $i = 0; |
230 | $i = 0; |
231 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
231 | foreach my $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data ) |
232 | {
|
232 | {
|
233 | my $name = substr( $poster, 0, 62 ); |
233 | my $name = substr( $poster, 0, 62 ); |
234 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
234 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ), |
235 | $data{$poster}{size} / 1024; #/ |
235 | $data{$poster}{size} / 1024; #/ |
236 | last if ( ++$i == $count ); |
236 | last if ( ++$i == $count ); |
237 | }
|
237 | }
|
238 | print "\n", "=" x 76, "\n"; |
238 | print "\n", "=" x 76, "\n"; |
239 | }
|
239 | }
|
240 | 240 | ||
241 | #####################################
|
241 | #####################################
|
242 | ## Show top posters for original text
|
242 | ## Show top posters for original text
|
243 | #####################################
|
243 | #####################################
|
244 | unless ( $skipSec{3} ) |
244 | unless ( $skipSec{3} ) |
245 | {
|
245 | {
|
246 | if ( keys %data < $topposters ) |
246 | if ( keys %data < $topposters ) |
247 | {
|
247 | {
|
248 | $count = keys %data; |
248 | $count = keys %data; |
249 | }
|
249 | }
|
250 | else
|
250 | else
|
251 | {
|
251 | {
|
252 | $count = $topposters; |
252 | $count = $topposters; |
253 | }
|
253 | }
|
254 | printf "%s\n", |
254 | printf "%s\n", |
255 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
255 | ¢red( "Top $count responders by original text (> 5 posts)", 76 ); |
256 | print "=" x 76, "\n"; |
256 | print "=" x 76, "\n"; |
257 | $i = 0; |
257 | $i = 0; |
258 | foreach my $poster ( |
258 | foreach my $poster ( |
259 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
259 | sort { $data{$b}{percent} <=> $data{$a}{percent} } |
260 | keys %data |
260 | keys %data |
261 | )
|
261 | )
|
262 | {
|
262 | {
|
263 | next if $data{$poster}{quoted} == 0; |
263 | next if $data{$poster}{quoted} == 0; |
264 | next if $data{$poster}{count} < 5; |
264 | next if $data{$poster}{count} < 5; |
265 | my $name = substr( $poster, 0, 63 ); |
265 | my $name = substr( $poster, 0, 63 ); |
266 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
266 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
267 | $data{$poster}{percent}; |
267 | $data{$poster}{percent}; |
268 | last if ( ++$i == $count ); |
268 | last if ( ++$i == $count ); |
269 | }
|
269 | }
|
270 | print "\n", "=" x 76, "\n"; |
270 | print "\n", "=" x 76, "\n"; |
271 | }
|
271 | }
|
272 | 272 | ||
273 | ########################################
|
273 | ########################################
|
274 | ## Show bottom posters for original text
|
274 | ## Show bottom posters for original text
|
275 | ########################################
|
275 | ########################################
|
276 | unless ( $skipSec{4} ) |
276 | unless ( $skipSec{4} ) |
277 | {
|
277 | {
|
278 | if ( keys %data < $topposters ) |
278 | if ( keys %data < $topposters ) |
279 | {
|
279 | {
|
280 | $count = keys %data; |
280 | $count = keys %data; |
281 | }
|
281 | }
|
282 | else
|
282 | else
|
283 | {
|
283 | {
|
284 | $count = $topposters; |
284 | $count = $topposters; |
285 | }
|
285 | }
|
286 | printf "%s\n", |
286 | printf "%s\n", |
287 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
287 | ¢red( "Bottom $count responders by original text (> 5 posts)", 76 ); |
288 | print "=" x 76, "\n"; |
288 | print "=" x 76, "\n"; |
289 | $i = 0; |
289 | $i = 0; |
290 | foreach my $poster ( |
290 | foreach my $poster ( |
291 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
291 | sort { $data{$a}{percent} <=> $data{$b}{percent} } |
292 | keys %data |
292 | keys %data |
293 | )
|
293 | )
|
294 | {
|
294 | {
|
295 | next if $data{$poster}{quoted} == 0; |
295 | next if $data{$poster}{quoted} == 0; |
296 | next if $data{$poster}{count} < 5; |
296 | next if $data{$poster}{count} < 5; |
297 | my $name = substr( $poster, 0, 63 ); |
297 | my $name = substr( $poster, 0, 63 ); |
298 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
298 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ), |
299 | $data{$poster}{percent}; |
299 | $data{$poster}{percent}; |
300 | last if ( ++$i == $count ); |
300 | last if ( ++$i == $count ); |
301 | }
|
301 | }
|
302 | print "\n", "=" x 76, "\n"; |
302 | print "\n", "=" x 76, "\n"; |
303 | }
|
303 | }
|
304 | 304 | ||
305 | #####################################
|
305 | #####################################
|
306 | ## Show threads by number of articles
|
306 | ## Show threads by number of articles
|
307 | #####################################
|
307 | #####################################
|
308 | unless ( $skipSec{5} ) |
308 | unless ( $skipSec{5} ) |
309 | {
|
309 | {
|
310 | if ( keys %threads < $topthreads ) |
310 | if ( keys %threads < $topthreads ) |
311 | {
|
311 | {
|
312 | $count = keys %threads; |
312 | $count = keys %threads; |
313 | }
|
313 | }
|
314 | else
|
314 | else
|
315 | {
|
315 | {
|
316 | $count = $topthreads; |
316 | $count = $topthreads; |
317 | }
|
317 | }
|
318 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
318 | printf "%s\n", ¢red( "Top $count threads by no. of articles", 76 ); |
319 | print "=" x 76, "\n"; |
319 | print "=" x 76, "\n"; |
320 | $i = 0; |
320 | $i = 0; |
321 | foreach my $thread ( |
321 | foreach my $thread ( |
322 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
322 | sort { $threads{$b}{count} <=> $threads{$a}{count} } |
323 | keys %threads |
323 | keys %threads |
324 | )
|
324 | )
|
325 | {
|
325 | {
|
326 | my $name = substr( $thread, 0, 65 ); |
326 | my $name = substr( $thread, 0, 65 ); |
327 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
327 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
328 | $threads{$thread}{count}; |
328 | $threads{$thread}{count}; |
329 | last if ( ++$i == $count ); |
329 | last if ( ++$i == $count ); |
330 | }
|
330 | }
|
331 | print "\n", "=" x 76, "\n"; |
331 | print "\n", "=" x 76, "\n"; |
332 | }
|
332 | }
|
333 | 333 | ||
334 | ##############################
|
334 | ##############################
|
335 | ## Show threads by size in KiB
|
335 | ## Show threads by size in KiB
|
336 | ##############################
|
336 | ##############################
|
337 | unless ( $skipSec{6} ) |
337 | unless ( $skipSec{6} ) |
338 | {
|
338 | {
|
339 | if ( keys %threads < $topthreads ) |
339 | if ( keys %threads < $topthreads ) |
340 | {
|
340 | {
|
341 | $count = keys %threads; |
341 | $count = keys %threads; |
342 | }
|
342 | }
|
343 | else
|
343 | else
|
344 | {
|
344 | {
|
345 | $count = $topthreads; |
345 | $count = $topthreads; |
346 | }
|
346 | }
|
347 | printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
347 | printf "%s\n", ¢red( "Top $count threads by size in KiB", 76 ); |
348 | print "=" x 76, "\n"; |
348 | print "=" x 76, "\n"; |
349 | $i = 0; |
349 | $i = 0; |
350 | foreach my $thread ( |
350 | foreach my $thread ( |
351 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
351 | sort { $threads{$b}{size} <=> $threads{$a}{size} } |
352 | keys %threads |
352 | keys %threads |
353 | )
|
353 | )
|
354 | {
|
354 | {
|
355 | my $name = substr( $thread, 0, 65 ); |
355 | my $name = substr( $thread, 0, 65 ); |
356 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
356 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
357 | $threads{$thread}{size} / 1024; #/ |
357 | $threads{$thread}{size} / 1024; #/ |
358 | last if ( ++$i == $count ); |
358 | last if ( ++$i == $count ); |
359 | }
|
359 | }
|
360 | print "\n", "=" x 76, "\n"; |
360 | print "\n", "=" x 76, "\n"; |
361 | }
|
361 | }
|
362 | 362 | ||
363 | ##################################
|
363 | ##################################
|
364 | ## Show top 10 cross-posted groups
|
364 | ## Show top 10 cross-posted groups
|
365 | ##################################
|
365 | ##################################
|
366 | unless ( $skipSec{7} ) |
366 | unless ( $skipSec{7} ) |
367 | {
|
367 | {
|
368 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
368 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
369 | if ( keys %crossposts < $topcrossposts ) |
369 | if ( keys %crossposts < $topcrossposts ) |
370 | {
|
370 | {
|
371 | $count = keys %crossposts; |
371 | $count = keys %crossposts; |
372 | }
|
372 | }
|
373 | else
|
373 | else
|
374 | {
|
374 | {
|
375 | $count = $topcrossposts; |
375 | $count = $topcrossposts; |
376 | }
|
376 | }
|
377 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
377 | printf "%s\n", ¢red( "Top $count cross-posted groups", 76 ); |
378 | print "=" x 76, "\n"; |
378 | print "=" x 76, "\n"; |
379 | $i = 0; |
379 | $i = 0; |
380 | foreach
|
380 | foreach
|
381 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
381 | my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts ) |
382 | {
|
382 | {
|
383 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
383 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ), |
384 | $crossposts{$name}; |
384 | $crossposts{$name}; |
385 | last if ( ++$i == $count ); |
385 | last if ( ++$i == $count ); |
386 | }
|
386 | }
|
387 | print "\n", "=" x 76, "\n"; |
387 | print "\n", "=" x 76, "\n"; |
388 | }
|
388 | }
|
389 | 389 | ||
390 | #########################
|
390 | #########################
|
391 | ## Show agents and counts
|
391 | ## Show agents and counts
|
392 | #########################
|
392 | #########################
|
393 | unless ( $skipSec{8} ) |
393 | unless ( $skipSec{8} ) |
394 | {
|
394 | {
|
395 | if ( keys %agents < $topagents ) |
395 | if ( keys %agents < $topagents ) |
396 | {
|
396 | {
|
397 | $count = keys %agents; |
397 | $count = keys %agents; |
398 | }
|
398 | }
|
399 | else
|
399 | else
|
400 | {
|
400 | {
|
401 | $count = $topagents; |
401 | $count = $topagents; |
402 | }
|
402 | }
|
403 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
403 | printf "%s\n", ¢red( "Top $count User Agents by poster", 76 ); |
404 | print "=" x 76, "\n"; |
404 | print "=" x 76, "\n"; |
405 | $i = 0; |
405 | $i = 0; |
406 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
406 | foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) |
407 | {
|
407 | {
|
408 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
408 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ), |
409 | $agents{$agent}; |
409 | $agents{$agent}; |
410 | last if ( ++$i == $count ); |
410 | last if ( ++$i == $count ); |
411 | }
|
411 | }
|
412 | print "\n", "=" x 76, "\n"; |
412 | print "\n", "=" x 76, "\n"; |
413 | }
|
413 | }
|
414 | 414 | ||
415 | #######################
|
415 | #######################
|
416 | ## Show distinct agents
|
416 | ## Show distinct agents
|
417 | #######################
|
417 | #######################
|
418 | unless ( $skipSec{9} ) |
418 | unless ( $skipSec{9} ) |
419 | {
|
419 | {
|
420 | if ( keys %distinct_agent < $topagents ) |
420 | if ( keys %distinct_agent < $topagents ) |
421 | {
|
421 | {
|
422 | $count = keys %distinct_agent; |
422 | $count = keys %distinct_agent; |
423 | }
|
423 | }
|
424 | else
|
424 | else
|
425 | {
|
425 | {
|
426 | $count = $topagents; |
426 | $count = $topagents; |
427 | }
|
427 | }
|
428 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
428 | printf "%s\n", ¢red( "Top $count User Agents by number of posts", 76 ); |
429 | print "=" x 76, "\n"; |
429 | print "=" x 76, "\n"; |
430 | $i = 0; |
430 | $i = 0; |
431 | foreach my $agent ( |
431 | foreach my $agent ( |
432 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
432 | sort { $distinct_agent{$b} <=> $distinct_agent{$a} } |
433 | keys %distinct_agent |
433 | keys %distinct_agent |
434 | )
|
434 | )
|
435 | {
|
435 | {
|
436 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
436 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ), |
437 | $distinct_agent{$agent}, |
437 | $distinct_agent{$agent}, |
438 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
438 | ( ( $distinct_agent{$agent} / $totalposts ) * 100 ); |
439 | last if ( ++$i == $count ); |
439 | last if ( ++$i == $count ); |
440 | }
|
440 | }
|
441 | print "\n", "=" x 76, "\n"; |
441 | print "\n", "=" x 76, "\n"; |
442 | }
|
442 | }
|
443 | 443 | ||
444 | ############################
|
444 | ############################
|
445 | ## Show timezones and counts
|
445 | ## Show timezones and counts
|
446 | ############################
|
446 | ############################
|
447 | unless ( $skipSec{10} ) |
447 | unless ( $skipSec{10} ) |
448 | {
|
448 | {
|
449 | if ( keys %tz < $toptz ) |
449 | if ( keys %tz < $toptz ) |
450 | {
|
450 | {
|
451 | $count = keys %tz; |
451 | $count = keys %tz; |
452 | }
|
452 | }
|
453 | else
|
453 | else
|
454 | {
|
454 | {
|
455 | $count = $toptz; |
455 | $count = $toptz; |
456 | }
|
456 | }
|
457 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
457 | printf "%s\n", ¢red( "Top 10 time zones", 76 ); |
458 | print "=" x 76, "\n"; |
458 | print "=" x 76, "\n"; |
459 | $i = 0; |
459 | $i = 0; |
460 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
460 | foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz ) |
461 | {
|
461 | {
|
462 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
462 | printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone}; |
463 | last if ( ++$i == $count ); |
463 | last if ( ++$i == $count ); |
464 | }
|
464 | }
|
465 | print "\n", "=" x 76, "\n"; |
465 | print "\n", "=" x 76, "\n"; |
466 | }
|
466 | }
|
467 | 467 | ||
468 | ################################ SUBROUTINES ################################
|
468 | ################################ SUBROUTINES ################################
|
469 | 469 | ||
470 | ########################################
|
470 | ########################################
|
471 | ## Get current article's header and body
|
471 | ## Get current article's header and body
|
472 | ########################################
|
472 | ########################################
|
473 | sub get_article
|
473 | sub get_article
|
474 | {
|
474 | {
|
475 | %headers = (); # dump old headers |
475 | %headers = (); # dump old headers |
476 | my $filename = shift; # get the name of the file |
476 | my $filename = shift; # get the name of the file |
477 | 477 | ||
478 | ## get stats about the file itself
|
478 | ## get stats about the file itself
|
479 | $filesize = -s $filename; # get total size of file |
479 | $filesize = -s $filename; # get total size of file |
480 | $totsize += $filesize; # bump total sizes of all files |
480 | $totsize += $filesize; # bump total sizes of all files |
481 | 481 | ||
482 | my $mtime = ( stat $filename )[9]; |
482 | my $mtime = ( stat $filename )[9]; |
483 | if ( $mtime < $earliest ) |
483 | if ( $mtime < $earliest ) |
484 | {
|
484 | {
|
485 | $earliest = $mtime; |
485 | $earliest = $mtime; |
486 | }
|
486 | }
|
487 | elsif ( $mtime > $latest ) |
487 | elsif ( $mtime > $latest ) |
488 | {
|
488 | {
|
489 | $latest = $mtime; |
489 | $latest = $mtime; |
490 | }
|
490 | }
|
491 | 491 | ||
492 | ## now read the file
|
492 | ## now read the file
|
493 | open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
493 | open( my $FILE, '<', $filename ) or die "Can't open $filename: $!\n"; |
494 | while (<$FILE>) |
494 | while (<$FILE>) |
495 | {
|
495 | {
|
496 | $totheader += length($_); # bump total header size |
496 | $totheader += length($_); # bump total header size |
497 | last if (/^\s*$/); # end of header? |
497 | last if (/^\s*$/); # end of header? |
498 | if (/^([^:\s]*):\s*(.*)/) |
498 | if (/^([^:\s]*):\s*(.*)/) |
499 | {
|
499 | {
|
500 | my ( $key, $val ) = ( $1, $2 ); |
500 | my ( $key, $val ) = ( $1, $2 ); |
501 | $headers{$key} = decode( 'MIME-Header', $val ); |
501 | $headers{$key} = decode( 'MIME-Header', $val ); |
502 | $lcheader{ clean( lc($key) ) } = clean($val); |
502 | $lcheader{ clean( lc($key) ) } = clean($val); |
503 | }
|
503 | }
|
504 | }
|
504 | }
|
505 | @body = <$FILE>; # slurp up body |
505 | @body = <$FILE>; # slurp up body |
506 | close($FILE); |
506 | close($FILE); |
507 | } # get_article |
507 | } # get_article |
508 | 508 | ||
509 | ####################################
|
509 | ####################################
|
510 | ## Get data from the current article
|
510 | ## Get data from the current article
|
511 | ####################################
|
511 | ####################################
|
512 | sub get_data
|
512 | sub get_data
|
513 | {
|
513 | {
|
514 | #### First, analyse header fields ####
|
514 | #### First, analyse header fields ####
|
515 | 515 | ||
516 | ## Set up this poster if not defined, get counts, sizes
|
516 | ## Set up this poster if not defined, get counts, sizes
|
517 | my $poster = $headers{From}; # get the poster's name |
517 | my $poster = $headers{From}; # get the poster's name |
518 | 518 | ||
519 | # Convert old to new format
|
519 | # Convert old to new format
|
520 | $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
520 | $poster =~ s/^\s*(.+?\@.+?)\s*\((.+?)\)\s*$/$2 <$1>/; |
521 | 521 | ||
522 | # Collapse whitespace
|
522 | # Collapse whitespace
|
523 | $poster =~ s/\s+/ /; |
523 | $poster =~ s/\s+/ /; |
524 | 524 | ||
525 | # Remove outer quotes
|
525 | # Remove outer quotes
|
526 | $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
526 | $poster =~ s/^["'](.+?)["']\s+(.*)/$1 $2/; |
527 | 527 | ||
528 | if ( !defined( $data{$poster} ) ) |
528 | if ( !defined( $data{$poster} ) ) |
529 | { # seen this one before? |
529 | { # seen this one before? |
530 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
530 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
531 | $data{$poster}{orig} = 0; |
531 | $data{$poster}{orig} = 0; |
532 | $data{$poster}{quoted} = 0; |
532 | $data{$poster}{quoted} = 0; |
533 | }
|
533 | }
|
534 | $data{$poster}{count}++; # bump count for this poster |
534 | $data{$poster}{count}++; # bump count for this poster |
535 | $data{$poster}{size} += $filesize; # total size of file |
535 | $data{$poster}{size} += $filesize; # total size of file |
536 | 536 | ||
537 | ## The User-Agent and/or X-Newsreader fields
|
537 | ## The User-Agent and/or X-Newsreader fields
|
538 | ## for User-Agent by poster
|
538 | ## for User-Agent by poster
|
539 | if ( defined $lcheader{"user-agent"} ) |
539 | if ( defined $lcheader{"user-agent"} ) |
540 | {
|
540 | {
|
541 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
541 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
542 | }
|
542 | }
|
543 | if ( defined $lcheader{"x-newsreader"} ) |
543 | if ( defined $lcheader{"x-newsreader"} ) |
544 | {
|
544 | {
|
545 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
545 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
546 | }
|
546 | }
|
547 | 547 | ||
548 | ## The User Agent for User-Agent by number of posts
|
548 | ## The User Agent for User-Agent by number of posts
|
549 | my $UA = "unknown"; |
549 | my $UA = "unknown"; |
550 | foreach my $keys ( keys %lcheader ) |
550 | foreach my $keys ( keys %lcheader ) |
551 | {
|
551 | {
|
552 | if ( defined $lcheader{'user-agent'} ) |
552 | if ( defined $lcheader{'user-agent'} ) |
553 | {
|
553 | {
|
554 | $UA = $lcheader{'user-agent'}; |
554 | $UA = $lcheader{'user-agent'}; |
555 | }
|
555 | }
|
556 | elsif ( defined $lcheader{"x-newsreader"} ) |
556 | elsif ( defined $lcheader{"x-newsreader"} ) |
557 | {
|
557 | {
|
558 | $UA = $lcheader{"x-newsreader"}; |
558 | $UA = $lcheader{"x-newsreader"}; |
559 | }
|
559 | }
|
560 | elsif ( defined $lcheader{'x-mailer'} ) |
560 | elsif ( defined $lcheader{'x-mailer'} ) |
561 | {
|
561 | {
|
562 | $UA = $lcheader{'x-mailer'}; |
562 | $UA = $lcheader{'x-mailer'}; |
563 | }
|
563 | }
|
564 | elsif ( |
564 | elsif ( |
565 | ( defined $lcheader{'organization'} ) |
565 | ( defined $lcheader{'organization'} ) |
566 | && ( $lcheader{'organization'} =~ |
566 | && ( $lcheader{'organization'} =~ |
567 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
567 | /groups\.google|AOL|Supernews|WebTV|compuserve/ ) |
568 | )
|
568 | )
|
569 | {
|
569 | {
|
570 | $UA = $lcheader{'organization'}; |
570 | $UA = $lcheader{'organization'}; |
571 | }
|
571 | }
|
572 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
572 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
573 | {
|
573 | {
|
574 | $UA = "Pine"; |
574 | $UA = "Pine"; |
575 | } ## Hopefully found UA, else set to unknown |
575 | } ## Hopefully found UA, else set to unknown |
576 | }
|
576 | }
|
577 | 577 | ||
578 | $UA = clean($UA); |
578 | $UA = clean($UA); |
579 | $UA = get_agent($UA); |
579 | $UA = get_agent($UA); |
580 | 580 | ||
581 | sub get_agent
|
581 | sub get_agent
|
582 | {
|
582 | {
|
583 | my $raw = shift; |
583 | my $raw = shift; |
584 | my $agent = $raw; |
584 | my $agent = $raw; |
585 | 585 | ||
586 | ## strip http
|
586 | ## strip http
|
587 | if ( $raw =~ /.*http.*/ ) |
587 | if ( $raw =~ /.*http.*/ ) |
588 | {
|
588 | {
|
589 | $raw =~ s!posted via!!i; |
589 | $raw =~ s!posted via!!i; |
590 | $raw =~ s!http://!!g; |
590 | $raw =~ s!http://!!g; |
591 | $raw =~ s!/!!g; |
591 | $raw =~ s!/!!g; |
592 | $raw =~ s! !!g; |
592 | $raw =~ s! !!g; |
593 | }
|
593 | }
|
594 | 594 | ||
595 | ## Fix Outlook from Mac
|
595 | ## Fix Outlook from Mac
|
596 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
596 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; } |
597 | 597 | ||
598 | ## Pick out the popular agents
|
598 | ## Pick out the popular agents
|
599 | if ( |
599 | if ( |
600 | $raw =~ /(outlook express)/i |
600 | $raw =~ /(outlook express)/i |
601 | || $raw =~ /(windows mail)/i |
601 | || $raw =~ /(windows mail)/i |
602 | || $raw =~ /(microplanet gravity)/i |
602 | || $raw =~ /(microplanet gravity)/i |
603 | || $raw =~ /(news rover)/i |
603 | || $raw =~ /(news rover)/i |
604 | || $raw =~ /(forte agent)/i |
604 | || $raw =~ /(forte agent)/i |
605 | || $raw =~ /(forte free agent)/i |
605 | || $raw =~ /(forte free agent)/i |
606 | )
|
606 | )
|
607 | {
|
607 | {
|
608 | $agent = $1; |
608 | $agent = $1; |
609 | }
|
609 | }
|
610 | elsif ( |
610 | elsif ( |
611 | $raw =~ /^( |
611 | $raw =~ /^( |
612 | pan |
612 | pan |
613 | |sylpheed
|
613 | |sylpheed
|
614 | |slrn
|
614 | |slrn
|
615 | |mozilla
|
615 | |mozilla
|
616 | |knode
|
616 | |knode
|
617 | |tin
|
617 | |tin
|
618 | |hamster
|
618 | |hamster
|
619 | |xrn
|
619 | |xrn
|
620 | |xnews
|
620 | |xnews
|
621 | |aol
|
621 | |aol
|
622 | |gnus
|
622 | |gnus
|
623 | |krn
|
623 | |krn
|
624 | |macsoup
|
624 | |macsoup
|
625 | |messenger
|
625 | |messenger
|
626 | |openxp
|
626 | |openxp
|
627 | |pine
|
627 | |pine
|
628 | |thoth
|
628 | |thoth
|
629 | |turnpike
|
629 | |turnpike
|
630 | |winvn
|
630 | |winvn
|
631 | |vsoup
|
631 | |vsoup
|
632 | |google
|
632 | |google
|
633 | |supernews
|
633 | |supernews
|
634 | |nn
|
634 | |nn
|
635 | |rn
|
635 | |rn
|
636 | |007
|
636 | |007
|
637 | |webtv
|
637 | |webtv
|
638 | |compuserve
|
638 | |compuserve
|
639 | )/ix |
639 | )/ix |
640 | )
|
640 | )
|
641 | {
|
641 | {
|
642 | $agent = $1; |
642 | $agent = $1; |
643 | }
|
643 | }
|
644 | else
|
644 | else
|
645 | {
|
645 | {
|
646 | ## Clean up unknown agents
|
646 | ## Clean up unknown agents
|
647 | if ( $raw =~ m!^(.*?)/! ) |
647 | if ( $raw =~ m!^(.*?)/! ) |
648 | {
|
648 | {
|
649 | $agent = $1; |
649 | $agent = $1; |
650 | }
|
650 | }
|
651 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
651 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
652 | {
|
652 | {
|
653 | $agent = $1; |
653 | $agent = $1; |
654 | }
|
654 | }
|
655 | }
|
655 | }
|
656 | 656 | ||
657 | $distinct_agent{$agent}++; |
657 | $distinct_agent{$agent}++; |
658 | return $agent; |
658 | return $agent; |
659 | }
|
659 | }
|
660 | 660 | ||
661 | ## Get all cross-posted newsgroups
|
661 | ## Get all cross-posted newsgroups
|
662 | for ( split /,/, $headers{"Newsgroups"} ) |
662 | for ( split /,/, $headers{"Newsgroups"} ) |
663 | {
|
663 | {
|
664 | $crossposts{$_}++; # bump count for each |
664 | $crossposts{$_}++; # bump count for each |
665 | }
|
665 | }
|
666 | 666 | ||
667 | ## Get threads
|
667 | ## Get threads
|
668 | my $thread = $headers{"Subject"}; |
668 | my $thread = $headers{"Subject"}; |
669 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
669 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
670 | $thread =~ s/\s+/ /g; # collapse whitespace |
670 | $thread =~ s/\s+/ /g; # collapse whitespace |
671 | $threads{$thread}{count} += 1; # bump count of this subject |
671 | $threads{$thread}{count} += 1; # bump count of this subject |
672 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
672 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
673 | 673 | ||
674 | ## Is this an original post or a reply?
|
674 | ## Is this an original post or a reply?
|
675 | if ( defined $headers{"References"} ) |
675 | if ( defined $headers{"References"} ) |
676 | {
|
676 | {
|
677 | $replies++; |
677 | $replies++; |
678 | }
|
678 | }
|
679 | else
|
679 | else
|
680 | {
|
680 | {
|
681 | $origposts++; |
681 | $origposts++; |
682 | }
|
682 | }
|
683 | 683 | ||
684 | ## Get the time zone
|
684 | ## Get the time zone
|
685 | $_ = $headers{"Date"}; |
685 | $_ = $headers{"Date"}; |
686 | my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
686 | my ($tz) = /\d\d:\d\d(?::\d\d)?\s+(.*)/; |
687 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
687 | if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) ) |
688 | {
|
688 | {
|
689 | $tz = "UTC"; |
689 | $tz = "UTC"; |
690 | }
|
690 | }
|
691 | $tz{$tz}++; |
691 | $tz{$tz}++; |
692 | 692 | ||
693 | #### Now analyse the body text ####
|
693 | #### Now analyse the body text ####
|
694 | my $insig = 0; |
694 | my $insig = 0; |
695 | for (@body) |
695 | for (@body) |
696 | {
|
696 | {
|
697 | $totbody += length($_); # bump total body size |
697 | $totbody += length($_); # bump total body size |
698 | next if (/^$>/); # don't count blank lines in body |
698 | next if (/^$>/); # don't count blank lines in body |
699 | if ( $insig == 1 ) |
699 | if ( $insig == 1 ) |
700 | {
|
700 | {
|
701 | $totsig += length($_); # bump total sig size |
701 | $totsig += length($_); # bump total sig size |
702 | 702 | ||
703 | ## Bill Unruh uses ] quotes, and another poster uses ::
|
703 | ## Bill Unruh uses ] quotes, and another poster uses ::
|
704 | }
|
704 | }
|
705 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
705 | elsif ( /^\s*[>\]]/ or /^\s*::/ ) |
706 | { # are we in a quote line? |
706 | { # are we in a quote line? |
707 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
707 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
708 | $totquoted += length($_); |
708 | $totquoted += length($_); |
709 | }
|
709 | }
|
710 | elsif (/-- /) |
710 | elsif (/-- /) |
711 | {
|
711 | {
|
712 | $insig = 1; |
712 | $insig = 1; |
713 | }
|
713 | }
|
714 | else
|
714 | else
|
715 | {
|
715 | {
|
716 | 716 | ||
717 | ## We must be processing an original line
|
717 | ## We must be processing an original line
|
718 | $data{$poster}{orig} += length($_); # bump count of original chrs |
718 | $data{$poster}{orig} += length($_); # bump count of original chrs |
719 | $totorig += length($_); |
719 | $totorig += length($_); |
720 | }
|
720 | }
|
721 | } # end for (@body) |
721 | } # end for (@body) |
722 | 722 | ||
723 | } # get_data |
723 | } # get_data |
724 | 724 | ||
725 | #########################################
|
725 | #########################################
|
726 | ## Count the User-Agents used, collapsing
|
726 | ## Count the User-Agents used, collapsing
|
727 | ## different versions into one per agent.
|
727 | ## different versions into one per agent.
|
728 | #########################################
|
728 | #########################################
|
729 | sub count_agents
|
729 | sub count_agents
|
730 | {
|
730 | {
|
731 | POSTER:
|
731 | POSTER:
|
732 | foreach my $poster ( keys %data ) |
732 | foreach my $poster ( keys %data ) |
733 | {
|
733 | {
|
734 | foreach my $agent_name ( keys %distinct_agent ) |
734 | foreach my $agent_name ( keys %distinct_agent ) |
735 | { # check against known ones |
735 | { # check against known ones |
736 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
736 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) |
737 | {
|
737 | {
|
738 | $agents{$agent_name}++; |
738 | $agents{$agent_name}++; |
739 | next POSTER; |
739 | next POSTER; |
740 | }
|
740 | }
|
741 | }
|
741 | }
|
742 | $agents{ $data{$poster}{agent} }++; |
742 | $agents{ $data{$poster}{agent} }++; |
743 | }
|
743 | }
|
744 | } # count_agents |
744 | } # count_agents |
745 | 745 | ||
746 | #############################################
|
746 | #############################################
|
747 | ## Set orig/total percentages for all posters
|
747 | ## Set orig/total percentages for all posters
|
748 | #############################################
|
748 | #############################################
|
749 | sub fix_percent
|
749 | sub fix_percent
|
750 | {
|
750 | {
|
751 | foreach my $poster ( keys %data ) |
751 | foreach my $poster ( keys %data ) |
752 | {
|
752 | {
|
753 | my $percent = 100; |
753 | my $percent = 100; |
754 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
754 | if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) ) |
755 | {
|
755 | {
|
756 | $percent = |
756 | $percent = |
757 | $data{$poster}{orig} * 100 / |
757 | $data{$poster}{orig} * 100 / |
758 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
758 | ( $data{$poster}{quoted} + $data{$poster}{orig} ); #/ |
759 | }
|
759 | }
|
760 | elsif ( $data{$poster}{orig} == 0 ) |
760 | elsif ( $data{$poster}{orig} == 0 ) |
761 | {
|
761 | {
|
762 | $percent = 0; |
762 | $percent = 0; |
763 | }
|
763 | }
|
764 | $data{$poster}{percent} = $percent; |
764 | $data{$poster}{percent} = $percent; |
765 | }
|
765 | }
|
766 | }
|
766 | }
|
767 | 767 | ||
768 | ###############################
|
768 | ###############################
|
769 | ## Right pad a string with '.'s
|
769 | ## Right pad a string with '.'s
|
770 | ###############################
|
770 | ###############################
|
771 | sub rpad
|
771 | sub rpad
|
772 | {
|
772 | {
|
773 | ## Get text to pad, length to pad, pad chr
|
773 | ## Get text to pad, length to pad, pad chr
|
774 | my ( $text, $pad_len, $pad_chr ) = @_; |
774 | my ( $text, $pad_len, $pad_chr ) = @_; |
775 | 775 | ||
776 | ## DEBUG
|
776 | ## DEBUG
|
777 | #printf "|%s| = %d\n", $text, length($text);
|
777 | #printf "|%s| = %d\n", $text, length($text);
|
778 | 778 | ||
779 | if ( length($text) > $pad_len ) |
779 | if ( length($text) > $pad_len ) |
780 | {
|
780 | {
|
781 | $text = substr( $text, 0, $pad_len ); |
781 | $text = substr( $text, 0, $pad_len ); |
782 | }
|
782 | }
|
783 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
783 | my $padded = $text . $pad_chr x ( $pad_len - length($text) ); |
784 | return $padded; |
784 | return $padded; |
785 | }
|
785 | }
|
786 | 786 | ||
787 | ##################
|
787 | ##################
|
788 | ## Centre a string
|
788 | ## Centre a string
|
789 | ##################
|
789 | ##################
|
790 | sub centred
|
790 | sub centred
|
791 | {
|
791 | {
|
792 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
792 | my ( $text, $width ) = @_; # text to centre, size of field to centre in |
793 | my $pad_len = ( $width - length($text) ) / 2; #/ |
793 | my $pad_len = ( $width - length($text) ) / 2; #/ |
794 | my $centred = " " x $pad_len . $text; |
794 | my $centred = " " x $pad_len . $text; |
795 | return $centred; |
795 | return $centred; |
796 | }
|
796 | }
|
797 | 797 | ||
798 | ###########################
|
798 | ###########################
|
799 | ## Put commas into a number
|
799 | ## Put commas into a number
|
800 | ###########################
|
800 | ###########################
|
801 | sub commify
|
801 | sub commify
|
802 | {
|
802 | {
|
803 | local $_ = shift; |
803 | local $_ = shift; |
804 | 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
804 | 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
805 | return $_; |
805 | return $_; |
806 | }
|
806 | }
|
807 | 807 | ||
808 | ################################################################
|
808 | ################################################################
|
809 | ## Returns a string with leading and trailing whitespace removed
|
809 | ## Returns a string with leading and trailing whitespace removed
|
810 | ################################################################
|
810 | ################################################################
|
811 | sub clean
|
811 | sub clean
|
812 | {
|
812 | {
|
813 | my $dirty = shift; |
813 | my $dirty = shift; |
814 | my $clean = $dirty; |
814 | my $clean = $dirty; |
815 | $clean =~ s/^\s*|\s*$//g; |
815 | $clean =~ s/^\s+|\s+$//g; |
816 | 816 | ||
817 | return $clean; |
817 | return $clean; |
818 | }
|
818 | }
|
819 | 819 | ||
820 | sub usage
|
820 | sub usage
|
821 | {
|
821 | {
|
822 | print "usage: newstat.pl newsgroupname\n"; |
822 | print "usage: newstat.pl newsgroupname\n"; |
823 | exit 1; |
823 | exit 1; |
824 | }
|
824 | }
|
825 | 825 | ||
826 | ##################################
|
826 | ##################################
|
827 | ## Write data structures to a file
|
827 | ## Write data structures to a file
|
828 | ##################################
|
828 | ##################################
|
829 | sub write_data
|
829 | sub write_data
|
830 | {
|
830 | {
|
831 | open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
831 | open my $OUTF, ">:encoding(UTF-8)", "/tmp/XDATA" |
832 | or die "Can't create XDATA: $!\n"; |
832 | or die "Can't create XDATA: $!\n"; |
833 | print $OUTF "Data collected from $newsgroup_name\n\n"; |
833 | print $OUTF "Data collected from $newsgroup_name\n\n"; |
834 | print $OUTF |
834 | print $OUTF |
835 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
835 | "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
836 | foreach my $name ( keys %data ) |
836 | foreach my $name ( keys %data ) |
837 | {
|
837 | {
|
838 | print $OUTF |
838 | print $OUTF |
839 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
839 | "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
840 | }
|
840 | }
|
841 | print $OUTF |
841 | print $OUTF |
842 | "============================================================================\n"; |
842 | "============================================================================\n"; |
843 | print $OUTF "Thread subjects\n"; |
843 | print $OUTF "Thread subjects\n"; |
844 | print $OUTF |
844 | print $OUTF |
845 | "----------------------------------------------------------------------------\n"; |
845 | "----------------------------------------------------------------------------\n"; |
846 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
846 | foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads ) |
847 | {
|
847 | {
|
848 | print $OUTF |
848 | print $OUTF |
849 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
849 | "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
850 | }
|
850 | }
|
851 | print $OUTF |
851 | print $OUTF |
852 | "============================================================================\n"; |
852 | "============================================================================\n"; |
853 | print $OUTF "Cross-posts\n"; |
853 | print $OUTF "Cross-posts\n"; |
854 | print $OUTF |
854 | print $OUTF |
855 | "----------------------------------------------------------------------------\n"; |
855 | "----------------------------------------------------------------------------\n"; |
856 | foreach my $name ( sort keys %crossposts ) |
856 | foreach my $name ( sort keys %crossposts ) |
857 | {
|
857 | {
|
858 | print $OUTF "$name : $crossposts{$name}\n"; |
858 | print $OUTF "$name : $crossposts{$name}\n"; |
859 | }
|
859 | }
|
860 | print $OUTF |
860 | print $OUTF |
861 | "============================================================================\n"; |
861 | "============================================================================\n"; |
862 | print $OUTF "User agents\n"; |
862 | print $OUTF "User agents\n"; |
863 | print $OUTF |
863 | print $OUTF |
864 | "----------------------------------------------------------------------------\n"; |
864 | "----------------------------------------------------------------------------\n"; |
865 | foreach my $name ( sort keys %agents ) |
865 | foreach my $name ( sort keys %agents ) |
866 | {
|
866 | {
|
867 | print $OUTF "$name : $agents{$name}\n"; |
867 | print $OUTF "$name : $agents{$name}\n"; |
868 | }
|
868 | }
|
869 | print $OUTF |
869 | print $OUTF |
870 | "============================================================================\n"; |
870 | "============================================================================\n"; |
871 | print $OUTF "Time zones\n"; |
871 | print $OUTF "Time zones\n"; |
872 | print $OUTF |
872 | print $OUTF |
873 | "----------------------------------------------------------------------------\n"; |
873 | "----------------------------------------------------------------------------\n"; |
874 | foreach my $name ( sort keys %tz ) |
874 | foreach my $name ( sort keys %tz ) |
875 | {
|
875 | {
|
876 | print $OUTF "$name : $tz{$name}\n"; |
876 | print $OUTF "$name : $tz{$name}\n"; |
877 | }
|
877 | }
|
878 | close $OUTF; |
878 | close $OUTF; |
879 | } # write_data |
879 | } # write_data |
880 | 880 |