Rev 5 | Rev 7 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 5 | Rev 6 | ||
---|---|---|---|
1 | #!/usr/bin/perl -w
|
1 | #!/usr/bin/perl -w
|
2 | use strict; |
2 | use strict; |
3 | #########################
|
3 | use warnings; |
4 | # newsstat.pl version 0.3
|
4 | use utf8; |
5 | 5 | use Encode; |
|
6 | 6 | ||
- | 7 | #########################
|
|
- | 8 | # newsstat.pl version 0.4
|
|
7 | 9 | ||
8 | ###################################################################
|
10 | ############################################################################
|
9 | # Collect statistics about the alt.os.linux.mandrake newsgroup.
|
11 | # Collect statistics about a newsgroup (specified by first argument) in
|
10 | # Check all articles in the last 7-day period. Rank posters by
|
12 | # the local news spool. Check all articles in the last 30-day period.
|
11 | # no. of posts and by volume of posts, report on top and bottom
|
13 | # Rank posters by number of posts and by volume of posts, report on top and
|
12 | # 20 posters. Show their name, no. posts, size of posts, percentage
|
14 | # bottom 20 posters. Show their name, number of posts, size of posts,
|
13 | # quoted lines. Rank user-agents used, by poster rather than by
|
15 | # percentage of quoted lines. Rank user-agents used, by poster rather than
|
14 | # post. Rank top 10 threads. Rank top 10 cross-posted groups.
|
16 | # by post. Rank top 20 threads. Rank top 10 cross-posted groups.
|
- | 17 | #
|
|
- | 18 | # (Numbers and paths can be configured below. -- PE)
|
|
15 | ###################################################################
|
19 | ############################################################################
|
16 | 20 | ||
17 | ##################################################################
|
21 | ############################################################################
|
18 | # RECENT CHANGES #
|
22 | # RECENT CHANGES #
|
- | 23 | # 2011-07-03 PE - Use Encode to decode/encode MIME encodings
|
|
- | 24 | # - Use warnings, utf8 (just in case)
|
|
- | 25 | # - Documentation update
|
|
- | 26 | # N/A NN - Take newsgroup name as argument
|
|
19 | # 2004/06/19 - newsgroup name is $ARGV[0]
|
27 | # 2004-06-19 NN - newsgroup name is $ARGV[0]
|
20 | # - Allow command line flags for subtracting
|
28 | # - Allow command line flags for subtracting
|
21 | # output if not pertinent for a group
|
29 | # output if not pertinent for a group
|
22 | # 2002/11/09 - Put Garry's writedata() function back in.
|
30 | # 2002-11-09 NN - Put Garry's writedata() function back in.
|
23 | # - added "rn" to my list of UA's
|
31 | # - added "rn" to my list of UA's
|
24 | # - Started using %distinct_agent for both User agent
|
32 | # - Started using %distinct_agent for both User agent
|
25 | # sections
|
33 | # sections
|
26 | # - named it newsstat.pl version 0.3
|
34 | # - named it newsstat.pl version 0.3
|
27 | # 2002/11/06 - Fixed the earliest/latest file problem by using
|
35 | # 2002-11-06 NN - Fixed the earliest/latest file problem by using
|
28 | # mtime rather than ctime, and simplifying the logic
|
36 | # mtime rather than ctime, and simplifying the logic
|
29 | # 2002/11/05 - moved user configurations to the top
|
37 | # 2002-11-05 NN - moved user configurations to the top
|
30 | # - fixed the cross-posting section
|
38 | # - fixed the cross-posting section
|
31 | # - introduced the $newsgroup_name variable which
|
39 | # - introduced the $newsgroup_name variable which
|
32 | # later becomes $news$group
|
40 | # later becomes $news$group
|
33 | # - changed $name to $agent_name in countagents()
|
41 | # - changed $name to $agent_name in countagents()
|
- | 42 | #
|
|
- | 43 | # Contributors
|
|
- | 44 | # -------------
|
|
- | 45 | # NN Nomen nominandum (name to be determined later)
|
|
- | 46 | # PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
|
|
34 | 47 | ||
35 | ########### NEXT #############
|
48 | ########### TODO #############
|
36 | # Commas in bottom section of report
|
49 | # Commas in bottom section of report
|
37 | # Show date the figures were compiled
|
50 | # Show date the figures were compiled
|
38 | # No. of HTML articles (Content-Type: text/html)
|
51 | # No. of HTML articles (Content-Type: text/html)
|
39 | # No. of quoted sigs (/>\s*-- /)
|
52 | # No. of quoted sigs (/>\s*-- /)
|
40 | # Per cent of top-posted articles
|
53 | # Per cent of top-posted articles
|
41 | # Top 10 cross-posters
|
54 | # Top 10 cross-posters
|
42 | # Top 20 news posting hosts (from Path)
|
55 | # Top 20 news posting hosts (from Path)
|
43 | # Count of certain subject words: newbie, kde, burner, sendmail, etc.
|
56 | # Count of certain subject words: newbie, kde, burner, sendmail, etc.
|
44 | # Count *all* User Agents that each poster uses
|
57 | # Count *all* User Agents that each poster uses
|
45 | # What do we do about Bill Unruh's ] quote style?
|
58 | # What do we do about Bill Unruh's ] quote style?
|
46 | # Change the way dates/times are checked
|
59 | # Change the way dates/times are checked
|
47 | # include % share in posters by no. of arts
|
60 | # include % share in posters by no. of arts
|
48 | # include % share in posters by size
|
61 | # include % share in posters by size
|
49 | # Total, orig & quoted lines by user agent with per cent
|
62 | # Total, orig & quoted lines by user agent with per cent
|
50 | # Take arguments, i.e. newsgroup name
|
63 | # Take more arguments
|
51 | #######################################################
|
64 | #######################################################
|
52 | 65 | ||
53 | ###################### USER CONFIGURATIONS ############################
|
66 | ###################### USER CONFIGURATIONS ############################
|
54 | 67 | ||
55 | # The name of the group to do stats for
|
68 | # The name of the group to do stats for
|
56 | my $newsgroup_name = $ARGV[0]; |
69 | my $newsgroup_name = $ARGV[0]; |
57 | $newsgroup_name or &usage; |
70 | $newsgroup_name or &usage; |
58 | 71 | ||
59 | # Check for removal flags
|
72 | # Check for removal flags
|
60 | my $ix; |
73 | my $ix; |
61 | my $j; |
74 | my $j; |
62 | my %skipSec; |
75 | my %skipSec; |
63 | my @skiplist; |
76 | my @skiplist; |
64 | my $args = @ARGV; |
77 | my $args = @ARGV; |
65 | for ( $ix = 1 ; $ix < $args ; $ix++ ) { |
78 | for ( $ix = 1 ; $ix < $args ; $ix++ ) { |
66 | $j = $ix + 1; |
79 | $j = $ix + 1; |
67 | if ( $ARGV[$ix] eq "-x" ) { |
80 | if ( $ARGV[$ix] eq "-x" ) { |
68 | @skiplist = split(",",$ARGV[$j]); |
81 | @skiplist = split(",",$ARGV[$j]); |
69 | } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { |
82 | } elsif ( $ARGV[$ix] =~ /-x(\d.*)/ ) { |
70 | @skiplist = split(",",$1); |
83 | @skiplist = split(",",$1); |
71 | }
|
84 | }
|
72 | }
|
85 | }
|
73 | foreach(@skiplist) { |
86 | foreach(@skiplist) { |
74 | $skipSec{$_} = 1; |
87 | $skipSec{$_} = 1; |
75 | }
|
88 | }
|
76 | 89 | ||
77 | # Leafnode users will want /var/spool/news for this variable.
|
90 | # Leafnode users will want /var/spool/news for this variable.
|
78 | my $news = "/var/spool/news/"; |
91 | my $news = "/var/spool/news/"; |
79 | 92 | ||
80 | # How many days are we doing statistics for?
|
93 | # How many days are we doing statistics for?
|
81 | my $numdays = 30; |
94 | my $numdays = 30; |
82 | 95 | ||
83 | # no. of agents we list
|
96 | # no. of agents we list
|
84 | my $topagents = 10; |
97 | my $topagents = 10; |
85 | 98 | ||
86 | # no. of threads we want to know about
|
99 | # no. of threads we want to know about
|
87 | my $topthreads = 20; |
100 | my $topthreads = 20; |
88 | 101 | ||
89 | # no. of top or bottom posters to show
|
102 | # no. of top or bottom posters to show
|
90 | my $topposters = 20; |
103 | my $topposters = 20; |
91 | 104 | ||
92 | # no. of cross-posted threads to show
|
105 | # no. of cross-posted threads to show
|
93 | my $topcrossposts = 10; |
106 | my $topcrossposts = 10; |
94 | 107 | ||
95 | # no. of time zones to show
|
108 | # no. of time zones to show
|
96 | my $toptz = 10; |
109 | my $toptz = 10; |
97 | 110 | ||
98 | 111 | ||
99 | 112 | ||
100 | ###################### DATA STRUCTURES ######################
|
113 | ###################### DATA STRUCTURES ######################
|
101 | my $group = $newsgroup_name; |
114 | my $group = $newsgroup_name; |
102 | $group =~ s!\.!/!g; |
115 | $group =~ s!\.!/!g; |
103 | my %data; # name, count, agent, total, orig, quoted |
116 | my %data; # name, count, agent, total, orig, quoted |
104 | my %threads; # subject, count |
117 | my %threads; # subject, count |
105 | my %crossposts; # group, count |
118 | my %crossposts; # group, count |
106 | my %tz; # timezones by count |
119 | my %tz; # timezones by count |
107 | my %headers; # holds header of current article |
120 | my %headers; # holds header of current article |
108 | my %lcheader; # holds lowercase headers |
121 | my %lcheader; # holds lowercase headers |
109 | my @body; # holds body of current article |
122 | my @body; # holds body of current article |
110 | my @sig; # holds sig text; |
123 | my @sig; # holds sig text; |
111 | my $totalposts; # total no. of posts considered |
124 | my $totalposts; # total no. of posts considered |
112 | my $filename; # name of current article file |
125 | my $filename; # name of current article file |
113 | my $filesize; # size of current article file |
126 | my $filesize; # size of current article file |
114 | my $earliest; # earliest article we have found |
127 | my $earliest; # earliest article we have found |
115 | my $latest; # latest article we have found |
128 | my $latest; # latest article we have found |
116 | my $poster; # poster we are dealing with |
129 | my $poster; # poster we are dealing with |
117 | my $totsize = 0; # holds total sizes of all files |
130 | my $totsize = 0; # holds total sizes of all files |
118 | my $totheader = 0; # total size of header material |
131 | my $totheader = 0; # total size of header material |
119 | my $totbody = 0; # total size of body material |
132 | my $totbody = 0; # total size of body material |
120 | my $totsig = 0; # total size of sig material |
133 | my $totsig = 0; # total size of sig material |
121 | my $totorig = 0; # total size of original material |
134 | my $totorig = 0; # total size of original material |
122 | my $totquoted = 0; # total size of quoted material |
135 | my $totquoted = 0; # total size of quoted material |
123 | my $origposts = 0; # total no. of original posts |
136 | my $origposts = 0; # total no. of original posts |
124 | my $replies = 0; # total no. of replies |
137 | my $replies = 0; # total no. of replies |
125 | my $i; # general purpose |
138 | my $i; # general purpose |
126 | my %distinct_agent; |
139 | my %distinct_agent; |
127 | my %agents = # used to hold counts of User Agents used |
140 | my %agents = # used to hold counts of User Agents used |
128 | ( "KNode" => 0, |
141 | ( "KNode" => 0, |
129 | "Pan" => 0, |
142 | "Pan" => 0, |
130 | "Mozilla" => 0, |
143 | "Mozilla" => 0, |
131 | "Sylpheed" => 0, |
144 | "Sylpheed" => 0, |
132 | "Gnus" => 0, |
145 | "Gnus" => 0, |
133 | "Forte Agent" => 0, |
146 | "Forte Agent" => 0, |
134 | "Forte Free Agent" => 0, |
147 | "Forte Free Agent" => 0, |
135 | "MicroPlanet Gravity" => 0, |
148 | "MicroPlanet Gravity" => 0, |
136 | "Microsoft Outlook Express" => 0, |
149 | "Microsoft Outlook Express" => 0, |
137 | "Xnews" => 0, |
150 | "Xnews" => 0, |
138 | "slrn" => 0, |
151 | "slrn" => 0, |
139 | "tin" => 0, |
152 | "tin" => 0, |
140 | "rn" => 0, |
153 | "rn" => 0, |
141 | "NN" => 0, |
154 | "NN" => 0, |
142 | "MacSOUP" => 0, |
155 | "MacSOUP" => 0, |
143 | "Foorum" => 0, |
156 | "Foorum" => 0, |
144 | "MT-NewsWatcher" => 0, |
157 | "MT-NewsWatcher" => 0, |
145 | "News Rover" => 0, |
158 | "News Rover" => 0, |
146 | "WebTV" => 0, |
159 | "WebTV" => 0, |
147 | "Compuserver" => 0, |
160 | "Compuserver" => 0, |
148 | "VSoup" => 0); |
161 | "VSoup" => 0); |
149 | 162 | ||
150 | ######################## MAIN CODE ########################
|
163 | ######################## MAIN CODE ########################
|
151 | $! = 1; |
164 | $! = 1; |
152 | 165 | ||
153 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
166 | chdir("$news$group") or die "Can't cd to $news$group: $!\n"; |
154 | opendir(DIR, ".") or die "Can't open $news$group directory: $!\n"; |
167 | opendir(DIR, ".") or die "Can't open $news$group directory: $!\n"; |
155 | while (defined($filename = readdir(DIR))) { |
168 | while (defined($filename = readdir(DIR))) { |
156 | %lcheader = (); |
169 | %lcheader = (); |
157 | next unless -f $filename; # only want real files |
170 | next unless -f $filename; # only want real files |
158 | next if ($filename eq ".overview"); # real articles only |
171 | next if ($filename eq ".overview"); # real articles only |
159 | next if (-M $filename > $numdays); # only want articles <= a certain age |
172 | next if (-M $filename > $numdays); # only want articles <= a certain age |
160 | $earliest = (stat $filename)[9] unless defined ($earliest); |
173 | $earliest = (stat $filename)[9] unless defined ($earliest); |
161 | $latest = (stat $filename)[9] unless defined ($latest); |
174 | $latest = (stat $filename)[9] unless defined ($latest); |
162 | &getarticle($filename); # read in the article |
175 | &getarticle($filename); # read in the article |
163 | &getdata; # grab the data from the article |
176 | &getdata; # grab the data from the article |
164 | $totalposts++; # bump count of articles considered |
177 | $totalposts++; # bump count of articles considered |
165 | }
|
178 | }
|
166 | closedir(DIR); # finished with the directory |
179 | closedir(DIR); # finished with the directory |
167 | # post-processing
|
180 | # post-processing
|
168 | &countagents; # count agents, collapsing versions |
181 | &countagents; # count agents, collapsing versions |
169 | &fixpercent; # check percentages orig/total for posters |
182 | &fixpercent; # check percentages orig/total for posters |
170 | 183 | ||
171 | &writedata; |
184 | &writedata; |
172 | 185 | ||
173 | #################### DISPLAY RESULTS #####################
|
186 | #################### DISPLAY RESULTS #####################
|
174 | print "=" x 76, "\n"; |
187 | print "=" x 76, "\n"; |
175 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
188 | printf "%s\n", ¢red("Analysis of posts to $newsgroup_name", 76); |
176 | print "=" x 76, "\n"; |
189 | print "=" x 76, "\n"; |
177 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight)", 76); |
190 | printf "%s\n", ¢red("(stats compiled with a script by Garry Knight et al.)", 76); |
178 | print "\n\n"; |
191 | print "\n\n"; |
179 | printf "Total posts considered: %s over %d days\n", |
192 | printf "Total posts considered: %s over %d days\n", |
180 | commify($totalposts), $numdays; |
193 | commify($totalposts), $numdays; |
181 | printf "Earliest article: %s\n", scalar localtime($earliest); |
194 | printf "Earliest article: %s\n", scalar localtime($earliest); |
182 | printf "Latest article: %s\n", scalar localtime($latest); |
195 | printf "Latest article: %s\n", scalar localtime($latest); |
183 | printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); |
196 | printf "Original articles: %s, replies: %s\n", commify($origposts), commify($replies); |
184 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
197 | printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize), |
185 | commify(int($totsize / 1024)), $totsize / 1048576; # |
198 | commify(int($totsize / 1024)), $totsize / 1048576; # |
186 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
199 | printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n", |
187 | commify(int($totalposts / $numdays)), |
200 | commify(int($totalposts / $numdays)), |
188 | $totsize / $numdays / 1048576, commify(int($totsize / $totalposts)); |
201 | $totsize / $numdays / 1048576, commify(int($totsize / $totalposts)); |
189 | my $count = keys %data; |
202 | my $count = keys %data; |
190 | printf "Total headers: %s KB bodies: %s KB\n", |
203 | printf "Total headers: %s KB bodies: %s KB\n", |
191 | commify(int($totheader / 1024)), commify(int($totbody / 1024)); |
204 | commify(int($totheader / 1024)), commify(int($totbody / 1024)); |
192 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
205 | printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n", |
193 | commify(int($totquoted / 1024)), commify(int($totorig / 1024)), |
206 | commify(int($totquoted / 1024)), commify(int($totorig / 1024)), |
194 | ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024)); |
207 | ($totorig * 100) / ($totorig + $totquoted), commify(int($totsig / 1024)); |
195 | printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), |
208 | printf "Total number of posters: %s, average %s bytes per poster\n", commify($count), |
196 | commify(int($totsize / $count)); #/ |
209 | commify(int($totsize / $count)); #/ |
197 | $count = keys %threads; |
210 | $count = keys %threads; |
198 | printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), |
211 | printf "Total number of threads: %s, average %s bytes per thread\n", commify($count), |
199 | commify(int($totsize / $count)); #/ |
212 | commify(int($totsize / $count)); #/ |
200 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
213 | printf "Total number of User-Agents: %d\n", scalar keys %agents; |
201 | print "\n", "=" x 76, "\n"; |
214 | print "\n", "=" x 76, "\n"; |
202 | 215 | ||
203 | ###############################
|
216 | ###############################
|
204 | # show posters by article count Sec 1;
|
217 | # show posters by article count Sec 1;
|
205 | ###############################
|
218 | ###############################
|
206 | unless ( $skipSec{1} ) { |
219 | unless ( $skipSec{1} ) { |
207 | if (keys %data < $topposters) { |
220 | if (keys %data < $topposters) { |
208 | $count = keys %data; |
221 | $count = keys %data; |
209 | } else { |
222 | } else { |
210 | $count = $topposters; |
223 | $count = $topposters; |
211 | }
|
224 | }
|
212 | printf "%s\n", ¢red("Top $count posters by number of articles", 76); |
225 | printf "%s\n", ¢red("Top $count posters by number of articles", 76); |
213 | print "=" x 76, "\n"; |
226 | print "=" x 76, "\n"; |
214 | $i = 0; |
227 | $i = 0; |
215 | foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) { |
228 | foreach $poster (sort {$data{$b}{count} <=> $data{$a}{count}} keys %data) { |
216 | my $name = substr($poster, 0, 65); |
229 | my $name = substr($poster, 0, 65); |
217 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count}; |
230 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{count}; |
218 | last if (++$i == $count); |
231 | last if (++$i == $count); |
219 | }
|
232 | }
|
220 | print "\n", "=" x 76, "\n"; |
233 | print "\n", "=" x 76, "\n"; |
221 | }
|
234 | }
|
222 | 235 | ||
223 | ################################
|
236 | ################################
|
224 | # show posters by size in Kbytes Sec 2;
|
237 | # show posters by size in Kbytes Sec 2;
|
225 | ################################
|
238 | ################################
|
226 | unless ( $skipSec{2} ) { |
239 | unless ( $skipSec{2} ) { |
227 | if (keys %data < $topposters) { |
240 | if (keys %data < $topposters) { |
228 | $count = keys %data; |
241 | $count = keys %data; |
229 | } else { |
242 | } else { |
230 | $count = $topposters; |
243 | $count = $topposters; |
231 | }
|
244 | }
|
232 | printf "%s\n", ¢red("Top $count posters by article size in Kbytes", 76); |
245 | printf "%s\n", ¢red("Top $count posters by article size in Kbytes", 76); |
233 | print "=" x 76, "\n"; |
246 | print "=" x 76, "\n"; |
234 | $i = 0; |
247 | $i = 0; |
235 | foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) { |
248 | foreach $poster (sort {$data{$b}{size} <=> $data{$a}{size}} keys %data) { |
236 | my $name = substr($poster, 0, 62); |
249 | my $name = substr($poster, 0, 62); |
237 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/ |
250 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{size} / 1024; #/ |
238 | last if (++$i == $count); |
251 | last if (++$i == $count); |
239 | }
|
252 | }
|
240 | print "\n", "=" x 76, "\n"; |
253 | print "\n", "=" x 76, "\n"; |
241 | }
|
254 | }
|
242 | 255 | ||
243 | ####################################
|
256 | ####################################
|
244 | # show top posters for original text
|
257 | # show top posters for original text
|
245 | ####################################
|
258 | ####################################
|
246 | unless ( $skipSec{3} ) { |
259 | unless ( $skipSec{3} ) { |
247 | if (keys %data < $topposters) { |
260 | if (keys %data < $topposters) { |
248 | $count = keys %data; |
261 | $count = keys %data; |
249 | } else { |
262 | } else { |
250 | $count = $topposters; |
263 | $count = $topposters; |
251 | }
|
264 | }
|
252 | printf "%s\n", ¢red("Top $count responders by original text (> 5 posts)", 76); |
265 | printf "%s\n", ¢red("Top $count responders by original text (> 5 posts)", 76); |
253 | print "=" x 76, "\n"; |
266 | print "=" x 76, "\n"; |
254 | $i = 0; |
267 | $i = 0; |
255 | foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) { |
268 | foreach $poster (sort { $data{$b}{percent} <=> $data{$a}{percent} } keys %data) { |
256 | next if $data{$poster}{quoted} == 0; |
269 | next if $data{$poster}{quoted} == 0; |
257 | next if $data{$poster}{count} < 5; |
270 | next if $data{$poster}{count} < 5; |
258 | my $name = substr($poster, 0, 63); |
271 | my $name = substr($poster, 0, 63); |
259 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
272 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
260 | last if (++$i == $count); |
273 | last if (++$i == $count); |
261 | }
|
274 | }
|
262 | print "\n", "=" x 76, "\n"; |
275 | print "\n", "=" x 76, "\n"; |
263 | }
|
276 | }
|
264 | 277 | ||
265 | #######################################
|
278 | #######################################
|
266 | # show bottom posters for original text
|
279 | # show bottom posters for original text
|
267 | #######################################
|
280 | #######################################
|
268 | unless ( $skipSec{4} ) { |
281 | unless ( $skipSec{4} ) { |
269 | if (keys %data < $topposters) { |
282 | if (keys %data < $topposters) { |
270 | $count = keys %data; |
283 | $count = keys %data; |
271 | } else { |
284 | } else { |
272 | $count = $topposters; |
285 | $count = $topposters; |
273 | }
|
286 | }
|
274 | printf "%s\n", ¢red("Bottom $count responders by original text (> 5 posts)", 76); |
287 | printf "%s\n", ¢red("Bottom $count responders by original text (> 5 posts)", 76); |
275 | print "=" x 76, "\n"; |
288 | print "=" x 76, "\n"; |
276 | $i = 0; |
289 | $i = 0; |
277 | foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) { |
290 | foreach $poster (sort { $data{$a}{percent} <=> $data{$b}{percent} } keys %data) { |
278 | next if $data{$poster}{quoted} == 0; |
291 | next if $data{$poster}{quoted} == 0; |
279 | next if $data{$poster}{count} < 5; |
292 | next if $data{$poster}{count} < 5; |
280 | my $name = substr($poster, 0, 63); |
293 | my $name = substr($poster, 0, 63); |
281 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
294 | printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad($poster, 63, "."), $data{$poster}{percent}; |
282 | last if (++$i == $count); |
295 | last if (++$i == $count); |
283 | }
|
296 | }
|
284 | print "\n", "=" x 76, "\n"; |
297 | print "\n", "=" x 76, "\n"; |
285 | }
|
298 | }
|
286 | 299 | ||
287 | ####################################
|
300 | ####################################
|
288 | # show threads by number of articles
|
301 | # show threads by number of articles
|
289 | ####################################
|
302 | ####################################
|
290 | unless ( $skipSec{5} ) { |
303 | unless ( $skipSec{5} ) { |
291 | if (keys %threads < $topthreads) { |
304 | if (keys %threads < $topthreads) { |
292 | $count = keys %threads; |
305 | $count = keys %threads; |
293 | } else { |
306 | } else { |
294 | $count = $topthreads; |
307 | $count = $topthreads; |
295 | }
|
308 | }
|
296 | printf "%s\n", ¢red("Top $count threads by no. of articles", 76); |
309 | printf "%s\n", ¢red("Top $count threads by no. of articles", 76); |
297 | print "=" x 76, "\n"; |
310 | print "=" x 76, "\n"; |
298 | $i = 0; |
311 | $i = 0; |
299 | foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) { |
312 | foreach my $thread (sort {$threads{$b}{count} <=> $threads{$a}{count}} keys %threads) { |
300 | my $name = substr($thread, 0, 65); |
313 | my $name = substr($thread, 0, 65); |
301 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count}; |
314 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{count}; |
302 | last if (++$i == $count); |
315 | last if (++$i == $count); |
303 | }
|
316 | }
|
304 | print "\n", "=" x 76, "\n"; |
317 | print "\n", "=" x 76, "\n"; |
305 | }
|
318 | }
|
306 | ################################
|
319 | ################################
|
307 | # show threads by size in Kbytes
|
320 | # show threads by size in Kbytes
|
308 | ################################
|
321 | ################################
|
309 | unless ( $skipSec{6} ) { |
322 | unless ( $skipSec{6} ) { |
310 | if (keys %threads < $topthreads) { |
323 | if (keys %threads < $topthreads) { |
311 | $count = keys %threads; |
324 | $count = keys %threads; |
312 | } else { |
325 | } else { |
313 | $count = $topthreads; |
326 | $count = $topthreads; |
314 | }
|
327 | }
|
315 | printf "%s\n", ¢red("Top $count threads by size in KB", 76); |
328 | printf "%s\n", ¢red("Top $count threads by size in KB", 76); |
316 | print "=" x 76, "\n"; |
329 | print "=" x 76, "\n"; |
317 | $i = 0; |
330 | $i = 0; |
318 | foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) { |
331 | foreach my $thread (sort {$threads{$b}{size} <=> $threads{$a}{size}} keys %threads) { |
319 | my $name = substr($thread, 0, 65); |
332 | my $name = substr($thread, 0, 65); |
320 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/ |
333 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $threads{$thread}{size} / 1024; #/ |
321 | last if (++$i == $count); |
334 | last if (++$i == $count); |
322 | }
|
335 | }
|
323 | print "\n", "=" x 76, "\n"; |
336 | print "\n", "=" x 76, "\n"; |
324 | }
|
337 | }
|
325 | 338 | ||
326 | #################################
|
339 | #################################
|
327 | # show top 10 cross-posted groups
|
340 | # show top 10 cross-posted groups
|
328 | #################################
|
341 | #################################
|
329 | unless ( $skipSec{7} ) { |
342 | unless ( $skipSec{7} ) { |
330 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
343 | delete $crossposts{"$newsgroup_name"}; # don't include ours |
331 | if (keys %crossposts < $topcrossposts) { |
344 | if (keys %crossposts < $topcrossposts) { |
332 | $count = keys %crossposts; |
345 | $count = keys %crossposts; |
333 | } else { |
346 | } else { |
334 | $count = $topcrossposts; |
347 | $count = $topcrossposts; |
335 | }
|
348 | }
|
336 | printf "%s\n", ¢red("Top $count cross-posted groups", 76); |
349 | printf "%s\n", ¢red("Top $count cross-posted groups", 76); |
337 | print "=" x 76, "\n"; |
350 | print "=" x 76, "\n"; |
338 | $i = 0; |
351 | $i = 0; |
339 | foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) { |
352 | foreach my $name (sort {$crossposts{$b} <=> $crossposts{$a}} keys %crossposts) { |
340 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name}; |
353 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($name, 63, "."), $crossposts{$name}; |
341 | last if (++$i == $count); |
354 | last if (++$i == $count); |
342 | }
|
355 | }
|
343 | print "\n", "=" x 76, "\n"; |
356 | print "\n", "=" x 76, "\n"; |
344 | }
|
357 | }
|
345 | #######################
|
358 | #######################
|
346 | #show agents and counts
|
359 | #show agents and counts
|
347 | #######################
|
360 | #######################
|
348 | unless ( $skipSec{8} ) { |
361 | unless ( $skipSec{8} ) { |
349 | if (keys %agents < $topagents) { |
362 | if (keys %agents < $topagents) { |
350 | $count = keys %agents; |
363 | $count = keys %agents; |
351 | } else { |
364 | } else { |
352 | $count = $topagents; |
365 | $count = $topagents; |
353 | }
|
366 | }
|
354 | printf "%s\n", ¢red("Top $count User Agents by poster", 76); |
367 | printf "%s\n", ¢red("Top $count User Agents by poster", 76); |
355 | print "=" x 76, "\n"; |
368 | print "=" x 76, "\n"; |
356 | $i = 0; |
369 | $i = 0; |
357 | foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) { |
370 | foreach my $agent (sort {$agents{$b} <=> $agents{$a}} keys %agents) { |
358 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent}; |
371 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($agent, 63, "."), $agents{$agent}; |
359 | last if (++$i == $count); |
372 | last if (++$i == $count); |
360 | }
|
373 | }
|
361 | print "\n", "=" x 76, "\n"; |
374 | print "\n", "=" x 76, "\n"; |
362 | }
|
375 | }
|
363 | 376 | ||
364 | #######################
|
377 | #######################
|
365 | #show distinct agents
|
378 | #show distinct agents
|
366 | #######################
|
379 | #######################
|
367 | unless ( $skipSec{9} ) { |
380 | unless ( $skipSec{9} ) { |
368 | if (keys %distinct_agent < $topagents) { |
381 | if (keys %distinct_agent < $topagents) { |
369 | $count = keys %distinct_agent; |
382 | $count = keys %distinct_agent; |
370 | } else { |
383 | } else { |
371 | $count = $topagents; |
384 | $count = $topagents; |
372 | }
|
385 | }
|
373 | printf "%s\n", ¢red("Top $count User Agents by number of posts", 76); |
386 | printf "%s\n", ¢red("Top $count User Agents by number of posts", 76); |
374 | print "=" x 76, "\n"; |
387 | print "=" x 76, "\n"; |
375 | $i = 0; |
388 | $i = 0; |
376 | foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) { |
389 | foreach my $agent (sort {$distinct_agent{$b} <=> $distinct_agent{$a}} keys %distinct_agent) { |
377 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100); |
390 | printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad($agent, 58, "."), $distinct_agent{$agent}, (( $distinct_agent{$agent} / $totalposts ) * 100); |
378 | last if (++$i == $count); |
391 | last if (++$i == $count); |
379 | }
|
392 | }
|
380 | print "\n", "=" x 76, "\n"; |
393 | print "\n", "=" x 76, "\n"; |
381 | }
|
394 | }
|
382 | 395 | ||
383 | ##########################
|
396 | ##########################
|
384 | #show timezones and counts
|
397 | #show timezones and counts
|
385 | ##########################
|
398 | ##########################
|
386 | unless ( $skipSec{10} ) { |
399 | unless ( $skipSec{10} ) { |
387 | if (keys %tz < $toptz) { |
400 | if (keys %tz < $toptz) { |
388 | $count = keys %tz; |
401 | $count = keys %tz; |
389 | } else { |
402 | } else { |
390 | $count = $toptz; |
403 | $count = $toptz; |
391 | }
|
404 | }
|
392 | printf "%s\n", ¢red("Top 10 time zones", 76); |
405 | printf "%s\n", ¢red("Top 10 time zones", 76); |
393 | print "=" x 76, "\n"; |
406 | print "=" x 76, "\n"; |
394 | $i = 0; |
407 | $i = 0; |
395 | foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) { |
408 | foreach my $zone (sort {$tz{$b} <=> $tz{$a}} keys %tz) { |
396 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone}; |
409 | printf "%2d: %-63s : %6d\n", $i + 1, rpad($zone, 63, "."), $tz{$zone}; |
397 | last if (++$i == $count); |
410 | last if (++$i == $count); |
398 | }
|
411 | }
|
399 | print "\n", "=" x 76, "\n"; |
412 | print "\n", "=" x 76, "\n"; |
400 | }
|
413 | }
|
401 | 414 | ||
402 | 415 | ||
403 | ################################ SUBROUTINES ################################
|
416 | ################################ SUBROUTINES ################################
|
404 | 417 | ||
405 | 418 | ||
406 | #######################################
|
419 | #######################################
|
407 | # get current article's header and body
|
420 | # get current article's header and body
|
408 | #######################################
|
421 | #######################################
|
409 | sub getarticle { |
422 | sub getarticle { |
410 | %headers = (); # dump old headers |
423 | %headers = (); # dump old headers |
411 | my $filename = shift; # get the name of the file |
424 | my $filename = shift; # get the name of the file |
412 | # get stats about the file itself
|
425 | # get stats about the file itself
|
413 | $filesize = -s $filename; # get total size of file |
426 | $filesize = -s $filename; # get total size of file |
414 | $totsize += $filesize; # bump total sizes of all files |
427 | $totsize += $filesize; # bump total sizes of all files |
415 | 428 | ||
416 | my $mtime = (stat $filename)[9]; |
429 | my $mtime = (stat $filename)[9]; |
417 | if ( $mtime < $earliest ) { |
430 | if ( $mtime < $earliest ) { |
418 | $earliest = $mtime; |
431 | $earliest = $mtime; |
419 | } elsif ( $mtime > $latest ) { |
432 | } elsif ( $mtime > $latest ) { |
420 | $latest = $mtime; |
433 | $latest = $mtime; |
421 | }
|
434 | }
|
422 | 435 | ||
423 | # now read the file
|
436 | # now read the file
|
424 | open(FILE, $filename) or die "Can't open $filename: $!\n"; |
437 | open(FILE, $filename) or die "Can't open $filename: $!\n"; |
425 | while (<FILE>) { |
438 | while (<FILE>) { |
426 | $totheader += length($_); # bump total header size |
439 | $totheader += length($_); # bump total header size |
427 | last if (/^\s*$/); # end of header? |
440 | last if (/^\s*$/); # end of header? |
428 | if (/^([^:\s]*):\s+(.*)/) { |
441 | if (/^([^:\s]*):\s+(.*)/) { |
429 | my($key,$val) = ($1,$2); |
442 | my($key,$val) = ($1,$2); |
430 | $headers{$key} = $val; |
443 | $headers{$key} = decode('MIME-Header', $val); |
431 | $lcheader{clean(lc($key))} = clean($val); |
444 | $lcheader{clean(lc($key))} = clean($val); |
432 | }
|
445 | }
|
433 | }
|
446 | }
|
434 | @body = <FILE>; # slurp up body |
447 | @body = <FILE>; # slurp up body |
435 | close(FILE); |
448 | close(FILE); |
436 | } # getarticle |
449 | } # getarticle |
437 | 450 | ||
438 | ###################################
|
451 | ###################################
|
439 | # get data from the current article
|
452 | # get data from the current article
|
440 | ###################################
|
453 | ###################################
|
441 | sub getdata { |
454 | sub getdata { |
442 | #### First, analyse header fields ####
|
455 | #### First, analyse header fields ####
|
443 | 456 | ||
444 | # Set up this poster if not defined, get counts, sizes
|
457 | # Set up this poster if not defined, get counts, sizes
|
445 | $poster = $headers{From}; # get the poster's name |
458 | $poster = encode('UTF-8', $headers{From}); # get the poster's name |
446 | if (!defined($data{$poster})) { # seen this one before? |
459 | if (!defined($data{$poster})) { # seen this one before? |
447 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
460 | $data{$poster}{agent} = 'Unknown'; # comes after For: field |
448 | $data{$poster}{orig} = 0; |
461 | $data{$poster}{orig} = 0; |
449 | $data{$poster}{quoted} = 0; |
462 | $data{$poster}{quoted} = 0; |
450 | }
|
463 | }
|
451 | $data{$poster}{count}++; # bump count for this poster |
464 | $data{$poster}{count}++; # bump count for this poster |
452 | $data{$poster}{size} += $filesize; # total size of file |
465 | $data{$poster}{size} += $filesize; # total size of file |
453 | 466 | ||
454 | # The User-Agent and/or X-Newsreader fields
|
467 | # The User-Agent and/or X-Newsreader fields
|
455 | # for User-Agent by poster
|
468 | # for User-Agent by poster
|
456 | if (defined $lcheader{"user-agent"}) { |
469 | if (defined $lcheader{"user-agent"}) { |
457 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
470 | $data{$poster}{agent} = $lcheader{"user-agent"}; |
458 | }
|
471 | }
|
459 | if (defined $lcheader{"x-newsreader"}) { |
472 | if (defined $lcheader{"x-newsreader"}) { |
460 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
473 | $data{$poster}{agent} = $lcheader{"x-newsreader"}; |
461 | }
|
474 | }
|
462 | 475 | ||
463 | # The User Agent for User-Agent by number of posts
|
476 | # The User Agent for User-Agent by number of posts
|
464 | my $UA = "unknown"; |
477 | my $UA = "unknown"; |
465 | foreach my $keys ( keys %lcheader ) |
478 | foreach my $keys ( keys %lcheader ) |
466 | {
|
479 | {
|
467 | if (defined $lcheader{'user-agent'}) |
480 | if (defined $lcheader{'user-agent'}) |
468 | {
|
481 | {
|
469 | $UA = $lcheader{'user-agent'}; |
482 | $UA = $lcheader{'user-agent'}; |
470 | }
|
483 | }
|
471 | elsif (defined $lcheader{"x-newsreader"}) |
484 | elsif (defined $lcheader{"x-newsreader"}) |
472 | {
|
485 | {
|
473 | $UA = $lcheader{"x-newsreader"}; |
486 | $UA = $lcheader{"x-newsreader"}; |
474 | }
|
487 | }
|
475 | elsif (defined $lcheader{'x-mailer'}) |
488 | elsif (defined $lcheader{'x-mailer'}) |
476 | {
|
489 | {
|
477 | $UA = $lcheader{'x-mailer'}; |
490 | $UA = $lcheader{'x-mailer'}; |
478 | }
|
491 | }
|
479 | elsif ((defined $lcheader{'organization'}) && |
492 | elsif ((defined $lcheader{'organization'}) && |
480 | ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/)) |
493 | ($lcheader{'organization'} =~ /groups\.google|AOL|Supernews|WebTV|compuserve/)) |
481 | {
|
494 | {
|
482 | $UA = $lcheader{'organization'}; |
495 | $UA = $lcheader{'organization'}; |
483 | }
|
496 | }
|
484 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
497 | elsif ( $lcheader{'message-id'} =~ /pine/i ) |
485 | {
|
498 | {
|
486 | $UA = "Pine"; |
499 | $UA = "Pine"; |
487 | } ## Hopefully found UA, else set to unknown |
500 | } ## Hopefully found UA, else set to unknown |
488 | }
|
501 | }
|
489 | 502 | ||
490 | 503 | ||
491 | $UA = clean($UA); |
504 | $UA = clean($UA); |
492 | $UA = get_agent($UA); |
505 | $UA = get_agent($UA); |
493 | 506 | ||
494 | 507 | ||
495 | sub get_agent { |
508 | sub get_agent { |
496 | my $raw = shift; |
509 | my $raw = shift; |
497 | my $agent = $raw; |
510 | my $agent = $raw; |
498 | 511 | ||
499 | ## strip http
|
512 | ## strip http
|
500 | if ( $raw =~ /.*http.*/ ) { |
513 | if ( $raw =~ /.*http.*/ ) { |
501 | $raw =~ s!posted via!!i; |
514 | $raw =~ s!posted via!!i; |
502 | $raw =~ s!http://!!g; |
515 | $raw =~ s!http://!!g; |
503 | $raw =~ s!/!!g; |
516 | $raw =~ s!/!!g; |
504 | $raw =~ s! !!g; |
517 | $raw =~ s! !!g; |
505 | }
|
518 | }
|
506 | 519 | ||
507 | ## Fix Outlook from Mac
|
520 | ## Fix Outlook from Mac
|
508 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;} |
521 | if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g;} |
509 | 522 | ||
510 | ## Pick out the popular agents
|
523 | ## Pick out the popular agents
|
511 | if ( $raw =~ /(outlook express)/i || |
524 | if ( $raw =~ /(outlook express)/i || |
512 | $raw =~ /(microplanet gravity)/i || |
525 | $raw =~ /(microplanet gravity)/i || |
513 | $raw =~ /(news rover)/i || |
526 | $raw =~ /(news rover)/i || |
514 | $raw =~ /(forte agent)/i || |
527 | $raw =~ /(forte agent)/i || |
515 | $raw =~ /(forte free agent)/i |
528 | $raw =~ /(forte free agent)/i |
516 | )
|
529 | )
|
517 | {
|
530 | {
|
518 | $agent = $1; |
531 | $agent = $1; |
519 | }
|
532 | }
|
520 | elsif ( $raw =~ /^( |
533 | elsif ( $raw =~ /^( |
521 | pan |
534 | pan |
522 | |sylpheed
|
535 | |sylpheed
|
523 | |slrn
|
536 | |slrn
|
524 | |mozilla
|
537 | |mozilla
|
525 | |knode
|
538 | |knode
|
526 | |tin
|
539 | |tin
|
527 | |hamster
|
540 | |hamster
|
528 | |xrn
|
541 | |xrn
|
529 | |xnews
|
542 | |xnews
|
530 | |aol
|
543 | |aol
|
531 | |gnus
|
544 | |gnus
|
532 | |krn
|
545 | |krn
|
533 | |macsoup
|
546 | |macsoup
|
534 | |messenger
|
547 | |messenger
|
535 | |openxp
|
548 | |openxp
|
536 | |pine
|
549 | |pine
|
537 | |thoth
|
550 | |thoth
|
538 | |turnpike
|
551 | |turnpike
|
539 | |winvn
|
552 | |winvn
|
540 | |vsoup
|
553 | |vsoup
|
541 | |google
|
554 | |google
|
542 | |supernews
|
555 | |supernews
|
543 | |nn
|
556 | |nn
|
544 | |rn
|
557 | |rn
|
545 | |007
|
558 | |007
|
546 | |webtv
|
559 | |webtv
|
547 | |compuserve
|
560 | |compuserve
|
548 | )/ix ) |
561 | )/ix ) |
549 | {
|
562 | {
|
550 | $agent = $1; |
563 | $agent = $1; |
551 | }
|
564 | }
|
552 | else
|
565 | else
|
553 | {
|
566 | {
|
554 | ## Clean up unknown agents
|
567 | ## Clean up unknown agents
|
555 | if ( $raw =~ m!^(.*?)/! ) { |
568 | if ( $raw =~ m!^(.*?)/! ) { |
556 | $agent = $1; |
569 | $agent = $1; |
557 | }
|
570 | }
|
558 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
571 | elsif ( $raw =~ /^(\w*)\d.*/ ) |
559 | {
|
572 | {
|
560 | $agent = $1; |
573 | $agent = $1; |
561 | }
|
574 | }
|
562 | }
|
575 | }
|
563 | 576 | ||
564 | $distinct_agent{$agent}++; |
577 | $distinct_agent{$agent}++; |
565 | return $agent; |
578 | return $agent; |
566 | }
|
579 | }
|
567 | 580 | ||
568 | 581 | ||
569 | # Get all cross-posted newsgroups
|
582 | # Get all cross-posted newsgroups
|
570 | for (split /,/, $headers{"Newsgroups"}) { |
583 | for (split /,/, $headers{"Newsgroups"}) { |
571 | $crossposts{$_}++; # bump count for each |
584 | $crossposts{$_}++; # bump count for each |
572 | }
|
585 | }
|
573 | 586 | ||
574 | # Get threads
|
587 | # Get threads
|
575 | my $thread = $headers{"Subject"}; |
588 | my $thread = encode('UTF-8', $headers{"Subject"}); |
576 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
589 | $thread =~ s/^re: //i; # Remove Re: or re: at start |
577 | $thread =~ s/\s+/ /g; # collapse whitespace |
590 | $thread =~ s/\s+/ /g; # collapse whitespace |
578 | $threads{$thread}{count} += 1; # bump count of this subject |
591 | $threads{$thread}{count} += 1; # bump count of this subject |
579 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
592 | $threads{$thread}{size} += $filesize; # bump bytes for this thread |
580 | 593 | ||
581 | # Is this an original post or a reply?
|
594 | # Is this an original post or a reply?
|
582 | if (defined $headers{"References"}) { |
595 | if (defined $headers{"References"}) { |
583 | $replies++; |
596 | $replies++; |
584 | } else { |
597 | } else { |
585 | $origposts++; |
598 | $origposts++; |
586 | }
|
599 | }
|
587 | 600 | ||
588 | # Get the time zone
|
601 | # Get the time zone
|
589 | $_ = $headers{"Date"}; |
602 | $_ = $headers{"Date"}; |
590 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
603 | my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/; |
591 | if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) { |
604 | if (($tz =~ /UTC/) or ($tz =~ /GMT/) or ($tz =~ /0000/)) { |
592 | $tz = "UTC"; |
605 | $tz = "UTC"; |
593 | }
|
606 | }
|
594 | $tz{$tz}++; |
607 | $tz{$tz}++; |
595 | 608 | ||
596 | #### Now analyse the body text ####
|
609 | #### Now analyse the body text ####
|
597 | my $insig = 0; |
610 | my $insig = 0; |
598 | for (@body) { |
611 | for (@body) { |
599 | $totbody += length($_); # bump total body size |
612 | $totbody += length($_); # bump total body size |
600 | next if (/^$>/); # don't count blank lines in body |
613 | next if (/^$>/); # don't count blank lines in body |
601 | if ($insig == 1) { |
614 | if ($insig == 1) { |
602 | $totsig += length($_); # bump total sig size |
615 | $totsig += length($_); # bump total sig size |
603 | # Bill Unruh uses ] quotes, and another poster uses ::
|
616 | # Bill Unruh uses ] quotes, and another poster uses ::
|
604 | } elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line? |
617 | } elsif (/^\s*[>\]]/ or /^\s*::/) { # are we in a quote line? |
605 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
618 | $data{$poster}{quoted} += length($_); # bump count of quoted chrs |
606 | $totquoted += length($_); |
619 | $totquoted += length($_); |
607 | } elsif (/-- /) { |
620 | } elsif (/-- /) { |
608 | $insig = 1; |
621 | $insig = 1; |
609 | } else { |
622 | } else { |
610 | # we must be processing an original line
|
623 | # we must be processing an original line
|
611 | $data{$poster}{orig} += length($_); # bump count of original chrs |
624 | $data{$poster}{orig} += length($_); # bump count of original chrs |
612 | $totorig += length($_); |
625 | $totorig += length($_); |
613 | }
|
626 | }
|
614 | } # end for (@body) |
627 | } # end for (@body) |
615 | 628 | ||
616 | } # getdata |
629 | } # getdata |
617 | 630 | ||
618 | ########################################
|
631 | ########################################
|
619 | # Count the User-Agents used, collapsing
|
632 | # Count the User-Agents used, collapsing
|
620 | # different versions into one per agent.
|
633 | # different versions into one per agent.
|
621 | ########################################
|
634 | ########################################
|
622 | sub countagents { |
635 | sub countagents { |
623 | POSTER:
|
636 | POSTER:
|
624 | foreach $poster (keys %data) { |
637 | foreach $poster (keys %data) { |
625 | foreach my $agent_name (keys %distinct_agent) { # check against known ones |
638 | foreach my $agent_name (keys %distinct_agent) { # check against known ones |
626 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { |
639 | if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ ) { |
627 | $agents{$agent_name}++; |
640 | $agents{$agent_name}++; |
628 | next POSTER; |
641 | next POSTER; |
629 | }
|
642 | }
|
630 | }
|
643 | }
|
631 | $agents{$data{$poster}{agent}}++; |
644 | $agents{$data{$poster}{agent}}++; |
632 | }
|
645 | }
|
633 | } # countagents |
646 | } # countagents |
634 | 647 | ||
635 | ############################################
|
648 | ############################################
|
636 | # set orig/total percentages for all posters
|
649 | # set orig/total percentages for all posters
|
637 | ############################################
|
650 | ############################################
|
638 | sub fixpercent { |
651 | sub fixpercent { |
639 | foreach $poster (keys %data) { |
652 | foreach $poster (keys %data) { |
640 | my $percent = 100; |
653 | my $percent = 100; |
641 | if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) { |
654 | if (($data{$poster}{orig} != 0) and ($data{$poster}{quoted} != 0)) { |
642 | $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/ |
655 | $percent = $data{$poster}{orig} * 100 / ($data{$poster}{quoted} + $data{$poster}{orig}); #/ |
643 | } elsif ($data{$poster}{orig} == 0) { |
656 | } elsif ($data{$poster}{orig} == 0) { |
644 | $percent = 0; |
657 | $percent = 0; |
645 | }
|
658 | }
|
646 | $data{$poster}{percent} = $percent; |
659 | $data{$poster}{percent} = $percent; |
647 | }
|
660 | }
|
648 | }
|
661 | }
|
649 | 662 | ||
650 | ##############################
|
663 | ##############################
|
651 | # right pad a string with '.'s
|
664 | # right pad a string with '.'s
|
652 | ##############################
|
665 | ##############################
|
653 | sub rpad { |
666 | sub rpad { |
654 | # get text to pad, length to pad, pad chr
|
667 | # get text to pad, length to pad, pad chr
|
655 | my ($text, $pad_len, $pad_chr) = @_; |
668 | my ($text, $pad_len, $pad_chr) = @_; |
656 | if (length($text) > $pad_len) { |
669 | if (length($text) > $pad_len) { |
657 | $text = substr($text, 0, $pad_len); |
670 | $text = substr($text, 0, $pad_len); |
658 | }
|
671 | }
|
659 | my $padded = $text . $pad_chr x ( $pad_len - length( $text ) ); |
672 | my $padded = $text . $pad_chr x ( $pad_len - length( $text ) ); |
660 | return $padded; |
673 | return $padded; |
661 | }
|
674 | }
|
662 | 675 | ||
663 | #################
|
676 | #################
|
664 | # centre a string
|
677 | # centre a string
|
665 | #################
|
678 | #################
|
666 | sub centred { |
679 | sub centred { |
667 | my ($text, $width) = @_; # text to centre, size of field to centre in |
680 | my ($text, $width) = @_; # text to centre, size of field to centre in |
668 | my $pad_len = ($width - length($text)) / 2; #/ |
681 | my $pad_len = ($width - length($text)) / 2; #/ |
669 | my $centred = " " x $pad_len . $text; |
682 | my $centred = " " x $pad_len . $text; |
670 | return $centred; |
683 | return $centred; |
671 | }
|
684 | }
|
672 | 685 | ||
673 | ##########################
|
686 | ##########################
|
674 | # put commas into a number
|
687 | # put commas into a number
|
675 | ##########################
|
688 | ##########################
|
676 | sub commify { |
689 | sub commify { |
677 | $_ = shift; |
690 | $_ = shift; |
678 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
691 | 1 while s/^(-?\d+)(\d{3})/$1,$2/; |
679 | return $_; |
692 | return $_; |
680 | }
|
693 | }
|
681 | 694 | ||
682 | #########################
|
695 | #########################
|
683 | # clean
|
696 | # clean
|
684 | #########################
|
697 | #########################
|
685 | sub clean { |
698 | sub clean { |
686 | my $dirty = shift; |
699 | my $dirty = shift; |
687 | my $clean = $dirty; |
700 | my $clean = $dirty; |
688 | $clean =~ s/^\s*//; |
701 | $clean =~ s/^\s*//; |
689 | $clean =~ s/\s*$//; |
702 | $clean =~ s/\s*$//; |
690 | 703 | ||
691 | return $clean; |
704 | return $clean; |
692 | }
|
705 | }
|
693 | 706 | ||
694 | 707 | ||
695 | sub usage { |
708 | sub usage { |
696 | 709 | ||
697 | print "usage: newstat.pl newsgroupname\n"; |
710 | print "usage: newstat.pl newsgroupname\n"; |
698 | exit 1; |
711 | exit 1; |
699 | }
|
712 | }
|
700 | 713 | ||
701 | ###################################
|
714 | ###################################
|
702 | # Write data structures to a file #
|
715 | # Write data structures to a file #
|
703 | ###################################
|
716 | ###################################
|
704 | sub writedata { |
717 | sub writedata { |
705 | open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
718 | open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n"; |
706 | print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
719 | print OUTF "Data collected from alt.os.linux.mandrake\n\n"; |
707 | print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
720 | print OUTF "Poster Data\nname : agent : count : size: orig : quoted : per cent\n"; |
708 | foreach my $name (keys %data) { |
721 | foreach my $name (keys %data) { |
709 | print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
722 | print OUTF "$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n"; |
710 | }
|
723 | }
|
711 | print OUTF "============================================================================\n"; |
724 | print OUTF "============================================================================\n"; |
712 | print OUTF "Thread subjects\n"; |
725 | print OUTF "Thread subjects\n"; |
713 | print OUTF "----------------------------------------------------------------------------\n"; |
726 | print OUTF "----------------------------------------------------------------------------\n"; |
714 | foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) { |
727 | foreach my $thread (sort {"\L$a" cmp "\L$b"} keys %threads) { |
715 | print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
728 | print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n"; |
716 | }
|
729 | }
|
717 | print OUTF "============================================================================\n"; |
730 | print OUTF "============================================================================\n"; |
718 | print OUTF "Cross-posts\n"; |
731 | print OUTF "Cross-posts\n"; |
719 | print OUTF "----------------------------------------------------------------------------\n"; |
732 | print OUTF "----------------------------------------------------------------------------\n"; |
720 | foreach my $name (sort keys %crossposts) { |
733 | foreach my $name (sort keys %crossposts) { |
721 | print OUTF "$name : $crossposts{$name}\n"; |
734 | print OUTF "$name : $crossposts{$name}\n"; |
722 | }
|
735 | }
|
723 | print OUTF |
736 | print OUTF |
724 | print OUTF "============================================================================\n"; |
737 | print OUTF "============================================================================\n"; |
725 | print OUTF "User agents\n"; |
738 | print OUTF "User agents\n"; |
726 | print OUTF "----------------------------------------------------------------------------\n"; |
739 | print OUTF "----------------------------------------------------------------------------\n"; |
727 | foreach my $name (sort keys %agents) { |
740 | foreach my $name (sort keys %agents) { |
728 | print OUTF "$name : $agents{$name}\n"; |
741 | print OUTF "$name : $agents{$name}\n"; |
729 | }
|
742 | }
|
730 | print OUTF "============================================================================\n"; |
743 | print OUTF "============================================================================\n"; |
731 | print OUTF "Time zones\n"; |
744 | print OUTF "Time zones\n"; |
732 | print OUTF "----------------------------------------------------------------------------\n"; |
745 | print OUTF "----------------------------------------------------------------------------\n"; |
733 | foreach my $name (sort keys %tz) { |
746 | foreach my $name (sort keys %tz) { |
734 | print OUTF "$name : $tz{$name}\n"; |
747 | print OUTF "$name : $tz{$name}\n"; |
735 | }
|
748 | }
|
736 | close OUTF; |
749 | close OUTF; |
737 | } # writedata |
750 | } # writedata |
738 | 751 |