WebSVN - LCARS - Blame - Rev 7 - /trunk/tools/network/news/newsstat/newsstat.pl

Rev	Author	Line No.	Line
7	PointedEar	1	#!/usr/bin/perl
5	PointedEar	2	use strict;
6	PointedEar	3	use warnings;
		4	use utf8;
		5	use Encode;
		6
5	PointedEar	7	#########################
6	PointedEar	8	# newsstat.pl version 0.4
5	PointedEar	9
6	PointedEar	10	############################################################################
		11	# Collect statistics about a newsgroup (specified by first argument) in
7	PointedEar	12	# the local news spool. Check all articles in the last 30-day period.
6	PointedEar	13	# Rank posters by number of posts and by volume of posts, report on top and
		14	# bottom 20 posters. Show their name, number of posts, size of posts,
		15	# percentage of quoted lines. Rank user-agents used, by poster rather than
		16	# by post. Rank top 20 threads. Rank top 10 cross-posted groups.
7	PointedEar	17	#
6	PointedEar	18	# (Numbers and paths can be configured below. -- PE)
		19	############################################################################
5	PointedEar	20
6	PointedEar	21	############################################################################
		22	# RECENT CHANGES #
		23	# 2011-07-03 PE - Use Encode to decode/encode MIME encodings
7	PointedEar	24	# - Use warnings, utf8 (just in case)
6	PointedEar	25	# - Documentation update
		26	# N/A NN - Take newsgroup name as argument
		27	# 2004-06-19 NN - newsgroup name is $ARGV[0]
		28	# - Allow command line flags for subtracting
		29	# output if not pertinent for a group
		30	# 2002-11-09 NN - Put Garry's writedata() function back in.
		31	# - added "rn" to my list of UA's
		32	# - Started using %distinct_agent for both User agent
		33	# sections
		34	# - named it newsstat.pl version 0.3
		35	# 2002-11-06 NN - Fixed the earliest/latest file problem by using
		36	# mtime rather than ctime, and simplifying the logic
		37	# 2002-11-05 NN - moved user configurations to the top
		38	# - fixed the cross-posting section
		39	# - introduced the $newsgroup_name variable which
		40	# later becomes $news$group
		41	# - changed $name to $agent_name in countagents()
7	PointedEar	42	#
6	PointedEar	43	# Contributors
		44	# -------------
		45	# NN Nomen nominandum (name to be determined later)
		46	# PE Thomas 'PointedEars' Lahn <startrek@PointedEars.de>
5	PointedEar	47
6	PointedEar	48	########### TODO #############
5	PointedEar	49	# Commas in bottom section of report
		50	# Show date the figures were compiled
		51	# No. of HTML articles (Content-Type: text/html)
		52	# No. of quoted sigs (/>\s*-- /)
		53	# Per cent of top-posted articles
		54	# Top 10 cross-posters
		55	# Top 20 news posting hosts (from Path)
		56	# Count of certain subject words: newbie, kde, burner, sendmail, etc.
		57	# Count all User Agents that each poster uses
		58	# What do we do about Bill Unruh's ] quote style?
		59	# Change the way dates/times are checked
		60	# include % share in posters by no. of arts
		61	# include % share in posters by size
		62	# Total, orig & quoted lines by user agent with per cent
6	PointedEar	63	# Take more arguments
5	PointedEar	64	#######################################################
		65
		66	###################### USER CONFIGURATIONS ############################
		67
		68	# The name of the group to do stats for
		69	my $newsgroup_name = $ARGV[0];
6	PointedEar	70	$newsgroup_name or &usage;
5	PointedEar	71
		72	# Check for removal flags
		73	my $ix;
		74	my $j;
		75	my %skipSec;
		76	my @skiplist;
		77	my $args = @ARGV;
7	PointedEar	78	for ( $ix = 1 ; $ix < $args ; $ix++ )
		79	{
		80	$j = $ix + 1;
		81	if ( $ARGV[$ix] eq "-x" )
		82	{
		83	@skiplist = split( ",", $ARGV[$j] );
		84	}
		85	elsif ( $ARGV[$ix] =~ /-x(\d.*)/ )
		86	{
		87	@skiplist = split( ",", $1 );
		88	}
5	PointedEar	89	}
7	PointedEar	90	foreach (@skiplist)
		91	{
5	PointedEar	92	$skipSec{$_} = 1;
		93	}
		94
		95	# Leafnode users will want /var/spool/news for this variable.
		96	my $news = "/var/spool/news/";
		97
		98	# How many days are we doing statistics for?
		99	my $numdays = 30;
		100
		101	# no. of agents we list
		102	my $topagents = 10;
		103
		104	# no. of threads we want to know about
		105	my $topthreads = 20;
		106
		107	# no. of top or bottom posters to show
		108	my $topposters = 20;
		109
		110	# no. of cross-posted threads to show
		111	my $topcrossposts = 10;
		112
		113	# no. of time zones to show
		114	my $toptz = 10;
		115
		116	###################### DATA STRUCTURES ######################
		117	my $group = $newsgroup_name;
		118	$group =~ s!\.!/!g;
7	PointedEar	119	my %data; # name, count, agent, total, orig, quoted
		120	my %threads; # subject, count
		121	my %crossposts; # group, count
		122	my %tz; # timezones by count
		123	my %headers; # holds header of current article
		124	my %lcheader; # holds lowercase headers
		125	my @body; # holds body of current article
		126	my @sig; # holds sig text;
		127	my $totalposts; # total no. of posts considered
		128	my $filename; # name of current article file
		129	my $filesize; # size of current article file
		130	my $earliest; # earliest article we have found
		131	my $latest; # latest article we have found
		132	my $poster; # poster we are dealing with
		133	my $totsize = 0; # holds total sizes of all files
		134	my $totheader = 0; # total size of header material
		135	my $totbody = 0; # total size of body material
		136	my $totsig = 0; # total size of sig material
		137	my $totorig = 0; # total size of original material
		138	my $totquoted = 0; # total size of quoted material
		139	my $origposts = 0; # total no. of original posts
		140	my $replies = 0; # total no. of replies
		141	my $i; # general purpose
5	PointedEar	142	my %distinct_agent;
7	PointedEar	143	my %agents = # used to hold counts of User Agents used
		144	(
		145	"KNode" => 0,
		146	"Pan" => 0,
		147	"Mozilla" => 0,
		148	"Sylpheed" => 0,
		149	"Gnus" => 0,
		150	"Forte Agent" => 0,
		151	"Forte Free Agent" => 0,
		152	"MicroPlanet Gravity" => 0,
		153	"Microsoft Outlook Express" => 0,
		154	"Xnews" => 0,
		155	"slrn" => 0,
		156	"tin" => 0,
		157	"rn" => 0,
		158	"NN" => 0,
		159	"MacSOUP" => 0,
		160	"Foorum" => 0,
		161	"MT-NewsWatcher" => 0,
		162	"News Rover" => 0,
		163	"WebTV" => 0,
		164	"Compuserver" => 0,
		165	"VSoup" => 0
		166	);
5	PointedEar	167
		168	######################## MAIN CODE ########################
		169	$! = 1;
		170
		171	chdir("$news$group") or die "Can't cd to $news$group: $!\n";
7	PointedEar	172	opendir( DIR, "." ) or die "Can't open $news$group directory: $!\n";
		173	while ( defined( $filename = readdir(DIR) ) )
		174	{
		175	%lcheader = ();
		176	next unless -f $filename; # only want real files
		177	next if ( $filename eq ".overview" ); # real articles only
		178	next if ( -M $filename > $numdays ); # only want articles <= a certain age
		179	$earliest = ( stat $filename )[9] unless defined($earliest);
		180	$latest = ( stat $filename )[9] unless defined($latest);
		181	&getarticle($filename); # read in the article
		182	&getdata; # grab the data from the article
		183	$totalposts++; # bump count of articles considered
5	PointedEar	184	}
7	PointedEar	185	closedir(DIR); # finished with the directory
		186
5	PointedEar	187	# post-processing
7	PointedEar	188	&countagents; # count agents, collapsing versions
		189	&fixpercent; # check percentages orig/total for posters
5	PointedEar	190
		191	&writedata;
		192
		193	#################### DISPLAY RESULTS #####################
		194	print "=" x 76, "\n";
7	PointedEar	195	printf "%s\n", &centred( "Analysis of posts to $newsgroup_name", 76 );
5	PointedEar	196	print "=" x 76, "\n";
7	PointedEar	197	printf "%s\n",
		198	&centred( "(stats compiled with a script by Garry Knight et al.)", 76 );
5	PointedEar	199	print "\n\n";
7	PointedEar	200	printf "Total posts considered: %s over %d days\n", commify($totalposts),
		201	$numdays;
		202	printf "Earliest article: %s\n", scalar localtime($earliest);
		203	printf "Latest article: %s\n", scalar localtime($latest);
		204	printf "Original articles: %s, replies: %s\n", commify($origposts),
		205	commify($replies);
5	PointedEar	206	printf "Total size of posts: %s bytes (%sK) (%.2fM)\n", commify($totsize),
7	PointedEar	207	commify( int( $totsize / 1024 ) ), $totsize / 1048576; #
5	PointedEar	208	printf "Average %s articles per day, %.2f MB per day, %s bytes per article\n",
7	PointedEar	209	commify( int( $totalposts / $numdays ) ), $totsize / $numdays / 1048576,
		210	commify( int( $totsize / $totalposts ) );
5	PointedEar	211	my $count = keys %data;
		212	printf "Total headers: %s KB bodies: %s KB\n",
7	PointedEar	213	commify( int( $totheader / 1024 ) ), commify( int( $totbody / 1024 ) );
5	PointedEar	214	printf "Body text - quoted: %s KB, original: %s KB = %02.2f%%, sigs: %s KB\n",
7	PointedEar	215	commify( int( $totquoted / 1024 ) ), commify( int( $totorig / 1024 ) ),
		216	( $totorig * 100 ) / ( $totorig + $totquoted ),
		217	commify( int( $totsig / 1024 ) );
		218	printf "Total number of posters: %s, average %s bytes per poster\n",
		219	commify($count), commify( int( $totsize / $count ) ); #/
5	PointedEar	220	$count = keys %threads;
7	PointedEar	221	printf "Total number of threads: %s, average %s bytes per thread\n",
		222	commify($count), commify( int( $totsize / $count ) ); #/
5	PointedEar	223	printf "Total number of User-Agents: %d\n", scalar keys %agents;
		224	print "\n", "=" x 76, "\n";
		225
		226	###############################
		227	# show posters by article count Sec 1;
		228	###############################
7	PointedEar	229	unless ( $skipSec{1} )
		230	{
		231	if ( keys %data < $topposters )
		232	{
		233	$count = keys %data;
		234	}
		235	else
		236	{
		237	$count = $topposters;
		238	}
		239	printf "%s\n", &centred( "Top $count posters by number of articles", 76 );
		240	print "=" x 76, "\n";
		241	$i = 0;
		242	foreach $poster ( sort { $data{$b}{count} <=> $data{$a}{count} } keys %data )
		243	{
		244	my $name = substr( $poster, 0, 65 );
		245	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
		246	$data{$poster}{count};
		247	last if ( ++$i == $count );
		248	}
		249	print "\n", "=" x 76, "\n";
5	PointedEar	250	}
		251
		252	################################
		253	# show posters by size in Kbytes Sec 2;
		254	################################
7	PointedEar	255	unless ( $skipSec{2} )
		256	{
		257	if ( keys %data < $topposters )
		258	{
5	PointedEar	259	$count = keys %data;
7	PointedEar	260	}
		261	else
		262	{
5	PointedEar	263	$count = $topposters;
		264	}
7	PointedEar	265	printf "%s\n", &centred( "Top $count posters by article size in Kbytes", 76 );
5	PointedEar	266	print "=" x 76, "\n";
		267	$i = 0;
7	PointedEar	268	foreach $poster ( sort { $data{$b}{size} <=> $data{$a}{size} } keys %data )
		269	{
		270	my $name = substr( $poster, 0, 62 );
		271	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $poster, 63, "." ),
		272	$data{$poster}{size} / 1024; #/
		273	last if ( ++$i == $count );
5	PointedEar	274	}
		275	print "\n", "=" x 76, "\n";
		276	}
		277
		278	####################################
		279	# show top posters for original text
		280	####################################
7	PointedEar	281	unless ( $skipSec{3} )
		282	{
		283	if ( keys %data < $topposters )
		284	{
		285	$count = keys %data;
		286	}
		287	else
		288	{
		289	$count = $topposters;
		290	}
		291	printf "%s\n",
		292	&centred( "Top $count responders by original text (> 5 posts)", 76 );
		293	print "=" x 76, "\n";
		294	$i = 0;
		295	foreach $poster ( sort { $data{$b}{percent} <=> $data{$a}{percent} }
		296	keys %data )
		297	{
		298	next if $data{$poster}{quoted} == 0;
		299	next if $data{$poster}{count} < 5;
		300	my $name = substr( $poster, 0, 63 );
		301	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
		302	$data{$poster}{percent};
		303	last if ( ++$i == $count );
		304	}
		305	print "\n", "=" x 76, "\n";
5	PointedEar	306	}
		307
		308	#######################################
		309	# show bottom posters for original text
		310	#######################################
7	PointedEar	311	unless ( $skipSec{4} )
		312	{
		313	if ( keys %data < $topposters )
		314	{
5	PointedEar	315	$count = keys %data;
7	PointedEar	316	}
		317	else
		318	{
5	PointedEar	319	$count = $topposters;
		320	}
7	PointedEar	321	printf "%s\n",
		322	&centred( "Bottom $count responders by original text (> 5 posts)", 76 );
5	PointedEar	323	print "=" x 76, "\n";
		324	$i = 0;
7	PointedEar	325	foreach $poster ( sort { $data{$a}{percent} <=> $data{$b}{percent} }
		326	keys %data )
		327	{
5	PointedEar	328	next if $data{$poster}{quoted} == 0;
		329	next if $data{$poster}{count} < 5;
7	PointedEar	330	my $name = substr( $poster, 0, 63 );
		331	printf "%2d: %-63s : %02.2f%%\n", $i + 1, rpad( $poster, 63, "." ),
		332	$data{$poster}{percent};
		333	last if ( ++$i == $count );
5	PointedEar	334	}
		335	print "\n", "=" x 76, "\n";
		336	}
		337
		338	####################################
		339	# show threads by number of articles
		340	####################################
7	PointedEar	341	unless ( $skipSec{5} )
		342	{
		343	if ( keys %threads < $topthreads )
		344	{
5	PointedEar	345	$count = keys %threads;
7	PointedEar	346	}
		347	else
		348	{
5	PointedEar	349	$count = $topthreads;
		350	}
7	PointedEar	351	printf "%s\n", &centred( "Top $count threads by no. of articles", 76 );
5	PointedEar	352	print "=" x 76, "\n";
		353	$i = 0;
7	PointedEar	354	foreach my $thread ( sort { $threads{$b}{count} <=> $threads{$a}{count} }
		355	keys %threads )
		356	{
		357	my $name = substr( $thread, 0, 65 );
		358	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
		359	$threads{$thread}{count};
		360	last if ( ++$i == $count );
5	PointedEar	361	}
		362	print "\n", "=" x 76, "\n";
		363	}
		364	################################
		365	# show threads by size in Kbytes
		366	################################
7	PointedEar	367	unless ( $skipSec{6} )
		368	{
		369	if ( keys %threads < $topthreads )
		370	{
5	PointedEar	371	$count = keys %threads;
7	PointedEar	372	}
		373	else
		374	{
5	PointedEar	375	$count = $topthreads;
		376	}
7	PointedEar	377	printf "%s\n", &centred( "Top $count threads by size in KB", 76 );
5	PointedEar	378	print "=" x 76, "\n";
		379	$i = 0;
7	PointedEar	380	foreach my $thread ( sort { $threads{$b}{size} <=> $threads{$a}{size} }
		381	keys %threads )
		382	{
		383	my $name = substr( $thread, 0, 65 );
		384	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
		385	$threads{$thread}{size} / 1024; #/
		386	last if ( ++$i == $count );
5	PointedEar	387	}
		388	print "\n", "=" x 76, "\n";
		389	}
		390
		391	#################################
		392	# show top 10 cross-posted groups
		393	#################################
7	PointedEar	394	unless ( $skipSec{7} )
		395	{
		396	delete $crossposts{"$newsgroup_name"}; # don't include ours
		397	if ( keys %crossposts < $topcrossposts )
		398	{
		399	$count = keys %crossposts;
		400	}
		401	else
		402	{
		403	$count = $topcrossposts;
		404	}
		405	printf "%s\n", &centred( "Top $count cross-posted groups", 76 );
		406	print "=" x 76, "\n";
		407	$i = 0;
		408	foreach
		409	my $name ( sort { $crossposts{$b} <=> $crossposts{$a} } keys %crossposts )
		410	{
		411	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $name, 63, "." ),
		412	$crossposts{$name};
		413	last if ( ++$i == $count );
		414	}
		415	print "\n", "=" x 76, "\n";
5	PointedEar	416	}
		417	#######################
		418	#show agents and counts
		419	#######################
7	PointedEar	420	unless ( $skipSec{8} )
		421	{
		422	if ( keys %agents < $topagents )
		423	{
		424	$count = keys %agents;
		425	}
		426	else
		427	{
		428	$count = $topagents;
		429	}
		430	printf "%s\n", &centred( "Top $count User Agents by poster", 76 );
		431	print "=" x 76, "\n";
		432	$i = 0;
		433	foreach my $agent ( sort { $agents{$b} <=> $agents{$a} } keys %agents )
		434	{
		435	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $agent, 63, "." ),
		436	$agents{$agent};
		437	last if ( ++$i == $count );
		438	}
		439	print "\n", "=" x 76, "\n";
5	PointedEar	440	}
		441
		442	#######################
		443	#show distinct agents
		444	#######################
7	PointedEar	445	unless ( $skipSec{9} )
		446	{
		447	if ( keys %distinct_agent < $topagents )
		448	{
		449	$count = keys %distinct_agent;
		450	}
		451	else
		452	{
		453	$count = $topagents;
		454	}
		455	printf "%s\n", &centred( "Top $count User Agents by number of posts", 76 );
		456	print "=" x 76, "\n";
		457	$i = 0;
		458	foreach my $agent ( sort { $distinct_agent{$b} <=> $distinct_agent{$a} }
		459	keys %distinct_agent )
		460	{
		461	printf "%2d: %-58s : %5d (%2.f%%)\n", $i + 1, rpad( $agent, 58, "." ),
		462	$distinct_agent{$agent},
		463	( ( $distinct_agent{$agent} / $totalposts ) * 100 );
		464	last if ( ++$i == $count );
		465	}
		466	print "\n", "=" x 76, "\n";
5	PointedEar	467	}
		468
		469	##########################
		470	#show timezones and counts
		471	##########################
7	PointedEar	472	unless ( $skipSec{10} )
		473	{
		474	if ( keys %tz < $toptz )
		475	{
		476	$count = keys %tz;
		477	}
		478	else
		479	{
		480	$count = $toptz;
		481	}
		482	printf "%s\n", &centred( "Top 10 time zones", 76 );
		483	print "=" x 76, "\n";
		484	$i = 0;
		485	foreach my $zone ( sort { $tz{$b} <=> $tz{$a} } keys %tz )
		486	{
		487	printf "%2d: %-63s : %6d\n", $i + 1, rpad( $zone, 63, "." ), $tz{$zone};
		488	last if ( ++$i == $count );
		489	}
		490	print "\n", "=" x 76, "\n";
5	PointedEar	491	}
		492
		493	################################ SUBROUTINES ################################
		494
		495	#######################################
		496	# get current article's header and body
		497	#######################################
7	PointedEar	498	sub getarticle
		499	{
		500	%headers = (); # dump old headers
		501	my $filename = shift; # get the name of the file
5	PointedEar	502
7	PointedEar	503	# get stats about the file itself
		504	$filesize = -s $filename; # get total size of file
		505	$totsize += $filesize; # bump total sizes of all files
		506
		507	my $mtime = ( stat $filename )[9];
		508	if ( $mtime < $earliest )
		509	{
5	PointedEar	510	$earliest = $mtime;
7	PointedEar	511	}
		512	elsif ( $mtime > $latest )
		513	{
5	PointedEar	514	$latest = $mtime;
7	PointedEar	515	}
5	PointedEar	516
7	PointedEar	517	# now read the file
		518	open( FILE, $filename ) or die "Can't open $filename: $!\n";
		519	while (<FILE>)
		520	{
		521	$totheader += length($_); # bump total header size
		522	last if (/^\s*$/); # end of header?
		523	if (/^([^:\s]):\s+(.)/)
		524	{
		525	my ( $key, $val ) = ( $1, $2 );
		526	$headers{$key} = decode( 'MIME-Header', $val );
		527	$lcheader{ clean( lc($key) ) } = clean($val);
		528	}
5	PointedEar	529	}
7	PointedEar	530	@body = <FILE>; # slurp up body
		531	close(FILE);
		532	} # getarticle
5	PointedEar	533
		534	###################################
		535	# get data from the current article
		536	###################################
7	PointedEar	537	sub getdata
		538	{
5	PointedEar	539	#### First, analyse header fields ####
		540
7	PointedEar	541	# Set up this poster if not defined, get counts, sizes
		542	$poster = encode( 'UTF-8', $headers{From} ); # get the poster's name
		543	if ( !defined( $data{$poster} ) )
		544	{ # seen this one before?
		545	$data{$poster}{agent} = 'Unknown'; # comes after For: field
		546	$data{$poster}{orig} = 0;
		547	$data{$poster}{quoted} = 0;
		548	}
		549	$data{$poster}{count}++; # bump count for this poster
		550	$data{$poster}{size} += $filesize; # total size of file
5	PointedEar	551
7	PointedEar	552	# The User-Agent and/or X-Newsreader fields
		553	# for User-Agent by poster
		554	if ( defined $lcheader{"user-agent"} )
		555	{
		556	$data{$poster}{agent} = $lcheader{"user-agent"};
		557	}
		558	if ( defined $lcheader{"x-newsreader"} )
		559	{
		560	$data{$poster}{agent} = $lcheader{"x-newsreader"};
		561	}
5	PointedEar	562
7	PointedEar	563	# The User Agent for User-Agent by number of posts
		564	my $UA = "unknown";
		565	foreach my $keys ( keys %lcheader )
		566	{
		567	if ( defined $lcheader{'user-agent'} )
		568	{
		569	$UA = $lcheader{'user-agent'};
		570	}
		571	elsif ( defined $lcheader{"x-newsreader"} )
		572	{
		573	$UA = $lcheader{"x-newsreader"};
		574	}
		575	elsif ( defined $lcheader{'x-mailer'} )
		576	{
		577	$UA = $lcheader{'x-mailer'};
		578	}
		579	elsif (
		580	( defined $lcheader{'organization'} )
		581	&& ( $lcheader{'organization'} =~
		582	/groups\.google\|AOL\|Supernews\|WebTV\|compuserve/ )
		583	)
		584	{
		585	$UA = $lcheader{'organization'};
		586	}
		587	elsif ( $lcheader{'message-id'} =~ /pine/i )
		588	{
		589	$UA = "Pine";
		590	} ## Hopefully found UA, else set to unknown
		591	}
5	PointedEar	592
7	PointedEar	593	$UA = clean($UA);
		594	$UA = get_agent($UA);
5	PointedEar	595
7	PointedEar	596	sub get_agent
		597	{
		598	my $raw = shift;
		599	my $agent = $raw;
5	PointedEar	600
7	PointedEar	601	## strip http
		602	if ( $raw =~ /.http./ )
		603	{
		604	$raw =~ s!posted via!!i;
		605	$raw =~ s!http://!!g;
		606	$raw =~ s!/!!g;
		607	$raw =~ s! !!g;
		608	}
5	PointedEar	609
7	PointedEar	610	## Fix Outlook from Mac
		611	if ( $raw =~ /^microsoft/i ) { $raw =~ s/-/ /g; }
5	PointedEar	612
7	PointedEar	613	## Pick out the popular agents
		614	if ( $raw =~ /(outlook express)/i
		615	\|\| $raw =~ /(microplanet gravity)/i
		616	\|\| $raw =~ /(news rover)/i
		617	\|\| $raw =~ /(forte agent)/i
		618	\|\| $raw =~ /(forte free agent)/i )
		619	{
		620	$agent = $1;
		621	}
		622	elsif (
		623	$raw =~ /^(
5	PointedEar	624	pan
		625	\|sylpheed
		626	\|slrn
		627	\|mozilla
		628	\|knode
		629	\|tin
		630	\|hamster
		631	\|xrn
		632	\|xnews
		633	\|aol
		634	\|gnus
		635	\|krn
		636	\|macsoup
		637	\|messenger
		638	\|openxp
		639	\|pine
		640	\|thoth
		641	\|turnpike
		642	\|winvn
		643	\|vsoup
		644	\|google
		645	\|supernews
		646	\|nn
		647	\|rn
		648	\|007
		649	\|webtv
		650	\|compuserve
7	PointedEar	651	)/ix
		652	)
		653	{
		654	$agent = $1;
		655	}
		656	else
		657	{
		658	## Clean up unknown agents
		659	if ( $raw =~ m!^(.*?)/! )
		660	{
		661	$agent = $1;
		662	}
		663	elsif ( $raw =~ /^(\w)\d./ )
		664	{
		665	$agent = $1;
		666	}
		667	}
		668
		669	$distinct_agent{$agent}++;
		670	return $agent;
5	PointedEar	671	}
		672
7	PointedEar	673	# Get all cross-posted newsgroups
		674	for ( split /,/, $headers{"Newsgroups"} )
		675	{
		676	$crossposts{$_}++; # bump count for each
		677	}
5	PointedEar	678
7	PointedEar	679	# Get threads
		680	my $thread = encode( 'UTF-8', $headers{"Subject"} );
		681	$thread =~ s/^re: //i; # Remove Re: or re: at start
		682	$thread =~ s/\s+/ /g; # collapse whitespace
		683	$threads{$thread}{count} += 1; # bump count of this subject
		684	$threads{$thread}{size} += $filesize; # bump bytes for this thread
5	PointedEar	685
7	PointedEar	686	# Is this an original post or a reply?
		687	if ( defined $headers{"References"} )
		688	{
		689	$replies++;
		690	}
		691	else
		692	{
		693	$origposts++;
		694	}
5	PointedEar	695
7	PointedEar	696	# Get the time zone
		697	$_ = $headers{"Date"};
		698	my ($tz) = /\d\d:\d\d:\d\d\s+(.*)/;
		699	if ( ( $tz =~ /UTC/ ) or ( $tz =~ /GMT/ ) or ( $tz =~ /0000/ ) )
		700	{
		701	$tz = "UTC";
		702	}
		703	$tz{$tz}++;
5	PointedEar	704
7	PointedEar	705	#### Now analyse the body text ####
		706	my $insig = 0;
		707	for (@body)
		708	{
		709	$totbody += length($_); # bump total body size
		710	next if (/^$>/); # don't count blank lines in body
		711	if ( $insig == 1 )
		712	{
		713	$totsig += length($_); # bump total sig size
5	PointedEar	714
7	PointedEar	715	# Bill Unruh uses ] quotes, and another poster uses ::
		716	}
		717	elsif ( /^\s[>\]]/ or /^\s::/ )
		718	{ # are we in a quote line?
		719	$data{$poster}{quoted} += length($_); # bump count of quoted chrs
		720	$totquoted += length($_);
		721	}
		722	elsif (/-- /)
		723	{
		724	$insig = 1;
		725	}
		726	else
		727	{
5	PointedEar	728
7	PointedEar	729	# we must be processing an original line
		730	$data{$poster}{orig} += length($_); # bump count of original chrs
		731	$totorig += length($_);
		732	}
		733	} # end for (@body)
5	PointedEar	734
7	PointedEar	735	} # getdata
5	PointedEar	736
		737	########################################
		738	# Count the User-Agents used, collapsing
		739	# different versions into one per agent.
		740	########################################
7	PointedEar	741	sub countagents
		742	{
5	PointedEar	743	POSTER:
7	PointedEar	744	foreach $poster ( keys %data )
		745	{
		746	foreach my $agent_name ( keys %distinct_agent )
		747	{ # check against known ones
		748	if ( $data{$poster}{agent} =~ /\Q$agent_name\E/ )
		749	{
		750	$agents{$agent_name}++;
		751	next POSTER;
		752	}
		753	}
		754	$agents{ $data{$poster}{agent} }++;
		755	}
		756	} # countagents
5	PointedEar	757
		758	############################################
		759	# set orig/total percentages for all posters
		760	############################################
7	PointedEar	761	sub fixpercent
		762	{
		763	foreach $poster ( keys %data )
		764	{
		765	my $percent = 100;
		766	if ( ( $data{$poster}{orig} != 0 ) and ( $data{$poster}{quoted} != 0 ) )
		767	{
		768	$percent = $data{$poster}{orig} * 100 /
		769	( $data{$poster}{quoted} + $data{$poster}{orig} ); #/
		770	}
		771	elsif ( $data{$poster}{orig} == 0 )
		772	{
		773	$percent = 0;
		774	}
		775	$data{$poster}{percent} = $percent;
		776	}
5	PointedEar	777	}
		778
		779	##############################
		780	# right pad a string with '.'s
		781	##############################
7	PointedEar	782	sub rpad
		783	{
		784
		785	# get text to pad, length to pad, pad chr
		786	my ( $text, $pad_len, $pad_chr ) = @_;
		787	if ( length($text) > $pad_len )
		788	{
		789	$text = substr( $text, 0, $pad_len );
		790	}
		791	my $padded = $text . $pad_chr x ( $pad_len - length($text) );
		792	return $padded;
5	PointedEar	793	}
		794
		795	#################
		796	# centre a string
		797	#################
7	PointedEar	798	sub centred
		799	{
		800	my ( $text, $width ) = @_; # text to centre, size of field to centre in
		801	my $pad_len = ( $width - length($text) ) / 2; #/
		802	my $centred = " " x $pad_len . $text;
		803	return $centred;
5	PointedEar	804	}
		805
		806	##########################
		807	# put commas into a number
		808	##########################
7	PointedEar	809	sub commify
		810	{
		811	$_ = shift;
		812	1 while s/^(-?\d+)(\d{3})/$1,$2/;
		813	return $_;
5	PointedEar	814	}
		815
		816	#########################
		817	# clean
		818	#########################
7	PointedEar	819	sub clean
		820	{
		821	my $dirty = shift;
		822	my $clean = $dirty;
		823	$clean =~ s/^\s*//;
		824	$clean =~ s/\s*$//;
5	PointedEar	825
7	PointedEar	826	return $clean;
5	PointedEar	827	}
		828
7	PointedEar	829	sub usage
		830	{
5	PointedEar	831
7	PointedEar	832	print "usage: newstat.pl newsgroupname\n";
		833	exit 1;
5	PointedEar	834	}
		835
		836	###################################
		837	# Write data structures to a file #
		838	###################################
7	PointedEar	839	sub writedata
		840	{
		841	open OUTF, ">/tmp/XDATA" or die "Can't create XDATA: $!\n";
		842	print OUTF "Data collected from alt.os.linux.mandrake\n\n";
		843	print OUTF
		844	"Poster Data\nname : agent : count : size: orig : quoted : per cent\n";
		845	foreach my $name ( keys %data )
		846	{
		847	print OUTF
		848	"$name : $data{$name}{agent} : $data{$name}{count} : $data{$name}{size} : $data{$name}{orig} : $data{$name}{quoted} : $data{$name}{percent}\n";
		849	}
		850	print OUTF
		851	"============================================================================\n";
		852	print OUTF "Thread subjects\n";
		853	print OUTF
		854	"----------------------------------------------------------------------------\n";
		855	foreach my $thread ( sort { "\L$a" cmp "\L$b" } keys %threads )
		856	{
		857	print OUTF "$thread : $threads{$thread}{count} : $threads{$thread}{size}\n";
		858	}
		859	print OUTF
		860	"============================================================================\n";
		861	print OUTF "Cross-posts\n";
		862	print OUTF
		863	"----------------------------------------------------------------------------\n";
		864	foreach my $name ( sort keys %crossposts )
		865	{
		866	print OUTF "$name : $crossposts{$name}\n";
		867	}
		868	print OUTF print OUTF
		869	"============================================================================\n";
		870	print OUTF "User agents\n";
		871	print OUTF
		872	"----------------------------------------------------------------------------\n";
		873	foreach my $name ( sort keys %agents )
		874	{
		875	print OUTF "$name : $agents{$name}\n";
		876	}
		877	print OUTF
		878	"============================================================================\n";
		879	print OUTF "Time zones\n";
		880	print OUTF
		881	"----------------------------------------------------------------------------\n";
		882	foreach my $name ( sort keys %tz )
		883	{
		884	print OUTF "$name : $tz{$name}\n";
		885	}
		886	close OUTF;
		887	} # writedata

Subversion Repositories LCARS

(root)/trunk/tools/network/news/newsstat/newsstat.pl @ 33 - Rev 7