nkrgupta
11-02-2005, 06:42 AM
Hi,
I'll try my best to make this as understandable as possible. Please do ask if any clarification is required.
I had made this program to create a report from the apache access log files, of top entry pages and top exit pages of our site. What the program does is take a date as argument form the shell and search for log files of that date and parse them accordingly and produce the output. At that time we had 20 servers, (each server has a separate access log file), but the no. of servers has shot up to almost 150! (the traffic has increased about 5-7 times). Quite understandably, my program is not being able to handle the huge amount of data in the desired way due to which, after processing log files of about 120 odd servers, the program is terminating for want of memory on the server.
So, if anyone could go through it and suggest a better way of doing what is being done (maybe use of some module, or just a simpler and less memory intensive way, or any suggestion which would improve the efficiency of the program). I know the program is not bug-free, as the report generated may not be 100% correct, though most of it is reliable. But pointers as to increasing the processing capacity of the program are what i am looking for, more than those questioning the logic of the program (All are welcome though).
I am including the code as-is with comments, wherever needed (pardon me for not using strict!).
The point to be noted is that the program terminates in the first while loop only, as the initial hash is not built. So optimizations need to be done there.
#!/usr/bin/perl
use Date::Calc ( ":all" );
my ($yy1,$mm1,$dd1, $hh,$min,$sec, $doy,$dow,$dst) = Localtime();
if (!$ARGV[0])
{
my ($yy,$mm,$dd) = Add_Delta_Days($yy1,$mm1,$dd1,-1);
$mm = "0$mm" if($mm < 10);
$dd = "0$dd" if($dd < 10);
}
else
{
my ($mm,$dd,$yy) = split(/\//,$ARGV[0]);
}
my $file_date=$yy.$mm.$dd;
my $path="/reports/cgi-bin/ServerLogs"; ##path to all the log files
my $txt_format="e_$mm$dd$yy";
my $first_file="temp.txt"; ##temp file
my $final_file="e_$mm$dd$yy.txt"; ##final text file to be generated
open (DB, ">>$first_file") or print("Could not create txt File<br>\n\n");
undef %hash;
&servers;
foreach $server(@servers)
{
my $logfile="$path/www$server.access.$file_date.log"; ##format of log files names
print "www$server..\n";
open (F, $logfile) or print ("\tCould not open $logfile\n");
while (<F>)
{
chomp($line);
##parsing each line of log files
my ($host, $ident_user, $auth_user, $date, $time, $time_zone, $method, $url, $protocol, $status, $bytes, $referer, $agent) = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] \"(\S+) (.+?) (\S+)\" (\S+) (\S+) \"([^\"]+)\" \"([^\"]+)\"$/;
##this if block ignores all files prgrams which need to be ignored
if ($referer =~ /http:\/\/cards*./i || $url =~ /(\/view\/|\/internal\/auto\/|gotoad\.pl|TrackRecord\.pl|HpTrack\.pl|bottrack\.pl|mantle\.pl|uptime\.pl|customtrack\.pl|popup\.pl |\.dll\?|TrackThumb.pl|\.asp\?|nl\.pl|ah-ha\.pl|nc\.pl|\/cgi-bin\/cards\/help\/|searchmain\.pl|sprinks\.pl|popup4\.pl|ReturnCardData\.pl|\/cgi-bin\/cards\/|\/newcards\/|\/card\/\d\d\/\d\d\/\d\d\/\d\d\/Z|assocstats\.pl|putsp\.pl|newmember\.pl|putmail\.pl|\/interface\/|GetCount\.pl|RetDS\.pl|thankyou\.pl|prevcus\.pl|cust\.pl|lostpass\.pl|Return|putcus\.pl|chgencat\.p l|catcont\.pl|putgen\.pl|choosevent\.pl|mailtag\.pl)/i || $url !~ /(\.html|\/|\/\?|\.pl|\.pl\.new|\d{12,}|rd\/\?\/.+|rd1\/\?\/.+)$/i) {
$NbOfLinesDropped++;
next;
}
if ($url =~/^http:\/\/w+\.our-website\.com\//)
{
$url=~ s/http:\/\/w+\.our-website\.com\//\//; ##substituting sitename with '/'
}
elsif ($url =~/^http:\/\/w+\d{1,2}\.our-website\.com\//)
{
$url=~ s/http:\/\/w+\d{1,2}\.our-website\.com\//\//; ##substituting sitename with '/'
}
if ($url=~/\/card\//)
{
$url=~/(\/card\/\d{2}\/\d{2})(\/\d{2}\/\d{2}\/\w{2})(\d{1})(.*)/;
$url=$1."/".$3; ##doing another editing
}
$hash{$host}{$time}=$url; ##BUILDING THE HASH
}
close(F);
}
##THE following loop creates a file which has the hash created above, in a format which can be sorted according to time
foreach $ho(keys(%hash))
{
foreach $nt (keys %{$hash{$ho}})
{
my ($h,$m,$s)=split(':',$nt);
my $time=$h.$m.$s;
my $str=$ho."|^~".$time."|^~".$hash{$ho}{$nt};
print DB "$str\n";
}
}
undef %hash;
close(DB);
`sort $first_file -o $first_file`; ##sorting of the file according to time
open (R, "$first_file") or print("Could not create txt File<br>\n\n");
open (W, ">$final_file") or print("Could not create txt File<br>\n\n");
undef %final;
my $entry="Entry";
my $exit="Exit";
##The following while loop creates the final hash of all the entry and exit pages.
while (my $line=<R>)
{
chomp($line);
my ($ip,$time,$url)=split('\|\^\~',$line);
if (!exists $final{$ip}{$entry})
{
$final{$ip}{$entry}=$url;
}
$final{$ip}{$exit}=$url;
}
##the following loop writes all the entry and exit pages to a file
foreach $k (keys(%final))
{
if ($final{$k}{$entry} =~ /\.pl/)
{
$final{$k}{$entry}=~/(.*\/)(\w+\.pl)(.*)/;
$final{$k}{$entry}=$2;
}
if ($final{$k}{$exit}=~/\.pl/)
{
$final{$k}{$exit}=~/(.*\/)(\w+\.pl)(.*)/;
$final{$k}{$exit}=$2;
}
$string="$final{$k}{$entry}|^~$final{$k}{$exit}";
print W "$string\n";
}
undef %final;
close(R);
close(W);
open(RT,"$final_file") or print ("Error");
undef %entry;
undef %exit;
##Creating the indivisual hashes for antry and exit pages with the no. of times each page is called as the value of the hash
while (my $line=<RT>)
{
chomp($line);
my ($ent,$ext)=split('\|\^\~',$line);
$entry{$ent}++;
$exit{$ext}++;
}
close(RT);
my $ent_txt="ent_$mm$dd$yy";
my $ext_txt="ext_$mm$dd$yy";
open(ENT,">$ent_txt") or print ("Error");
open(EXT,">$ext_txt") or print ("Error");
##Writing entyr and exit pages with no. of times called in separate text files
foreach $k (keys %entry)
{
print ENT $k."|^~".$entry{$k}."\n";
}
undef %entry;
foreach $k (keys %exit)
{
print EXT $k."|^~".$exit{$k}."\n";
}
undef %exit;
close(ENT);
close(EXT);
print "Archiving $final_file\n";
my $tarname="/reports/accesslog_db/".$txt_format.".tgz";
my $execstring="tar -zcf ".$tarname." ".$ent_txt." ".$ext_txt;
print($execstring);
system($execstring);
print("\n");
unlink $first_file;
unlink $final_file;
unlink $ent_txt;
unlink $ext_txt;
print "$tarname created \n";
sub servers ## @servers for online server
{
undef @servers;
open READ,"servers.dat" or die "No Serverlist: $!";
my $line=<READ>;
close(READ);
@servers=split(/,/,$line);
}
Thank You Very much
Naveen (http://naveenhere.blogspot.com)
I'll try my best to make this as understandable as possible. Please do ask if any clarification is required.
I had made this program to create a report from the apache access log files, of top entry pages and top exit pages of our site. What the program does is take a date as argument form the shell and search for log files of that date and parse them accordingly and produce the output. At that time we had 20 servers, (each server has a separate access log file), but the no. of servers has shot up to almost 150! (the traffic has increased about 5-7 times). Quite understandably, my program is not being able to handle the huge amount of data in the desired way due to which, after processing log files of about 120 odd servers, the program is terminating for want of memory on the server.
So, if anyone could go through it and suggest a better way of doing what is being done (maybe use of some module, or just a simpler and less memory intensive way, or any suggestion which would improve the efficiency of the program). I know the program is not bug-free, as the report generated may not be 100% correct, though most of it is reliable. But pointers as to increasing the processing capacity of the program are what i am looking for, more than those questioning the logic of the program (All are welcome though).
I am including the code as-is with comments, wherever needed (pardon me for not using strict!).
The point to be noted is that the program terminates in the first while loop only, as the initial hash is not built. So optimizations need to be done there.
#!/usr/bin/perl
use Date::Calc ( ":all" );
my ($yy1,$mm1,$dd1, $hh,$min,$sec, $doy,$dow,$dst) = Localtime();
if (!$ARGV[0])
{
my ($yy,$mm,$dd) = Add_Delta_Days($yy1,$mm1,$dd1,-1);
$mm = "0$mm" if($mm < 10);
$dd = "0$dd" if($dd < 10);
}
else
{
my ($mm,$dd,$yy) = split(/\//,$ARGV[0]);
}
my $file_date=$yy.$mm.$dd;
my $path="/reports/cgi-bin/ServerLogs"; ##path to all the log files
my $txt_format="e_$mm$dd$yy";
my $first_file="temp.txt"; ##temp file
my $final_file="e_$mm$dd$yy.txt"; ##final text file to be generated
open (DB, ">>$first_file") or print("Could not create txt File<br>\n\n");
undef %hash;
&servers;
foreach $server(@servers)
{
my $logfile="$path/www$server.access.$file_date.log"; ##format of log files names
print "www$server..\n";
open (F, $logfile) or print ("\tCould not open $logfile\n");
while (<F>)
{
chomp($line);
##parsing each line of log files
my ($host, $ident_user, $auth_user, $date, $time, $time_zone, $method, $url, $protocol, $status, $bytes, $referer, $agent) = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] \"(\S+) (.+?) (\S+)\" (\S+) (\S+) \"([^\"]+)\" \"([^\"]+)\"$/;
##this if block ignores all files prgrams which need to be ignored
if ($referer =~ /http:\/\/cards*./i || $url =~ /(\/view\/|\/internal\/auto\/|gotoad\.pl|TrackRecord\.pl|HpTrack\.pl|bottrack\.pl|mantle\.pl|uptime\.pl|customtrack\.pl|popup\.pl |\.dll\?|TrackThumb.pl|\.asp\?|nl\.pl|ah-ha\.pl|nc\.pl|\/cgi-bin\/cards\/help\/|searchmain\.pl|sprinks\.pl|popup4\.pl|ReturnCardData\.pl|\/cgi-bin\/cards\/|\/newcards\/|\/card\/\d\d\/\d\d\/\d\d\/\d\d\/Z|assocstats\.pl|putsp\.pl|newmember\.pl|putmail\.pl|\/interface\/|GetCount\.pl|RetDS\.pl|thankyou\.pl|prevcus\.pl|cust\.pl|lostpass\.pl|Return|putcus\.pl|chgencat\.p l|catcont\.pl|putgen\.pl|choosevent\.pl|mailtag\.pl)/i || $url !~ /(\.html|\/|\/\?|\.pl|\.pl\.new|\d{12,}|rd\/\?\/.+|rd1\/\?\/.+)$/i) {
$NbOfLinesDropped++;
next;
}
if ($url =~/^http:\/\/w+\.our-website\.com\//)
{
$url=~ s/http:\/\/w+\.our-website\.com\//\//; ##substituting sitename with '/'
}
elsif ($url =~/^http:\/\/w+\d{1,2}\.our-website\.com\//)
{
$url=~ s/http:\/\/w+\d{1,2}\.our-website\.com\//\//; ##substituting sitename with '/'
}
if ($url=~/\/card\//)
{
$url=~/(\/card\/\d{2}\/\d{2})(\/\d{2}\/\d{2}\/\w{2})(\d{1})(.*)/;
$url=$1."/".$3; ##doing another editing
}
$hash{$host}{$time}=$url; ##BUILDING THE HASH
}
close(F);
}
##THE following loop creates a file which has the hash created above, in a format which can be sorted according to time
foreach $ho(keys(%hash))
{
foreach $nt (keys %{$hash{$ho}})
{
my ($h,$m,$s)=split(':',$nt);
my $time=$h.$m.$s;
my $str=$ho."|^~".$time."|^~".$hash{$ho}{$nt};
print DB "$str\n";
}
}
undef %hash;
close(DB);
`sort $first_file -o $first_file`; ##sorting of the file according to time
open (R, "$first_file") or print("Could not create txt File<br>\n\n");
open (W, ">$final_file") or print("Could not create txt File<br>\n\n");
undef %final;
my $entry="Entry";
my $exit="Exit";
##The following while loop creates the final hash of all the entry and exit pages.
while (my $line=<R>)
{
chomp($line);
my ($ip,$time,$url)=split('\|\^\~',$line);
if (!exists $final{$ip}{$entry})
{
$final{$ip}{$entry}=$url;
}
$final{$ip}{$exit}=$url;
}
##the following loop writes all the entry and exit pages to a file
foreach $k (keys(%final))
{
if ($final{$k}{$entry} =~ /\.pl/)
{
$final{$k}{$entry}=~/(.*\/)(\w+\.pl)(.*)/;
$final{$k}{$entry}=$2;
}
if ($final{$k}{$exit}=~/\.pl/)
{
$final{$k}{$exit}=~/(.*\/)(\w+\.pl)(.*)/;
$final{$k}{$exit}=$2;
}
$string="$final{$k}{$entry}|^~$final{$k}{$exit}";
print W "$string\n";
}
undef %final;
close(R);
close(W);
open(RT,"$final_file") or print ("Error");
undef %entry;
undef %exit;
##Creating the indivisual hashes for antry and exit pages with the no. of times each page is called as the value of the hash
while (my $line=<RT>)
{
chomp($line);
my ($ent,$ext)=split('\|\^\~',$line);
$entry{$ent}++;
$exit{$ext}++;
}
close(RT);
my $ent_txt="ent_$mm$dd$yy";
my $ext_txt="ext_$mm$dd$yy";
open(ENT,">$ent_txt") or print ("Error");
open(EXT,">$ext_txt") or print ("Error");
##Writing entyr and exit pages with no. of times called in separate text files
foreach $k (keys %entry)
{
print ENT $k."|^~".$entry{$k}."\n";
}
undef %entry;
foreach $k (keys %exit)
{
print EXT $k."|^~".$exit{$k}."\n";
}
undef %exit;
close(ENT);
close(EXT);
print "Archiving $final_file\n";
my $tarname="/reports/accesslog_db/".$txt_format.".tgz";
my $execstring="tar -zcf ".$tarname." ".$ent_txt." ".$ext_txt;
print($execstring);
system($execstring);
print("\n");
unlink $first_file;
unlink $final_file;
unlink $ent_txt;
unlink $ext_txt;
print "$tarname created \n";
sub servers ## @servers for online server
{
undef @servers;
open READ,"servers.dat" or die "No Serverlist: $!";
my $line=<READ>;
close(READ);
@servers=split(/,/,$line);
}
Thank You Very much
Naveen (http://naveenhere.blogspot.com)