...

View Full Version : returning 500 error when vaild url



Phil Jackson
01-24-2010, 06:31 PM
Hello, this is my second full day working with perl. The following code is supposed to get all valid full paths to all pages on a site;



#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use Data::Dumper;
require LWP::UserAgent;
use DBI;

my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);

print "\n|--------------------------------------------------------------------|\n";
print "\nEnter a website: http://www.";
chomp(my $url = <>);
print "\n|--------------------------------------------------------------------|\n";

# try and find full path
sub findFullPath {

my($link, $landingPage) = @_;

# strip ./ and / from beggining of string
$link =~ s/^(?:(?:\/)|(?:\.\/))//g;

# find out whether link is backtracing to previous folder
if( $link =~ m/^\.\.\// ) { # link desination is back tracing

# find destination folder from landing page
my @folders = split( "/", $landingPage );
#find size of array
my $foldersSize = scalar @folders;
# get last entry in array
my $lastEntry = $folders[$foldersSize - 1];

if( $lastEntry =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/g ) {
delete $folders[$lastEntry];
}

my @backFolders = ( $link =~ m/\.\.\//g );
my $amountOfBackFolders = scalar @backFolders;
for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
my $numberToDelete = $foldersSize - $x;
delete $folders[$numberToDelete - 1];
}

$landingPage = join( "/", @folders );
$link =~ s/\.\.\///g;
return $landingPage . "/" . $link . "\n";

}else{
if( substr( $landingPage, -1) eq "/" ){
return $landingPage . $link . "\n";
}else{
my @splitLandingPage = split( "/", $landingPage );
my $amountSplit = scalar @splitLandingPage;
my $toDelete = $amountSplit - 1;
delete $splitLandingPage[$toDelete];
return join( "/", @splitLandingPage ) . "/" . $link . "\n";
}
}
}

# get HTTP details
my $response = $ua->get('http://' . $url);
my $responseCode = $response->code;
my $responseLocation = $response->header( 'Location' );

# contintue only if status code is 200 or 301
if( $responseCode != 200 && $responseCode != 301 ){
die "Domain name invalid, please use differnet domain name";
}

# change url if domain status eq 301
if( $responseCode == 301 ){
$url = $responseLocation;
}

my @pagesArray = ($url);
my @pagesScannedArray = ('');
my @mainPagesArray = ('');
my @pagesNotScanned = ('');

while ( @pagesArray ) {
# get the next in queue for proccessing
my $page = shift @pagesArray;
# check page http status
$response = $ua->get('http://www' . chomp($page));
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 ){
# change page url if 301 redirect
if( $responseCode == 301 ){
$page = substr( $response->header( 'Location' ), 7 );
}
# connect to page and get contents
if( my $pageData = get "http://" . $page ) {
# get all links on page
my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
# foreach link on the page
foreach( @pageLinksArray ) {
my $link = $_;
# if link is format we are looking for
if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
# if link is outbound
if( $link =~ m/^http:\/\// ) {
if( ! grep {$_ eq $link} @pagesNotScanned ) {
push ( @pagesNotScanned, $link );
}
}else{
# find full path for link
my $newUrl = &findFullPath($link, $url);
# if link has not already been claimed to be a main page
if( ! grep {$_ eq $newUrl} @mainPagesArray ) {
# if link is not already in queue
if( ! grep {$_ eq $newUrl} @pagesArray ) {
push ( @pagesArray, $newUrl );
}
}
}
}
}
if( ! grep {$_ eq $page} @mainPagesArray ) {
push ( @mainPagesArray, $page );
}
}
}else{
print "Could not retrieve data for " . $url ." - HTTP Status " . $responseCode . "\n";
if( ! grep {$_ eq $page} @pagesNotScanned ) {
push ( @pagesNotScanned, $page );
}
}
}


print "\nPages Not Scanned: \n";
if( scalar @pagesNotScanned == 0 ){
print "NULL\n";
}else{
foreach( @pagesNotScanned ){
print $_ . "\n";
}
}

print "\nPages Scanned: \n";
if( scalar @mainPagesArray == 0 ){
print "NULL\n";
}else{
foreach( @mainPagesArray ) {
print $_ . "\n";
}
}


problem being on line 89 it is returning a status code of 500 which is not true. tried with

actwebdesigns.co.uk/
landscapers-mansfield.co.uk/
google.co.uk/

I think it has something to do with changing the $page to the location if 301 which it didn't do previously. If anyone can see where i am going wrong it would be much appreciated.

derekC
01-24-2010, 11:37 PM
I suspect the problem is in using the RETURN value of chomp (==number of characters removed) to create the URL, rather than $page after chomping. See also help about HTTP 500 internal server errors (http://www.getnetgoing.com/HTTP-500.html)

Phil Jackson
01-25-2010, 11:35 AM
iI have now sorted it. Thought I'd share my first ever perl script!



#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
require LWP::UserAgent;
sub trim($);

my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);

print "##################################################\n";
print "# ACT Spider .php .html .asp .htm .shtml / #\n";
print "# FINDS ONLY VALID PAGE FOR PLUG N PLAY CMS #\n";
print "# CHANGE TO SUIT #\n";
print "# SUPORTS ONLY 301,302 AND 200 HTTP HEADER CODES #\n";
print "##################################################\n";
print "\n|--------------------------------------------------------------------|\n";
print "\nEnter a website: http://www.";
chomp(my $url = <>);
print "\n|--------------------------------------------------------------------|\n";

sub trim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}

# try and find full path
sub findFullPath {

my($link, $landingPage) = @_;

# strip ./ and / from beggining of string
$link =~ s/^(?:(?:\/)|(?:\.\/))//g;

# find out whether link is backtracing to previous folder
if( $link =~ m/^\.\.\// ) { # link desination is back tracing

if( $landingPage =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml))$/g ) {
# find destination folder from landing page
my @folders = split( "/", $landingPage );
#find size of array
my $foldersSize = scalar @folders;
delete $folders[$foldersSize - 1];
$foldersSize = scalar @folders;
my @backFolders = ( $link =~ m/\.\.\//g ); # get rid of ../
my $amountOfBackFolders = scalar @backFolders; # find how many folders back
for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
my $numberToDelete = ($foldersSize - 1) - $x;
delete $folders[$numberToDelete];
}
$landingPage = join( "/", @folders );
$link =~ s/\.\.\///g;
return $landingPage . "/" . $link . "\n";
} elsif( $landingPage =~ m/(?:\/)$/g ) {
my @folders = split( "/", $landingPage );
#find size of array
my $foldersSize = scalar @folders;
delete $folders[$foldersSize - 1];
$foldersSize = scalar @folders;
my @backFolders = ( $link =~ m/\.\.\//g ); # get rid of ../
my $amountOfBackFolders = scalar @backFolders; # find how many folders back
for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
my $numberToDelete = ($foldersSize) - $x;
delete $folders[$numberToDelete];
}
$landingPage = join( "/", @folders );
$link =~ s/\.\.\///g;
return $landingPage . "/" . $link . "\n";
} else {

}

}else{
if( substr( $landingPage, -1) eq "/" ){
return $landingPage . $link . "\n";
print "1";
}else{
my @splitLandingPage = split( "/", $landingPage );
my $amountSplit = scalar @splitLandingPage;
my $toDelete = $amountSplit - 1;
my $lastEntry = $splitLandingPage[$toDelete];
if( $lastEntry =~ m/(?:(?:com)|(?:co\.uk)|(?:net)|(?:org)|(?:cc)|(?:tv)|(?:info)|(?:org\.uk)|(?:me\.uk)|(?:biz)|(?:name) |(?:eu)|(?:uk\.com)|(?:eu\.com)|(?:gb\.com)|(?:gb\.net)|(?:uk\.net)|(?:me)|(?:mobi))$/g ) {
return join( "/", @splitLandingPage ) . "/" . $link . "\n";
}else{
delete $splitLandingPage[$toDelete];
return join( "/", @splitLandingPage ) . "/" . $link . "\n";
}
}
}
}

# get HTTP details
my $response = $ua->get('http://' . trim($url));
my $responseCode = $response->code;
my $responseLocation = $response->header( 'Location' );

# contintue only if status code is 200 or 301
if( $responseCode != 200 && $responseCode != 301 && $responseCode != 302 ){
die "Domain name invalid, please use differnet domain name: http status - " . $responseCode;
}

# change url if domain status eq 301
if( $responseCode == 301 || $responseCode == 302 ){
if($response->header( 'Location' ) =~ m/^http:\/\/www\./g ) {
$url = substr( $response->header( 'Location' ), 11 );
}elsif($response->header( 'Location' ) =~ m/^http:\/\//g ) {
$url = substr( $response->header( 'Location' ), 7 );
}else{
$url = findFullPath($response->header( 'Location' ), $url);
}
}

my @pagesArray = ($url);
my @pagesScannedArray;
my @mainPagesArray;
my @pagesNotScanned;
my $z = 0;

while ( @pagesArray && $z < 200 ) {
# get the next in queue for proccessing
my $page = trim(shift @pagesArray);
if( ! grep {$_ eq trim($page)} @pagesNotScanned ) {
# check page http status
$response = $ua->get("http://" . trim($page));
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ){
# change page url if 301 redirect
if( $responseCode == 301 || $responseCode == 302 ){
if($response->header( 'Location' ) =~ m/^http:\/\/www\./g ) {
$page = substr( $response->header( 'Location' ), 11 );
print $page;
}elsif($response->header( 'Location' ) =~ m/^http:\/\//g ) {
$page = substr( $response->header( 'Location' ), 7 );
}else{
$page = findFullPath($response->header( 'Location' ), $url);
}
}
# connect to page and get contents
if( my $pageData = get "http://" . trim($page) ) {
# get all links on page
my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
# foreach link on the page
foreach( @pageLinksArray ) {
my $link = trim($_);
# remove url if located on same domain
$link =~ s/(?:http:\/\/)?(?:www\.)?$url//g;
# if link is format we are looking for
if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
# if link is outbound
if( $link =~ m/^http:\/\//g ) {
if( ! grep {$_ eq trim($link)} @pagesNotScanned ) {
if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
push ( @pagesNotScanned, trim($link) );
}
}
}else{
# find full path for link
my $newUrl = &findFullPath(trim($link), trim($page));
# if link has not already been claimed to be a main page
if( ! grep {$_ eq trim($newUrl)} @mainPagesArray ) {
# if link is not already in queue
if( ! grep {$_ eq trim($newUrl)} @pagesArray ) {
#print "Landing Page: " . $page;
#print "Old URL: " . $link . "\n";
push ( @pagesArray, trim($newUrl) );
}
}
}
}
}
if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
push ( @mainPagesArray, trim($page) );
print "[" . $z . "] http://www." . trim($page) . "\n";
}
}
}else{
if( ! grep {$_ eq trim($page)} @pagesNotScanned ) {
if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
push ( @pagesNotScanned, trim($page) );
}
}
}
}
$z++;

}

my $getFileName = ( $url =~ m/(^[^\/]*)/g );
my $fileName = "sitemap-" . $1 . ".txt";
if( open FH, ">>c:/perlscripts/$fileName" or die "can't open '$fileName': $!" ) {
foreach( @mainPagesArray ) {
print FH "http://www." . trim($_) . "\n";
}
close FH;
print "\n>>c:/perlscripts/$fileName successfuly saved\n";
}



EZ Archive Ads Plugin for vBulletin Copyright 2006 Computer Help Forum