Phil Jackson
01-24-2010, 05:31 PM
Hello, this is my second full day working with perl. The following code is supposed to get all valid full paths to all pages on a site;
#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use Data::Dumper;
require LWP::UserAgent;
use DBI;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);
print "\n|--------------------------------------------------------------------|\n";
print "\nEnter a website: http://www.";
chomp(my $url = <>);
print "\n|--------------------------------------------------------------------|\n";
# try and find full path
sub findFullPath {
my($link, $landingPage) = @_;
# strip ./ and / from beggining of string
$link =~ s/^(?:(?:\/)|(?:\.\/))//g;
# find out whether link is backtracing to previous folder
if( $link =~ m/^\.\.\// ) { # link desination is back tracing
# find destination folder from landing page
my @folders = split( "/", $landingPage );
#find size of array
my $foldersSize = scalar @folders;
# get last entry in array
my $lastEntry = $folders[$foldersSize - 1];
if( $lastEntry =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/g ) {
delete $folders[$lastEntry];
}
my @backFolders = ( $link =~ m/\.\.\//g );
my $amountOfBackFolders = scalar @backFolders;
for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
my $numberToDelete = $foldersSize - $x;
delete $folders[$numberToDelete - 1];
}
$landingPage = join( "/", @folders );
$link =~ s/\.\.\///g;
return $landingPage . "/" . $link . "\n";
}else{
if( substr( $landingPage, -1) eq "/" ){
return $landingPage . $link . "\n";
}else{
my @splitLandingPage = split( "/", $landingPage );
my $amountSplit = scalar @splitLandingPage;
my $toDelete = $amountSplit - 1;
delete $splitLandingPage[$toDelete];
return join( "/", @splitLandingPage ) . "/" . $link . "\n";
}
}
}
# get HTTP details
my $response = $ua->get('http://' . $url);
my $responseCode = $response->code;
my $responseLocation = $response->header( 'Location' );
# contintue only if status code is 200 or 301
if( $responseCode != 200 && $responseCode != 301 ){
die "Domain name invalid, please use differnet domain name";
}
# change url if domain status eq 301
if( $responseCode == 301 ){
$url = $responseLocation;
}
my @pagesArray = ($url);
my @pagesScannedArray = ('');
my @mainPagesArray = ('');
my @pagesNotScanned = ('');
while ( @pagesArray ) {
# get the next in queue for proccessing
my $page = shift @pagesArray;
# check page http status
$response = $ua->get('http://www' . chomp($page));
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 ){
# change page url if 301 redirect
if( $responseCode == 301 ){
$page = substr( $response->header( 'Location' ), 7 );
}
# connect to page and get contents
if( my $pageData = get "http://" . $page ) {
# get all links on page
my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
# foreach link on the page
foreach( @pageLinksArray ) {
my $link = $_;
# if link is format we are looking for
if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
# if link is outbound
if( $link =~ m/^http:\/\// ) {
if( ! grep {$_ eq $link} @pagesNotScanned ) {
push ( @pagesNotScanned, $link );
}
}else{
# find full path for link
my $newUrl = &findFullPath($link, $url);
# if link has not already been claimed to be a main page
if( ! grep {$_ eq $newUrl} @mainPagesArray ) {
# if link is not already in queue
if( ! grep {$_ eq $newUrl} @pagesArray ) {
push ( @pagesArray, $newUrl );
}
}
}
}
}
if( ! grep {$_ eq $page} @mainPagesArray ) {
push ( @mainPagesArray, $page );
}
}
}else{
print "Could not retrieve data for " . $url ." - HTTP Status " . $responseCode . "\n";
if( ! grep {$_ eq $page} @pagesNotScanned ) {
push ( @pagesNotScanned, $page );
}
}
}
print "\nPages Not Scanned: \n";
if( scalar @pagesNotScanned == 0 ){
print "NULL\n";
}else{
foreach( @pagesNotScanned ){
print $_ . "\n";
}
}
print "\nPages Scanned: \n";
if( scalar @mainPagesArray == 0 ){
print "NULL\n";
}else{
foreach( @mainPagesArray ) {
print $_ . "\n";
}
}
problem being on line 89 it is returning a status code of 500 which is not true. tried with
actwebdesigns.co.uk/
landscapers-mansfield.co.uk/
google.co.uk/
I think it has something to do with changing the $page to the location if 301 which it didn't do previously. If anyone can see where i am going wrong it would be much appreciated.
#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use Data::Dumper;
require LWP::UserAgent;
use DBI;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);
print "\n|--------------------------------------------------------------------|\n";
print "\nEnter a website: http://www.";
chomp(my $url = <>);
print "\n|--------------------------------------------------------------------|\n";
# try and find full path
sub findFullPath {
my($link, $landingPage) = @_;
# strip ./ and / from beggining of string
$link =~ s/^(?:(?:\/)|(?:\.\/))//g;
# find out whether link is backtracing to previous folder
if( $link =~ m/^\.\.\// ) { # link desination is back tracing
# find destination folder from landing page
my @folders = split( "/", $landingPage );
#find size of array
my $foldersSize = scalar @folders;
# get last entry in array
my $lastEntry = $folders[$foldersSize - 1];
if( $lastEntry =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/g ) {
delete $folders[$lastEntry];
}
my @backFolders = ( $link =~ m/\.\.\//g );
my $amountOfBackFolders = scalar @backFolders;
for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
my $numberToDelete = $foldersSize - $x;
delete $folders[$numberToDelete - 1];
}
$landingPage = join( "/", @folders );
$link =~ s/\.\.\///g;
return $landingPage . "/" . $link . "\n";
}else{
if( substr( $landingPage, -1) eq "/" ){
return $landingPage . $link . "\n";
}else{
my @splitLandingPage = split( "/", $landingPage );
my $amountSplit = scalar @splitLandingPage;
my $toDelete = $amountSplit - 1;
delete $splitLandingPage[$toDelete];
return join( "/", @splitLandingPage ) . "/" . $link . "\n";
}
}
}
# get HTTP details
my $response = $ua->get('http://' . $url);
my $responseCode = $response->code;
my $responseLocation = $response->header( 'Location' );
# contintue only if status code is 200 or 301
if( $responseCode != 200 && $responseCode != 301 ){
die "Domain name invalid, please use differnet domain name";
}
# change url if domain status eq 301
if( $responseCode == 301 ){
$url = $responseLocation;
}
my @pagesArray = ($url);
my @pagesScannedArray = ('');
my @mainPagesArray = ('');
my @pagesNotScanned = ('');
while ( @pagesArray ) {
# get the next in queue for proccessing
my $page = shift @pagesArray;
# check page http status
$response = $ua->get('http://www' . chomp($page));
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 ){
# change page url if 301 redirect
if( $responseCode == 301 ){
$page = substr( $response->header( 'Location' ), 7 );
}
# connect to page and get contents
if( my $pageData = get "http://" . $page ) {
# get all links on page
my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
# foreach link on the page
foreach( @pageLinksArray ) {
my $link = $_;
# if link is format we are looking for
if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
# if link is outbound
if( $link =~ m/^http:\/\// ) {
if( ! grep {$_ eq $link} @pagesNotScanned ) {
push ( @pagesNotScanned, $link );
}
}else{
# find full path for link
my $newUrl = &findFullPath($link, $url);
# if link has not already been claimed to be a main page
if( ! grep {$_ eq $newUrl} @mainPagesArray ) {
# if link is not already in queue
if( ! grep {$_ eq $newUrl} @pagesArray ) {
push ( @pagesArray, $newUrl );
}
}
}
}
}
if( ! grep {$_ eq $page} @mainPagesArray ) {
push ( @mainPagesArray, $page );
}
}
}else{
print "Could not retrieve data for " . $url ." - HTTP Status " . $responseCode . "\n";
if( ! grep {$_ eq $page} @pagesNotScanned ) {
push ( @pagesNotScanned, $page );
}
}
}
print "\nPages Not Scanned: \n";
if( scalar @pagesNotScanned == 0 ){
print "NULL\n";
}else{
foreach( @pagesNotScanned ){
print $_ . "\n";
}
}
print "\nPages Scanned: \n";
if( scalar @mainPagesArray == 0 ){
print "NULL\n";
}else{
foreach( @mainPagesArray ) {
print $_ . "\n";
}
}
problem being on line 89 it is returning a status code of 500 which is not true. tried with
actwebdesigns.co.uk/
landscapers-mansfield.co.uk/
google.co.uk/
I think it has something to do with changing the $page to the location if 301 which it didn't do previously. If anyone can see where i am going wrong it would be much appreciated.