Hello and welcome to our community! Is this your first visit?
Register
Enjoy an ad free experience by logging in. Not a member yet? Register.
Results 1 to 3 of 3
  1. #1
    Senior Coder
    Join Date
    Aug 2009
    Location
    Mansfield, Nottinghamshire, UK
    Posts
    1,555
    Thanks
    57
    Thanked 148 Times in 147 Posts

    returning 500 error when vaild url

    Hello, this is my second full day working with perl. The following code is supposed to get all valid full paths to all pages on a site;

    Code:
    #!/usr/bin/perl
    use strict;
    use warnings;
    use LWP::Simple;
    use Data::Dumper;
    require LWP::UserAgent;
    use DBI;
    
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->max_redirect(0);
    
    print "\n|--------------------------------------------------------------------|\n";
    print "\nEnter a website: http://www.";
    chomp(my $url = <>);
    print "\n|--------------------------------------------------------------------|\n";
    
    # try and find full path
    sub findFullPath {
    
       my($link, $landingPage) = @_;
        
       # strip ./ and / from beggining of string
       $link =~ s/^(?:(?:\/)|(?:\.\/))//g;
        
       # find out whether link is backtracing to previous folder
       if( $link =~ m/^\.\.\// ) { # link desination is back tracing
          
          # find destination folder from landing page
          my @folders = split( "/", $landingPage );    
          #find size of array
          my $foldersSize = scalar @folders;
          # get last entry in array
          my $lastEntry = $folders[$foldersSize - 1];
          
          if( $lastEntry =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/g ) {
             delete $folders[$lastEntry];
          }
          
          my @backFolders = ( $link =~ m/\.\.\//g );
          my $amountOfBackFolders = scalar @backFolders;
          for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
             my $numberToDelete = $foldersSize - $x;
             delete $folders[$numberToDelete - 1];
          }
       
          $landingPage = join( "/", @folders );
          $link =~ s/\.\.\///g;
          return $landingPage . "/" . $link . "\n";
           
       }else{
          if( substr( $landingPage, -1) eq "/" ){
             return $landingPage . $link . "\n";
          }else{
             my @splitLandingPage = split( "/", $landingPage );
             my $amountSplit = scalar @splitLandingPage;
             my $toDelete = $amountSplit - 1;
             delete $splitLandingPage[$toDelete];
             return join( "/", @splitLandingPage ) . "/" . $link . "\n";
          }
       }
    }
    
    # get HTTP details
    my $response = $ua->get('http://' . $url);
    my $responseCode = $response->code;
    my $responseLocation = $response->header( 'Location' );
    
    # contintue only if status code is 200 or 301
    if( $responseCode != 200 && $responseCode != 301 ){
        die "Domain name invalid, please use differnet domain name";
    }
    
    # change url if domain status eq 301
    if( $responseCode == 301 ){
        $url = $responseLocation;
    }
    
    my @pagesArray = ($url);
    my @pagesScannedArray = ('');
    my @mainPagesArray = ('');
    my @pagesNotScanned = ('');
    
    while ( @pagesArray ) {
       # get the next in queue for proccessing
       my $page = shift @pagesArray;
       # check page http status
       $response = $ua->get('http://www' . chomp($page));
       $responseCode = $response->code;
       if( $responseCode == 200 || $responseCode == 301 ){
          # change page url if 301 redirect
          if( $responseCode == 301 ){
             $page = substr( $response->header( 'Location' ), 7 );
          }
          # connect to page and get contents
          if( my $pageData = get "http://" . $page ) {
             # get all links on page
             my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
             # foreach link on the page
             foreach( @pageLinksArray ) {
                 my $link = $_;
                 # if link is format we are looking for
                 if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
                   # if link is outbound
                   if( $link =~ m/^http:\/\// ) {
                      if( ! grep {$_ eq $link} @pagesNotScanned ) {
                         push ( @pagesNotScanned, $link );
                      }
                   }else{
                      # find full path for link
                      my $newUrl = &findFullPath($link, $url);
                      # if link has not already been claimed to be a main page
                      if( ! grep {$_ eq $newUrl} @mainPagesArray ) {
                         # if link is not already in queue
                         if( ! grep {$_ eq $newUrl} @pagesArray ) {
                            push ( @pagesArray, $newUrl );
                         }
                      }
                   }
                }
             }
             if( ! grep {$_ eq $page} @mainPagesArray ) {
                push ( @mainPagesArray, $page );
             }
          }
       }else{
          print "Could not retrieve data for " . $url ." - HTTP Status " . $responseCode . "\n";
          if( ! grep {$_ eq $page} @pagesNotScanned ) {
             push ( @pagesNotScanned, $page );
          }
       }
    }
    
    
    print "\nPages Not Scanned: \n";
    if( scalar @pagesNotScanned == 0 ){
       print "NULL\n";
    }else{
       foreach( @pagesNotScanned ){
           print $_ . "\n";
       }
    }
    
    print "\nPages Scanned: \n";
    if( scalar @mainPagesArray == 0 ){
       print "NULL\n";
    }else{
       foreach( @mainPagesArray ) {
           print $_ . "\n";
       }
    }
    problem being on line 89 it is returning a status code of 500 which is not true. tried with

    actwebdesigns.co.uk/
    landscapers-mansfield.co.uk/
    google.co.uk/

    I think it has something to do with changing the $page to the location if 301 which it didn't do previously. If anyone can see where i am going wrong it would be much appreciated.
    Website Design Mansfield
    PHP Code:
    function I_LOVE(){function b(&$b='P'){$b.='P';}function a($_){return $_++;}$b='P';define("B",'H');b($b=implode('',array($b=a($b),$b=a(B))));b($b);return $b;}
    echo 
    I_LOVE(); 

  • #2
    New to the CF scene
    Join Date
    Jan 2010
    Posts
    1
    Thanks
    0
    Thanked 0 Times in 0 Posts
    I suspect the problem is in using the RETURN value of chomp (==number of characters removed) to create the URL, rather than $page after chomping. See also help about HTTP 500 internal server errors

  • #3
    Senior Coder
    Join Date
    Aug 2009
    Location
    Mansfield, Nottinghamshire, UK
    Posts
    1,555
    Thanks
    57
    Thanked 148 Times in 147 Posts
    iI have now sorted it. Thought I'd share my first ever perl script!

    Code:
    #!/usr/bin/perl
    use strict;
    use warnings;
    use LWP::Simple;
    require LWP::UserAgent;
    sub trim($);
    
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->max_redirect(0);
    
    print "##################################################\n";
    print "#      ACT Spider .php .html .asp .htm .shtml /  #\n";
    print "#     FINDS ONLY VALID PAGE FOR PLUG N PLAY CMS  #\n";
    print "#                   CHANGE TO SUIT               #\n";
    print "# SUPORTS ONLY 301,302 AND 200 HTTP HEADER CODES #\n";
    print "##################################################\n";
    print "\n|--------------------------------------------------------------------|\n";
    print "\nEnter a website: http://www.";
    chomp(my $url = <>);
    print "\n|--------------------------------------------------------------------|\n";
    
    sub trim($)
    {
       my $string = shift;
       $string =~ s/^\s+//;
       $string =~ s/\s+$//;
       return $string;
    }
    
    # try and find full path
    sub findFullPath {
    
       my($link, $landingPage) = @_;
        
       # strip ./ and / from beggining of string
       $link =~ s/^(?:(?:\/)|(?:\.\/))//g;
        
       # find out whether link is backtracing to previous folder
       if( $link =~ m/^\.\.\// ) { # link desination is back tracing
          
          if( $landingPage =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml))$/g ) {
             # find destination folder from landing page
             my @folders = split( "/", $landingPage );    
             #find size of array
             my $foldersSize = scalar @folders;
             delete $folders[$foldersSize - 1];
             $foldersSize = scalar @folders;
             my @backFolders = ( $link =~ m/\.\.\//g ); # get rid of ../
             my $amountOfBackFolders = scalar @backFolders; # find how many folders back
             for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
                my $numberToDelete = ($foldersSize - 1) - $x;
                delete $folders[$numberToDelete];
             }
             $landingPage = join( "/", @folders );
             $link =~ s/\.\.\///g;
             return $landingPage . "/" . $link . "\n";
          } elsif( $landingPage =~ m/(?:\/)$/g ) {
             my @folders = split( "/", $landingPage );    
             #find size of array
             my $foldersSize = scalar @folders;
             delete $folders[$foldersSize - 1];
             $foldersSize = scalar @folders;
             my @backFolders = ( $link =~ m/\.\.\//g ); # get rid of ../
             my $amountOfBackFolders = scalar @backFolders; # find how many folders back
             for( my $x=0; $x < $amountOfBackFolders; $x++ ) {
                my $numberToDelete = ($foldersSize) - $x;
                delete $folders[$numberToDelete];
             }
             $landingPage = join( "/", @folders );
             $link =~ s/\.\.\///g;
             return $landingPage . "/" . $link . "\n";
          } else {
             
          }
           
       }else{
          if( substr( $landingPage, -1) eq "/" ){
             return $landingPage . $link . "\n";
             print "1";
          }else{
             my @splitLandingPage = split( "/", $landingPage );
             my $amountSplit = scalar @splitLandingPage;
             my $toDelete = $amountSplit - 1;
             my $lastEntry = $splitLandingPage[$toDelete];
             if( $lastEntry =~ m/(?:(?:com)|(?:co\.uk)|(?:net)|(?:org)|(?:cc)|(?:tv)|(?:info)|(?:org\.uk)|(?:me\.uk)|(?:biz)|(?:name)|(?:eu)|(?:uk\.com)|(?:eu\.com)|(?:gb\.com)|(?:gb\.net)|(?:uk\.net)|(?:me)|(?:mobi))$/g ) {
                return join( "/", @splitLandingPage ) . "/" . $link . "\n";
             }else{
                delete $splitLandingPage[$toDelete];
                return join( "/", @splitLandingPage ) . "/" . $link . "\n";
             }
          }
       }
    }
    
    # get HTTP details
    my $response = $ua->get('http://' . trim($url));
    my $responseCode = $response->code;
    my $responseLocation = $response->header( 'Location' );
    
    # contintue only if status code is 200 or 301
    if( $responseCode != 200 && $responseCode != 301 && $responseCode != 302 ){
        die "Domain name invalid, please use differnet domain name: http status - " . $responseCode;
    }
    
    # change url if domain status eq 301
    if( $responseCode == 301 || $responseCode == 302 ){
       if($response->header( 'Location' ) =~ m/^http:\/\/www\./g ) {
          $url = substr( $response->header( 'Location' ), 11 );
       }elsif($response->header( 'Location' ) =~ m/^http:\/\//g ) {
          $url = substr( $response->header( 'Location' ), 7 );
       }else{
          $url = findFullPath($response->header( 'Location' ), $url);
       }
    }
    
    my @pagesArray = ($url);
    my @pagesScannedArray;
    my @mainPagesArray;
    my @pagesNotScanned;
    my $z = 0;
    
    while ( @pagesArray && $z < 200 ) {
       # get the next in queue for proccessing
       my $page = trim(shift @pagesArray);
       if( ! grep {$_ eq trim($page)} @pagesNotScanned ) {
          # check page http status
          $response = $ua->get("http://" . trim($page));
          $responseCode = $response->code;
          if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ){
             # change page url if 301 redirect
             if( $responseCode == 301 || $responseCode == 302 ){
                if($response->header( 'Location' ) =~ m/^http:\/\/www\./g ) {
                   $page = substr( $response->header( 'Location' ), 11 );
                   print $page;
                }elsif($response->header( 'Location' ) =~ m/^http:\/\//g ) {
                   $page = substr( $response->header( 'Location' ), 7 );
                }else{
                   $page = findFullPath($response->header( 'Location' ), $url);
                }
             }
             # connect to page and get contents
             if( my $pageData = get "http://" . trim($page) ) {
                # get all links on page
                my @pageLinksArray = ( $pageData =~ m/href=["']([^"']*)["']/g );
                # foreach link on the page
                foreach( @pageLinksArray ) {
                    my $link = trim($_);
                   # remove url if located on same domain
                   $link =~ s/(?:http:\/\/)?(?:www\.)?$url//g;
                   # if link is format we are looking for
                   if( $link =~ m/(?:(?:\.html)|(?:\.php)|(?:\.htm)|(?:\.asp)|(?:\.shtml)|(?:\/))$/ ) {
                      # if link is outbound
                      if( $link =~ m/^http:\/\//g ) {
                         if( ! grep {$_ eq trim($link)} @pagesNotScanned ) {
                            if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
                               push ( @pagesNotScanned, trim($link) );
                            }
                         }
                      }else{
                         # find full path for link
                         my $newUrl = &findFullPath(trim($link), trim($page));
                         # if link has not already been claimed to be a main page
                         if( ! grep {$_ eq trim($newUrl)} @mainPagesArray ) {
                            # if link is not already in queue
                            if( ! grep {$_ eq trim($newUrl)} @pagesArray ) {
                               #print "Landing Page: " . $page;
                               #print "Old URL: " . $link . "\n";
                               push ( @pagesArray, trim($newUrl) );
                            }
                         }
                      }
                   }
                }
                if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
                   push ( @mainPagesArray, trim($page) );
                   print "[" . $z . "] http://www." . trim($page) . "\n";
                }
             }
          }else{
             if( ! grep {$_ eq trim($page)} @pagesNotScanned ) {
                if( ! grep {$_ eq trim($page)} @mainPagesArray ) {
                   push ( @pagesNotScanned, trim($page) );
                }
             }
          }
       }
       $z++;
       
    }
    
    my $getFileName = ( $url =~ m/(^[^\/]*)/g );
    my $fileName = "sitemap-" . $1 . ".txt";
    if( open FH, ">>c:/perlscripts/$fileName" or die "can't open '$fileName': $!" ) {
       foreach( @mainPagesArray ) {
          print FH "http://www." . trim($_) . "\n";
       }
       close FH;
       print "\n>>c:/perlscripts/$fileName successfuly saved\n";
    }
    Website Design Mansfield
    PHP Code:
    function I_LOVE(){function b(&$b='P'){$b.='P';}function a($_){return $_++;}$b='P';define("B",'H');b($b=implode('',array($b=a($b),$b=a(B))));b($b);return $b;}
    echo 
    I_LOVE(); 


  •  

    Posting Permissions

    • You may not post new threads
    • You may not post replies
    • You may not post attachments
    • You may not edit your posts
    •