Darren06
05-31-2004, 10:00 AM
Anyone know how I would make a link extractor that only extracts internal links on a site I specify?
|
||||
Link ExtractorDarren06 05-31-2004, 10:00 AM Anyone know how I would make a link extractor that only extracts internal links on a site I specify? Ökii 05-31-2004, 10:46 AM The following code is a basic reciprocal tester, that scampers across a given site searching for a return link. The object (herein named $a) will hold all the links found on the site within some output arrays - the index named ['all_urls'] should be the onsite ones - anyway, hack around with the code until it does what you need. <html> <head> <title>reciprocal test</title> </head> <body> <?php class reciprocal { function reciprocal($site_base, $return_link = 'http://www.teckis.com', $max_depth = 4, $max_pages = 150) { $this->max_depth = $max_depth+1; $this->site_base = (substr($site_base,-1) == '/') ? $site_base : $site_base.'/'; $this->ret_link = $return_link; $this->depth = 0; $this->page_urls = array(); $this->all_urls = array(); $this->page_urls[0][$site_base] = 0; $this->all_urls[$site_base] = 0; $this->found_the_link = array('found'=>'no','num_pages'=>0,'on_page'=>'','ret_href'=>''); $this->max_pages = $max_pages; $this->count_pages = 0; $this->bad_things = array('.exe','.jpg','.jpeg','.gif','.png','.mov','.zip','.gz','.wmv','.pdf','.doc'); while($this->depth < $this->max_depth) { if(is_array($this->page_urls[$this->depth]) && count($this->page_urls[$this->depth]) > 0) { foreach($this->page_urls[$this->depth] AS $test_page=>$test_val) { if(($this->depth < $this->max_depth) && $this->all_urls[$test_page] != 1 && $this->count_pages <= $this->max_pages) { // untested page - link still not found $this->valid_page = true; foreach($this->bad_things AS $indx=>$bad_ext) { if(strpos($test_page,$bad_ext) > -1) { $this->valid_page = false; } } if($this->valid_page == true) { $this->checkSite($test_page); } } } } ++$this->depth; } $this->found_the_link['num_pages'] = $this->count_pages; } function checkSite($url_to_check = '') { ++$this->count_pages; $this->r = $url_to_check; $this->all_urls[$this->r] = 1; $this->f = @file($this->r); $this->c = count($this->f); $this->base_href = $this->r; $this->done_base = false; // create a base_href that is a folder with a trailing slash if(substr($this->base_href,-1) != '/') { // the base href doesn't end in / so work out if it's a folder or file $this->base_bits = explode('/',$this->base_href); $this->num_bits = count($this->base_bits); if($this->num_bits < 4 || strpos($this->base_bits[($this->num_bits-1)],'.') == -1) { // root directory or folder $this->base_href .= '/'; } else { // tis a file we thinks $this->base_href = substr($this->base_href,0,strrpos($this->base_href,'/')+1); } } for($this->l = 0; $this->l < $this->c; $this->l++) { // run through the file a line at a time if(strpos($this->f[$this->l],'<base') > -1 && $this->done_base == false) { // a <base tag found - see if it is a base href preg_match_all("|href=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->bhr); if(isset($this->bhr[1][0]) && $this->bhr[1][0] != '') { $this->base_href = $this->bhr[1][0].((substr($this->bhr[1][0],-1) == '/') ? '' : '/'); } $this->done_base = true; } $this->p = strpos($this->f[$this->l],'<a '); if($this->p > -1) { // test the anchor tag to see if it has href='.....' unset($this->matches); preg_match_all("|href=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->matches); foreach($this->matches[1] AS $this->href) { // remove leading / if there is one $this->href = (substr($this->href,0,1) == '/') ? substr($this->href,1) : $this->href; // now suss out a usable url from the tag return if(substr($this->href,0,4) != 'http' && substr($this->href,0,6) != 'mailto' && substr($this->href,0,1) != '#') { // we have a relative path url :o| so add path if(!isset($this->all_urls[$this->base_href.$this->href])) { $this->page_urls[$this->depth+1][$this->base_href.$this->href] = $this->r; } } elseif(substr($this->href,0,strlen($this->site_base)) == $this->site_base) { // we have a full on site http url :) if(!isset($this->all_urls[$this->href])) { $this->page_urls[$this->depth+1][$this->href] = $this->r; } } else { // an off site link if(strpos($this->href, $this->ret_link) > -1) { $this->found_the_link['found'] = 'yes'; $this->found_the_link['on_page'] = $this->r; $this->found_the_link['ret_href'] = $this->href; $this->depth += $this->max_depth; } } } // end iterate matches } // end anchor tag area found $this->fr = strpos($this->f[$this->l],'<frame '); if($this->fr > -1) { // test the frame tag to see if it has a src='.....' unset($this->matches); preg_match_all("|src=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->matches); foreach($this->matches[1] AS $this->href) { // remove leading / if there is one $this->href = (substr($this->href,0,1) == '/') ? substr($this->href,1) : $this->href; // now suss out a usable url from the tag return if(substr($this->href,0,4) != 'http') { // we have a relative path src tag :o| so add path if(!isset($this->all_urls[$this->base_href.$this->href])) { $this->page_urls[$this->depth+1][$this->base_href.$this->href] = $this->r; } } elseif(substr($this->href,0,strlen($this->site_base)) == $this->site_base) { // we have a full on site http src tag :) if(!isset($this->all_urls[$this->href])) { $this->page_urls[$this->depth+1][$this->href] = $this->r; } } } // end iterate frame src matches } // end frame area found } // end parse file a line at a time } function getReturnData() { return $this->found_the_link; } } if(isset($_POST['url']) && $_POST['url']) { $checkfor = (isset($_POST['findme']) && $_POST['findme'] != '') ? $_POST['findme'] : 'http://www.teckis.com'; $a = new reciprocal($_POST['url'],$checkfor,3,150); $b = $a->getReturnData(); if($b['found'] == 'yes') {echo 'A link with href ' .$b['ret_href']. ' was found on page ' .$b['on_page']. ' after testing ' .$b['num_pages']. ' pages.';} else {echo 'No links containing the text ' .$checkfor. ' were found on the site ' .$_POST['url']. ' within a ' .$b['num_pages']. ' page test.';} echo '<pre>'; print_r($a); echo '</pre>'; } ?> <br /><br /> <form method="post" action="<?php echo $_SERVER['REQUEST_URI']; ?>"> Domain to test: <input type="text" name="url"><br /> Link to look for: <input type="text" name="findme"><br /> <input type="submit" value="find the link"> </form> </body> </html> Darren06 05-31-2004, 12:14 PM Thanks, Ill try it. Darren06 06-01-2004, 10:52 PM Thanks for the code but how would I do it where I didnt look for anything but just specified the page and it display all links on that site? And the loop for each finding where would that be? firepages 06-01-2004, 11:51 PM the snoopy (http://snoppy.sourceforge.net) class has a fetchlinks() method , e.g. just fetch the links from within a specified page. not sure if it digs into frames like Marks code though! |
| |||
EZ Archive Ads Plugin for vBulletin Copyright 2006 Computer Help Forum