...

View Full Version : Link Extractor



Darren06
05-31-2004, 10:00 AM
Anyone know how I would make a link extractor that only extracts internal links on a site I specify?

Íkii
05-31-2004, 10:46 AM
The following code is a basic reciprocal tester, that scampers across a given site searching for a return link.
The object (herein named $a) will hold all the links found on the site within some output arrays - the index named ['all_urls'] should be the onsite ones - anyway, hack around with the code until it does what you need.



<html>
<head>
<title>reciprocal test</title>
</head>
<body>
<?php

class reciprocal
{
function reciprocal($site_base, $return_link = 'http://www.teckis.com', $max_depth = 4, $max_pages = 150)
{
$this->max_depth = $max_depth+1;
$this->site_base = (substr($site_base,-1) == '/') ? $site_base : $site_base.'/';
$this->ret_link = $return_link;
$this->depth = 0;
$this->page_urls = array();
$this->all_urls = array();
$this->page_urls[0][$site_base] = 0;
$this->all_urls[$site_base] = 0;
$this->found_the_link = array('found'=>'no','num_pages'=>0,'on_page'=>'','ret_href'=>'');
$this->max_pages = $max_pages;
$this->count_pages = 0;
$this->bad_things = array('.exe','.jpg','.jpeg','.gif','.png','.mov','.zip','.gz','.wmv','.pdf','.doc');
while($this->depth < $this->max_depth)
{
if(is_array($this->page_urls[$this->depth]) && count($this->page_urls[$this->depth]) > 0)
{
foreach($this->page_urls[$this->depth] AS $test_page=>$test_val)
{
if(($this->depth < $this->max_depth) && $this->all_urls[$test_page] != 1 && $this->count_pages <= $this->max_pages)
{
// untested page - link still not found
$this->valid_page = true;
foreach($this->bad_things AS $indx=>$bad_ext)
{
if(strpos($test_page,$bad_ext) > -1)
{
$this->valid_page = false;
}
}
if($this->valid_page == true)
{
$this->checkSite($test_page);
}
}
}
}
++$this->depth;
}
$this->found_the_link['num_pages'] = $this->count_pages;
}
function checkSite($url_to_check = '')
{
++$this->count_pages;
$this->r = $url_to_check;
$this->all_urls[$this->r] = 1;
$this->f = @file($this->r);
$this->c = count($this->f);
$this->base_href = $this->r;
$this->done_base = false;
// create a base_href that is a folder with a trailing slash
if(substr($this->base_href,-1) != '/')
{
// the base href doesn't end in / so work out if it's a folder or file
$this->base_bits = explode('/',$this->base_href);
$this->num_bits = count($this->base_bits);
if($this->num_bits < 4 || strpos($this->base_bits[($this->num_bits-1)],'.') == -1)
{
// root directory or folder
$this->base_href .= '/';
}
else
{
// tis a file we thinks
$this->base_href = substr($this->base_href,0,strrpos($this->base_href,'/')+1);
}
}
for($this->l = 0; $this->l < $this->c; $this->l++)
{
// run through the file a line at a time
if(strpos($this->f[$this->l],'<base') > -1 && $this->done_base == false)
{
// a <base tag found - see if it is a base href
preg_match_all("|href=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->bhr);
if(isset($this->bhr[1][0]) && $this->bhr[1][0] != '')
{
$this->base_href = $this->bhr[1][0].((substr($this->bhr[1][0],-1) == '/') ? '' : '/');
}
$this->done_base = true;
}

$this->p = strpos($this->f[$this->l],'<a ');
if($this->p > -1)
{
// test the anchor tag to see if it has href='.....'
unset($this->matches);
preg_match_all("|href=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->matches);
foreach($this->matches[1] AS $this->href)
{
// remove leading / if there is one
$this->href = (substr($this->href,0,1) == '/') ? substr($this->href,1) : $this->href;
// now suss out a usable url from the tag return
if(substr($this->href,0,4) != 'http' && substr($this->href,0,6) != 'mailto' && substr($this->href,0,1) != '#')
{
// we have a relative path url :o| so add path
if(!isset($this->all_urls[$this->base_href.$this->href]))
{
$this->page_urls[$this->depth+1][$this->base_href.$this->href] = $this->r;
}
}
elseif(substr($this->href,0,strlen($this->site_base)) == $this->site_base)
{
// we have a full on site http url :)
if(!isset($this->all_urls[$this->href]))
{
$this->page_urls[$this->depth+1][$this->href] = $this->r;
}
}
else
{
// an off site link
if(strpos($this->href, $this->ret_link) > -1)
{
$this->found_the_link['found'] = 'yes';
$this->found_the_link['on_page'] = $this->r;
$this->found_the_link['ret_href'] = $this->href;
$this->depth += $this->max_depth;
}
}
} // end iterate matches
} // end anchor tag area found

$this->fr = strpos($this->f[$this->l],'<frame ');
if($this->fr > -1)
{
// test the frame tag to see if it has a src='.....'
unset($this->matches);
preg_match_all("|src=['\"]?([^\"' >]+)|i",$this->f[$this->l],$this->matches);
foreach($this->matches[1] AS $this->href)
{
// remove leading / if there is one
$this->href = (substr($this->href,0,1) == '/') ? substr($this->href,1) : $this->href;
// now suss out a usable url from the tag return
if(substr($this->href,0,4) != 'http')
{
// we have a relative path src tag :o| so add path
if(!isset($this->all_urls[$this->base_href.$this->href]))
{
$this->page_urls[$this->depth+1][$this->base_href.$this->href] = $this->r;
}
}
elseif(substr($this->href,0,strlen($this->site_base)) == $this->site_base)
{
// we have a full on site http src tag :)
if(!isset($this->all_urls[$this->href]))
{
$this->page_urls[$this->depth+1][$this->href] = $this->r;
}
}
} // end iterate frame src matches
} // end frame area found
} // end parse file a line at a time
}
function getReturnData()
{
return $this->found_the_link;
}
}


if(isset($_POST['url']) && $_POST['url'])
{
$checkfor = (isset($_POST['findme']) && $_POST['findme'] != '') ? $_POST['findme'] : 'http://www.teckis.com';
$a = new reciprocal($_POST['url'],$checkfor,3,150);
$b = $a->getReturnData();
if($b['found'] == 'yes') {echo 'A link with href ' .$b['ret_href']. ' was found on page ' .$b['on_page']. ' after testing ' .$b['num_pages']. ' pages.';}
else {echo 'No links containing the text ' .$checkfor. ' were found on the site ' .$_POST['url']. ' within a ' .$b['num_pages']. ' page test.';}
echo '<pre>';
print_r($a);
echo '</pre>';
}
?>
<br /><br />
<form method="post" action="<?php echo $_SERVER['REQUEST_URI']; ?>">
Domain to test: <input type="text" name="url"><br />
Link to look for: <input type="text" name="findme"><br />
<input type="submit" value="find the link">
</form>
</body>
</html>

Darren06
05-31-2004, 12:14 PM
Thanks, Ill try it.

Darren06
06-01-2004, 10:52 PM
Thanks for the code but how would I do it where I didnt look for anything but just specified the page and it display all links on that site? And the loop for each finding where would that be?

firepages
06-01-2004, 11:51 PM
the snoopy (http://snoppy.sourceforge.net) class has a fetchlinks() method , e.g. just fetch the links from within a specified page.

not sure if it digs into frames like Marks code though!



EZ Archive Ads Plugin for vBulletin Copyright 2006 Computer Help Forum