rswyatt
05-24-2004, 08:28 PM
Alright,
I'm now using the CURL_* set of functions to do my grabbing... The problem I'm having with the script is with the POST sites... Here is an example:
$PRODUCT_PAGE_IMMEDIATELY_BEFORE = '/MyAccount/ViewProduct.asp?SID=1&Product_ID=';
$PRODUCT_PAGE_IMMEDIATELY_AFTER = '&TabID=1';
$IMAGE_URL_IMMEDIATELY_BEFORE = '/Images/Products/';
$IMAGE_URL_IMMEDIATELY_AFTER = '.gif" ALT="Product Image"';
$DESCRIPTION_IMMEDIATELY_BEFORE = '/Images/Products/';
$DESCRIPTION_IMMEDIATELY_AFTER = '.gif" ALT="Product Image"';
$DEBUG = 1;
$PUBLISHER = 'Ignatius Press';
////////////////////////////////////////////////////////////////
$body = '';
$r = mysql_query("SELECT * from products where publisher like '$PUBLISHER' AND (descr = '' OR descr is NULL) LIMIT 5");
while ($row = mysql_fetch_array($r)) {
// Grab the URL
$url = 'http://www.ignatius.com/Search.asp?';
$name = explode(":",$row['name']);
$name = $name[0];
$pattern = " ";
$replace = "+";
$name = ereg_replace($pattern,$replace,$name);
$params = "CategoryType=%&SearchCriteria=".$name."&Page=1&ProductsPerPage=25&SearchBy=P.Name,P.FullDescription,P.SKU,P.ISBN";
$ch = curl_init(); //initialize the cURL process
//all the cURL options that are needed to 'browse' the page
curl_setopt($ch, CURLOPT_POST,1); //POST of course
curl_setopt($ch, CURLOPT_POSTFIELDS, $params); //POST that long field variable
curl_setopt($ch, CURLOPT_URL,$url); //login page
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); //verify the SSL host
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.01; Windows NT 5.1)"); //submit user agent
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); //validate the SSL certificate
curl_setopt($ch, CURLOPT_COOKIEJAR, 'my_cookies.txt'); //keep the cookies, don't eat them you'll need them later;)
$pagex = curl_exec ($ch); //actually do the request
curl_setopt($ch, CURLOPT_URL,$url); //login page
$page = curl_exec ($ch); //actually do the request
// Put the page in a single line.
$page = str_replace("\n",'' , $page);
$page = str_replace("\r",'' , $page);
$page = str_replace("\t",'' , $page);
echo $page."<hr>";
//echo "$page<hr>";
// Make sure it found the product
// Customize this
if (preg_match("/No records were found for your search/", $page)) {
continue;
}
$product_page_url = get_between_text ($page,$PRODUCT_PAGE_IMMEDIATELY_BEFORE,$PRODUCT_PAGE_IMMEDIATELY_AFTER);
echo $product_page_url."<br>";
// Customize this
$product_page_url = "http://www.ignatius.com/MyAccount/ViewProduct.asp?SID=1&Product_ID=".$product_page_url."&TabID=1";
echo "<br><br><b>URL:</b> $product_page_url";
curl_setopt($ch, CURLOPT_URL,$product_page_url); //login page
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); //verify the SSL host
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.01; Windows NT 5.1)"); //submit user agent
curl_setopt($ch, CURLOPT_COOKIEJAR, 'my_cookies.txt'); //keep the cookies, don't eat them you'll need them later;)
$page = curl_exec ($ch); //actually do the request
// Put the page in a single line.
$page = str_replace("\n",'' , $page);
$page = str_replace("\r",'' , $page);
$page = str_replace("\t",'' , $page);
curl_close ($ch); //close the connection
$description = get_between_text ($page,$DESCRIPTION_IMMEDIATELY_BEFORE,$DESCRIPTION_IMMEDIATELY_AFTER);
$image_url = get_between_text ($page,$IMAGE_URL_IMMEDIATELY_BEFORE,$IMAGE_URL_IMMEDIATELY_AFTER);
// Customize this
//$image_url = 'http://tyndalebooksellers.com/images/119_w/'.$image_url;
$image_url = "http://www.ignatius.com/Images/Products/".$image_url.".gif";
if ($DEBUG) {
echo "$page<hr>\n\n\n";
echo "<b>Description:</b> $description<br>\n\n<b>Image URL:</b> $image_url<br><br>\n\n\n";
}
list($image,$thumb) = get_images($image_url,$row['pid']);
$sql = 'update products set '
. 'thumb=' . db_prep($thumb)
. ',image=' . db_prep($image)
. ',descr=' . db_prep($description)
. ' where pid=' . db_prep($row['pid']);
if ($DEBUG) {
echo "<b>SQL:</b> $sql\n\n";
} else {
//d_mysql_query($sql);
}
I'm now using the CURL_* set of functions to do my grabbing... The problem I'm having with the script is with the POST sites... Here is an example:
$PRODUCT_PAGE_IMMEDIATELY_BEFORE = '/MyAccount/ViewProduct.asp?SID=1&Product_ID=';
$PRODUCT_PAGE_IMMEDIATELY_AFTER = '&TabID=1';
$IMAGE_URL_IMMEDIATELY_BEFORE = '/Images/Products/';
$IMAGE_URL_IMMEDIATELY_AFTER = '.gif" ALT="Product Image"';
$DESCRIPTION_IMMEDIATELY_BEFORE = '/Images/Products/';
$DESCRIPTION_IMMEDIATELY_AFTER = '.gif" ALT="Product Image"';
$DEBUG = 1;
$PUBLISHER = 'Ignatius Press';
////////////////////////////////////////////////////////////////
$body = '';
$r = mysql_query("SELECT * from products where publisher like '$PUBLISHER' AND (descr = '' OR descr is NULL) LIMIT 5");
while ($row = mysql_fetch_array($r)) {
// Grab the URL
$url = 'http://www.ignatius.com/Search.asp?';
$name = explode(":",$row['name']);
$name = $name[0];
$pattern = " ";
$replace = "+";
$name = ereg_replace($pattern,$replace,$name);
$params = "CategoryType=%&SearchCriteria=".$name."&Page=1&ProductsPerPage=25&SearchBy=P.Name,P.FullDescription,P.SKU,P.ISBN";
$ch = curl_init(); //initialize the cURL process
//all the cURL options that are needed to 'browse' the page
curl_setopt($ch, CURLOPT_POST,1); //POST of course
curl_setopt($ch, CURLOPT_POSTFIELDS, $params); //POST that long field variable
curl_setopt($ch, CURLOPT_URL,$url); //login page
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); //verify the SSL host
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.01; Windows NT 5.1)"); //submit user agent
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); //validate the SSL certificate
curl_setopt($ch, CURLOPT_COOKIEJAR, 'my_cookies.txt'); //keep the cookies, don't eat them you'll need them later;)
$pagex = curl_exec ($ch); //actually do the request
curl_setopt($ch, CURLOPT_URL,$url); //login page
$page = curl_exec ($ch); //actually do the request
// Put the page in a single line.
$page = str_replace("\n",'' , $page);
$page = str_replace("\r",'' , $page);
$page = str_replace("\t",'' , $page);
echo $page."<hr>";
//echo "$page<hr>";
// Make sure it found the product
// Customize this
if (preg_match("/No records were found for your search/", $page)) {
continue;
}
$product_page_url = get_between_text ($page,$PRODUCT_PAGE_IMMEDIATELY_BEFORE,$PRODUCT_PAGE_IMMEDIATELY_AFTER);
echo $product_page_url."<br>";
// Customize this
$product_page_url = "http://www.ignatius.com/MyAccount/ViewProduct.asp?SID=1&Product_ID=".$product_page_url."&TabID=1";
echo "<br><br><b>URL:</b> $product_page_url";
curl_setopt($ch, CURLOPT_URL,$product_page_url); //login page
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); //verify the SSL host
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.01; Windows NT 5.1)"); //submit user agent
curl_setopt($ch, CURLOPT_COOKIEJAR, 'my_cookies.txt'); //keep the cookies, don't eat them you'll need them later;)
$page = curl_exec ($ch); //actually do the request
// Put the page in a single line.
$page = str_replace("\n",'' , $page);
$page = str_replace("\r",'' , $page);
$page = str_replace("\t",'' , $page);
curl_close ($ch); //close the connection
$description = get_between_text ($page,$DESCRIPTION_IMMEDIATELY_BEFORE,$DESCRIPTION_IMMEDIATELY_AFTER);
$image_url = get_between_text ($page,$IMAGE_URL_IMMEDIATELY_BEFORE,$IMAGE_URL_IMMEDIATELY_AFTER);
// Customize this
//$image_url = 'http://tyndalebooksellers.com/images/119_w/'.$image_url;
$image_url = "http://www.ignatius.com/Images/Products/".$image_url.".gif";
if ($DEBUG) {
echo "$page<hr>\n\n\n";
echo "<b>Description:</b> $description<br>\n\n<b>Image URL:</b> $image_url<br><br>\n\n\n";
}
list($image,$thumb) = get_images($image_url,$row['pid']);
$sql = 'update products set '
. 'thumb=' . db_prep($thumb)
. ',image=' . db_prep($image)
. ',descr=' . db_prep($description)
. ' where pid=' . db_prep($row['pid']);
if ($DEBUG) {
echo "<b>SQL:</b> $sql\n\n";
} else {
//d_mysql_query($sql);
}