diff --git a/lib/OpenDbSnoopy.class.php b/lib/OpenDbSnoopy.class.php index 561a6538..e8a0fcaf 100644 --- a/lib/OpenDbSnoopy.class.php +++ b/lib/OpenDbSnoopy.class.php @@ -37,15 +37,19 @@ class OpenDbSnoopy extends Snoopy { function OpenDbSnoopy($debug = FALSE) { // if file cache table is not installed, we cannot use file cache. $this->_file_cache_enabled = get_opendb_config_var ( 'http.cache', 'enable' ); - + // override user agent. - $this->agent = 'Mozilla/5.0 (X11; CentOS) Gecko/20100101 Firefox/50.0'; - + if (isset($_SERVER['HTTP_USER_AGENT'])) { + $this->agent = $_SERVER['HTTP_USER_AGENT']; + } else { + $this->agent = 'Mozilla/5.0 (X11; CentOS) Gecko/20100101 Firefox/75.0'; + } + // in how many cases is this going to work? $this->passcookies = FALSE; - + $this->_debug = $debug; - + $proxy_server_config_r = get_opendb_config_var ( 'http.proxy_server' ); if ($proxy_server_config_r ['enable'] == TRUE) { $this->proxy_host = $proxy_server_config_r ['host']; @@ -53,7 +57,7 @@ function OpenDbSnoopy($debug = FALSE) { $this->proxy_user = $proxy_server_config_r ['userid']; $this->proxy_pass = $proxy_server_config_r ['password']; } - + // the default curl path for snoopy is /usr/local/bin/curl - often however, it will reside in another path if(!empty($this->curl_path) || !@is_executable($this->curl_path)) { $curlpaths = array(); // variable for test-paths diff --git a/lib/Snoopy.class.php b/lib/Snoopy.class.php index fb54a082..a2995672 100644 --- a/lib/Snoopy.class.php +++ b/lib/Snoopy.class.php @@ -557,7 +557,7 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") if (empty ( $url )) $url = "/"; // GET ... header not needed for curl - //$headers[] = $http_method." ".$url." ".$this->_httpversion; + //$headers[] = $http_method." ".$url." ".$this->_httpversion; if (! empty ( $this->agent )) curl_setopt($curl, CURLOPT_USERAGENT, $this->agent); @@ -621,6 +621,8 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") if (! empty ( $this->user ) || ! empty ( $this->pass )) $headers [] = "Authorization: BASIC " . base64_encode ( $this->user . ":" . $this->pass ); + $header[] = "Pragma: "; + curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); if (! empty ( $body )) { @@ -636,6 +638,7 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0); curl_setopt($curl, CURLOPT_AUTOREFERER, 1); curl_setopt($curl, CURLOPT_HEADER, 0); // don't include headers in output + curl_setopt($curl, CURLOPT_ENCODING, "gzip,deflate"); $results = curl_exec($curl); $status = curl_getinfo($curl); diff --git a/lib/site/amazon.class.php b/lib/site/amazon.class.php index 4aaa2a7f..be6c2a6b 100644 --- a/lib/site/amazon.class.php +++ b/lib/site/amazon.class.php @@ -1,782 +1,422 @@ array('asinId' => 'amazonasin', 'url' => 'www.amazon.com'), - 'amazonuk' => array('asinId' => 'amazukasin', 'url' => 'www.amazon.co.uk'), - 'amazonfr' => array('asinId' => 'amazfrasin', 'url' => 'www.amazon.fr'), - 'amazonde' => array('asinId' => 'amazdeasin', 'url' => 'www.amazon.de') - ); - - function amazon($site_type) { - parent::SitePlugin($site_type); - - $this->asinId = $this->sites[$site_type]['asinId']; - $this->url = $this->sites[$site_type]['url']; - $this->_httpClient->agent = 'Mozilla/5.0 (X11; OpenDB) Gecko/20100101 Firefox/50.0'; - } - - function queryListing($page_no, $items_per_page, $offset, $s_item_type, $search_vars_r) { - if (strlen($search_vars_r[$this->asinId]) > 0) { - $this->addListingRow(NULL, NULL, NULL, array($this->asinId => $search_vars_r[$this->asinId])); - return TRUE; - } else { - //http://www.amazon.com/s/ref=sr_nr_p_n_format_browse-bi_mrr_0?rh=i%3Advd%2Ck%3Aguard%2Cp_n_format_browse-bin%3A2650304011&sort=movies-tv&keywords=guard&ie=UTF8&qid=1410661852&rnid=2650303011 - //http://www.amazon.com/s/ref=sr_nr_p_n_format_browse-bi_mrr_3?rh=i%3Advd%2Ck%3Aguard%2Cp_n_format_browse-bin%3A2650305011&sort=movies-tv&keywords=guard&ie=UTF8&qid=1410661852&rnid=2650303011 - // Get the mapped AMAZON index type - $index_type = ifempty($this->getConfigValue('item_type_to_index_map', $s_item_type), strtolower($s_item_type)); - - // amazon does not provide the ability to specify how many items per page, so $items_per_page is ignored! - $queryUrl = "https://" . $this->url . "/exec/obidos/external-search?index=" . $index_type . "&keyword=" . urlencode($search_vars_r['title']) . "&page=$page_no"; - - $pageBuffer = $this->fetchURI($queryUrl); - } - - if (strlen($pageBuffer) > 0) { - $amazonasin = FALSE; - - //
  • ISBN-10: 0812929985
  • - // - // check for an exact match, but not if this is second page of listings or more - if (!$this->isPreviousPage()) { - if (preg_match("/ASIN: (\w{10})<\/font>/", $pageBuffer, $regs)) { - $amazonasin = trim($regs[1]); - } else if (preg_match("/ASIN: (\w{10})/", strip_tags($pageBuffer), $regs)) { - $amazonasin = trim($regs[1]); - } else if (preg_match("!
  • ISBN-10:\s*([0-9]+)
  • !", $pageBuffer, $regs)) { // for books, ASIN is the same as ISBN - $amazonasin = trim($regs[1]); - } else if (preg_match_all("!
    ]*?name=\"([^\"])\"!", $pageBuffer, $regs)) { - if (count($regs[0]) == 1) { - $amazonasin = trim($regs[1]); - } - } else if (preg_match_all("!
    addListingRow(NULL, NULL, NULL, array($this->asinId => $amazonasin, 'search.title' => $search_vars_r['title'])); - - return TRUE; - } else { - // this is a severe memory hog!!! - $pageBuffer = preg_replace('/[\r\n]+/', ' ', $pageBuffer); - - //
    Showing 1 - 12 of 55 Results
    || class="resultCount">Showing 1 Result1-24 von 194 Ergebnissen - if ((preg_match("/ id=\"resultCount\">.*?.*?.[0-9]+[\s]+?-[\s]+?[0-9]+.*?([0-9,]+).*?<\//", $pageBuffer, $regs) || - preg_match("/ id=\"resultCount\">.*?.*?.([0-9]+).*?<\//", $pageBuffer, $regs) || - preg_match("/ id=.s-result-count.*?([0-9,]+) results? for/", $pageBuffer, $regs) )) { - // need to remove the commas from the total - $total = str_replace(",", "", $regs[1]); - - // store total count here. - $this->setTotalCount($total); - - // 2 = img, 1 = href, 3 = title - if (preg_match_all("/id=\"result_.*?href=\"(.*?)\">.*?(.*?)<\/a/i", $pageBuffer, $matches)) { - for ($i = 0; $i < count($matches[0]); $i++) { - - $imageuri = preg_replace('!(\/[^.]+\.)_[^.]+_\.!', "$1", $matches[2][$i]); - - if (preg_match("!/dp/([^/]+)/!", $matches[1][$i], $regs)) { - if (strpos($matches[2][$i], "no-img") !== FALSE) - $matches[2][$i] = NULL; - - if (!preg_match("!
    ]*/dp/".$regs[1]."/!i", $pageBuffer, $newregs)) { - $this->addListingRow($matches[3][$i], $imageuri, NULL, array($this->asinId => $regs[1], 'search.title' => $search_vars_r['title'])); - } - } - } - } - } - } - - //default - return TRUE; - } else { - return FALSE; - } - } - - function queryItem($search_attributes_r, $s_item_type) { - // assumes we have an exact match here - $pageBuffer = $this->fetchURI("https://" . $this->url . "/gp/product/" . $search_attributes_r[$this->asinId]); - - // no sense going any further here. - if (strlen($pageBuffer) == 0) - return FALSE; - - $pageBuffer = preg_replace('/[\r\n]+/', ' ', $pageBuffer); - $pageBuffer = preg_replace('/>[\s]*<', $pageBuffer); - - //Prometheus (Blu-ray/ DVD + Digital Copy) (2012) - //Homeland: The Dark Elf Trilogy, Part 1 (Forgotten Realms: The Legend of Drizzt, Book I) (Bk. 1) [Mass Market Paperback] Illustration School: Let's Draw Happy People Hardcover - //Men in Black 3 [Blu-ray] - if (preg_match("/]*>([^<]+)<\/?span/s", $pageBuffer, $regs) || - preg_match("/]*?>([^<]+)<\/span/s", $pageBuffer, $regs) || - //

    ... - preg_match("/]*?id=\"title\"[^>]*>([^<]+)([^<]+)<\/b>/s", $pageBuffer, $regs) || - preg_match("/([^<]+)