From 7a8aac59b79d2f6a4cdfcc92a479560c2a1492d3 Mon Sep 17 00:00:00 2001 From: Nathaniel Clark Date: Sat, 8 Sep 2018 16:35:48 -0400 Subject: [PATCH] Site::Amazon: use DOM to scrape instead of regex This removes most of the awful regular expressions and uses a much cleaner DOM interface. All white-space to spaces. Books and Movies are parsed. Games and Music are not yet. Add ISBN if we can tell search string is one. Bettery byline processing. Add genre to games and music Use Client HTTP agent when issuing requests to providers Signed-off-by: Nathaniel Clark --- lib/OpenDbSnoopy.class.php | 16 +- lib/Snoopy.class.php | 5 +- lib/site/amazon.class.php | 1190 +++++++++++++----------------------- lib/site/amazonutils.php | 227 ++----- lib/site_plugin.php | 4 +- lib/utils.php | 22 + 6 files changed, 516 insertions(+), 948 deletions(-) diff --git a/lib/OpenDbSnoopy.class.php b/lib/OpenDbSnoopy.class.php index 561a6538..e8a0fcaf 100644 --- a/lib/OpenDbSnoopy.class.php +++ b/lib/OpenDbSnoopy.class.php @@ -37,15 +37,19 @@ class OpenDbSnoopy extends Snoopy { function OpenDbSnoopy($debug = FALSE) { // if file cache table is not installed, we cannot use file cache. $this->_file_cache_enabled = get_opendb_config_var ( 'http.cache', 'enable' ); - + // override user agent. - $this->agent = 'Mozilla/5.0 (X11; CentOS) Gecko/20100101 Firefox/50.0'; - + if (isset($_SERVER['HTTP_USER_AGENT'])) { + $this->agent = $_SERVER['HTTP_USER_AGENT']; + } else { + $this->agent = 'Mozilla/5.0 (X11; CentOS) Gecko/20100101 Firefox/75.0'; + } + // in how many cases is this going to work? $this->passcookies = FALSE; - + $this->_debug = $debug; - + $proxy_server_config_r = get_opendb_config_var ( 'http.proxy_server' ); if ($proxy_server_config_r ['enable'] == TRUE) { $this->proxy_host = $proxy_server_config_r ['host']; @@ -53,7 +57,7 @@ function OpenDbSnoopy($debug = FALSE) { $this->proxy_user = $proxy_server_config_r ['userid']; $this->proxy_pass = $proxy_server_config_r ['password']; } - + // the default curl path for snoopy is /usr/local/bin/curl - often however, it will reside in another path if(!empty($this->curl_path) || !@is_executable($this->curl_path)) { $curlpaths = array(); // variable for test-paths diff --git a/lib/Snoopy.class.php b/lib/Snoopy.class.php index fb54a082..a2995672 100644 --- a/lib/Snoopy.class.php +++ b/lib/Snoopy.class.php @@ -557,7 +557,7 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") if (empty ( $url )) $url = "/"; // GET ... header not needed for curl - //$headers[] = $http_method." ".$url." ".$this->_httpversion; + //$headers[] = $http_method." ".$url." ".$this->_httpversion; if (! empty ( $this->agent )) curl_setopt($curl, CURLOPT_USERAGENT, $this->agent); @@ -621,6 +621,8 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") if (! empty ( $this->user ) || ! empty ( $this->pass )) $headers [] = "Authorization: BASIC " . base64_encode ( $this->user . ":" . $this->pass ); + $header[] = "Pragma: "; + curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); if (! empty ( $body )) { @@ -636,6 +638,7 @@ function _httprequest($url, $URI, $http_method, $content_type = "", $body = "") curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0); curl_setopt($curl, CURLOPT_AUTOREFERER, 1); curl_setopt($curl, CURLOPT_HEADER, 0); // don't include headers in output + curl_setopt($curl, CURLOPT_ENCODING, "gzip,deflate"); $results = curl_exec($curl); $status = curl_getinfo($curl); diff --git a/lib/site/amazon.class.php b/lib/site/amazon.class.php index 4aaa2a7f..be6c2a6b 100644 --- a/lib/site/amazon.class.php +++ b/lib/site/amazon.class.php @@ -1,782 +1,422 @@ array('asinId' => 'amazonasin', 'url' => 'www.amazon.com'), - 'amazonuk' => array('asinId' => 'amazukasin', 'url' => 'www.amazon.co.uk'), - 'amazonfr' => array('asinId' => 'amazfrasin', 'url' => 'www.amazon.fr'), - 'amazonde' => array('asinId' => 'amazdeasin', 'url' => 'www.amazon.de') - ); - - function amazon($site_type) { - parent::SitePlugin($site_type); - - $this->asinId = $this->sites[$site_type]['asinId']; - $this->url = $this->sites[$site_type]['url']; - $this->_httpClient->agent = 'Mozilla/5.0 (X11; OpenDB) Gecko/20100101 Firefox/50.0'; - } - - function queryListing($page_no, $items_per_page, $offset, $s_item_type, $search_vars_r) { - if (strlen($search_vars_r[$this->asinId]) > 0) { - $this->addListingRow(NULL, NULL, NULL, array($this->asinId => $search_vars_r[$this->asinId])); - return TRUE; - } else { - //http://www.amazon.com/s/ref=sr_nr_p_n_format_browse-bi_mrr_0?rh=i%3Advd%2Ck%3Aguard%2Cp_n_format_browse-bin%3A2650304011&sort=movies-tv&keywords=guard&ie=UTF8&qid=1410661852&rnid=2650303011 - //http://www.amazon.com/s/ref=sr_nr_p_n_format_browse-bi_mrr_3?rh=i%3Advd%2Ck%3Aguard%2Cp_n_format_browse-bin%3A2650305011&sort=movies-tv&keywords=guard&ie=UTF8&qid=1410661852&rnid=2650303011 - // Get the mapped AMAZON index type - $index_type = ifempty($this->getConfigValue('item_type_to_index_map', $s_item_type), strtolower($s_item_type)); - - // amazon does not provide the ability to specify how many items per page, so $items_per_page is ignored! - $queryUrl = "https://" . $this->url . "/exec/obidos/external-search?index=" . $index_type . "&keyword=" . urlencode($search_vars_r['title']) . "&page=$page_no"; - - $pageBuffer = $this->fetchURI($queryUrl); - } - - if (strlen($pageBuffer) > 0) { - $amazonasin = FALSE; - - //
  • ISBN-10: 0812929985
  • - // - // check for an exact match, but not if this is second page of listings or more - if (!$this->isPreviousPage()) { - if (preg_match("/ASIN: (\w{10})<\/font>/", $pageBuffer, $regs)) { - $amazonasin = trim($regs[1]); - } else if (preg_match("/ASIN: (\w{10})/", strip_tags($pageBuffer), $regs)) { - $amazonasin = trim($regs[1]); - } else if (preg_match("!
  • ISBN-10:\s*([0-9]+)
  • !", $pageBuffer, $regs)) { // for books, ASIN is the same as ISBN - $amazonasin = trim($regs[1]); - } else if (preg_match_all("!
    ]*?name=\"([^\"])\"!", $pageBuffer, $regs)) { - if (count($regs[0]) == 1) { - $amazonasin = trim($regs[1]); - } - } else if (preg_match_all("!
    addListingRow(NULL, NULL, NULL, array($this->asinId => $amazonasin, 'search.title' => $search_vars_r['title'])); - - return TRUE; - } else { - // this is a severe memory hog!!! - $pageBuffer = preg_replace('/[\r\n]+/', ' ', $pageBuffer); - - //
    Showing 1 - 12 of 55 Results
    || class="resultCount">Showing 1 Result1-24 von 194 Ergebnissen - if ((preg_match("/ id=\"resultCount\">.*?.*?.[0-9]+[\s]+?-[\s]+?[0-9]+.*?([0-9,]+).*?<\//", $pageBuffer, $regs) || - preg_match("/ id=\"resultCount\">.*?.*?.([0-9]+).*?<\//", $pageBuffer, $regs) || - preg_match("/ id=.s-result-count.*?([0-9,]+) results? for/", $pageBuffer, $regs) )) { - // need to remove the commas from the total - $total = str_replace(",", "", $regs[1]); - - // store total count here. - $this->setTotalCount($total); - - // 2 = img, 1 = href, 3 = title - if (preg_match_all("/id=\"result_.*?href=\"(.*?)\">.*?(.*?)<\/a/i", $pageBuffer, $matches)) { - for ($i = 0; $i < count($matches[0]); $i++) { - - $imageuri = preg_replace('!(\/[^.]+\.)_[^.]+_\.!', "$1", $matches[2][$i]); - - if (preg_match("!/dp/([^/]+)/!", $matches[1][$i], $regs)) { - if (strpos($matches[2][$i], "no-img") !== FALSE) - $matches[2][$i] = NULL; - - if (!preg_match("!
    ]*/dp/".$regs[1]."/!i", $pageBuffer, $newregs)) { - $this->addListingRow($matches[3][$i], $imageuri, NULL, array($this->asinId => $regs[1], 'search.title' => $search_vars_r['title'])); - } - } - } - } - } - } - - //default - return TRUE; - } else { - return FALSE; - } - } - - function queryItem($search_attributes_r, $s_item_type) { - // assumes we have an exact match here - $pageBuffer = $this->fetchURI("https://" . $this->url . "/gp/product/" . $search_attributes_r[$this->asinId]); - - // no sense going any further here. - if (strlen($pageBuffer) == 0) - return FALSE; - - $pageBuffer = preg_replace('/[\r\n]+/', ' ', $pageBuffer); - $pageBuffer = preg_replace('/>[\s]*<', $pageBuffer); - - //Prometheus (Blu-ray/ DVD + Digital Copy) (2012) - //Homeland: The Dark Elf Trilogy, Part 1 (Forgotten Realms: The Legend of Drizzt, Book I) (Bk. 1) [Mass Market Paperback] Illustration School: Let's Draw Happy People Hardcover - //Men in Black 3 [Blu-ray] - if (preg_match("/]*>([^<]+)<\/?span/s", $pageBuffer, $regs) || - preg_match("/]*?>([^<]+)<\/span/s", $pageBuffer, $regs) || - //

    ... - preg_match("/]*?id=\"title\"[^>]*>([^<]+)([^<]+)<\/b>/s", $pageBuffer, $regs) || - preg_match("/([^<]+)