From 11b4e9ae38cc258155ffece70bd767d514d75dc3 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Mon, 5 Jan 2026 14:16:50 +0530 Subject: [PATCH 1/3] Added Remaining APIs - Changed the request method such that it can work with all of our APIs - Added all APIs. Now it supports HTML API, Google Search API, Amazon Search API, Amazon Product API, Walmart Search API, Walmart Product API, ChatGPT API, YouTube Search API, YouTube Metadata API, YouTube Trainability API, YouTube Transcript API - Added a method to call our usage endpoint - Create a new method for accessing our HTML API. So that in future, we can deprecate the get and post methods. --- CHANGELOG.md | 41 ++- README.md | 314 +++++++++++++------ manual-test.py | 605 +++++++++++++++++++++++++++++++++++++ scrapingbee/__version__.py | 2 +- scrapingbee/client.py | 393 +++++++++++++++++++++--- scrapingbee/utils.py | 18 -- tests/test_client.py | 544 +++++++++++++++++++++++++++++++-- tests/test_utils.py | 16 +- 8 files changed, 1747 insertions(+), 186 deletions(-) create mode 100644 manual-test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c031bcc..ebd1f79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,44 @@ # Changelog +## [2.1.0](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.2...v2.1.0) (2024-12-18) + +### Features + +- Added `html_api()` method for HTML API with unified GET/POST support +- Added `google_search()` method for Google Search API +- Added `amazon_search()` method for Amazon Search API +- Added `amazon_product()` method for Amazon Product API +- Added `walmart_search()` method for Walmart Search API +- Added `walmart_product()` method for Walmart Product API +- Added `youtube_search()` method for YouTube Search API +- Added `youtube_metadata()` method for YouTube Metadata API +- Added `youtube_transcript()` method for YouTube Transcript API +- Added `youtube_trainability()` method for YouTube Trainability API +- Added `chatgpt()` method for ChatGPT API +- Added `usage()` method for Usage API +- Refactored internal `request()` method to be API-agnostic + +### Deprecated + +- `get()` and `post()` methods are deprecated in favor of `html_api()` method + +## [2.0.2](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.1...v2.0.2) (2025-10-02) + +### Features + +- Added support for `ai_extract_rules` parameter in `SpbParams`. +- Added Python 3.11 support. + +### Changes + +- Dropped Python 3.7 support. + +## [2.0.1](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.0...v2.0.1) (2023-10-17) + +### Bugfix + +- Fix typos in `README.md` (`block_ressources` -> `block_resources`, `json_scenario` -> `js_scenario`). + ## [2.0.0](https://github.com/ScrapingBee/scrapingbee-python/compare/v1.2.0...v2.0.0) (2023-10-03) ### Improvement @@ -8,4 +47,4 @@ ### Breaking change -- No need to url encode params anymore. +- No need to url encode params anymore. \ No newline at end of file diff --git a/README.md b/README.md index 328dc4d..30386c6 100644 --- a/README.md +++ b/README.md @@ -16,130 +16,264 @@ pip install scrapingbee ## Usage -The ScrapingBee Python SDK is a wrapper around the [requests](https://docs.python-requests.org/en/master/) library. ScrapingBee supports GET and POST requests. +The ScrapingBee Python SDK is a wrapper around the [requests](https://docs.python-requests.org/en/master/) library. Signup to ScrapingBee to [get your API key](https://app.scrapingbee.com/account/register) and some free credits to get started. -### Making a GET request +## Table of Contents + +- [HTML API](#html-api) +- [Google Search API](#google-search-api) +- [Amazon API](#amazon-api) +- [Walmart API](#walmart-api) +- [YouTube API](#youtube-api) +- [ChatGPT API](#chatgpt-api) +- [Usage API](#usage-api) + +--- + +## HTML API + +The HTML API allows you to scrape any webpage and get the HTML content. + +### Basic Request ```python ->>> from scrapingbee import ScrapingBeeClient +from scrapingbee import ScrapingBeeClient ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +client = ScrapingBeeClient(api_key='YOUR-API-KEY') ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +response = client.html_api( + 'https://www.scrapingbee.com', params={ - # Block ads on the page you want to scrape - 'block_ads': False, - # Block images and CSS on the page you want to scrape - 'block_resources': True, - # Premium proxy geolocation - 'country_code': '', - # Control the device the request will be sent from - 'device': 'desktop', - # Use some data extraction rules - 'extract_rules': {'title': 'h1'}, - # Use AI to extract data from the page - 'ai_extract_rules': {'product_name': 'The name of the product', 'price': 'The price in USD'}, - # Wrap response in JSON - 'json_response': False, - # Interact with the webpage you want to scrape - 'js_scenario': { - "instructions": [ - {"wait_for": "#slow_button"}, - {"click": "#slow_button"}, - {"scroll_x": 1000}, - {"wait": 1000}, - {"scroll_x": 1000}, - {"wait": 1000}, - ] - }, - # Use premium proxies to bypass difficult to scrape websites (10-25 credits/request) - 'premium_proxy': False, - # Execute JavaScript code with a Headless Browser (5 credits/request) - 'render_js': True, - # Return the original HTML before the JavaScript rendering - 'return_page_source': False, - # Return page screenshot as a png image - 'screenshot': False, - # Take a full page screenshot without the window limitation - 'screenshot_full_page': False, - # Transparently return the same HTTP code of the page requested. - 'transparent_status_code': False, - # Wait, in miliseconds, before returning the response - 'wait': 0, - # Wait for CSS selector before returning the response, ex ".title" - 'wait_for': '', - # Set the browser window width in pixel - 'window_width': 1920, - # Set the browser window height in pixel - 'window_height': 1080 - }, - headers={ - # Forward custom headers to the target website - "key": "value" - }, - cookies={ - # Forward custom cookies to the target website - "name": "value" + 'render_js': False, } ) ->>> response.text -'...' + +print(response.content) ``` -ScrapingBee takes various parameters to render JavaScript, execute a custom JavaScript script, use a premium proxy from a specific geolocation and more. +### Making a POST request -You can find all the supported parameters on [ScrapingBee's documentation](https://www.scrapingbee.com/documentation/). +```python +response = client.html_api( + 'https://httpbin.org/post', + method='POST', + data={ + 'key': 'value' + } +) +``` -You can send custom cookies and headers like you would normally do with the requests library. +--- -## Screenshot +## Google Search API -Here a little exemple on how to retrieve and store a screenshot from the ScrapingBee blog in its mobile resolution. +Scrape Google search results in real-time. ```python ->>> from scrapingbee import ScrapingBeeClient +response = client.google_search( + search='web scraping tools', + params={ + 'language': 'en', + 'country_code': 'us', + 'nb_results': 10 + } +) ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +print(response.json()) +``` + +--- + +## Amazon API + +Scrape Amazon search results and product details. ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +### Amazon Search + +```python +response = client.amazon_search( + query='laptop', params={ - # Take a screenshot - 'screenshot': True, - # Specify that we need the full height - 'screenshot_full_page': True, - # Specify a mobile width in pixel - 'window_width': 375 + 'domain': 'com', + 'language': 'en', + 'pages': 1 } ) ->>> if response.ok: - with open("./scrapingbee_mobile.png", "wb") as f: - f.write(response.content) +print(response.json()) ``` -## Using ScrapingBee with Scrapy +### Amazon Product + +```python +response = client.amazon_product( + query='B0D2Q9397Y', # ASIN + params={ + 'domain': 'com' + } +) -Scrapy is the most popular Python web scraping framework. You can easily [integrate ScrapingBee's API with the Scrapy middleware](https://github.com/ScrapingBee/scrapy-scrapingbee). +print(response.json()) +``` +--- -## Retries +## Walmart API -The client includes a retry mechanism for 5XX responses. +Scrape Walmart search results and product details. + +### Walmart Search + +```python +response = client.walmart_search( + query='laptop', + params={ + 'sort_by': 'best_match', + 'device': 'desktop' + } +) + +print(response.json()) +``` + +### Walmart Product ```python ->>> from scrapingbee import ScrapingBeeClient +response = client.walmart_product( + product_id='123456789', + params={ + 'device': 'desktop' + } +) + +print(response.json()) +``` + +--- ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +## YouTube API ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +Scrape YouTube search results, video metadata, and transcripts. + +### YouTube Search + +```python +response = client.youtube_search( + search='web scraping tutorial', params={ - 'render_js': True, - }, - retries=5 + 'sort_by': 'relevance', + 'type': 'video' + } ) + +print(response.json()) ``` + +### YouTube Metadata + +```python +response = client.youtube_metadata(video_id='dQw4w9WgXcQ') +print(response.json()) +``` + +### YouTube Transcript + +```python +response = client.youtube_transcript( + video_id='dQw4w9WgXcQ', + params={'language': 'en'} +) +print(response.json()) +``` + +### YouTube Trainability + +```python +response = client.youtube_trainability(video_id='dQw4w9WgXcQ') +print(response.json()) +``` + +--- + +## ChatGPT API + +Use ChatGPT with optional web search. + +```python +response = client.chatgpt( + prompt='What is web scraping?', + params={ + 'search': True, + 'country_code': 'us' + } +) + +print(response.json()) +``` + +--- + +## Usage API + +Check your API credit usage. + +```python +response = client.usage() +print(response.json()) +# { +# "max_api_credit": 8000000, +# "used_api_credit": 1000023, +# "max_concurrency": 200, +# "current_concurrency": 1 +# } +``` + +--- + +## Legacy Methods (Deprecated) + +The `get()` and `post()` methods are deprecated and will be removed in a future version. Please use `html_api()` instead. + +```python +# Deprecated +client.get(url, params={...}) + +# Use instead +client.html_api(url, method='GET', params={...}) +``` + +## Screenshot + +Here is a little example on how to retrieve and store a screenshot from the ScrapingBee blog. + +```python +from scrapingbee import ScrapingBeeClient + +client = ScrapingBeeClient(api_key='YOUR-API-KEY') + +response = client.html_api( + 'https://www.scrapingbee.com/', + params={ + 'screenshot': True, + 'screenshot_full_page': True, + 'window_width': 375, + } +) + +with open('screenshot.png', 'wb') as f: + f.write(response.content) +``` + +## Retries + +The client includes a retry mechanism for 5XX responses. + +```python +client.html_api(url, params={...}, retries=5) +``` + +## Using ScrapingBee with Scrapy + +Scrapy is the most popular Python web scraping framework. You can easily [integrate ScrapingBee's API with the Scrapy middleware](https://github.com/ScrapingBee/scrapy-scrapingbee). \ No newline at end of file diff --git a/manual-test.py b/manual-test.py new file mode 100644 index 0000000..b29fed8 --- /dev/null +++ b/manual-test.py @@ -0,0 +1,605 @@ +import os +from scrapingbee import ScrapingBeeClient + +API_KEY = os.environ.get("SCRAPINGBEE_API_KEY") +client = ScrapingBeeClient(API_KEY) + + +# ============================================ +# Helper Functions +# ============================================ + +def assert_test(condition, message): + if not condition: + raise AssertionError(message) + + +# ============================================ +# HTML API Tests (Legacy) +# ============================================ + +def test_html_get(): + print("=== Testing HTML API - GET ===") + try: + response = client.get( + url="https://httpbin.org/get", + params={"render_js": False} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("httpbin" in response.text, "Response does not contain expected content") + + print(f"Status: {response.status_code}") + print("✅ HTML GET test passed!\n") + except Exception as e: + print(f"❌ HTML GET test failed: {e}\n") + raise + + +def test_html_post(): + print("=== Testing HTML API - POST ===") + try: + response = client.post( + url="https://httpbin.org/post", + params={"render_js": False}, + data={"test": "data"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("test" in response.text, "Response does not contain posted data") + + print(f"Status: {response.status_code}") + print("✅ HTML POST test passed!\n") + except Exception as e: + print(f"❌ HTML POST test failed: {e}\n") + raise + + +# ============================================ +# HTML API Tests (New) +# ============================================ + +def test_html_api_get(): + print("=== Testing HTML API (New) - GET ===") + try: + response = client.html_api( + url="https://httpbin.org/get", + method="GET", + params={"render_js": False} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("httpbin" in response.text, "Response does not contain expected content") + + print(f"Status: {response.status_code}") + print("✅ HTML API GET test passed!\n") + except Exception as e: + print(f"❌ HTML API GET test failed: {e}\n") + raise + + +def test_html_api_post(): + print("=== Testing HTML API (New) - POST ===") + try: + response = client.html_api( + url="https://httpbin.org/post", + method="POST", + params={"render_js": False}, + data={"test": "data"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("test" in response.text, "Response does not contain posted data") + + print(f"Status: {response.status_code}") + print("✅ HTML API POST test passed!\n") + except Exception as e: + print(f"❌ HTML API POST test failed: {e}\n") + raise + + +def test_html_api_extract_rules(): + print("=== Testing HTML API - Extract Rules ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com/blog/", + params={ + "render_js": False, + "extract_rules": { + "title": "h1", + "posts": { + "selector": ".container > div > div > div", + "type": "list", + "output": { + "title": "h4", + "link": "a@href" + } + } + } + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Extracted title is missing") + assert_test(isinstance(data.get("posts"), list), "Extracted posts is not a list") + assert_test(len(data.get("posts", [])) > 0, "No posts extracted") + + print(f"Status: {response.status_code}") + print(f"Extracted title: {data.get('title')}") + print(f"Extracted posts count: {len(data.get('posts', []))}") + print("✅ HTML API Extract Rules test passed!\n") + except Exception as e: + print(f"❌ HTML API Extract Rules test failed: {e}\n") + raise + + +def test_html_api_js_scenario(): + print("=== Testing HTML API - JS Scenario ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com", + params={ + "render_js": True, + "js_scenario": { + "instructions": [ + {"wait": 1000}, + {"scroll_y": 500}, + {"wait": 500} + ] + } + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + + print(f"Status: {response.status_code}") + print(f"Content: {response.text[:300]}") # ← Fixed: Match Node.js output + print("✅ HTML API JS Scenario test passed!\n") + except Exception as e: + print(f"❌ HTML API JS Scenario test failed: {e}\n") + raise + + +def test_html_api_screenshot(): + print("=== Testing HTML API - Screenshot ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com", + params={ + "render_js": True, + "screenshot": True, + "window_width": 1920, + "window_height": 1080 + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.content, "Response is empty") + assert_test(len(response.content) > 10000, "Screenshot seems too small") + + # Check PNG signature + png_signature = b'\x89PNG\r\n\x1a\n' + assert_test(response.content[:8] == png_signature, "Response is not a valid PNG") + + print(f"Status: {response.status_code}") + print(f"Screenshot size: {len(response.content)} bytes") + print("✅ HTML API Screenshot test passed!\n") + except Exception as e: + print(f"❌ HTML API Screenshot test failed: {e}\n") + raise + + +def test_html_api_json_response(): + print("=== Testing HTML API - JSON Response ===") + try: + response = client.html_api( + url="https://httpbin.org/get", + params={ + "render_js": False, + "json_response": True + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("body") is not None, "JSON response missing body field") + assert_test(data.get("xhr") is not None, "JSON response missing xhr field") + + # Handle body as string or object + body = data.get("body") + body_preview = body[:300] if isinstance(body, str) else str(body)[:300] + + print(f"Status: {response.status_code}") + print(f"Content: {body_preview}") + print("✅ HTML API JSON Response test passed!\n") + except Exception as e: + print(f"❌ HTML API JSON Response test failed: {e}\n") + raise + + +def test_html_api_with_headers(): + print("=== Testing HTML API - Custom Headers ===") + try: + response = client.html_api( + url="https://httpbin.org/headers", + params={"render_js": False}, + headers={"X-Custom-Header": "CustomValue123"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("CustomValue123" in response.text, "Custom header not forwarded") + + print(f"Status: {response.status_code}") + print("✅ HTML API Custom Headers test passed!\n") + except Exception as e: + print(f"❌ HTML API Custom Headers test failed: {e}\n") + raise + + +def test_html_api_with_cookies(): + print("=== Testing HTML API - Custom Cookies ===") + try: + response = client.html_api( + url="https://httpbin.org/cookies", + params={"render_js": False}, + cookies={"session_id": "abc123", "user_token": "xyz789"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("abc123" in response.text or "xyz789" in response.text, "Cookies not forwarded") + + print(f"Status: {response.status_code}") + print("✅ HTML API Custom Cookies test passed!\n") + except Exception as e: + print(f"❌ HTML API Custom Cookies test failed: {e}\n") + raise + + +def test_html_api_post_with_headers_and_cookies(): + print("=== Testing HTML API - POST with Headers + Cookies ===") + try: + response = client.html_api( + url="https://httpbin.org/post", + method="POST", + params={"render_js": False}, + headers={"X-Test-Header": "TestValue"}, + cookies={"session": "mysession123"}, + data={"action": "submit"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("submit" in response.text, "Posted data not in response") + + print(f"Status: {response.status_code}") + print("✅ HTML API POST with Headers + Cookies test passed!\n") + except Exception as e: + print(f"❌ HTML API POST with Headers + Cookies test failed: {e}\n") + raise + + +# ============================================ +# Google Search API +# ============================================ + +def test_google_search(): + print("=== Testing Google Search API ===") + try: + response = client.google_search( + search="scrapingbee", + params={"language": "en", "country_code": "us"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("organic_results"), "Missing organic_results in response") + assert_test(isinstance(data.get("organic_results"), list), "organic_results is not a list") + assert_test(len(data.get("organic_results", [])) > 0, "No organic results found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('organic_results', []))}") + print("✅ Google Search test passed!\n") + except Exception as e: + print(f"❌ Google Search test failed: {e}\n") + raise + + +# ============================================ +# Amazon API +# ============================================ + +def test_amazon_search(): + print("=== Testing Amazon Search API ===") + try: + response = client.amazon_search( + query="laptop", + params={"domain": "com", "pages": 1} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("products"), "Missing products in response") + assert_test(isinstance(data.get("products"), list), "products is not a list") + assert_test(len(data.get("products", [])) > 0, "No products found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('products', []))}") + print("✅ Amazon Search test passed!\n") + except Exception as e: + print(f"❌ Amazon Search test failed: {e}\n") + raise + + +def test_amazon_product(): + print("=== Testing Amazon Product API ===") + try: + response = client.amazon_product( + query="B0D2Q9397Y", + params={"domain": "com"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Missing product title in response") + + print(f"Status: {response.status_code}") + print(f"Product title: {data.get('title', '')[:50]}") + print("✅ Amazon Product test passed!\n") + except Exception as e: + print(f"❌ Amazon Product test failed: {e}\n") + raise + + +# ============================================ +# Walmart API +# ============================================ + +def test_walmart_search(): + print("=== Testing Walmart Search API ===") + try: + response = client.walmart_search( + query="laptop", + params={"device": "desktop", "sort_by": "best_match"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("products"), "Missing products in response") + assert_test(isinstance(data.get("products"), list), "products is not a list") + assert_test(len(data.get("products", [])) > 0, "No products found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('products', []))}") + print("✅ Walmart Search test passed!\n") + except Exception as e: + print(f"❌ Walmart Search test failed: {e}\n") + raise + + +def test_walmart_product(): + print("=== Testing Walmart Product API ===") + try: + response = client.walmart_product( + product_id="454408250", + params={"device": "desktop"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Missing product title in response") + + print(f"Status: {response.status_code}") + print(f"Product title: {data.get('title', '')[:50]}") + print("✅ Walmart Product test passed!\n") + except Exception as e: + print(f"❌ Walmart Product test failed: {e}\n") + raise + + +# ============================================ +# YouTube API +# ============================================ + +def test_youtube_search(): + print("=== Testing YouTube Search API ===") + try: + response = client.youtube_search( + search="web scraping tutorial", + params={"sort_by": "relevance", "type": "video"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("results"), "Missing results in response") + assert_test(isinstance(data.get("results"), list), "results is not a list") + assert_test(len(data.get("results", [])) > 0, "No results found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('results', []))}") + print("✅ YouTube Search test passed!\n") + except Exception as e: + print(f"❌ YouTube Search test failed: {e}\n") + raise + + +def test_youtube_metadata(): + print("=== Testing YouTube Metadata API ===") + try: + response = client.youtube_metadata(video_id="dQw4w9WgXcQ") + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title") or data.get("like_count") is not None, "Missing expected metadata fields") + + print(f"Status: {response.status_code}") + print(f"Like count: {data.get('like_count')}") + print("✅ YouTube Metadata test passed!\n") + except Exception as e: + print(f"❌ YouTube Metadata test failed: {e}\n") + raise + + +def test_youtube_transcript(): + print("=== Testing YouTube Transcript API ===") + try: + response = client.youtube_transcript( + video_id="sfyL4BswUeE", + params={"language": "en"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("text") or data.get("transcript"), "Missing transcript in response") + + transcript_preview = (data.get("text") or str(data.get("transcript", "")))[:100] + print(f"Status: {response.status_code}") + print(f"Transcript preview: {transcript_preview}") + print("✅ YouTube Transcript test passed!\n") + except Exception as e: + print(f"❌ YouTube Transcript test failed: {e}\n") + raise + + +def test_youtube_trainability(): + print("=== Testing YouTube Trainability API ===") + try: + response = client.youtube_trainability(video_id="dQw4w9WgXcQ") + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("permitted") is not None, "Missing permitted field in response") + + print(f"Status: {response.status_code}") + print(f"Permitted: {data.get('permitted')}") + print("✅ YouTube Trainability test passed!\n") + except Exception as e: + print(f"❌ YouTube Trainability test failed: {e}\n") + raise + + +# ============================================ +# ChatGPT API +# ============================================ + +def test_chatgpt(): + print("=== Testing ChatGPT API ===") + try: + response = client.chatgpt( + prompt="What is web scraping? Answer in one sentence.", + params={"search": True} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("results_text") or data.get("results_markdown"), "Missing response text") + + response_text = (data.get("results_text") or data.get("results_markdown", ""))[:100] + print(f"Status: {response.status_code}") + print(f"Response: {response_text}") + print("✅ ChatGPT test passed!\n") + except Exception as e: + print(f"❌ ChatGPT test failed: {e}\n") + raise + + +# ============================================ +# Usage API +# ============================================ + +def test_usage(): + print("=== Testing Usage API ===") + try: + response = client.usage() + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("max_api_credit") is not None, "Missing max_api_credit") + assert_test(data.get("used_api_credit") is not None, "Missing used_api_credit") + assert_test(data.get("max_concurrency") is not None, "Missing max_concurrency") + + print(f"Status: {response.status_code}") + print(f"Max API credits: {data.get('max_api_credit')}") + print(f"Used API credits: {data.get('used_api_credit')}") + print(f"Max concurrency: {data.get('max_concurrency')}") + print("✅ Usage test passed!\n") + except Exception as e: + print(f"❌ Usage test failed: {e}\n") + raise + + +# ============================================ +# Run All Tests +# ============================================ + +def run_tests(): + print("\n🚀 Starting ScrapingBee Python SDK Tests\n") + + tests = [ + # Legacy HTML API + test_html_get, + test_html_post, + + # New HTML API + test_html_api_get, + test_html_api_post, + test_html_api_extract_rules, + test_html_api_js_scenario, + test_html_api_screenshot, + test_html_api_json_response, + test_html_api_with_headers, + test_html_api_with_cookies, + test_html_api_post_with_headers_and_cookies, + + # Other APIs + test_google_search, + test_amazon_search, + test_amazon_product, + test_walmart_search, + test_walmart_product, + test_youtube_search, + test_youtube_metadata, + test_youtube_transcript, + test_youtube_trainability, + test_chatgpt, + test_usage, + ] + + passed = 0 + failed = 0 + + for test in tests: + try: + test() + passed += 1 + except Exception: + failed += 1 + + print("🏁 All tests completed!") + print(f"✅ Passed: {passed}") + print(f"❌ Failed: {failed}") + print(f"📊 Total: {len(tests)}\n") + + if failed > 0: + exit(1) + + +if __name__ == "__main__": + run_tests() diff --git a/scrapingbee/__version__.py b/scrapingbee/__version__.py index 0309ae2..9aa3f90 100644 --- a/scrapingbee/__version__.py +++ b/scrapingbee/__version__.py @@ -1 +1 @@ -__version__ = "2.0.2" +__version__ = "2.1.0" diff --git a/scrapingbee/client.py b/scrapingbee/client.py index 67f72c3..ee4aff1 100644 --- a/scrapingbee/client.py +++ b/scrapingbee/client.py @@ -1,56 +1,63 @@ -from typing import Optional - from requests import Response, Session from requests.adapters import HTTPAdapter from urllib3.util import Retry -from .utils import get_scrapingbee_url, process_headers +from .utils import process_headers, process_params class ScrapingBeeClient: - api_url = "https://app.scrapingbee.com/api/v1/" + # API Endpoints + HTML_API_URL = "https://app.scrapingbee.com/api/v1/" + GOOGLE_API_URL = "https://app.scrapingbee.com/api/v1/store/google" + AMAZON_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/amazon/search" + AMAZON_PRODUCT_API_URL = "https://app.scrapingbee.com/api/v1/amazon/product" + WALMART_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/walmart/search" + WALMART_PRODUCT_API_URL = "https://app.scrapingbee.com/api/v1/walmart/product" + YOUTUBE_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/youtube/search" + YOUTUBE_METADATA_API_URL = "https://app.scrapingbee.com/api/v1/youtube/metadata" + YOUTUBE_TRANSCRIPT_API_URL = "https://app.scrapingbee.com/api/v1/youtube/transcript" + YOUTUBE_TRAINABILITY_API_URL = "https://app.scrapingbee.com/api/v1/youtube/trainability" + CHATGPT_API_URL = "https://app.scrapingbee.com/api/v1/chatgpt" + USAGE_API_URL = "https://app.scrapingbee.com/api/v1/usage" def __init__(self, api_key: str): self.api_key = api_key + # ============================================ + # Core Request Method + # ============================================ + def request( self, method: str, url: str, - params: Optional[dict] = None, - data: Optional[dict] = None, - json: Optional[dict] = None, - headers: Optional[dict] = None, - cookies: Optional[dict] = None, - retries: Optional[int] = None, + params: dict, + headers: dict = None, + data: dict = None, + json: dict = None, + retries: int = None, **kwargs ) -> Response: - if not params: - params = {} - - # Process headers and set forward_headers - if headers: - params["forward_headers"] = True - headers = process_headers(headers) - - # Add cookies to params - if cookies: - # ScrapingBee reads cookies from url parameters - params["cookies"] = cookies - - # Get ScrapingBee API URL - spb_url = get_scrapingbee_url(self.api_url, self.api_key, url, params) + """Core request method - adds api_key and makes the HTTP call.""" + params["api_key"] = self.api_key session = Session() if retries: - # Retries if it is a network error or a 5xx error on an idempotent request (GET) - retries = Retry(total=retries, raise_on_status=False, status_forcelist=frozenset(range(500, 600))) - session.mount('https://', HTTPAdapter(max_retries=retries)) - session.mount('http://', HTTPAdapter(max_retries=retries)) + retry_strategy = Retry( + total=retries, + raise_on_status=False, + status_forcelist=frozenset(range(500, 600)) + ) + session.mount("https://", HTTPAdapter(max_retries=retry_strategy)) + session.mount("http://", HTTPAdapter(max_retries=retry_strategy)) - if not data and json is not None: - return session.request(method, spb_url, json=json, headers=headers, **kwargs) - return session.request(method, spb_url, data=data, headers=headers, **kwargs) + if json is not None: + return session.request(method, url, params=params, json=json, headers=headers, **kwargs) + return session.request(method, url, params=params, data=data, headers=headers, **kwargs) + + # ============================================ + # HTML API (Legacy - WILL BE REMOVED) + # ============================================ def get( self, @@ -58,10 +65,29 @@ def get( params: dict = None, headers: dict = None, cookies: dict = None, - retries: Optional[int] = None, + retries: int = None, **kwargs ) -> Response: - return self.request("GET", url, params=params, headers=headers, cookies=cookies, retries=retries, **kwargs) + """HTML API - GET request. DEPRECATED: Use html_api() instead.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method="GET", + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + retries=retries, + **kwargs + ) def post( self, @@ -71,8 +97,305 @@ def post( json: dict = None, headers: dict = None, cookies: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """HTML API - POST request. DEPRECATED: Use html_api() instead.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method="POST", + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + data=data, + json=json, + retries=retries, + **kwargs + ) + + # ============================================ + # HTML API (New) + # ============================================ + + def html_api( + self, + url: str, + method: str = "GET", + params: dict = None, + data: dict = None, + json: dict = None, + headers: dict = None, + cookies: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """HTML API - Scrape any webpage.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method=method, + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + data=data, + json=json, + retries=retries, + **kwargs + ) + + # ============================================ + # Google Search API + # ============================================ + + def google_search( + self, + search: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Google Search API - Scrape Google search results.""" + if params is None: + params = {} + params["search"] = search + + return self.request( + method="GET", + url=self.GOOGLE_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Amazon API + # ============================================ + + def amazon_search( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Amazon Search API - Scrape Amazon search results.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.AMAZON_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def amazon_product( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Amazon Product API - Scrape Amazon product details.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.AMAZON_PRODUCT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Walmart API + # ============================================ + + def walmart_search( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Walmart Search API - Scrape Walmart search results.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.WALMART_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def walmart_product( + self, + product_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Walmart Product API - Scrape Walmart product details.""" + if params is None: + params = {} + params["product_id"] = product_id + + return self.request( + method="GET", + url=self.WALMART_PRODUCT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # YouTube API + # ============================================ + + def youtube_search( + self, + search: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Search API - Scrape YouTube search results.""" + if params is None: + params = {} + params["search"] = search + + return self.request( + method="GET", + url=self.YOUTUBE_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_metadata( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Metadata API - Get YouTube video metadata.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_METADATA_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_transcript( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Transcript API - Get YouTube video transcript.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_TRANSCRIPT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_trainability( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Trainability API - Check video trainability.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_TRAINABILITY_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # ChatGPT API + # ============================================ + + def chatgpt( + self, + prompt: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """ChatGPT API - Use ChatGPT with optional web search.""" + if params is None: + params = {} + params["prompt"] = prompt + + return self.request( + method="GET", + url=self.CHATGPT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Usage API + # ============================================ + + def usage( + self, + retries: int = None, **kwargs ) -> Response: + """Usage API - Check API credit usage and account limits.""" return self.request( - "POST", url, params=params, data=data, json=json, headers=headers, cookies=cookies, **kwargs + method="GET", + url=self.USAGE_API_URL, + params={}, + retries=retries, + **kwargs ) diff --git a/scrapingbee/utils.py b/scrapingbee/utils.py index 7501533..ffb3d7c 100644 --- a/scrapingbee/utils.py +++ b/scrapingbee/utils.py @@ -1,6 +1,5 @@ import base64 import json -import urllib from typing import Optional from .__version__ import __version__ @@ -53,20 +52,3 @@ def process_params(params: dict) -> dict: else: new_params[k] = v return new_params - - -def get_scrapingbee_url(api_url: str, api_key: str, url: str, params: dict) -> str: - all_params = { - 'api_key': api_key, - 'url': url - } - if params: - all_params.update(params) - - # Process params - spb_params = process_params(all_params) - - # Format url query string - qs = urllib.parse.urlencode(spb_params) - - return f'{api_url}?{qs}' diff --git a/tests/test_client.py b/tests/test_client.py index 2a3d0f8..a10d1a8 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -11,6 +11,10 @@ def client(): return ScrapingBeeClient(api_key='API_KEY') +# ============================================ +# Legacy HTML API Tests (get) +# ============================================ + @mock.patch('scrapingbee.client.Session') def test_get(mock_session, client): '''It should make a GET request with the url and API key''' @@ -18,8 +22,8 @@ def test_get(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, data=None, headers=DEFAULT_HEADERS ) @@ -27,13 +31,13 @@ def test_get(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_params(mock_session, client): - '''It should add parameters to the url''' + '''It should add parameters to the request''' client.get('https://httpbin.org', params={'render_js': True}) mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&render_js=True', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'render_js': True}, data=None, headers=DEFAULT_HEADERS, ) @@ -46,17 +50,16 @@ def test_get_with_headers(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&forward_headers=True', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'forward_headers': True}, data=None, - headers={'Spb-Content-Type': 'text/html; charset=utf-8', - **DEFAULT_HEADERS}, + headers={'Spb-Content-Type': 'text/html; charset=utf-8', **DEFAULT_HEADERS}, ) @mock.patch('scrapingbee.client.Session') def test_get_with_cookies(mock_session, client): - '''It should format the cookies and add them to the url''' + '''It should format the cookies and add them to the params''' client.get('https://httpbin.org', cookies={ 'name_1': 'value_1', 'name_2': 'value_2', @@ -64,8 +67,8 @@ def test_get_with_cookies(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&cookies=name_1%3Dvalue_1%3Bname_2%3Dvalue_2', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'cookies': 'name_1=value_1;name_2=value_2'}, data=None, headers=DEFAULT_HEADERS, ) @@ -73,7 +76,7 @@ def test_get_with_cookies(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_extract_rules(mock_session, client): - '''It should format the extract_rules and add them to the url''' + '''It should format the extract_rules and add them to the params''' client.get('https://httpbin.org', params={ 'extract_rules': { "title": "h1", @@ -83,10 +86,12 @@ def test_get_with_extract_rules(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'extract_rules=%7B%22title%22%3A+%22h1%22%2C+%22' - 'subtitle%22%3A+%22%23subtitle%22%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'extract_rules': '{"title": "h1", "subtitle": "#subtitle"}' + }, data=None, headers=DEFAULT_HEADERS, ) @@ -94,7 +99,7 @@ def test_get_with_extract_rules(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_js_scenario(mock_session, client): - '''It should format the extract_rules and add them to the url''' + '''It should format the js_scenario and add them to the params''' client.get('https://httpbin.org', params={ 'js_scenario': { 'instructions': [ @@ -105,9 +110,12 @@ def test_get_with_js_scenario(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'js_scenario=%7B%22instructions%22%3A+%5B%7B%22click%22%3A+%22%23buttonId%22%7D%5D%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'js_scenario': '{"instructions": [{"click": "#buttonId"}]}' + }, data=None, headers=DEFAULT_HEADERS, ) @@ -115,7 +123,7 @@ def test_get_with_js_scenario(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_ai_extract_rules(mock_session, client): - '''It should format the ai_extract_rules and add them to the url''' + '''It should format the ai_extract_rules and add them to the params''' client.get('https://httpbin.org', params={ 'ai_extract_rules': { "product_name": "The name of the product", @@ -125,15 +133,21 @@ def test_get_with_ai_extract_rules(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'ai_extract_rules=%7B%22product_name%22%3A+%22The+name+of+the+product%22%2C+%22' - 'price%22%3A+%22The+price+in+USD%22%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'ai_extract_rules': '{"product_name": "The name of the product", "price": "The price in USD"}' + }, data=None, headers=DEFAULT_HEADERS, ) +# ============================================ +# Legacy HTML API Tests (post) +# ============================================ + @mock.patch('scrapingbee.client.Session') def test_post(mock_session, client): '''It should make a POST request with some data''' @@ -141,7 +155,483 @@ def test_post(mock_session, client): mock_session.return_value.request.assert_called_with( 'POST', - 'https://app.scrapingbee.com/api/v1/?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data={'KEY_1': 'VALUE_1'}, + headers=DEFAULT_HEADERS + ) + + +# ============================================ +# New HTML API Tests (html_api) +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_html_api_get(mock_session, client): + '''It should make a GET request with html_api''' + client.html_api('https://httpbin.org') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_post(mock_session, client): + '''It should make a POST request with html_api''' + client.html_api('https://httpbin.org', method='POST') + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_params(mock_session, client): + '''It should add parameters to html_api request''' + client.html_api('https://httpbin.org', params={'render_js': True, 'premium_proxy': True}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'render_js': True, 'premium_proxy': True}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_headers(mock_session, client): + '''It should prefix header names with Spb- and set forward_headers''' + client.html_api('https://httpbin.org', headers={'Content-Type': 'text/html; charset=utf-8'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'forward_headers': True}, + data=None, + headers={'Spb-Content-Type': 'text/html; charset=utf-8', **DEFAULT_HEADERS}, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_cookies(mock_session, client): + '''It should format the cookies and add them to the params''' + client.html_api('https://httpbin.org', cookies={ + 'name_1': 'value_1', + 'name_2': 'value_2', + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'cookies': 'name_1=value_1;name_2=value_2'}, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_headers_and_cookies(mock_session, client): + '''It should handle headers and cookies in html_api''' + client.html_api( + 'https://httpbin.org', + method='POST', + headers={'X-Custom': 'value'}, + cookies={'session': 'abc123'} + ) + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'cookies': 'session=abc123', + 'forward_headers': True + }, + data=None, + headers={'Spb-X-Custom': 'value', **DEFAULT_HEADERS} + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_extract_rules(mock_session, client): + '''It should format the extract_rules and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'extract_rules': { + "title": "h1", + "subtitle": "#subtitle" + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'extract_rules': '{"title": "h1", "subtitle": "#subtitle"}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_js_scenario(mock_session, client): + '''It should format the js_scenario and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'js_scenario': { + 'instructions': [ + {"click": "#buttonId"} + ] + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'js_scenario': '{"instructions": [{"click": "#buttonId"}]}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_ai_extract_rules(mock_session, client): + '''It should format the ai_extract_rules and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'ai_extract_rules': { + "product_name": "The name of the product", + "price": "The price in USD" + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'ai_extract_rules': '{"product_name": "The name of the product", "price": "The price in USD"}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_post_with_data(mock_session, client): + '''It should make a POST request with some data''' + client.html_api('https://httpbin.org', method='POST', data={'KEY_1': 'VALUE_1'}) + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, data={'KEY_1': 'VALUE_1'}, headers=DEFAULT_HEADERS ) + + +# ============================================ +# Google Search API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_google_search(mock_session, client): + '''It should make a Google Search request''' + client.google_search('test query') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/store/google', + params={'api_key': 'API_KEY', 'search': 'test query'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_google_search_with_params(mock_session, client): + '''It should add parameters to Google Search request''' + client.google_search('test query', params={'language': 'en', 'country_code': 'us'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/store/google', + params={'api_key': 'API_KEY', 'search': 'test query', 'language': 'en', 'country_code': 'us'}, + data=None, + headers=None + ) + + +# ============================================ +# Amazon API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_amazon_search(mock_session, client): + '''It should make an Amazon Search request''' + client.amazon_search('laptop') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/search', + params={'api_key': 'API_KEY', 'query': 'laptop'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_search_with_params(mock_session, client): + '''It should add parameters to Amazon Search request''' + client.amazon_search('laptop', params={'domain': 'com', 'pages': 2}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/search', + params={'api_key': 'API_KEY', 'query': 'laptop', 'domain': 'com', 'pages': 2}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_product(mock_session, client): + '''It should make an Amazon Product request''' + client.amazon_product('B0D2Q9397Y') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/product', + params={'api_key': 'API_KEY', 'query': 'B0D2Q9397Y'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_product_with_params(mock_session, client): + '''It should add parameters to Amazon Product request''' + client.amazon_product('B0D2Q9397Y', params={'domain': 'com'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/product', + params={'api_key': 'API_KEY', 'query': 'B0D2Q9397Y', 'domain': 'com'}, + data=None, + headers=None + ) + + +# ============================================ +# Walmart API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_walmart_search(mock_session, client): + '''It should make a Walmart Search request''' + client.walmart_search('laptop') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/search', + params={'api_key': 'API_KEY', 'query': 'laptop'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_search_with_params(mock_session, client): + '''It should add parameters to Walmart Search request''' + client.walmart_search('laptop', params={'sort_by': 'best_match'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/search', + params={'api_key': 'API_KEY', 'query': 'laptop', 'sort_by': 'best_match'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_product(mock_session, client): + '''It should make a Walmart Product request''' + client.walmart_product('123456789') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/product', + params={'api_key': 'API_KEY', 'product_id': '123456789'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_product_with_params(mock_session, client): + '''It should add parameters to Walmart Product request''' + client.walmart_product('123456789', params={'device': 'desktop'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/product', + params={'api_key': 'API_KEY', 'product_id': '123456789', 'device': 'desktop'}, + data=None, + headers=None + ) + + +# ============================================ +# YouTube API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_youtube_search(mock_session, client): + '''It should make a YouTube Search request''' + client.youtube_search('web scraping') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/search', + params={'api_key': 'API_KEY', 'search': 'web scraping'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_search_with_params(mock_session, client): + '''It should add parameters to YouTube Search request''' + client.youtube_search('web scraping', params={'sort_by': 'relevance', 'type': 'video'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/search', + params={'api_key': 'API_KEY', 'search': 'web scraping', 'sort_by': 'relevance', 'type': 'video'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_metadata(mock_session, client): + '''It should make a YouTube Metadata request''' + client.youtube_metadata('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/metadata', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_transcript(mock_session, client): + '''It should make a YouTube Transcript request''' + client.youtube_transcript('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/transcript', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_transcript_with_params(mock_session, client): + '''It should add parameters to YouTube Transcript request''' + client.youtube_transcript('dQw4w9WgXcQ', params={'language': 'en'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/transcript', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ', 'language': 'en'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_trainability(mock_session, client): + '''It should make a YouTube Trainability request''' + client.youtube_trainability('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/trainability', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +# ============================================ +# ChatGPT API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_chatgpt(mock_session, client): + '''It should make a ChatGPT request''' + client.chatgpt('What is web scraping?') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/chatgpt', + params={'api_key': 'API_KEY', 'prompt': 'What is web scraping?'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_chatgpt_with_params(mock_session, client): + '''It should add parameters to ChatGPT request''' + client.chatgpt('What is web scraping?', params={'search': True}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/chatgpt', + params={'api_key': 'API_KEY', 'prompt': 'What is web scraping?', 'search': True}, + data=None, + headers=None + ) + + +# ============================================ +# Usage API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_usage(mock_session, client): + '''It should make a Usage request''' + client.usage() + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/usage', + params={'api_key': 'API_KEY'}, + data=None, + headers=None + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 583e497..aa6b13b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,8 +3,7 @@ process_json_stringify_param, process_headers, process_cookies, - process_params, - get_scrapingbee_url, + process_params ) @@ -18,7 +17,7 @@ def test_process_headers(): """It should add a Spb- prefix to header names""" output = process_headers({"Accept-Language": "En-US"}) assert output == { - "User-Agent": "ScrapingBee-Python/2.0.2", + "User-Agent": "ScrapingBee-Python/2.1.0", "Spb-Accept-Language": "En-US", } @@ -57,14 +56,3 @@ def test_process_params(): """It should keep boolean parameters""" output = process_params({"render_js": True}) assert output == {"render_js": True} - - -def test_get_scrapingbee_url(): - """It should generate a url""" - output = get_scrapingbee_url( - "https://app.scrapingbee.com/api/v1/", "API_KEY", "https://httpbin.org", {"render_js": True, "wait_for": "#foo"} - ) - assert ( - output == "https://app.scrapingbee.com/api/v1/" - "?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&render_js=True&wait_for=%23foo" - ) From 861fb20c2b408e8faaa8dc808ba87f7baf22de21 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 22 Jan 2026 20:52:01 +0530 Subject: [PATCH 2/3] Added Deprecation Warning --- scrapingbee/client.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scrapingbee/client.py b/scrapingbee/client.py index ee4aff1..ebf57b1 100644 --- a/scrapingbee/client.py +++ b/scrapingbee/client.py @@ -1,9 +1,26 @@ +import warnings +from functools import wraps + from requests import Response, Session from requests.adapters import HTTPAdapter from urllib3.util import Retry from .utils import process_headers, process_params +def deprecated(reason): + """Decorator to mark functions as deprecated.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + f"{func.__name__}() is deprecated. {reason}", + category=DeprecationWarning, + stacklevel=2 + ) + return func(*args, **kwargs) + return wrapper + return decorator + class ScrapingBeeClient: # API Endpoints @@ -59,6 +76,7 @@ def request( # HTML API (Legacy - WILL BE REMOVED) # ============================================ + @deprecated("Please use html_api() instead. This method will be removed in version 2.0.0.") def get( self, url: str, @@ -89,6 +107,7 @@ def get( **kwargs ) + @deprecated("Please use html_api() instead. This method will be removed in version 2.0.0.") def post( self, url: str, From 537774679da56aa1cedf3a4c2c2dc326df594b83 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 22 Jan 2026 20:54:26 +0530 Subject: [PATCH 3/3] Minor change Fixed lint issue due to incorrect spacing --- scrapingbee/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapingbee/client.py b/scrapingbee/client.py index ebf57b1..766b60b 100644 --- a/scrapingbee/client.py +++ b/scrapingbee/client.py @@ -1,12 +1,15 @@ import warnings from functools import wraps + from requests import Response, Session from requests.adapters import HTTPAdapter from urllib3.util import Retry + from .utils import process_headers, process_params + def deprecated(reason): """Decorator to mark functions as deprecated.""" def decorator(func):