From 78cc4f9315b538624a0a12883ab2b7a616bab1ff Mon Sep 17 00:00:00 2001 From: mohammadehsanansari Date: Thu, 3 Jul 2025 12:07:28 +0530 Subject: [PATCH] updated docs --- .../endpoint/smartcrawler/get-status.mdx | 175 ++++++++++++++++++ api-reference/endpoint/smartcrawler/start.mdx | 106 +++++++++++ api-reference/openapi.json | 84 +++++++++ introduction.mdx | 1 + mint.json | 9 + services/smartcrawler.mdx | 15 -- 6 files changed, 375 insertions(+), 15 deletions(-) create mode 100644 api-reference/endpoint/smartcrawler/get-status.mdx create mode 100644 api-reference/endpoint/smartcrawler/start.mdx diff --git a/api-reference/endpoint/smartcrawler/get-status.mdx b/api-reference/endpoint/smartcrawler/get-status.mdx new file mode 100644 index 0000000..abdcb66 --- /dev/null +++ b/api-reference/endpoint/smartcrawler/get-status.mdx @@ -0,0 +1,175 @@ +--- +title: 'Get SmartCrawler Status' +api: 'GET /v1/crawl/{task_id}' +description: 'Get the status and results of a previous smartcrawl request' +# Get SmartCrawl Result +--- + +**GET** `/v1/crawl/{task_id}` + +Retrieve the result and status of a crawl job by its task ID. + +--- + +## Path Parameters + +- `task_id` (string, required): The ID of the crawl job task. + +--- + +## Response + +- **200 OK**: Returns the crawl job status, result, crawled URLs, and pages. +- **422 Unprocessable Entity**: Validation error. + +### Example Response +```json +{ + "status": "success", + "result": { + "status": "done", + "llm_result": { + "company": { + "name": "ScrapeGraphAI, Inc", + "description": "ScrapeGraphAI is a company that provides web scraping services using artificial intelligence, and also offers a powerful AI-driven API for web scraping. They transform websites into structured data, making it easy for AI agents and developers to collect data from websites.", + "features": [ + "AI Agent Ready", + "Universal Data Extraction", + "Intelligent Processing", + "Lightning Fast Setup", + "Enterprise Ready", + "Web scraping", + "Artificial intelligence", + "Data extraction", + "AI-driven web scraping", + "Structured data output", + "Easy integration with Python, JavaScript, and TypeScript", + "Handles website changes and maintenance", + "High performance, reliability, and scalability" + ], + "contact_email": "contact@scrapegraphai.com", + "social_links": { + "github": "https://github.com/ScrapeGraphAI/Scrapegraph-ai", + "linkedin": "https://www.linkedin.com/company/101881123", + "twitter": "https://x.com/scrapegraphai" + } + }, + "services": [ + { + "service_name": "Markdownify", + "description": "Convert webpage to markdown format", + "features": [ + "2 credits / Web page" + ] + }, + { + "service_name": "Smart Scraper", + "description": "Structured AI web scraping given an URL or html content", + "features": [ + "10 credits / Web page" + ] + }, + { + "service_name": "Search Scraper", + "description": "Structured AI scraping given a search query", + "features": [ + "30 credits / query" + ] + }, + { + "service_name": "Spidy Agent", + "description": "Generate code for Markdownify, Smart Scraper and Search Scraper services", + "features": [] + }, + { + "service_name": "Web Scraping", + "description": "ScrapeGraphAI provides web scraping services to extract data from websites.", + "features": [ + "Data extraction", + "Web crawling", + "Artificial intelligence" + ] + }, + { + "service_name": "Data Extraction", + "description": "ScrapeGraphAI provides data extraction services to extract specific data from websites. They also offer high-quality data extraction using AI to ensure accuracy and completeness.", + "features": [ + "Data mining", + "Web scraping", + "Artificial intelligence", + "High-quality data extraction", + "Handles large volumes of data", + "Customizable data output" + ] + }, + { + "service_name": "Web Scraping API", + "description": "ScrapeGraphAI's API provides a simple and efficient way to extract data from websites, using AI to handle complex web pages and structures.", + "features": [ + "Handles complex web pages and structures", + "Extracts data in a structured format", + "Easy to integrate with existing applications" + ] + } + ], + "legal": { + "privacy_policy": "https://scrapegraphai.com/privacy", + "terms_of_service": "https://scrapegraphai.com/terms" + } + }, + "crawled_urls": [ + "https://scrapegraphai.com/privacy/", + "https://scrapegraphai.com/privacy", + "https://scrapegraphai.com/welcome", + "https://scrapegraphai.com/", + "https://scrapegraphai.com/playground", + "https://scrapegraphai.com/pricing", + "https://scrapegraphai.com/terms/", + "https://scrapegraphai.com/terms", + "https://scrapegraphai.com/affiliate", + "https://scrapegraphai.com/oss" + ], + "pages": [ + { + "url": "https://scrapegraphai.com/", + "markdown": "# Transform Websites into Structured Data\n\n### Just One Prompt Away\n..." + }, + { + "url": "https://scrapegraphai.com/affiliate", + "markdown": "# Join Our Affiliate Program\n\nPartner with ScrapeGraphAI and earn generous commissions by promoting our powerful web scraping solutions\n..." + }, + { + "url": "https://scrapegraphai.com/terms", + "markdown": "# Terms of Service\n\nLast updated: March 15, 2024\n..." + }, + { + "url": "https://scrapegraphai.com/terms/", + "markdown": "# Terms of Service\n\nLast updated: March 15, 2024\n..." + }, + { + "url": "https://scrapegraphai.com/pricing", + "markdown": "## Simple, transparent pricing\n\nPay only for what you use. No hidden fees, no surprises. Start with our free tier and scale as you grow.\n..." + }, + { + "url": "https://scrapegraphai.com/oss", + "markdown": "# Transform Websites into Structured Data\n\n### Just One Prompt Away\n..." + }, + { + "url": "https://scrapegraphai.com/privacy", + "markdown": "# Privacy Policy\n\nLast updated: March 15, 2024\n..." + }, + { + "url": "https://scrapegraphai.com/privacy/", + "markdown": "# Privacy Policy\n\nLast updated: March 15, 2024\n..." + }, + { + "url": "https://scrapegraphai.com/welcome", + "markdown": "🎉\n\nWelcome to the ScrapeGraphai's Referral Program\n..." + }, + { + "url": "https://scrapegraphai.com/playground", + "markdown": "Loading...\n\n![ScrapeGraphAI Spider Logo](https://scrapegraphai.com/images/scrapegraphai_logo.svg)ScrapeGraphAI\n..." + } + ] + } +} diff --git a/api-reference/endpoint/smartcrawler/start.mdx b/api-reference/endpoint/smartcrawler/start.mdx new file mode 100644 index 0000000..21ed0ca --- /dev/null +++ b/api-reference/endpoint/smartcrawler/start.mdx @@ -0,0 +1,106 @@ +--- +title: 'Start SmartCrawler' +api: 'POST /v1/crawl' +description: 'Start a new AI-powered web crawl request' +--- + +# Start Crawl + +**POST** `/v1/crawl` + +Start a new crawl job using SmartCrawler. + +--- + +## Request Body + +Content-Type: `application/json` + +### Schema +```json +{ + "url": "string", + "prompt": "string", + "cache_website": "boolean", + "depth": "integer", + "max_pages": "integer", + "same_domain_only": "boolean", + "batch_size": "integer", + "schema": { /* JSON Schema object */ } +} +``` + +### Example +```json +{ + "url": "https://scrapegraphai.com/", + "prompt": "What does the company do? and I need text content from there privacy and terms", + "cache_website": true, + "depth": 2, + "max_pages": 2, + "same_domain_only": true, + "batch_size": 1, + "schema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScrapeGraphAI Website Content", + "type": "object", + "properties": { + "company": { + "type": "object", + "properties": { + "name": { "type": "string" }, + "description": { "type": "string" }, + "features": { + "type": "array", + "items": { "type": "string" } + }, + "contact_email": { "type": "string", "format": "email" }, + "social_links": { + "type": "object", + "properties": { + "github": { "type": "string", "format": "uri" }, + "linkedin": { "type": "string", "format": "uri" }, + "twitter": { "type": "string", "format": "uri" } + }, + "additionalProperties": false + } + }, + "required": ["name", "description"] + }, + "services": { + "type": "array", + "items": { + "type": "object", + "properties": { + "service_name": { "type": "string" }, + "description": { "type": "string" }, + "features": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["service_name", "description"] + } + }, + "legal": { + "type": "object", + "properties": { + "privacy_policy": { "type": "string" }, + "terms_of_service": { "type": "string" } + }, + "required": ["privacy_policy", "terms_of_service"] + } + }, + "required": ["company", "services", "legal"] + } +} +``` + +--- + +## Response + +- **200 OK**: Crawl started successfully. Returns `{ "task_id": "" }`. Use this `task_id` to retrieve the crawl result from the [Get Crawl Result](./get-status) endpoint. +- **422 Unprocessable Entity**: Validation error. + +See the [Get Crawl Result](./get-status) endpoint for the full response structure. diff --git a/api-reference/openapi.json b/api-reference/openapi.json index b01ce02..fbc4a2d 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -505,8 +505,92 @@ } ] } + }, + "/v1/crawl": { + "post": { + "tags": [ + "Crawler" + ], + "summary": "Start Crawl", + "operationId": "start_crawl_v1_crawl_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CrawlJob" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/v1/crawl/{task_id}": { + "get": { + "tags": [ + "Crawler" + ], + "summary": "Get Crawl Result", + "operationId": "get_crawl_result_v1_crawl__task_id__get", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "title": "Task Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } } }, + "components": { "schemas": { "CompletedMarkdownifyResponse": { diff --git a/introduction.mdx b/introduction.mdx index 9876c8c..dd520cd 100644 --- a/introduction.mdx +++ b/introduction.mdx @@ -65,6 +65,7 @@ ScrapeGraphAI is a powerful suite of LLM-driven web scraping tools designed to e - **SmartScraper**: AI-powered extraction for any webpage - **SearchScraper**: Find and extract any data using AI starting from a prompt +- **SmartCrawler**: AI-powered extraction for any webpage with crawl - **Markdownify**: Convert web content to clean Markdown format diff --git a/mint.json b/mint.json index 9b2c131..d720fbc 100644 --- a/mint.json +++ b/mint.json @@ -84,6 +84,7 @@ "pages": [ "services/smartscraper", "services/searchscraper", + "services/smartcrawler", "services/markdownify", { "group": "Additional Parameters", @@ -148,6 +149,7 @@ "api-reference/endpoint/smartscraper/get-status" ] }, + { "group": "SearchScraper", "pages": [ @@ -155,6 +157,13 @@ "api-reference/endpoint/searchscraper/get-status" ] }, + { + "group": "SmartCrawler", + "pages": [ + "api-reference/endpoint/smartcrawler/start", + "api-reference/endpoint/smartcrawler/get-status" + ] + }, { "group": "Markdownify", diff --git a/services/smartcrawler.mdx b/services/smartcrawler.mdx index e0eeeff..40b06cd 100644 --- a/services/smartcrawler.mdx +++ b/services/smartcrawler.mdx @@ -254,21 +254,6 @@ const result = await scraper.getResult(task.taskId); -### Infinite Scroll Support - -SmartCrawler can handle infinite scroll pages by scrolling on each page before extraction. Use `number_of_scrolls` to control this. - -| Parameter | Type | Required | Description | -|-------------------|---------|----------|-------------| -| number_of_scrolls | int | No | Number of scrolls per page (default: 0) | - - -Infinite scroll is useful for: -- Social media feeds -- E-commerce listings -- News sites with continuous scroll - - ### Validation & Error Handling SmartCrawler performs advanced validation: