Official Python SDK for the ScrapeGraphAI API.
pip install scrapegraph-py
# or
uv add scrapegraph-pyfrom scrapegraph_py import ScrapeGraphAI
# reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI(api_key="...")
sgai = ScrapeGraphAI()
result = sgai.scrape("https://example.com")
if result.status == "success":
print(result.data["results"]["markdown"]["data"])
else:
print(result.error)Every method returns ApiResult[T] — no exceptions to catch:
@dataclass
class ApiResult(Generic[T]):
status: Literal["success", "error"]
data: T | None
error: str | None
elapsed_ms: intScrape a webpage in multiple formats (markdown, html, screenshot, json, etc).
from scrapegraph_py import (
ScrapeGraphAI, FetchConfig,
MarkdownFormatConfig, ScreenshotFormatConfig, JsonFormatConfig
)
sgai = ScrapeGraphAI()
res = sgai.scrape(
"https://example.com",
formats=[
MarkdownFormatConfig(mode="reader"),
ScreenshotFormatConfig(full_page=True, width=1440, height=900),
JsonFormatConfig(prompt="Extract product info"),
],
content_type="text/html", # optional, auto-detected
fetch_config=FetchConfig( # optional
mode="js", # "auto" | "fast" | "js"
stealth=True,
timeout=30000,
wait=2000,
scrolls=3,
headers={"Accept-Language": "en"},
cookies={"session": "abc"},
country="us",
),
)Formats:
markdown— Clean markdown (modes:normal,reader,prune)html— Raw HTML (modes:normal,reader,prune)links— All links on the pageimages— All image URLssummary— AI-generated summaryjson— Structured extraction with prompt/schemabranding— Brand colors, typography, logosscreenshot— Page screenshot (full_page, width, height, quality)
Extract structured data from a URL, HTML, or markdown using AI.
from scrapegraph_py import ScrapeGraphAI
sgai = ScrapeGraphAI()
res = sgai.extract(
prompt="Extract product names and prices",
url="https://example.com",
schema={"type": "object", "properties": {...}}, # optional
mode="reader", # optional
# Or pass html/markdown directly instead of url
)Search the web and optionally extract structured data.
from scrapegraph_py import ScrapeGraphAI
sgai = ScrapeGraphAI()
res = sgai.search(
"best programming languages 2024",
num_results=5, # 1-20, default 3
format="markdown", # "markdown" | "html"
prompt="Extract key points", # optional, for AI extraction
schema={...}, # optional
time_range="past_week", # optional
location_geo_code="us", # optional
)Crawl a website and its linked pages.
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig
sgai = ScrapeGraphAI()
# Start a crawl
start = sgai.crawl.start(
"https://example.com",
formats=[MarkdownFormatConfig()],
max_pages=50,
max_depth=2,
max_links_per_page=10,
include_patterns=["/blog/*"],
exclude_patterns=["/admin/*"],
)
# Check status
status = sgai.crawl.get(start.data["id"])
# Control
sgai.crawl.stop(crawl_id)
sgai.crawl.resume(crawl_id)
sgai.crawl.delete(crawl_id)Monitor a webpage for changes on a schedule.
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig
sgai = ScrapeGraphAI()
# Create a monitor
mon = sgai.monitor.create(
"https://example.com",
"0 * * * *", # cron expression
name="Price Monitor",
formats=[MarkdownFormatConfig()],
webhook_url="https://...", # optional
)
# Manage monitors
sgai.monitor.list()
sgai.monitor.get(cron_id)
sgai.monitor.update(cron_id, interval="0 */6 * * *")
sgai.monitor.pause(cron_id)
sgai.monitor.resume(cron_id)
sgai.monitor.delete(cron_id)Fetch request history.
from scrapegraph_py import ScrapeGraphAI
sgai = ScrapeGraphAI()
history = sgai.history.list(
service="scrape", # optional filter
page=1,
limit=20,
)
entry = sgai.history.get("request-id")from scrapegraph_py import ScrapeGraphAI
sgai = ScrapeGraphAI()
credits = sgai.credits()
# { remaining: 1000, used: 500, plan: "pro", jobs: { crawl: {...}, monitor: {...} } }
health = sgai.health()
# { status: "ok", uptime: 12345 }All methods have async equivalents via AsyncScrapeGraphAI:
import asyncio
from scrapegraph_py import AsyncScrapeGraphAI
async def main():
async with AsyncScrapeGraphAI() as sgai:
result = await sgai.scrape("https://example.com")
if result.status == "success":
print(result.data["results"]["markdown"]["data"])
else:
print(result.error)
asyncio.run(main())async with AsyncScrapeGraphAI() as sgai:
res = await sgai.extract(
prompt="Extract product names and prices",
url="https://example.com",
)async with AsyncScrapeGraphAI() as sgai:
res = await sgai.search("best programming languages 2024", num_results=5)async with AsyncScrapeGraphAI() as sgai:
start = await sgai.crawl.start("https://example.com", max_pages=50)
status = await sgai.crawl.get(start.data["id"])async with AsyncScrapeGraphAI() as sgai:
mon = await sgai.monitor.create(
"https://example.com",
"0 * * * *",
name="Price Monitor",
)| Service | Example | Description |
|---|---|---|
| scrape | scrape_basic.py |
Basic markdown scraping |
| scrape | scrape_multi_format.py |
Multiple formats |
| scrape | scrape_json_extraction.py |
Structured JSON extraction |
| scrape | scrape_pdf.py |
PDF document parsing |
| scrape | scrape_with_fetchconfig.py |
JS rendering, stealth mode |
| extract | extract_basic.py |
AI data extraction |
| extract | extract_with_schema.py |
Extraction with JSON schema |
| search | search_basic.py |
Web search |
| search | search_with_extraction.py |
Search + AI extraction |
| crawl | crawl_basic.py |
Start and monitor a crawl |
| crawl | crawl_with_formats.py |
Crawl with formats |
| monitor | monitor_basic.py |
Create a page monitor |
| monitor | monitor_with_webhook.py |
Monitor with webhook |
| utilities | credits.py |
Check credits and limits |
| utilities | health.py |
API health check |
| utilities | history.py |
Request history |
| Service | Example | Description |
|---|---|---|
| scrape | scrape_basic_async.py |
Basic markdown scraping |
| scrape | scrape_multi_format_async.py |
Multiple formats |
| scrape | scrape_json_extraction_async.py |
Structured JSON extraction |
| scrape | scrape_pdf_async.py |
PDF document parsing |
| scrape | scrape_with_fetchconfig_async.py |
JS rendering, stealth mode |
| extract | extract_basic_async.py |
AI data extraction |
| extract | extract_with_schema_async.py |
Extraction with JSON schema |
| search | search_basic_async.py |
Web search |
| search | search_with_extraction_async.py |
Search + AI extraction |
| crawl | crawl_basic_async.py |
Start and monitor a crawl |
| crawl | crawl_with_formats_async.py |
Crawl with formats |
| monitor | monitor_basic_async.py |
Create a page monitor |
| monitor | monitor_with_webhook_async.py |
Monitor with webhook |
| utilities | credits_async.py |
Check credits and limits |
| utilities | health_async.py |
API health check |
| utilities | history_async.py |
Request history |
| Variable | Description | Default |
|---|---|---|
SGAI_API_KEY |
Your ScrapeGraphAI API key | — |
SGAI_API_URL |
Override API base URL | https://v2-api.scrapegraphai.com/api |
SGAI_DEBUG |
Enable debug logging ("1") |
off |
SGAI_TIMEOUT |
Request timeout in seconds | 120 |
uv sync
uv run pytest tests/ # unit tests
uv run pytest tests/test_integration.py # live API tests (requires SGAI_API_KEY)
uv run ruff check . # lintMIT - ScrapeGraphAI