diff --git a/integrations/langchain.mdx b/integrations/langchain.mdx index aed504f..d6fb0e2 100644 --- a/integrations/langchain.mdx +++ b/integrations/langchain.mdx @@ -25,20 +25,20 @@ pip install langchain-scrapegraph ## Available Tools -### SmartScraperTool +### ExtractTool Extract structured data from any webpage using natural language prompts: ```python -from langchain_scrapegraph.tools import SmartScraperTool +from langchain_scrapegraph.tools import ExtractTool # Initialize the tool (uses SGAI_API_KEY from environment) -tool = SmartscraperTool() +tool = ExtractTool() # Extract information using natural language result = tool.invoke({ - "website_url": "https://www.example.com", - "user_prompt": "Extract the main heading and first paragraph" + "url": "https://www.example.com", + "prompt": "Extract the main heading and first paragraph" }) ``` @@ -46,60 +46,51 @@ result = tool.invoke({ Define the structure of the output using Pydantic models: ```python -from typing import List from pydantic import BaseModel, Field -from langchain_scrapegraph.tools import SmartScraperTool +from langchain_scrapegraph.tools import ExtractTool class WebsiteInfo(BaseModel): - title: str = Field(description="The main title of the webpage") - description: str = Field(description="The main description or first paragraph") - urls: List[str] = Field(description="The URLs inside the webpage") + title: str = Field(description="The main title of the page") + description: str = Field(description="The main description") -# Initialize with schema -tool = SmartScraperTool(llm_output_schema=WebsiteInfo) +# Initialize with output schema +tool = ExtractTool(llm_output_schema=WebsiteInfo) result = tool.invoke({ - "website_url": "https://www.example.com", - "user_prompt": "Extract the website information" + "url": "https://example.com", + "prompt": "Extract the title and description" }) ``` -### SearchScraperTool +### SearchTool -Process HTML content directly with AI extraction: +Search the web and extract structured results using AI: ```python -from langchain_scrapegraph.tools import SearchScraperTool +from langchain_scrapegraph.tools import SearchTool - -tool = SearchScraperTool() +tool = SearchTool() result = tool.invoke({ - "user_prompt": "Find the best restaurants in San Francisco", + "query": "Find the best restaurants in San Francisco", }) - ``` - -```python -from typing import Optional -from pydantic import BaseModel, Field -from langchain_scrapegraph.tools import SearchScraperTool +### ScrapeTool -class RestaurantInfo(BaseModel): - name: str = Field(description="The restaurant name") - address: str = Field(description="The restaurant address") - rating: float = Field(description="The restaurant rating") +Scrape a webpage and return it in the desired format. Supported formats: `markdown`, `html`, `screenshot`, `branding`, `links`, `images`, `summary`. +```python +from langchain_scrapegraph.tools import ScrapeTool -tool = SearchScraperTool(llm_output_schema=RestaurantInfo) +tool = ScrapeTool() -result = tool.invoke({ - "user_prompt": "Find the best restaurants in San Francisco" -}) +# Scrape as markdown (default) +result = tool.invoke({"url": "https://example.com"}) +# Scrape as HTML +result = tool.invoke({"url": "https://example.com", "format": "html"}) ``` - ### MarkdownifyTool @@ -112,34 +103,151 @@ tool = MarkdownifyTool() markdown = tool.invoke({"website_url": "https://example.com"}) ``` +### Crawl Tools + +Start and manage crawl jobs with `CrawlStartTool`, `CrawlStatusTool`, `CrawlStopTool`, and `CrawlResumeTool`: + +```python +import time +from langchain_scrapegraph.tools import CrawlStartTool, CrawlStatusTool + +start_tool = CrawlStartTool() +status_tool = CrawlStatusTool() + +# Start a crawl job +result = start_tool.invoke({ + "url": "https://example.com", + "max_depth": 2, + "max_pages": 5, + "format": "markdown", +}) +print("Crawl started:", result) + +# Check status +crawl_id = result.get("id") +if crawl_id: + time.sleep(5) + status = status_tool.invoke({"crawl_id": crawl_id}) + print("Crawl status:", status) +``` + +### Monitor Tools + +Create and manage monitors (replaces scheduled jobs) with `MonitorCreateTool`, `MonitorListTool`, `MonitorGetTool`, `MonitorPauseTool`, `MonitorResumeTool`, and `MonitorDeleteTool`: + +```python +from langchain_scrapegraph.tools import MonitorCreateTool, MonitorListTool + +create_tool = MonitorCreateTool() +list_tool = MonitorListTool() + +# Create a monitor (interval accepts cron expressions or shorthand like "1h", "30m") +result = create_tool.invoke({ + "url": "https://example.com/products", + "name": "Price Monitor", + "interval": "0 9 * * *", # Daily at 9 AM + "prompt": "Extract current product prices", # optional JSON extraction +}) +print("Monitor created:", result) + +# List all monitors +monitors = list_tool.invoke({}) +print("All monitors:", monitors) +``` + +### HistoryTool + +Retrieve request history, optionally filtered by service with pagination: + +```python +from langchain_scrapegraph.tools import HistoryTool + +tool = HistoryTool() + +# List the most recent requests +history = tool.invoke({}) + +# Filter to a specific service and page +history = tool.invoke({"service": "scrape", "page": 1, "limit": 20}) +``` + +### GetCreditsTool + +Check your remaining API credits: + +```python +from langchain_scrapegraph.tools import GetCreditsTool + +tool = GetCreditsTool() +credits = tool.invoke({}) +``` + ## Example Agent Create a research agent that can gather and analyze web data: ```python -from langchain.agents import initialize_agent, AgentType -from langchain_scrapegraph.tools import SmartScraperTool +from langchain.agents import AgentExecutor, create_openai_functions_agent +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_openai import ChatOpenAI +from langchain_scrapegraph.tools import ExtractTool, GetCreditsTool, SearchTool -# Initialize tools +# Initialize the tools tools = [ - SmartScraperTool(), + ExtractTool(), + GetCreditsTool(), + SearchTool(), ] -# Create an agent -agent = initialize_agent( - tools=tools, - llm=ChatOpenAI(temperature=0), - agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, - verbose=True -) - -# Use the agent -response = agent.run(""" - Visit example.com, make a summary of the content and extract the main heading and first paragraph -""") +# Create the prompt template +prompt = ChatPromptTemplate.from_messages([ + SystemMessage( + content=( + "You are a helpful AI assistant that can analyze websites and extract information. " + "You have access to tools that can help you scrape and process web content. " + "Always explain what you're doing before using a tool." + ) + ), + MessagesPlaceholder(variable_name="chat_history", optional=True), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), +]) + +# Initialize the LLM +llm = ChatOpenAI(temperature=0) + +# Create the agent +agent = create_openai_functions_agent(llm, tools, prompt) +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + +# Example usage +response = agent_executor.invoke({ + "input": "Extract the main products from https://www.scrapegraphai.com/" +}) +print(response["output"]) ``` +## Migration from v1 + +If you're upgrading from v1, here are the key changes: + +| v1 Tool | v2 Tool | +|---------|---------| +| `SmartScraperTool` | `ExtractTool` | +| `SearchScraperTool` | `SearchTool` | +| `SmartCrawlerTool` | `CrawlStartTool` / `CrawlStatusTool` / `CrawlStopTool` / `CrawlResumeTool` | +| `CreateScheduledJobTool` | `MonitorCreateTool` | +| `GetScheduledJobsTool` | `MonitorListTool` | +| `GetScheduledJobTool` | `MonitorGetTool` | +| `PauseScheduledJobTool` | `MonitorPauseTool` | +| `ResumeScheduledJobTool` | `MonitorResumeTool` | +| `DeleteScheduledJobTool` | `MonitorDeleteTool` | +| `MarkdownifyTool` | `MarkdownifyTool` (unchanged) | +| `GetCreditsTool` | `GetCreditsTool` (unchanged) | +| `AgenticScraperTool` | Removed | +| -- | `HistoryTool` (new) | + ## Configuration Set your ScrapeGraph API key in your environment: @@ -156,7 +264,7 @@ os.environ["SGAI_API_KEY"] = "your-api-key-here" ``` -Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) +Get your API key from the [dashboard](https://scrapegraphai.com/dashboard) ## Use Cases