-
Notifications
You must be signed in to change notification settings - Fork 4
feat: add Firecrawl scraping feature #45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -256,3 +256,4 @@ conf/* | |
| !conf/.gitkeep | ||
| .run/Template Python tests.run.xml | ||
| /.run/ | ||
| .DS_Store | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| from dataclasses import dataclass | ||
| from typing import Any | ||
|
|
||
| from app.models.metadata_item import MetadataItem | ||
|
|
||
|
|
||
| @dataclass | ||
| class FirecrawlItem(MetadataItem): | ||
| """ | ||
| FirecrawlItem: Data class for scraped content from Firecrawl. | ||
| """ | ||
| id: str = "" | ||
| raw_content: str = "" | ||
|
|
||
| @staticmethod | ||
| def from_dict(obj: Any) -> "FirecrawlItem": | ||
| metadata_item = MetadataItem.from_dict(obj) | ||
| return FirecrawlItem( | ||
| url=metadata_item.url, | ||
| title=metadata_item.title, | ||
| author=metadata_item.author, | ||
| author_url=metadata_item.author_url, | ||
| telegraph_url=metadata_item.telegraph_url, | ||
| text=metadata_item.text, | ||
| content=metadata_item.content, | ||
| media_files=metadata_item.media_files, | ||
| category=metadata_item.category, | ||
| message_type=metadata_item.message_type, | ||
| id=obj.get("id", ""), | ||
| raw_content=obj.get("raw_content", ""), | ||
| ) | ||
|
|
||
| def to_dict(self) -> dict: | ||
| result: dict = super().to_dict() | ||
| result["id"] = self.id | ||
| result["raw_content"] = self.raw_content | ||
| return result |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import threading | ||
| from dataclasses import dataclass | ||
| from typing import Any, Dict, List, Optional | ||
|
|
||
| from firecrawl import Firecrawl | ||
|
|
||
| from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY, FIRECRAWL_TIMEOUT_SECONDS | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class FirecrawlSettings: | ||
| api_url: str | ||
| api_key: str | ||
| timeout_seconds: int = 60 # 你也可以在反代侧控制超时 | ||
|
|
||
|
|
||
| class FirecrawlClient: | ||
| """ | ||
| FirecrawlClient: 对 firecrawl python SDK 的封装 + 单例访问点。 | ||
|
|
||
| - 提供 scrape / crawl 等常用方法,方便其他模块调用 | ||
| - 线程安全单例(适合 Web 服务 / worker 多线程场景) | ||
| """ | ||
|
|
||
| _instance: Optional["FirecrawlClient"] = None | ||
| _lock = threading.Lock() | ||
|
|
||
| def __init__(self, config: FirecrawlSettings): | ||
| self._settings: FirecrawlSettings = config | ||
| self._app: Firecrawl = self._create_app(config) | ||
|
|
||
| @staticmethod | ||
| def _create_app(config: FirecrawlSettings) -> Firecrawl: | ||
| try: | ||
| return Firecrawl(api_url=config.api_url, api_key=config.api_key) | ||
| except TypeError: | ||
| return Firecrawl(api_url=config.api_url, api_key=config.api_key) | ||
|
|
||
| @classmethod | ||
| def get_instance(cls) -> "FirecrawlClient": | ||
| """ | ||
| 线程安全的单例获取。 | ||
| - 首次调用可传 settings | ||
| - 之后重复调用可不传 | ||
| """ | ||
| if cls._instance is not None: | ||
| return cls._instance | ||
|
|
||
| with cls._lock: | ||
| if cls._instance is not None: | ||
| return cls._instance | ||
|
|
||
| config = FirecrawlSettings( | ||
| api_url=FIRECRAWL_API_URL, | ||
| api_key=FIRECRAWL_API_KEY, | ||
| timeout_seconds=FIRECRAWL_TIMEOUT_SECONDS, | ||
| ) | ||
|
|
||
| cls._instance = cls(config) | ||
| return cls._instance | ||
|
|
||
| @classmethod | ||
| def reset_instance(cls) -> None: | ||
| """测试用:重置单例。""" | ||
| with cls._lock: | ||
| cls._instance = None | ||
|
|
||
| def scrape_url( | ||
| self, | ||
| url: str, | ||
| formats: Optional[List[str]] = None, | ||
| only_main_content: bool = True, | ||
| timeout_seconds: Optional[int] = None, | ||
| extra_params: Optional[Dict[str, Any]] = None, | ||
| ) -> Dict[str, Any]: | ||
| """ | ||
| 单页抓取(最常用) | ||
| """ | ||
| params: Dict[str, Any] = { | ||
| "formats": formats or ["markdown"], | ||
| "onlyMainContent": only_main_content, | ||
| } | ||
| if extra_params: | ||
| params.update(extra_params) | ||
|
|
||
| # if timeout_seconds is None: | ||
| # timeout_seconds = self._settings.timeout_seconds | ||
|
|
||
| try: | ||
| return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump( | ||
| exclude_none=True) | ||
|
Comment on lines
+70
to
+93
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: find . -type f -name "client.py" | grep firecrawlRepository: aturret/FastFetchBot Length of output: 114 🏁 Script executed: cat -n app/services/scrapers/firecrawl_client/client.pyRepository: aturret/FastFetchBot Length of output: 3450 🏁 Script executed: python3 << 'EOF'
import subprocess
result = subprocess.run(['pip', 'show', 'firecrawl'], capture_output=True, text=True)
print(result.stdout)
EOFRepository: aturret/FastFetchBot Length of output: 150 🏁 Script executed: # Search for how firecrawl scrape is used in the codebase
rg "scrape_url" --type py -A 3 -B 1Repository: aturret/FastFetchBot Length of output: 1059 🏁 Script executed: # Check if there are other usages or tests of the firecrawl SDK
rg "_app\.scrape|Firecrawl" --type pyRepository: aturret/FastFetchBot Length of output: 3710 🌐 Web query:
💡 Result: Summary of firecrawl-py /scrape method parameters (concise):
Sources: If you want, I can paste exact parameter names and example Python signatures from the SDK docs. Citations:
🌐 Web query:
💡 Result: The Firecrawl Python SDK scrape signature (synchronous) — from the SDK docs/source — is: def scrape( (Example usage and parameter details in the Firecrawl docs and the package docs mirror.) [1][2] Sources:
🏁 Script executed: # Check if there's a requirements.txt or setup.py that shows firecrawl version
find . -type f \( -name "requirements*.txt" -o -name "setup.py" -o -name "pyproject.toml" \) -exec cat {} \;Repository: aturret/FastFetchBot Length of output: 1668 🏁 Script executed: python3 << 'EOF'
import subprocess
import json
# Try to get info about firecrawl package
result = subprocess.run(['pip', 'show', 'firecrawl-py', '-f'], capture_output=True, text=True)
print("Package info:")
print(result.stdout)
print("\nStderr:")
print(result.stderr)
EOFRepository: aturret/FastFetchBot Length of output: 132 🌐 Web query:
💡 Result: Example — Python (firecrawl-py) using pageOptions.onlyMainContent: from firecrawl import FirecrawlApp References: Firecrawl docs — Page Options and Scrape examples. [1][2] 🏁 Script executed: # Check if scraper.py shows how scrape_url is being called
sed -n '1,50p' app/services/scrapers/firecrawl_client/scraper.pyRepository: aturret/FastFetchBot Length of output: 2048 🌐 Web query:
💡 Result: Pass the timeout as the timeout keyword when calling scrape; it expects milliseconds. Example: Python Docs: scrape endpoint timeout is in milliseconds and the Python SDK forwards that parameter. [1][2] Sources 🌐 Web query:
💡 Result: The firecrawl Python SDK's scrape method accepts options via keyword args (examples: formats, scrape_options, poll_interval). See the SDK docs and examples in the repository README. [1][2] Sources:
Wire unused The method builds a The 🔧 Suggested direction (adjust based on SDK version's params handling) params: Dict[str, Any] = {
"formats": formats or ["markdown"],
"onlyMainContent": only_main_content,
}
+ if timeout_seconds is None:
+ timeout_seconds = self._settings.timeout_seconds
+ if timeout_seconds is not None:
+ params["timeout"] = timeout_seconds * 1000 # Convert seconds to milliseconds
if extra_params:
params.update(extra_params)
try:
- return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump(
- exclude_none=True)
+ return self._app.scrape(url, **params).model_dump(exclude_none=True)🧰 Tools🪛 Ruff (0.14.13)75-75: Unused method argument: (ARG002) 79-79: Docstring contains ambiguous (RUF002) 79-79: Docstring contains ambiguous (RUF002) 🤖 Prompt for AI Agents |
||
| except Exception as e: | ||
| raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,168 @@ | ||
| import hashlib | ||
| from urllib.parse import urlparse | ||
|
|
||
| from openai import AsyncOpenAI | ||
| from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam | ||
|
|
||
| from app.config import OPENAI_API_KEY | ||
| from app.models.metadata_item import MediaFile, MessageType | ||
| from app.services.scrapers.scraper import Scraper, DataProcessor | ||
| from app.services.scrapers.firecrawl_client import FirecrawlItem | ||
| from app.services.scrapers.firecrawl_client.client import FirecrawlClient | ||
| from app.utils.parse import get_html_text_length, wrap_text_into_html | ||
| from app.utils.logger import logger | ||
|
|
||
| FIRECRAWL_TEXT_LIMIT = 800 | ||
|
|
||
| # System prompt for LLM to extract article content | ||
| ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. | ||
|
|
||
| Instructions: | ||
| 1. Identify and extract ONLY the main article/post content | ||
| 2. Remove navigation, headers, footers, sidebars, ads, comments, and other non-article elements | ||
| 3. Preserve the article's structure (headings, paragraphs, lists, etc.) | ||
| 4. Keep important formatting like bold, italic, links, and images | ||
| 5. Return clean HTML containing only the article content | ||
| 6. If you cannot identify the main content, return the original HTML unchanged | ||
|
|
||
| Return ONLY the extracted HTML content, no explanations or markdown.""" | ||
|
|
||
|
|
||
| class FirecrawlDataProcessor(DataProcessor): | ||
| """ | ||
| FirecrawlDataProcessor: Process URLs using Firecrawl to extract content. | ||
| """ | ||
|
|
||
| def __init__(self, url: str): | ||
| self.url: str = url | ||
| self._data: dict = {} | ||
| self.url_parser = urlparse(url) | ||
| self.id = hashlib.md5(url.encode()).hexdigest()[:16] | ||
| self._client: FirecrawlClient = FirecrawlClient.get_instance() | ||
|
|
||
| async def get_item(self) -> dict: | ||
| await self.process_data() | ||
| firecrawl_item = FirecrawlItem.from_dict(self._data) | ||
| return firecrawl_item.to_dict() | ||
|
|
||
| async def process_data(self) -> None: | ||
| await self._get_page_content() | ||
|
|
||
| async def _get_page_content(self) -> None: | ||
| try: | ||
| result = self._client.scrape_url( | ||
| url=self.url, | ||
| formats=["markdown", "html"], | ||
| only_main_content=True, | ||
| ) | ||
| await self._process_firecrawl_result(result) | ||
| except Exception as e: | ||
| logger.error(f"Failed to scrape URL with Firecrawl: {e}") | ||
| raise | ||
|
Comment on lines
+51
to
+61
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# First, let's find where FirecrawlClient is defined or imported
find . -type f -name "*.py" | xargs grep -l "class FirecrawlClient" 2>/dev/null || echo "No local FirecrawlClient class found"
# Search for scrape_url method definition in the repo
rg "def scrape_url|async def scrape_url" --type py -B 2 -A 10
# Also check the imports in the scraper.py file to understand where _client comes from
cat -n app/services/scrapers/firecrawl_client/scraper.py | head -50Repository: aturret/FastFetchBot Length of output: 3450 🏁 Script executed: #!/bin/bash
# Read the scrape_url method from client.py
cat -n app/services/scrapers/firecrawl_client/client.py | grep -A 30 "def scrape_url"Repository: aturret/FastFetchBot Length of output: 1153 🏁 Script executed: #!/bin/bash
# Read the FirecrawlClient class to see how _app is initialized
cat -n app/services/scrapers/firecrawl_client/client.py | head -70Repository: aturret/FastFetchBot Length of output: 2387 🌐 Web query:
💡 Result: Yes — the Firecrawl Python SDK includes async support via the AsyncFirecrawl class. Its methods mirror Firecrawl (awaitable versions of scrape, search, start_crawl, batch_scrape, watcher, etc.). Example usage and docs are on the official SDK pages. [1][2] Sources:
Use
🤖 Prompt for AI Agents |
||
|
|
||
| @staticmethod | ||
| async def parsing_article_body_by_llm(html_content: str) -> str: | ||
| """ | ||
| Use LLM to extract the main article content from HTML. | ||
|
|
||
| Args: | ||
| html_content: Raw HTML content from Firecrawl | ||
|
|
||
| Returns: | ||
| Cleaned HTML containing only the main article content | ||
| """ | ||
| if not html_content: | ||
| return html_content | ||
|
|
||
| if not OPENAI_API_KEY: | ||
| logger.warning("OPENAI_API_KEY not configured, skipping LLM parsing") | ||
| return html_content | ||
|
|
||
| try: | ||
| client = AsyncOpenAI(api_key=OPENAI_API_KEY) | ||
|
|
||
| # Truncate content if too long to avoid token limits | ||
| max_content_length = 50000 | ||
| truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content | ||
|
|
||
| response = await client.chat.completions.create( | ||
| model="gpt-4o-mini", | ||
| messages=[ | ||
| ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT), | ||
| ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}") | ||
| ], | ||
| temperature=0.1, | ||
| max_tokens=16000, | ||
| ) | ||
|
|
||
| extracted_content = response.choices[0].message.content | ||
|
|
||
| if extracted_content: | ||
| logger.info("Successfully extracted article content using LLM") | ||
| return extracted_content.strip() | ||
| else: | ||
| logger.warning("LLM returned empty content, using original HTML") | ||
| return html_content | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Failed to parse article body with LLM: {e}") | ||
| return html_content | ||
|
|
||
| async def _process_firecrawl_result(self, result: dict) -> None: | ||
| metadata = result.get("metadata", {}) | ||
| markdown_content = result.get("markdown", "") | ||
| html_content = result.get("html", "") | ||
|
|
||
| # Extract metadata fields | ||
| title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url | ||
| author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc | ||
| description = metadata.get("description", "") or metadata.get("ogDescription", "") | ||
|
|
||
| item_data = { | ||
| "id": self.id, | ||
| "category": "other", | ||
| "url": self.url, | ||
| "title": title, | ||
| "author": author, | ||
| "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", | ||
| } | ||
|
|
||
| # Process text content - use description or first part of markdown | ||
| text = description if description else markdown_content[:500] | ||
| item_data["text"] = text | ||
|
|
||
| html_content = await self.parsing_article_body_by_llm(html_content) | ||
|
|
||
| # Process HTML content | ||
| if html_content: | ||
| content = wrap_text_into_html(html_content, is_html=True) | ||
| else: | ||
| content = wrap_text_into_html(markdown_content, is_html=False) | ||
| item_data["content"] = content | ||
| item_data["raw_content"] = markdown_content | ||
|
|
||
| # Process media files - extract og:image if available | ||
| media_files = [] | ||
| og_image = metadata.get("ogImage") | ||
| if og_image: | ||
| media_files.append(MediaFile(url=og_image, media_type="image")) | ||
|
|
||
| item_data["media_files"] = [m.to_dict() for m in media_files] | ||
|
|
||
| # Determine message type based on text length | ||
| item_data["message_type"] = ( | ||
| MessageType.LONG | ||
| if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT | ||
| else MessageType.SHORT | ||
| ) | ||
|
|
||
| self._data = item_data | ||
|
|
||
|
|
||
| class FirecrawlScraper(Scraper): | ||
| """ | ||
| FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping. | ||
| """ | ||
|
|
||
| async def get_processor_by_url(self, url: str) -> DataProcessor: | ||
| return FirecrawlDataProcessor(url) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| from app.utils.logger import logger | ||
| from app.services.scrapers.bluesky.scraper import BlueskyScraper | ||
| from app.services.scrapers.weibo.scraper import WeiboScraper | ||
| from app.services.scrapers.firecrawl_client.scraper import FirecrawlScraper | ||
| from app.config import ( | ||
| BLUESKY_USERNAME, BLUESKY_PASSWORD | ||
| ) | ||
|
|
@@ -12,9 +13,12 @@ class ScraperManager: | |
|
|
||
| bluesky_scraper: Optional[BlueskyScraper] = None | ||
| weibo_scraper: Optional[WeiboScraper] = None | ||
| firecrawl_scraper: Optional[FirecrawlScraper] = None | ||
|
|
||
| scrapers = {"bluesky": bluesky_scraper, | ||
| "weibo": bluesky_scraper} | ||
| "weibo": weibo_scraper, | ||
| "other": firecrawl_scraper, | ||
| "unknown": firecrawl_scraper} | ||
|
Comment on lines
14
to
+21
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Class attributes never updated after scraper initialization - causes repeated re-initialization. The class attributes Additionally, when initializing for Proposed fix `@classmethod`
async def init_scraper(cls, category: str) -> None:
if category in cls.scrapers.keys():
scraper = None
if category == "bluesky" and not cls.bluesky_scraper:
scraper = await cls.init_bluesky_scraper()
+ cls.bluesky_scraper = scraper
elif category == "weibo" and not cls.weibo_scraper:
scraper = await cls.init_weibo_scraper()
+ cls.weibo_scraper = scraper
elif category in ["other", "unknown"] and not cls.firecrawl_scraper:
scraper = await cls.init_firecrawl_scraper()
+ cls.firecrawl_scraper = scraper
+ # Update both keys to use same instance
+ cls.scrapers["other"] = scraper
+ cls.scrapers["unknown"] = scraper
if scraper:
cls.scrapers[category] = scraper🧰 Tools🪛 Ruff (0.14.13)18-21: Mutable class attributes should be annotated with (RUF012) 🤖 Prompt for AI Agents |
||
|
|
||
| @classmethod | ||
| async def init_scrapers(cls): | ||
|
|
@@ -28,6 +32,8 @@ async def init_scraper(cls, category: str) -> None: | |
| scraper = await cls.init_bluesky_scraper() | ||
| elif category == "weibo" and not cls.weibo_scraper: | ||
| scraper = await cls.init_weibo_scraper() | ||
| elif category in ["other", "unknown"] and not cls.firecrawl_scraper: | ||
| scraper = await cls.init_firecrawl_scraper() | ||
| if scraper: | ||
| cls.scrapers[category] = scraper | ||
| else: | ||
|
|
@@ -44,3 +50,9 @@ async def init_bluesky_scraper(cls) -> BlueskyScraper: | |
| async def init_weibo_scraper(cls) -> WeiboScraper: | ||
| weibo_scraper = WeiboScraper() | ||
| return weibo_scraper | ||
|
|
||
| @classmethod | ||
| async def init_firecrawl_scraper(cls) -> FirecrawlScraper: | ||
| firecrawl_scraper = FirecrawlScraper() | ||
| return firecrawl_scraper | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Coerce
FIRECRAWL_TIMEOUT_SECONDSto int for type safety.Environment values are strings; downstream expects an int.
🔧 Suggested fix
🤖 Prompt for AI Agents