Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,4 @@ conf/*
!conf/.gitkeep
.run/Template Python tests.run.xml
/.run/
.DS_Store
6 changes: 6 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,12 @@ def ban_list_resolver(ban_list_string: str) -> list:
# Open AI API environment variables
OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)

# Firecrawl API environment variables
FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False)
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)
Comment on lines +214 to +218
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Coerce FIRECRAWL_TIMEOUT_SECONDS to int for type safety.
Environment values are strings; downstream expects an int.

🔧 Suggested fix
-FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)
+FIRECRAWL_TIMEOUT_SECONDS = int(env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)) or 60
🤖 Prompt for AI Agents
In `@app/config.py` around lines 214 - 218, FIRECRAWL_TIMEOUT_SECONDS is currently
assigned from env.get and may be a string; convert it to an int for type safety
by parsing the value (e.g., wrap the retrieved env value with int(...) or use an
existing helper like get_env_int) so downstream code receives an integer; update
the assignment of FIRECRAWL_TIMEOUT_SECONDS (the symbol in the diff) to
parse/coerce the env value to int and handle a missing/invalid value by falling
back to the default 60.


# Locale environment variables
localedir = os.path.join(os.path.dirname(__file__), "locale")
translation = gettext.translation("messages", localedir=localedir, fallback=True)
Expand Down
5 changes: 2 additions & 3 deletions app/services/scrapers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
inoreader
)
from app.services.file_export import video_download, document_export
from app.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads, \
bluesky
from app.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads
from app.services.scrapers.scraper_manager import ScraperManager
from app.database import save_instances
from app.utils.logger import logger
Expand Down Expand Up @@ -61,7 +60,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
self.kwargs["category"] = self.category
if not metadata_item:
try:
if self.category in ["bluesky", "weibo"]: # it is a workaround before the code refactor
if self.category in ["bluesky", "weibo", "other", "unknown"]: # it is a workaround before the code refactor
await ScraperManager.init_scraper(self.category)
item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url)
metadata_item = await item_data_processor.get_item()
Expand Down
37 changes: 37 additions & 0 deletions app/services/scrapers/firecrawl_client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from dataclasses import dataclass
from typing import Any

from app.models.metadata_item import MetadataItem


@dataclass
class FirecrawlItem(MetadataItem):
"""
FirecrawlItem: Data class for scraped content from Firecrawl.
"""
id: str = ""
raw_content: str = ""

@staticmethod
def from_dict(obj: Any) -> "FirecrawlItem":
metadata_item = MetadataItem.from_dict(obj)
return FirecrawlItem(
url=metadata_item.url,
title=metadata_item.title,
author=metadata_item.author,
author_url=metadata_item.author_url,
telegraph_url=metadata_item.telegraph_url,
text=metadata_item.text,
content=metadata_item.content,
media_files=metadata_item.media_files,
category=metadata_item.category,
message_type=metadata_item.message_type,
id=obj.get("id", ""),
raw_content=obj.get("raw_content", ""),
)

def to_dict(self) -> dict:
result: dict = super().to_dict()
result["id"] = self.id
result["raw_content"] = self.raw_content
return result
95 changes: 95 additions & 0 deletions app/services/scrapers/firecrawl_client/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from __future__ import annotations

import threading
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

from firecrawl import Firecrawl

from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY, FIRECRAWL_TIMEOUT_SECONDS


@dataclass(frozen=True)
class FirecrawlSettings:
api_url: str
api_key: str
timeout_seconds: int = 60 # 你也可以在反代侧控制超时


class FirecrawlClient:
"""
FirecrawlClient: 对 firecrawl python SDK 的封装 + 单例访问点。

- 提供 scrape / crawl 等常用方法,方便其他模块调用
- 线程安全单例(适合 Web 服务 / worker 多线程场景)
"""

_instance: Optional["FirecrawlClient"] = None
_lock = threading.Lock()

def __init__(self, config: FirecrawlSettings):
self._settings: FirecrawlSettings = config
self._app: Firecrawl = self._create_app(config)

@staticmethod
def _create_app(config: FirecrawlSettings) -> Firecrawl:
try:
return Firecrawl(api_url=config.api_url, api_key=config.api_key)
except TypeError:
return Firecrawl(api_url=config.api_url, api_key=config.api_key)

@classmethod
def get_instance(cls) -> "FirecrawlClient":
"""
线程安全的单例获取。
- 首次调用可传 settings
- 之后重复调用可不传
"""
if cls._instance is not None:
return cls._instance

with cls._lock:
if cls._instance is not None:
return cls._instance

config = FirecrawlSettings(
api_url=FIRECRAWL_API_URL,
api_key=FIRECRAWL_API_KEY,
timeout_seconds=FIRECRAWL_TIMEOUT_SECONDS,
)

cls._instance = cls(config)
return cls._instance

@classmethod
def reset_instance(cls) -> None:
"""测试用:重置单例。"""
with cls._lock:
cls._instance = None

def scrape_url(
self,
url: str,
formats: Optional[List[str]] = None,
only_main_content: bool = True,
timeout_seconds: Optional[int] = None,
extra_params: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
单页抓取(最常用)
"""
params: Dict[str, Any] = {
"formats": formats or ["markdown"],
"onlyMainContent": only_main_content,
}
if extra_params:
params.update(extra_params)

# if timeout_seconds is None:
# timeout_seconds = self._settings.timeout_seconds

try:
return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump(
exclude_none=True)
Comment on lines +70 to +93
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

find . -type f -name "client.py" | grep firecrawl

Repository: aturret/FastFetchBot

Length of output: 114


🏁 Script executed:

cat -n app/services/scrapers/firecrawl_client/client.py

Repository: aturret/FastFetchBot

Length of output: 3450


🏁 Script executed:

python3 << 'EOF'
import subprocess
result = subprocess.run(['pip', 'show', 'firecrawl'], capture_output=True, text=True)
print(result.stdout)
EOF

Repository: aturret/FastFetchBot

Length of output: 150


🏁 Script executed:

# Search for how firecrawl scrape is used in the codebase
rg "scrape_url" --type py -A 3 -B 1

Repository: aturret/FastFetchBot

Length of output: 1059


🏁 Script executed:

# Check if there are other usages or tests of the firecrawl SDK
rg "_app\.scrape|Firecrawl" --type py

Repository: aturret/FastFetchBot

Length of output: 3710


🌐 Web query:

firecrawl-py SDK scrape method parameters API documentation

💡 Result:

Summary of firecrawl-py /scrape method parameters (concise):

  • url / url string — the target page to scrape. (docs.firecrawl.dev)
  • formats — array of output formats (e.g., "markdown", "html", "rawHtml", "summary", "json", "links", "images", "screenshot", "branding"). (docs.firecrawl.dev)
  • maxAge — cache freshness in ms (default ~172800000 ms = 2 days); set 0 to force fresh. (docs.firecrawl.dev)
  • storeInCache — boolean to prevent storing results in cache. (docs.firecrawl.dev)
  • location — object with country (ISO 3166-1 alpha-2) and languages to control proxy/language emulation. (docs.firecrawl.dev)
  • pageOptions — object for page parsing behavior (examples include onlyMainContent, parsePDF, etc.). (docs.firecrawl.dev)
  • parsers — array to force specific parsers (e.g., ["pdf"]). (docs.firecrawl.dev)
  • screenshot (when formats includes "screenshot") — options like fullPage, quality, viewport. (docs.firecrawl.dev)
  • JSON extraction / LLM schema — support for structured extraction using a JSON format or pydantic schema. (docs.firecrawl.dev)
  • Additional SDK params — SDK methods accept named params/objects (scrape(), scrape_url(), scrape_options / ScrapeOptions) and support sync/async clients. (docs.firecrawl.dev)

Sources:
[1] Firecrawl Scrape docs (Scrape endpoint, formats, cache, location, screenshots). (docs.firecrawl.dev)
[2] Firecrawl Advanced Scraping Guide (pageOptions, parsers, scrape options). (docs.firecrawl.dev)
[3] firecrawl-py SDK / PyPI / GitHub pages (SDK method names, ScrapeOptions, examples, async). (github.com)

If you want, I can paste exact parameter names and example Python signatures from the SDK docs.

Citations:


🌐 Web query:

firecrawl-py GitHub repository scrape method signature source code

💡 Result:

The Firecrawl Python SDK scrape signature (synchronous) — from the SDK docs/source — is:

def scrape(
url: str,
*,
formats: Optional[List[str]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
wait_for: Optional[int] = None,
screenshot: Optional[bool] = None,
full_page_screenshot: Optional[bool] = None,
mobile: Optional[bool] = None,
**kwargs
) -> Document

(Example usage and parameter details in the Firecrawl docs and the package docs mirror.) [1][2]

Sources:

  • Firecrawl docs — Scrape endpoint / examples. [1]
  • firecrawl-py package docs (scraping API / signature listing). [2]

🏁 Script executed:

# Check if there's a requirements.txt or setup.py that shows firecrawl version
find . -type f \( -name "requirements*.txt" -o -name "setup.py" -o -name "pyproject.toml" \) -exec cat {} \;

Repository: aturret/FastFetchBot

Length of output: 1668


🏁 Script executed:

python3 << 'EOF'
import subprocess
import json

# Try to get info about firecrawl package
result = subprocess.run(['pip', 'show', 'firecrawl-py', '-f'], capture_output=True, text=True)
print("Package info:")
print(result.stdout)
print("\nStderr:")
print(result.stderr)
EOF

Repository: aturret/FastFetchBot

Length of output: 132


🌐 Web query:

firecrawl-py scrape method pageOptions onlyMainContent parameter example

💡 Result:

Example — Python (firecrawl-py) using pageOptions.onlyMainContent:

from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY")
resp = app.scrape_url(
"https://example.com",
params={
"pageOptions": {
"onlyMainContent": True,
"includeHtml": False
}
}
)
print(resp) # resp will contain the main page content without headers/navs/footers

References: Firecrawl docs — Page Options and Scrape examples. [1][2]


🏁 Script executed:

# Check if scraper.py shows how scrape_url is being called
sed -n '1,50p' app/services/scrapers/firecrawl_client/scraper.py

Repository: aturret/FastFetchBot

Length of output: 2048


🌐 Web query:

firecrawl-py SDK scrape method timeout parameter how to pass

💡 Result:

Pass the timeout as the timeout keyword when calling scrape; it expects milliseconds. Example:

Python
from firecrawl import Firecrawl
fc = Firecrawl(api_key="fc-YOUR-API-KEY")
doc = fc.scrape("https://example.com", formats=["markdown"], timeout=15000) # 15,000 ms = 15s

Docs: scrape endpoint timeout is in milliseconds and the Python SDK forwards that parameter. [1][2]

Sources
[1] Firecrawl Python SDK usage (scrape examples).
[2] Firecrawl Scrape endpoint — timeout (integer, milliseconds).


🌐 Web query:

site:github.com firecrawl-py scrape method implementation kwargs

💡 Result:

The firecrawl Python SDK's scrape method accepts options via keyword args (examples: formats, scrape_options, poll_interval). See the SDK docs and examples in the repository README. [1][2]

Sources:

  • firecrawl (main repo) README — example usage of firecrawl.scrape(url, formats=[...]). [1]
  • firecrawl-py (old Python SDK) README — scrape_url / scrape_url params examples. [2]

Wire unused params dict and timeout_seconds into the SDK call.

The method builds a params dict that includes extra_params, but then ignores it entirely when calling scrape(). Additionally, the timeout_seconds parameter is accepted but never applied—the code to use it is commented out. This causes callers' configuration to be silently dropped.

The timeout_seconds parameter is in seconds but the firecrawl SDK expects milliseconds. Consider either converting on input or passing the unconverted value if the SDK handles both.

🔧 Suggested direction (adjust based on SDK version's params handling)
         params: Dict[str, Any] = {
             "formats": formats or ["markdown"],
             "onlyMainContent": only_main_content,
         }
+        if timeout_seconds is None:
+            timeout_seconds = self._settings.timeout_seconds
+        if timeout_seconds is not None:
+            params["timeout"] = timeout_seconds * 1000  # Convert seconds to milliseconds
         if extra_params:
             params.update(extra_params)

         try:
-            return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump(
-                exclude_none=True)
+            return self._app.scrape(url, **params).model_dump(exclude_none=True)
🧰 Tools
🪛 Ruff (0.14.13)

75-75: Unused method argument: timeout_seconds

(ARG002)


79-79: Docstring contains ambiguous (FULLWIDTH LEFT PARENTHESIS). Did you mean ( (LEFT PARENTHESIS)?

(RUF002)


79-79: Docstring contains ambiguous (FULLWIDTH RIGHT PARENTHESIS). Did you mean ) (RIGHT PARENTHESIS)?

(RUF002)

🤖 Prompt for AI Agents
In `@app/services/scrapers/firecrawl_client/client.py` around lines 70 - 93,
scrape_url builds a params dict (including extra_params) and accepts
timeout_seconds but never uses either; update the call to self._app.scrape to
pass the assembled params (e.g., params=params) and wire the timeout_seconds
through — converting seconds to milliseconds if the SDK expects ms (timeout_ms =
int(timeout_seconds * 1000)) or passing raw seconds when appropriate; modify the
scrape invocation in scrape_url to use these values instead of the current
arguments (refer to scrape_url, params, timeout_seconds, and self._app.scrape)
so callers' options are honored.

except Exception as e:
raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e
168 changes: 168 additions & 0 deletions app/services/scrapers/firecrawl_client/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import hashlib
from urllib.parse import urlparse

from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam

from app.config import OPENAI_API_KEY
from app.models.metadata_item import MediaFile, MessageType
from app.services.scrapers.scraper import Scraper, DataProcessor
from app.services.scrapers.firecrawl_client import FirecrawlItem
from app.services.scrapers.firecrawl_client.client import FirecrawlClient
from app.utils.parse import get_html_text_length, wrap_text_into_html
from app.utils.logger import logger

FIRECRAWL_TEXT_LIMIT = 800

# System prompt for LLM to extract article content
ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML.

Instructions:
1. Identify and extract ONLY the main article/post content
2. Remove navigation, headers, footers, sidebars, ads, comments, and other non-article elements
3. Preserve the article's structure (headings, paragraphs, lists, etc.)
4. Keep important formatting like bold, italic, links, and images
5. Return clean HTML containing only the article content
6. If you cannot identify the main content, return the original HTML unchanged

Return ONLY the extracted HTML content, no explanations or markdown."""


class FirecrawlDataProcessor(DataProcessor):
"""
FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
"""

def __init__(self, url: str):
self.url: str = url
self._data: dict = {}
self.url_parser = urlparse(url)
self.id = hashlib.md5(url.encode()).hexdigest()[:16]
self._client: FirecrawlClient = FirecrawlClient.get_instance()

async def get_item(self) -> dict:
await self.process_data()
firecrawl_item = FirecrawlItem.from_dict(self._data)
return firecrawl_item.to_dict()

async def process_data(self) -> None:
await self._get_page_content()

async def _get_page_content(self) -> None:
try:
result = self._client.scrape_url(
url=self.url,
formats=["markdown", "html"],
only_main_content=True,
)
await self._process_firecrawl_result(result)
except Exception as e:
logger.error(f"Failed to scrape URL with Firecrawl: {e}")
raise
Comment on lines +51 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, let's find where FirecrawlClient is defined or imported
find . -type f -name "*.py" | xargs grep -l "class FirecrawlClient" 2>/dev/null || echo "No local FirecrawlClient class found"

# Search for scrape_url method definition in the repo
rg "def scrape_url|async def scrape_url" --type py -B 2 -A 10

# Also check the imports in the scraper.py file to understand where _client comes from
cat -n app/services/scrapers/firecrawl_client/scraper.py | head -50

Repository: aturret/FastFetchBot

Length of output: 3450


🏁 Script executed:

#!/bin/bash
# Read the scrape_url method from client.py
cat -n app/services/scrapers/firecrawl_client/client.py | grep -A 30 "def scrape_url"

Repository: aturret/FastFetchBot

Length of output: 1153


🏁 Script executed:

#!/bin/bash
# Read the FirecrawlClient class to see how _app is initialized
cat -n app/services/scrapers/firecrawl_client/client.py | head -70

Repository: aturret/FastFetchBot

Length of output: 2387


🌐 Web query:

Firecrawl Python SDK async support AsyncFirecrawl

💡 Result:

Yes — the Firecrawl Python SDK includes async support via the AsyncFirecrawl class. Its methods mirror Firecrawl (awaitable versions of scrape, search, start_crawl, batch_scrape, watcher, etc.). Example usage and docs are on the official SDK pages. [1][2]

Sources:

  • Firecrawl Python SDK docs (AsyncFirecrawl examples and description). [1]
  • Firecrawl docs (Python SDK overview, async class usage and watcher example). [2]

Use AsyncFirecrawl instead of blocking the event loop with a synchronous call.

self._client.scrape_url() is a synchronous, blocking call that performs network I/O to Firecrawl. Calling it without await in this async method blocks the event loop, preventing other async operations from executing concurrently and defeating the purpose of async. The Firecrawl Python SDK provides an AsyncFirecrawl class with async methods (e.g., scrape()); use it to avoid blocking during the scraping request.

🤖 Prompt for AI Agents
In `@app/services/scrapers/firecrawl_client/scraper.py` around lines 51 - 61, The
_get_page_content coroutine is calling the synchronous self._client.scrape_url
which blocks the event loop; switch to Firecrawl's async API by replacing the
blocking call with the AsyncFirecrawl async client and its async scrape method
(e.g., create or ensure self._client is an AsyncFirecrawl instance and call
await self._client.scrape(...) with the same parameters), then await the
existing _process_firecrawl_result(result) call; update error handling to catch
exceptions from the awaited async call and rethrow as before.


@staticmethod
async def parsing_article_body_by_llm(html_content: str) -> str:
"""
Use LLM to extract the main article content from HTML.

Args:
html_content: Raw HTML content from Firecrawl

Returns:
Cleaned HTML containing only the main article content
"""
if not html_content:
return html_content

if not OPENAI_API_KEY:
logger.warning("OPENAI_API_KEY not configured, skipping LLM parsing")
return html_content

try:
client = AsyncOpenAI(api_key=OPENAI_API_KEY)

# Truncate content if too long to avoid token limits
max_content_length = 50000
truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content

response = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[
ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT),
ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}")
],
temperature=0.1,
max_tokens=16000,
)

extracted_content = response.choices[0].message.content

if extracted_content:
logger.info("Successfully extracted article content using LLM")
return extracted_content.strip()
else:
logger.warning("LLM returned empty content, using original HTML")
return html_content

except Exception as e:
logger.error(f"Failed to parse article body with LLM: {e}")
return html_content

async def _process_firecrawl_result(self, result: dict) -> None:
metadata = result.get("metadata", {})
markdown_content = result.get("markdown", "")
html_content = result.get("html", "")

# Extract metadata fields
title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
description = metadata.get("description", "") or metadata.get("ogDescription", "")

item_data = {
"id": self.id,
"category": "other",
"url": self.url,
"title": title,
"author": author,
"author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
}

# Process text content - use description or first part of markdown
text = description if description else markdown_content[:500]
item_data["text"] = text

html_content = await self.parsing_article_body_by_llm(html_content)

# Process HTML content
if html_content:
content = wrap_text_into_html(html_content, is_html=True)
else:
content = wrap_text_into_html(markdown_content, is_html=False)
item_data["content"] = content
item_data["raw_content"] = markdown_content

# Process media files - extract og:image if available
media_files = []
og_image = metadata.get("ogImage")
if og_image:
media_files.append(MediaFile(url=og_image, media_type="image"))

item_data["media_files"] = [m.to_dict() for m in media_files]

# Determine message type based on text length
item_data["message_type"] = (
MessageType.LONG
if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT
else MessageType.SHORT
)

self._data = item_data


class FirecrawlScraper(Scraper):
"""
FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
"""

async def get_processor_by_url(self, url: str) -> DataProcessor:
return FirecrawlDataProcessor(url)
14 changes: 13 additions & 1 deletion app/services/scrapers/scraper_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from app.utils.logger import logger
from app.services.scrapers.bluesky.scraper import BlueskyScraper
from app.services.scrapers.weibo.scraper import WeiboScraper
from app.services.scrapers.firecrawl_client.scraper import FirecrawlScraper
from app.config import (
BLUESKY_USERNAME, BLUESKY_PASSWORD
)
Expand All @@ -12,9 +13,12 @@ class ScraperManager:

bluesky_scraper: Optional[BlueskyScraper] = None
weibo_scraper: Optional[WeiboScraper] = None
firecrawl_scraper: Optional[FirecrawlScraper] = None

scrapers = {"bluesky": bluesky_scraper,
"weibo": bluesky_scraper}
"weibo": weibo_scraper,
"other": firecrawl_scraper,
"unknown": firecrawl_scraper}
Comment on lines 14 to +21
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Class attributes never updated after scraper initialization - causes repeated re-initialization.

The class attributes bluesky_scraper, weibo_scraper, and firecrawl_scraper are used as guards (e.g., not cls.firecrawl_scraper) but are never assigned after initialization. This means every call to init_scraper() will re-initialize the scraper.

Additionally, when initializing for "other" category, cls.scrapers["other"] is updated but cls.scrapers["unknown"] still points to None, causing separate initializations.

Proposed fix
     `@classmethod`
     async def init_scraper(cls, category: str) -> None:
         if category in cls.scrapers.keys():
             scraper = None
             if category == "bluesky" and not cls.bluesky_scraper:
                 scraper = await cls.init_bluesky_scraper()
+                cls.bluesky_scraper = scraper
             elif category == "weibo" and not cls.weibo_scraper:
                 scraper = await cls.init_weibo_scraper()
+                cls.weibo_scraper = scraper
             elif category in ["other", "unknown"] and not cls.firecrawl_scraper:
                 scraper = await cls.init_firecrawl_scraper()
+                cls.firecrawl_scraper = scraper
+                # Update both keys to use same instance
+                cls.scrapers["other"] = scraper
+                cls.scrapers["unknown"] = scraper
             if scraper:
                 cls.scrapers[category] = scraper
🧰 Tools
🪛 Ruff (0.14.13)

18-21: Mutable class attributes should be annotated with typing.ClassVar

(RUF012)

🤖 Prompt for AI Agents
In `@app/services/scrapers/scraper_manager.py` around lines 14 - 21, The
class-level scraper attributes (bluesky_scraper, weibo_scraper,
firecrawl_scraper) are declared but never set after creating instances, causing
repeated re-initialization in init_scraper(); update init_scraper() so that when
you create a scraper instance you assign it back to the corresponding class
attribute (e.g., cls.firecrawl_scraper = instance, cls.bluesky_scraper =
instance, cls.weibo_scraper = instance) and ensure the scrapers mapping
(cls.scrapers) points to that same instance for all relevant keys (update both
"other" and "unknown" to reference cls.firecrawl_scraper or rebuild cls.scrapers
from the class attrs after initialization) so subsequent calls use the cached
instances.


@classmethod
async def init_scrapers(cls):
Expand All @@ -28,6 +32,8 @@ async def init_scraper(cls, category: str) -> None:
scraper = await cls.init_bluesky_scraper()
elif category == "weibo" and not cls.weibo_scraper:
scraper = await cls.init_weibo_scraper()
elif category in ["other", "unknown"] and not cls.firecrawl_scraper:
scraper = await cls.init_firecrawl_scraper()
if scraper:
cls.scrapers[category] = scraper
else:
Expand All @@ -44,3 +50,9 @@ async def init_bluesky_scraper(cls) -> BlueskyScraper:
async def init_weibo_scraper(cls) -> WeiboScraper:
weibo_scraper = WeiboScraper()
return weibo_scraper

@classmethod
async def init_firecrawl_scraper(cls) -> FirecrawlScraper:
firecrawl_scraper = FirecrawlScraper()
return firecrawl_scraper

Loading