Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,4 @@ conf/*
.run/Template Python tests.run.xml
/.run/
.DS_Store
/.claude/
17 changes: 12 additions & 5 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,23 @@ def ban_list_resolver(ban_list_string: str) -> list:
INOREADER_EMAIL = env.get("INOREADER_EMAIL", None)
INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None)

# Open AI API environment variables
# Open AI API
OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)

# Firecrawl API environment variables
FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False)
# General webpage scraping
GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False)
GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL")

# Firecrawl API
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)
FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering


# Zyte API
ZYTE_API_KEY = env.get("ZYTE_API_KEY", None)

# Locale environment variables
# Locale directories environment variables
localedir = os.path.join(os.path.dirname(__file__), "locale")
translation = gettext.translation("messages", localedir=localedir, fallback=True)
_ = translation.gettext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@


@dataclass
class FirecrawlItem(MetadataItem):
class GeneralItem(MetadataItem):
"""
FirecrawlItem: Data class for scraped content from Firecrawl.
GeneralItem: Data class for scraped content from general webpage scrapers.
"""
id: str = ""
raw_content: str = ""
scraper_type: str = "" # Which scraper was used (e.g., "firecrawl", "zyte", etc.)

@staticmethod
def from_dict(obj: Any) -> "FirecrawlItem":
def from_dict(obj: Any) -> "GeneralItem":
metadata_item = MetadataItem.from_dict(obj)
return FirecrawlItem(
return GeneralItem(
url=metadata_item.url,
title=metadata_item.title,
author=metadata_item.author,
Expand All @@ -28,10 +29,12 @@ def from_dict(obj: Any) -> "FirecrawlItem":
message_type=metadata_item.message_type,
id=obj.get("id", ""),
raw_content=obj.get("raw_content", ""),
scraper_type=obj.get("scraper_type", ""),
)

def to_dict(self) -> dict:
result: dict = super().to_dict()
result["id"] = self.id
result["raw_content"] = self.raw_content
result["scraper_type"] = self.scraper_type
return result
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
import hashlib
from abc import abstractmethod
from typing import Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup, Doctype
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam

from app.config import OPENAI_API_KEY
from app.models.metadata_item import MediaFile, MessageType
from app.services.scrapers.scraper import Scraper, DataProcessor
from app.services.scrapers.firecrawl_client import FirecrawlItem
from app.services.scrapers.firecrawl_client.client import FirecrawlClient
from app.services.scrapers.general import GeneralItem
from app.utils.parse import get_html_text_length, wrap_text_into_html
from app.utils.logger import logger

FIRECRAWL_TEXT_LIMIT = 800
GENERAL_TEXT_LIMIT = 800

DEFAULT_OPENAI_MODEL = "gpt-5-nano"

# System prompt for LLM to extract article content
ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML.
Expand All @@ -24,50 +28,134 @@
4. Keep important formatting like bold, italic, links, and images
5. Return clean HTML containing only the article content
6. If you cannot identify the main content, return the original HTML unchanged
7. remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>
7. After all of the above, remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>

Return ONLY the extracted HTML content, no explanations or markdown."""


class FirecrawlDataProcessor(DataProcessor):
class BaseGeneralDataProcessor(DataProcessor):
"""
FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
Base class for general webpage data processors.
Each specific scraper (Firecrawl, Zyte, etc.) should inherit from this class.
"""

def __init__(self, url: str):
self.url: str = url
self._data: dict = {}
self.url_parser = urlparse(url)
self.id = hashlib.md5(url.encode()).hexdigest()[:16]
self._client: FirecrawlClient = FirecrawlClient.get_instance()
self.scraper_type: str = "base"

async def get_item(self) -> dict:
await self.process_data()
firecrawl_item = FirecrawlItem.from_dict(self._data)
return firecrawl_item.to_dict()
general_item = GeneralItem.from_dict(self._data)
return general_item.to_dict()

async def process_data(self) -> None:
await self._get_page_content()

@abstractmethod
async def _get_page_content(self) -> None:
try:
result = self._client.scrape_url(
url=self.url,
formats=["markdown", "html"],
only_main_content=True,
)
await self._process_firecrawl_result(result)
except Exception as e:
logger.error(f"Failed to scrape URL with Firecrawl: {e}")
raise
"""Subclasses must implement this method to fetch page content."""
pass

async def _build_item_data(
self,
title: str,
author: str,
description: str,
markdown_content: str,
html_content: str,
og_image: Optional[str] = None,
) -> None:
"""
Common method to build item data from scraped content.
"""
item_data = {
"id": self.id,
"category": "other",
"url": self.url,
"title": title or self.url,
"author": author or self.url_parser.netloc,
"author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
"scraper_type": self.scraper_type,
}

# Process text content - use description or first part of markdown
# Strip any HTML tags to ensure plain text for Telegram short messages
text = description if description else (markdown_content or "")[:500]
text = BeautifulSoup(text, "html.parser").get_text()
item_data["text"] = text

# Process HTML content with LLM if available, then sanitize deterministically
if html_content:
cleaned_html = await self.parsing_article_body_by_llm(html_content)
cleaned_html = self.sanitize_html(cleaned_html)
content = wrap_text_into_html(cleaned_html, is_html=True)
else:
content = wrap_text_into_html(markdown_content or "", is_html=False)
item_data["content"] = content
item_data["raw_content"] = markdown_content

# Process media files - extract og:image if available
media_files = []
if og_image:
media_files.append(MediaFile(url=og_image, media_type="image"))

item_data["media_files"] = [m.to_dict() for m in media_files]

# Determine the message type based on content length (not text length)
item_data["message_type"] = (
MessageType.LONG
if get_html_text_length(content) > GENERAL_TEXT_LIMIT
else MessageType.SHORT
)

self._data = item_data

@staticmethod
def sanitize_html(html_content: str) -> str:
"""
Deterministic HTML sanitizer that removes all non-content tags.

This runs AFTER the LLM extraction as a safety net — the LLM is unreliable,
and when it fails (or when OPENAI_API_KEY is not set), raw Firecrawl HTML
(including <!DOCTYPE>, <script>, etc.) passes through unchanged.

Keeps content-meaningful tags: p, h1-h6, a, b/strong, i/em, u, ul, ol, li,
blockquote, pre, code, img, br, table, tr, td, th, thead, tbody.
"""
if not html_content:
return html_content

soup = BeautifulSoup(html_content, "html.parser")

# Remove DOCTYPE declarations
for item in soup.contents:
if isinstance(item, Doctype):
item.extract()
Comment on lines +133 to +136
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Same live-list mutation issue — iterate over a snapshot of soup.contents.

Same issue as in parse.py: calling .extract() while iterating over soup.contents can skip DOCTYPE nodes.

Proposed fix
         # Remove DOCTYPE declarations
-        for item in soup.contents:
+        for item in list(soup.contents):
             if isinstance(item, Doctype):
                 item.extract()
🤖 Prompt for AI Agents
In `@app/services/scrapers/general/base.py` around lines 133 - 136, The loop that
removes DOCTYPE nodes mutates soup.contents while iterating which can skip
nodes; change the iteration to iterate over a snapshot (e.g. wrap soup.contents
in list(...)) so every Doctype is visited and extracted — update the loop that
uses soup and Doctype in app/services/scrapers/general/base.py to iterate over
list(soup.contents) (or use soup.find_all/explicit copy) before calling
item.extract().


# Remove tags that should be destroyed with all their content
for tag_name in ["script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]:
for tag in soup.find_all(tag_name):
tag.decompose()

# Unwrap structural/layout tags — keep their text content, discard the tag itself
for tag_name in ["html", "body", "div", "span", "section", "article", "nav",
"header", "footer", "main", "aside", "figure", "figcaption",
"details", "summary", "dd", "dt", "dl"]:
for tag in soup.find_all(tag_name):
tag.unwrap()

return str(soup).strip()

@staticmethod
async def parsing_article_body_by_llm(html_content: str) -> str:
"""
Use LLM to extract the main article content from HTML.

Args:
html_content: Raw HTML content from Firecrawl
html_content: Raw HTML content from a scraper

Returns:
Cleaned HTML containing only the main article content
Expand All @@ -87,13 +175,13 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content

response = await client.chat.completions.create(
model="gpt-4o-mini",
model=DEFAULT_OPENAI_MODEL,
messages=[
ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT),
ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}")
],
temperature=0.1,
max_tokens=16000,
max_completion_tokens=10000,
)

extracted_content = response.choices[0].message.content
Expand All @@ -109,61 +197,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
logger.error(f"Failed to parse article body with LLM: {e}")
return html_content

async def _process_firecrawl_result(self, result: dict) -> None:
metadata = result.get("metadata", {})
markdown_content = result.get("markdown", "")
html_content = result.get("html", "")

# Extract metadata fields
title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
# description = metadata.get("description", "") or metadata.get("ogDescription", "")

item_data = {
"id": self.id,
"category": "other",
"url": self.url,
"title": title,
"author": author,
"author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
}

# Process text content - use description or first part of markdown
text = html_content[:FIRECRAWL_TEXT_LIMIT]
item_data["text"] = text

html_content = await self.parsing_article_body_by_llm(html_content)

# Process HTML content
if html_content:
content = wrap_text_into_html(html_content, is_html=True)
else:
content = wrap_text_into_html(markdown_content, is_html=False)
item_data["content"] = content
item_data["raw_content"] = markdown_content

# Process media files - extract og:image if available
media_files = []
og_image = metadata.get("ogImage")
if og_image:
media_files.append(MediaFile(url=og_image, media_type="image"))

item_data["media_files"] = [m.to_dict() for m in media_files]

# Determine message type based on text length
item_data["message_type"] = (
MessageType.LONG
if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT
else MessageType.SHORT
)

self._data = item_data


class FirecrawlScraper(Scraper):
class BaseGeneralScraper(Scraper):
"""
FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
Base class for general webpage scrapers.
"""

@abstractmethod
async def get_processor_by_url(self, url: str) -> DataProcessor:
return FirecrawlDataProcessor(url)
pass
65 changes: 65 additions & 0 deletions app/services/scrapers/general/firecrawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from app.config import FIRECRAWL_WAIT_FOR
from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper
from app.services.scrapers.general.firecrawl_client import FirecrawlClient
from app.services.scrapers.scraper import DataProcessor
from app.utils.logger import logger

# HTML tags to exclude from Firecrawl output at the source
FIRECRAWL_EXCLUDE_TAGS = [
"nav", "footer", "aside", "script", "style",
"noscript", "iframe", "svg", "form",
]


class FirecrawlDataProcessor(BaseGeneralDataProcessor):
"""
FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
"""

def __init__(self, url: str):
super().__init__(url)
self.scraper_type = "firecrawl"
self._client: FirecrawlClient = FirecrawlClient.get_instance()

async def _get_page_content(self) -> None:
try:
result = await self._client.scrape_url(
url=self.url,
formats=["markdown", "html"],
only_main_content=True,
exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
wait_for=FIRECRAWL_WAIT_FOR,
)
await self._process_firecrawl_result(result)
except Exception as e:
logger.error(f"Failed to scrape URL with Firecrawl: {e}")
raise
Comment on lines +24 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Synchronous scrape_url blocks the async event loop.

FirecrawlClient.scrape_url() is a synchronous method (it calls self._app.scrape(...) from the Firecrawl SDK). Calling it directly inside this async def _get_page_content will block the event loop for the duration of the HTTP request to Firecrawl.

Wrap the blocking call with asyncio.to_thread (or loop.run_in_executor):

Proposed fix
+import asyncio
+from functools import partial
+
 ...
 
     async def _get_page_content(self) -> None:
         try:
-            result = self._client.scrape_url(
-                url=self.url,
-                formats=["markdown", "html"],
-                only_main_content=True,
-                exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
-                wait_for=FIRECRAWL_WAIT_FOR,
-            )
+            result = await asyncio.to_thread(
+                partial(
+                    self._client.scrape_url,
+                    url=self.url,
+                    formats=["markdown", "html"],
+                    only_main_content=True,
+                    exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
+                    wait_for=FIRECRAWL_WAIT_FOR,
+                )
+            )
             await self._process_firecrawl_result(result)
         except Exception as e:
             logger.error(f"Failed to scrape URL with Firecrawl: {e}")
             raise
🤖 Prompt for AI Agents
In `@app/services/scrapers/general/firecrawl.py` around lines 24 - 36, The call to
the synchronous Firecrawl client in _get_page_content is blocking the event
loop; change the call to self._client.scrape_url so it runs on a background
thread (e.g., use asyncio.to_thread or loop.run_in_executor) and await that
result before passing it to _process_firecrawl_result; also add the necessary
import for asyncio at the top and keep the existing try/except and logger.error
behavior unchanged.


async def _process_firecrawl_result(self, result: dict) -> None:
metadata = result.get("metadata", {})
markdown_content = result.get("markdown", "")
html_content = result.get("html", "")

# Extract metadata fields
title = metadata.get("title", "") or metadata.get("ogTitle", "")
author = metadata.get("author", "") or metadata.get("ogSiteName", "")
description = metadata.get("description", "") or metadata.get("ogDescription", "")
og_image = metadata.get("ogImage")

await self._build_item_data(
title=title,
author=author,
description=description,
markdown_content=markdown_content,
html_content=html_content,
og_image=og_image,
)


class FirecrawlScraper(BaseGeneralScraper):
"""
FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
"""

async def get_processor_by_url(self, url: str) -> DataProcessor:
return FirecrawlDataProcessor(url)
Loading