Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 38 additions & 10 deletions apps/api/src/services/scrapers/general/firecrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@

GENERAL_TEXT_LIMIT = 800

# If extracted text length is below this ratio of the raw HTML text length,
# the LLM likely truncated/summarized the content.
_TRUNCATION_RATIO_THRESHOLD = 0.4


def _is_content_truncated(extracted_html: str, raw_html: str) -> bool:
"""Detect truncation by comparing text lengths (tags stripped) of extracted vs raw content."""
raw_text_len = get_html_text_length(raw_html)
if raw_text_len == 0:
return False
extracted_text_len = get_html_text_length(extracted_html)
ratio = extracted_text_len / raw_text_len
if ratio < _TRUNCATION_RATIO_THRESHOLD:
logger.info(
f"Content truncation detected: extracted={extracted_text_len}, "
f"raw={raw_text_len}, ratio={ratio:.2f}"
)
return True
return False


class FirecrawlDataProcessor(BaseGeneralDataProcessor):
"""
Expand Down Expand Up @@ -118,13 +138,19 @@ async def _process_json_extraction(
if og_image:
media_files.append(MediaFile(url=og_image, media_type="image"))

# Sanitize and wrap content HTML
if content_html:
content_html = self.sanitize_html(content_html)
# Sanitize and wrap content HTML, with truncation detection fallback
raw_html = full_result.get("html", "")
if not content_html or (raw_html and _is_content_truncated(content_html, raw_html)):
if content_html:
logger.warning(
"Firecrawl JSON extraction appears truncated, "
"falling back to raw HTML"
)
content_html = self.sanitize_html(raw_html) if raw_html else ""
content = wrap_text_into_html(content_html, is_html=True)
else:
markdown_content = full_result.get("markdown", "")
content = wrap_text_into_html(markdown_content, is_html=False)
content_html = self.sanitize_html(content_html)
content = wrap_text_into_html(content_html, is_html=True)

self._data = {
"id": self.id,
Expand All @@ -137,11 +163,13 @@ async def _process_json_extraction(
"content": content,
"raw_content": full_result.get("markdown", ""),
"media_files": [m.to_dict() for m in media_files],
"message_type": (
MessageType.LONG
if get_html_text_length(content) > GENERAL_TEXT_LIMIT
else MessageType.SHORT
),
"message_type": MessageType.LONG,
# (
# MessageType.LONG
# if get_html_text_length(content) > GENERAL_TEXT_LIMIT
# else MessageType.SHORT
# ),
# TODO: For now, we classify all JSON-extracted content as LONG to improve Telegram users' reading experience.
Comment on lines +166 to +172
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Hardcoding MessageType.LONG changes storage behavior for all JSON-extracted content.

Per context snippet 4 in apps/api/src/services/scrapers/common.py, MessageType.LONG triggers Telegraph storage for all articles. This means even short articles (e.g., a 200-character post) will now be stored in Telegraph unnecessarily, adding latency and external service dependency.

If the goal is better Telegram reading experience, consider a lower threshold than the original 800 characters rather than removing the threshold entirely. Alternatively, document this decision more explicitly for future maintainers.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/api/src/services/scrapers/general/firecrawl.py` around lines 166 - 172,
The hardcoded "message_type": MessageType.LONG forces Telegraph storage for all
JSON-extracted content; revert to a conditional that uses
get_html_text_length(content) compared to GENERAL_TEXT_LIMIT (from
scrapers.common) or pick a lower threshold (e.g., 200–800 chars) if you want
shorter content treated as LONG for Telegram; update the logic where
"message_type" is set in firecrawl.py to compute length via
get_html_text_length(content) and choose MessageType.LONG only when above the
chosen threshold, and add a short comment explaining the chosen threshold
decision for future maintainers.

"scraper_type": self.scraper_type,
}

Expand Down
12 changes: 8 additions & 4 deletions apps/api/src/services/scrapers/general/firecrawl_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ class ExtractedArticle(BaseModel):
content: str = Field(
default="",
description=(
"The main body content of the article as clean HTML. "
"Include headings, paragraphs, lists, links, and images. "
"The COMPLETE main body content of the article as clean HTML. "
"Include ALL text, headings, paragraphs, lists, links, and images. "
"Never truncate, shorten, or summarize the content. "
"Exclude navigation, ads, sidebars, and comments."
),
)
Expand All @@ -80,8 +81,11 @@ class ExtractedArticle(BaseModel):

FIRECRAWL_EXTRACTION_PROMPT = (
"Extract the main article or post content from this page. "
"For 'content', return the article body as clean HTML preserving "
"headings, paragraphs, lists, links, and embedded images. "
"For 'content', return the COMPLETE and FULL article body as clean HTML "
"preserving headings, paragraphs, lists, links, and embedded images. "
"Do NOT skip, truncate, summarize, or omit any part of the original content. "
"Do NOT add editorial notes like 'content continues' or 'text has been shortened'. "
"Include every paragraph and section of the article in full. "
"For 'text', provide a brief plain-text summary (under 500 chars). "
"For 'media_files', list all images and videos found in the "
"article body with their direct URLs and any captions. "
Expand Down
5 changes: 3 additions & 2 deletions packages/shared/fastfetchbot_shared/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@
}
BANNED_PATTERNS = [
r"chatgpt\.com\/share\/[A-Za-z0-9]+",
r"gemini\/share\/[A-Za-z0-9]+",
r"gemini\.google\.com\/share\/[A-Za-z0-9]+",
r"t\.me\/[A-Za-z0-9]+",
r"github\.com\/[A-Za-z0-9_-]+\/[A-Za-z0-9_-]+",
r"discord\.gg",
r"discord\.gg\/[A-Za-z0-9]+",
r"linkin\.com\/in\/[A-Za-z0-9]+",
r"telegra\.ph"
]