aturret · aturret · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · coderabbitai
diff --git a/apps/api/src/services/scrapers/general/firecrawl.py b/apps/api/src/services/scrapers/general/firecrawl.py
@@ -20,6 +20,26 @@
 
 GENERAL_TEXT_LIMIT = 800
 
+# If extracted text length is below this ratio of the raw HTML text length,
+# the LLM likely truncated/summarized the content.
+_TRUNCATION_RATIO_THRESHOLD = 0.4
+
+
+def _is_content_truncated(extracted_html: str, raw_html: str) -> bool:
+    """Detect truncation by comparing text lengths (tags stripped) of extracted vs raw content."""
+    raw_text_len = get_html_text_length(raw_html)
+    if raw_text_len == 0:
+        return False
+    extracted_text_len = get_html_text_length(extracted_html)
+    ratio = extracted_text_len / raw_text_len
+    if ratio < _TRUNCATION_RATIO_THRESHOLD:
+        logger.info(
+            f"Content truncation detected: extracted={extracted_text_len}, "
+            f"raw={raw_text_len}, ratio={ratio:.2f}"
+        )
+        return True
+    return False
+
 
 class FirecrawlDataProcessor(BaseGeneralDataProcessor):
     """
@@ -118,13 +138,19 @@ async def _process_json_extraction(
             if og_image:
                 media_files.append(MediaFile(url=og_image, media_type="image"))
 
-        # Sanitize and wrap content HTML
-        if content_html:
-            content_html = self.sanitize_html(content_html)
+        # Sanitize and wrap content HTML, with truncation detection fallback
+        raw_html = full_result.get("html", "")
+        if not content_html or (raw_html and _is_content_truncated(content_html, raw_html)):
+            if content_html:
+                logger.warning(
+                    "Firecrawl JSON extraction appears truncated, "
+                    "falling back to raw HTML"
+                )
+            content_html = self.sanitize_html(raw_html) if raw_html else ""
             content = wrap_text_into_html(content_html, is_html=True)
         else:
-            markdown_content = full_result.get("markdown", "")
-            content = wrap_text_into_html(markdown_content, is_html=False)
+            content_html = self.sanitize_html(content_html)
+            content = wrap_text_into_html(content_html, is_html=True)
 
         self._data = {
             "id": self.id,
@@ -137,11 +163,13 @@ async def _process_json_extraction(
             "content": content,
             "raw_content": full_result.get("markdown", ""),
             "media_files": [m.to_dict() for m in media_files],
-            "message_type": (
-                MessageType.LONG
-                if get_html_text_length(content) > GENERAL_TEXT_LIMIT
-                else MessageType.SHORT
-            ),
+            "message_type": MessageType.LONG,
+            #     (
+            #     MessageType.LONG
+            #     if get_html_text_length(content) > GENERAL_TEXT_LIMIT
+            #     else MessageType.SHORT
+            # ),
+            # TODO: For now, we classify all JSON-extracted content as LONG to improve Telegram users' reading experience.
             "scraper_type": self.scraper_type,
         }
 

diff --git a/apps/api/src/services/scrapers/general/firecrawl_schema.py b/apps/api/src/services/scrapers/general/firecrawl_schema.py
@@ -67,8 +67,9 @@ class ExtractedArticle(BaseModel):
     content: str = Field(
         default="",
         description=(
-            "The main body content of the article as clean HTML. "
-            "Include headings, paragraphs, lists, links, and images. "
+            "The COMPLETE main body content of the article as clean HTML. "
+            "Include ALL text, headings, paragraphs, lists, links, and images. "
+            "Never truncate, shorten, or summarize the content. "
             "Exclude navigation, ads, sidebars, and comments."
         ),
     )
@@ -80,8 +81,11 @@ class ExtractedArticle(BaseModel):
 
 FIRECRAWL_EXTRACTION_PROMPT = (
     "Extract the main article or post content from this page. "
-    "For 'content', return the article body as clean HTML preserving "
-    "headings, paragraphs, lists, links, and embedded images. "
+    "For 'content', return the COMPLETE and FULL article body as clean HTML "
+    "preserving headings, paragraphs, lists, links, and embedded images. "
+    "Do NOT skip, truncate, summarize, or omit any part of the original content. "
+    "Do NOT add editorial notes like 'content continues' or 'text has been shortened'. "
+    "Include every paragraph and section of the article in full. "
     "For 'text', provide a brief plain-text summary (under 500 chars). "
     "For 'media_files', list all images and videos found in the "
     "article body with their direct URLs and any captions. "

diff --git a/packages/shared/fastfetchbot_shared/utils/config.py b/packages/shared/fastfetchbot_shared/utils/config.py
@@ -50,9 +50,10 @@
 }
 BANNED_PATTERNS = [
     r"chatgpt\.com\/share\/[A-Za-z0-9]+",
-    r"gemini\/share\/[A-Za-z0-9]+",
+    r"gemini\.google\.com\/share\/[A-Za-z0-9]+",
     r"t\.me\/[A-Za-z0-9]+",
     r"github\.com\/[A-Za-z0-9_-]+\/[A-Za-z0-9_-]+",
-    r"discord\.gg",
+    r"discord\.gg\/[A-Za-z0-9]+",
     r"linkin\.com\/in\/[A-Za-z0-9]+",
+    r"telegra\.ph"
 ]