diff --git a/apps/api/src/services/scrapers/general/firecrawl.py b/apps/api/src/services/scrapers/general/firecrawl.py index c7e940a..e09fb57 100644 --- a/apps/api/src/services/scrapers/general/firecrawl.py +++ b/apps/api/src/services/scrapers/general/firecrawl.py @@ -20,6 +20,26 @@ GENERAL_TEXT_LIMIT = 800 +# If extracted text length is below this ratio of the raw HTML text length, +# the LLM likely truncated/summarized the content. +_TRUNCATION_RATIO_THRESHOLD = 0.4 + + +def _is_content_truncated(extracted_html: str, raw_html: str) -> bool: + """Detect truncation by comparing text lengths (tags stripped) of extracted vs raw content.""" + raw_text_len = get_html_text_length(raw_html) + if raw_text_len == 0: + return False + extracted_text_len = get_html_text_length(extracted_html) + ratio = extracted_text_len / raw_text_len + if ratio < _TRUNCATION_RATIO_THRESHOLD: + logger.info( + f"Content truncation detected: extracted={extracted_text_len}, " + f"raw={raw_text_len}, ratio={ratio:.2f}" + ) + return True + return False + class FirecrawlDataProcessor(BaseGeneralDataProcessor): """ @@ -118,13 +138,19 @@ async def _process_json_extraction( if og_image: media_files.append(MediaFile(url=og_image, media_type="image")) - # Sanitize and wrap content HTML - if content_html: - content_html = self.sanitize_html(content_html) + # Sanitize and wrap content HTML, with truncation detection fallback + raw_html = full_result.get("html", "") + if not content_html or (raw_html and _is_content_truncated(content_html, raw_html)): + if content_html: + logger.warning( + "Firecrawl JSON extraction appears truncated, " + "falling back to raw HTML" + ) + content_html = self.sanitize_html(raw_html) if raw_html else "" content = wrap_text_into_html(content_html, is_html=True) else: - markdown_content = full_result.get("markdown", "") - content = wrap_text_into_html(markdown_content, is_html=False) + content_html = self.sanitize_html(content_html) + content = wrap_text_into_html(content_html, is_html=True) self._data = { "id": self.id, @@ -137,11 +163,13 @@ async def _process_json_extraction( "content": content, "raw_content": full_result.get("markdown", ""), "media_files": [m.to_dict() for m in media_files], - "message_type": ( - MessageType.LONG - if get_html_text_length(content) > GENERAL_TEXT_LIMIT - else MessageType.SHORT - ), + "message_type": MessageType.LONG, + # ( + # MessageType.LONG + # if get_html_text_length(content) > GENERAL_TEXT_LIMIT + # else MessageType.SHORT + # ), + # TODO: For now, we classify all JSON-extracted content as LONG to improve Telegram users' reading experience. "scraper_type": self.scraper_type, } diff --git a/apps/api/src/services/scrapers/general/firecrawl_schema.py b/apps/api/src/services/scrapers/general/firecrawl_schema.py index 4e3e8e5..90663a2 100644 --- a/apps/api/src/services/scrapers/general/firecrawl_schema.py +++ b/apps/api/src/services/scrapers/general/firecrawl_schema.py @@ -67,8 +67,9 @@ class ExtractedArticle(BaseModel): content: str = Field( default="", description=( - "The main body content of the article as clean HTML. " - "Include headings, paragraphs, lists, links, and images. " + "The COMPLETE main body content of the article as clean HTML. " + "Include ALL text, headings, paragraphs, lists, links, and images. " + "Never truncate, shorten, or summarize the content. " "Exclude navigation, ads, sidebars, and comments." ), ) @@ -80,8 +81,11 @@ class ExtractedArticle(BaseModel): FIRECRAWL_EXTRACTION_PROMPT = ( "Extract the main article or post content from this page. " - "For 'content', return the article body as clean HTML preserving " - "headings, paragraphs, lists, links, and embedded images. " + "For 'content', return the COMPLETE and FULL article body as clean HTML " + "preserving headings, paragraphs, lists, links, and embedded images. " + "Do NOT skip, truncate, summarize, or omit any part of the original content. " + "Do NOT add editorial notes like 'content continues' or 'text has been shortened'. " + "Include every paragraph and section of the article in full. " "For 'text', provide a brief plain-text summary (under 500 chars). " "For 'media_files', list all images and videos found in the " "article body with their direct URLs and any captions. " diff --git a/packages/shared/fastfetchbot_shared/utils/config.py b/packages/shared/fastfetchbot_shared/utils/config.py index d88e159..635eeed 100644 --- a/packages/shared/fastfetchbot_shared/utils/config.py +++ b/packages/shared/fastfetchbot_shared/utils/config.py @@ -50,9 +50,10 @@ } BANNED_PATTERNS = [ r"chatgpt\.com\/share\/[A-Za-z0-9]+", - r"gemini\/share\/[A-Za-z0-9]+", + r"gemini\.google\.com\/share\/[A-Za-z0-9]+", r"t\.me\/[A-Za-z0-9]+", r"github\.com\/[A-Za-z0-9_-]+\/[A-Za-z0-9_-]+", - r"discord\.gg", + r"discord\.gg\/[A-Za-z0-9]+", r"linkin\.com\/in\/[A-Za-z0-9]+", + r"telegra\.ph" ]