aturret · aturret · Feb 1, 2026 · Feb 1, 2026
diff --git a/app/services/scrapers/firecrawl_client/scraper.py b/app/services/scrapers/firecrawl_client/scraper.py
@@ -116,7 +116,7 @@ async def _process_firecrawl_result(self, result: dict) -> None:
         # Extract metadata fields
         title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
         author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
-        description = metadata.get("description", "") or metadata.get("ogDescription", "")
+        # description = metadata.get("description", "") or metadata.get("ogDescription", "")
 
         item_data = {
             "id": self.id,
@@ -128,7 +128,7 @@ async def _process_firecrawl_result(self, result: dict) -> None:
         }
 
         # Process text content - use description or first part of markdown
-        text = description if description else markdown_content[:FIRECRAWL_TEXT_LIMIT]
+        text = html_content[:FIRECRAWL_TEXT_LIMIT]
         item_data["text"] = text
 
         html_content = await self.parsing_article_body_by_llm(html_content)

diff --git a/app/utils/config.py b/app/utils/config.py
@@ -47,4 +47,9 @@
         r"((www\.)?bilibili\.com\/video\/[A-Za-z0-9]+)",
         r"b23\.tv\/[A-Za-z0-9]+",
     ],
-}
+}
+BANNED_PATTERNS = [
+    r"chatgpt\.com\/share\/[A-Za-z0-9]+",
+    r"gemini\/share\/[A-Za-z0-9]+",
+    r"t\.me\/[A-Za-z0-9]+"
+]
diff --git a/app/utils/parse.py b/app/utils/parse.py
@@ -8,7 +8,7 @@
 from bs4 import BeautifulSoup
 
 from app.models.url_metadata import UrlMetadata
-from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS
+from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS
 
 TELEGRAM_TEXT_LIMIT = 900
 
@@ -89,6 +89,12 @@ async def get_url_metadata(url: str, ban_list: Optional[list] = None) -> UrlMeta
     if source in ban_list:
         source = "banned"
         content_type = "banned"
+    else:
+        for item in BANNED_PATTERNS:
+            if re.search(item, url):
+                source = "banned"
+                content_type = "banned"
+                break
     # TODO: check if the url is from Mastodon, according to the request cookie
     return UrlMetadata(url=url, source=source, content_type=content_type)