From 3aa31ab7d5108a90d9e2d8737ffd37eef7af81a7 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 31 Jan 2026 21:08:22 -0600 Subject: [PATCH] feat: add ban list hotfix --- app/services/scrapers/firecrawl_client/scraper.py | 4 ++-- app/utils/config.py | 7 ++++++- app/utils/parse.py | 8 +++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/app/services/scrapers/firecrawl_client/scraper.py b/app/services/scrapers/firecrawl_client/scraper.py index c08fe8d..d801800 100644 --- a/app/services/scrapers/firecrawl_client/scraper.py +++ b/app/services/scrapers/firecrawl_client/scraper.py @@ -116,7 +116,7 @@ async def _process_firecrawl_result(self, result: dict) -> None: # Extract metadata fields title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc - description = metadata.get("description", "") or metadata.get("ogDescription", "") + # description = metadata.get("description", "") or metadata.get("ogDescription", "") item_data = { "id": self.id, @@ -128,7 +128,7 @@ async def _process_firecrawl_result(self, result: dict) -> None: } # Process text content - use description or first part of markdown - text = description if description else markdown_content[:FIRECRAWL_TEXT_LIMIT] + text = html_content[:FIRECRAWL_TEXT_LIMIT] item_data["text"] = text html_content = await self.parsing_article_body_by_llm(html_content) diff --git a/app/utils/config.py b/app/utils/config.py index 0d59b78..2c9b6a3 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -47,4 +47,9 @@ r"((www\.)?bilibili\.com\/video\/[A-Za-z0-9]+)", r"b23\.tv\/[A-Za-z0-9]+", ], -} \ No newline at end of file +} +BANNED_PATTERNS = [ + r"chatgpt\.com\/share\/[A-Za-z0-9]+", + r"gemini\/share\/[A-Za-z0-9]+", + r"t\.me\/[A-Za-z0-9]+" +] \ No newline at end of file diff --git a/app/utils/parse.py b/app/utils/parse.py index 149d3ee..03a15d6 100644 --- a/app/utils/parse.py +++ b/app/utils/parse.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup from app.models.url_metadata import UrlMetadata -from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS +from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS TELEGRAM_TEXT_LIMIT = 900 @@ -89,6 +89,12 @@ async def get_url_metadata(url: str, ban_list: Optional[list] = None) -> UrlMeta if source in ban_list: source = "banned" content_type = "banned" + else: + for item in BANNED_PATTERNS: + if re.search(item, url): + source = "banned" + content_type = "banned" + break # TODO: check if the url is from Mastodon, according to the request cookie return UrlMetadata(url=url, source=source, content_type=content_type)