Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions app/services/scrapers/firecrawl_client/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ async def _process_firecrawl_result(self, result: dict) -> None:
# Extract metadata fields
title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
description = metadata.get("description", "") or metadata.get("ogDescription", "")
# description = metadata.get("description", "") or metadata.get("ogDescription", "")

item_data = {
"id": self.id,
Expand All @@ -128,7 +128,7 @@ async def _process_firecrawl_result(self, result: dict) -> None:
}

# Process text content - use description or first part of markdown
text = description if description else markdown_content[:FIRECRAWL_TEXT_LIMIT]
text = html_content[:FIRECRAWL_TEXT_LIMIT]
item_data["text"] = text

html_content = await self.parsing_article_body_by_llm(html_content)
Expand Down
7 changes: 6 additions & 1 deletion app/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,9 @@
r"((www\.)?bilibili\.com\/video\/[A-Za-z0-9]+)",
r"b23\.tv\/[A-Za-z0-9]+",
],
}
}
BANNED_PATTERNS = [
r"chatgpt\.com\/share\/[A-Za-z0-9]+",
r"gemini\/share\/[A-Za-z0-9]+",
r"t\.me\/[A-Za-z0-9]+"
]
8 changes: 7 additions & 1 deletion app/utils/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bs4 import BeautifulSoup

from app.models.url_metadata import UrlMetadata
from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS
from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS

TELEGRAM_TEXT_LIMIT = 900

Expand Down Expand Up @@ -89,6 +89,12 @@ async def get_url_metadata(url: str, ban_list: Optional[list] = None) -> UrlMeta
if source in ban_list:
source = "banned"
content_type = "banned"
else:
for item in BANNED_PATTERNS:
if re.search(item, url):
source = "banned"
content_type = "banned"
break
# TODO: check if the url is from Mastodon, according to the request cookie
return UrlMetadata(url=url, source=source, content_type=content_type)

Expand Down