From 868896f579b45b51a7a3443cab50161b3b012bd1 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 14 Feb 2026 14:41:39 -0600 Subject: [PATCH 1/8] feat: add zyte API --- app/config.py | 16 +- .../{firecrawl_client => general}/__init__.py | 11 +- .../scraper.py => general/base.py} | 145 +++++++------- app/services/scrapers/general/firecrawl.py | 56 ++++++ .../client.py => general/firecrawl_client.py} | 27 +-- app/services/scrapers/general/scraper.py | 86 ++++++++ app/services/scrapers/general/zyte.py | 78 ++++++++ app/services/scrapers/scraper_manager.py | 18 +- app/services/telegram_bot/__init__.py | 8 +- app/templates/social_media_message.jinja2 | 6 +- poetry.lock | 187 +++++++++++++++++- pyproject.toml | 4 +- template.env | 18 ++ 13 files changed, 537 insertions(+), 123 deletions(-) rename app/services/scrapers/{firecrawl_client => general}/__init__.py (71%) rename app/services/scrapers/{firecrawl_client/scraper.py => general/base.py} (68%) create mode 100644 app/services/scrapers/general/firecrawl.py rename app/services/scrapers/{firecrawl_client/client.py => general/firecrawl_client.py} (68%) create mode 100644 app/services/scrapers/general/scraper.py create mode 100644 app/services/scrapers/general/zyte.py diff --git a/app/config.py b/app/config.py index 04cb826..1e0b3fc 100644 --- a/app/config.py +++ b/app/config.py @@ -208,16 +208,22 @@ def ban_list_resolver(ban_list_string: str) -> list: INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) -# Open AI API environment variables +# Open AI API OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) -# Firecrawl API environment variables -FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False) +# General webpage scraping +GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) +GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") + +# Firecrawl API FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") -FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60) -# Locale environment variables + +# Zyte API +ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) + +# Locale directories environment variables localedir = os.path.join(os.path.dirname(__file__), "locale") translation = gettext.translation("messages", localedir=localedir, fallback=True) _ = translation.gettext diff --git a/app/services/scrapers/firecrawl_client/__init__.py b/app/services/scrapers/general/__init__.py similarity index 71% rename from app/services/scrapers/firecrawl_client/__init__.py rename to app/services/scrapers/general/__init__.py index d26a87a..94c0402 100644 --- a/app/services/scrapers/firecrawl_client/__init__.py +++ b/app/services/scrapers/general/__init__.py @@ -5,17 +5,18 @@ @dataclass -class FirecrawlItem(MetadataItem): +class GeneralItem(MetadataItem): """ - FirecrawlItem: Data class for scraped content from Firecrawl. + GeneralItem: Data class for scraped content from general webpage scrapers. """ id: str = "" raw_content: str = "" + scraper_type: str = "" # Which scraper was used (e.g., "firecrawl", "zyte", etc.) @staticmethod - def from_dict(obj: Any) -> "FirecrawlItem": + def from_dict(obj: Any) -> "GeneralItem": metadata_item = MetadataItem.from_dict(obj) - return FirecrawlItem( + return GeneralItem( url=metadata_item.url, title=metadata_item.title, author=metadata_item.author, @@ -28,10 +29,12 @@ def from_dict(obj: Any) -> "FirecrawlItem": message_type=metadata_item.message_type, id=obj.get("id", ""), raw_content=obj.get("raw_content", ""), + scraper_type=obj.get("scraper_type", ""), ) def to_dict(self) -> dict: result: dict = super().to_dict() result["id"] = self.id result["raw_content"] = self.raw_content + result["scraper_type"] = self.scraper_type return result diff --git a/app/services/scrapers/firecrawl_client/scraper.py b/app/services/scrapers/general/base.py similarity index 68% rename from app/services/scrapers/firecrawl_client/scraper.py rename to app/services/scrapers/general/base.py index 8efc261..832ef29 100644 --- a/app/services/scrapers/firecrawl_client/scraper.py +++ b/app/services/scrapers/general/base.py @@ -1,4 +1,6 @@ import hashlib +from abc import abstractmethod +from typing import Optional from urllib.parse import urlparse from openai import AsyncOpenAI @@ -7,12 +9,13 @@ from app.config import OPENAI_API_KEY from app.models.metadata_item import MediaFile, MessageType from app.services.scrapers.scraper import Scraper, DataProcessor -from app.services.scrapers.firecrawl_client import FirecrawlItem -from app.services.scrapers.firecrawl_client.client import FirecrawlClient +from app.services.scrapers.general import GeneralItem from app.utils.parse import get_html_text_length, wrap_text_into_html from app.utils.logger import logger -FIRECRAWL_TEXT_LIMIT = 800 +GENERAL_TEXT_LIMIT = 800 + +DEFAULT_OPENAI_MODEL = "gpt-4o-mini" # System prompt for LLM to extract article content ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. @@ -28,9 +31,10 @@ Return ONLY the extracted HTML content, no explanations or markdown.""" -class FirecrawlDataProcessor(DataProcessor): +class BaseGeneralDataProcessor(DataProcessor): """ - FirecrawlDataProcessor: Process URLs using Firecrawl to extract content. + Base class for general webpage data processors. + Each specific scraper (Firecrawl, Zyte, etc.) should inherit from this class. """ def __init__(self, url: str): @@ -38,27 +42,71 @@ def __init__(self, url: str): self._data: dict = {} self.url_parser = urlparse(url) self.id = hashlib.md5(url.encode()).hexdigest()[:16] - self._client: FirecrawlClient = FirecrawlClient.get_instance() + self.scraper_type: str = "base" async def get_item(self) -> dict: await self.process_data() - firecrawl_item = FirecrawlItem.from_dict(self._data) - return firecrawl_item.to_dict() + general_item = GeneralItem.from_dict(self._data) + return general_item.to_dict() async def process_data(self) -> None: await self._get_page_content() + @abstractmethod async def _get_page_content(self) -> None: - try: - result = self._client.scrape_url( - url=self.url, - formats=["markdown", "html"], - only_main_content=True, - ) - await self._process_firecrawl_result(result) - except Exception as e: - logger.error(f"Failed to scrape URL with Firecrawl: {e}") - raise + """Subclasses must implement this method to fetch page content.""" + pass + + async def _build_item_data( + self, + title: str, + author: str, + description: str, + markdown_content: str, + html_content: str, + og_image: Optional[str] = None, + ) -> None: + """ + Common method to build item data from scraped content. + """ + item_data = { + "id": self.id, + "category": "other", + "url": self.url, + "title": title or self.url, + "author": author or self.url_parser.netloc, + "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", + "scraper_type": self.scraper_type, + } + + # Process text content - use description or first part of markdown + text = description if description else markdown_content[:500] + item_data["text"] = text + + # Process HTML content with LLM if available + if html_content: + cleaned_html = await self.parsing_article_body_by_llm(html_content) + content = wrap_text_into_html(cleaned_html, is_html=True) + else: + content = wrap_text_into_html(markdown_content, is_html=False) + item_data["content"] = content + item_data["raw_content"] = markdown_content + + # Process media files - extract og:image if available + media_files = [] + if og_image: + media_files.append(MediaFile(url=og_image, media_type="image")) + + item_data["media_files"] = [m.to_dict() for m in media_files] + + # Determine the message type based on content length (not text length) + item_data["message_type"] = ( + MessageType.LONG + if get_html_text_length(content) > GENERAL_TEXT_LIMIT + else MessageType.SHORT + ) + + self._data = item_data @staticmethod async def parsing_article_body_by_llm(html_content: str) -> str: @@ -66,7 +114,7 @@ async def parsing_article_body_by_llm(html_content: str) -> str: Use LLM to extract the main article content from HTML. Args: - html_content: Raw HTML content from Firecrawl + html_content: Raw HTML content from a scraper Returns: Cleaned HTML containing only the main article content @@ -86,7 +134,7 @@ async def parsing_article_body_by_llm(html_content: str) -> str: truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content response = await client.chat.completions.create( - model="gpt-4o-mini", + model=DEFAULT_OPENAI_MODEL, messages=[ ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT), ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}") @@ -108,61 +156,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str: logger.error(f"Failed to parse article body with LLM: {e}") return html_content - async def _process_firecrawl_result(self, result: dict) -> None: - metadata = result.get("metadata", {}) - markdown_content = result.get("markdown", "") - html_content = result.get("html", "") - - # Extract metadata fields - title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url - author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc - description = metadata.get("description", "") or metadata.get("ogDescription", "") - - item_data = { - "id": self.id, - "category": "other", - "url": self.url, - "title": title, - "author": author, - "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", - } - - # Process text content - use description or first part of markdown - text = description if description else markdown_content[:500] - item_data["text"] = text - - html_content = await self.parsing_article_body_by_llm(html_content) - - # Process HTML content - if html_content: - content = wrap_text_into_html(html_content, is_html=True) - else: - content = wrap_text_into_html(markdown_content, is_html=False) - item_data["content"] = content - item_data["raw_content"] = markdown_content - - # Process media files - extract og:image if available - media_files = [] - og_image = metadata.get("ogImage") - if og_image: - media_files.append(MediaFile(url=og_image, media_type="image")) - - item_data["media_files"] = [m.to_dict() for m in media_files] - - # Determine message type based on text length - item_data["message_type"] = ( - MessageType.LONG - if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT - else MessageType.SHORT - ) - - self._data = item_data - -class FirecrawlScraper(Scraper): +class BaseGeneralScraper(Scraper): """ - FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping. + Base class for general webpage scrapers. """ + @abstractmethod async def get_processor_by_url(self, url: str) -> DataProcessor: - return FirecrawlDataProcessor(url) + pass diff --git a/app/services/scrapers/general/firecrawl.py b/app/services/scrapers/general/firecrawl.py new file mode 100644 index 0000000..b1d9d40 --- /dev/null +++ b/app/services/scrapers/general/firecrawl.py @@ -0,0 +1,56 @@ +from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper +from app.services.scrapers.general.firecrawl_client import FirecrawlClient +from app.services.scrapers.scraper import DataProcessor +from app.utils.logger import logger + + +class FirecrawlDataProcessor(BaseGeneralDataProcessor): + """ + FirecrawlDataProcessor: Process URLs using Firecrawl to extract content. + """ + + def __init__(self, url: str): + super().__init__(url) + self.scraper_type = "firecrawl" + self._client: FirecrawlClient = FirecrawlClient.get_instance() + + async def _get_page_content(self) -> None: + try: + result = self._client.scrape_url( + url=self.url, + formats=["markdown", "html"], + only_main_content=True, + ) + await self._process_firecrawl_result(result) + except Exception as e: + logger.error(f"Failed to scrape URL with Firecrawl: {e}") + raise + + async def _process_firecrawl_result(self, result: dict) -> None: + metadata = result.get("metadata", {}) + markdown_content = result.get("markdown", "") + html_content = result.get("html", "") + + # Extract metadata fields + title = metadata.get("title", "") or metadata.get("ogTitle", "") + author = metadata.get("author", "") or metadata.get("ogSiteName", "") + description = metadata.get("description", "") or metadata.get("ogDescription", "") + og_image = metadata.get("ogImage") + + await self._build_item_data( + title=title, + author=author, + description=description, + markdown_content=markdown_content, + html_content=html_content, + og_image=og_image, + ) + + +class FirecrawlScraper(BaseGeneralScraper): + """ + FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping. + """ + + async def get_processor_by_url(self, url: str) -> DataProcessor: + return FirecrawlDataProcessor(url) diff --git a/app/services/scrapers/firecrawl_client/client.py b/app/services/scrapers/general/firecrawl_client.py similarity index 68% rename from app/services/scrapers/firecrawl_client/client.py rename to app/services/scrapers/general/firecrawl_client.py index 7389996..454ca63 100644 --- a/app/services/scrapers/firecrawl_client/client.py +++ b/app/services/scrapers/general/firecrawl_client.py @@ -6,14 +6,13 @@ from firecrawl import Firecrawl -from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY, FIRECRAWL_TIMEOUT_SECONDS +from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY @dataclass(frozen=True) class FirecrawlSettings: api_url: str api_key: str - timeout_seconds: int = 60 # 你也可以在反代侧控制超时 class FirecrawlClient: @@ -33,10 +32,7 @@ def __init__(self, config: FirecrawlSettings): @staticmethod def _create_app(config: FirecrawlSettings) -> Firecrawl: - try: - return Firecrawl(api_url=config.api_url, api_key=config.api_key) - except TypeError: - return Firecrawl(api_url=config.api_url, api_key=config.api_key) + return Firecrawl(api_url=config.api_url, api_key=config.api_key) @classmethod def get_instance(cls) -> "FirecrawlClient": @@ -55,7 +51,6 @@ def get_instance(cls) -> "FirecrawlClient": config = FirecrawlSettings( api_url=FIRECRAWL_API_URL, api_key=FIRECRAWL_API_KEY, - timeout_seconds=FIRECRAWL_TIMEOUT_SECONDS, ) cls._instance = cls(config) @@ -72,24 +67,14 @@ def scrape_url( url: str, formats: Optional[List[str]] = None, only_main_content: bool = True, - timeout_seconds: Optional[int] = None, - extra_params: Optional[Dict[str, Any]] = None, + timeout: Optional[int] = None, ) -> Dict[str, Any]: """ - 单页抓取(最常用) + timeout: milliseconds """ - params: Dict[str, Any] = { - "formats": formats or ["markdown"], - "onlyMainContent": only_main_content, - } - if extra_params: - params.update(extra_params) - - # if timeout_seconds is None: - # timeout_seconds = self._settings.timeout_seconds - try: - return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump( + return self._app.scrape(url, formats=formats, only_main_content=only_main_content, + timeout=timeout).model_dump( exclude_none=True) except Exception as e: raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e diff --git a/app/services/scrapers/general/scraper.py b/app/services/scrapers/general/scraper.py new file mode 100644 index 0000000..17d9c38 --- /dev/null +++ b/app/services/scrapers/general/scraper.py @@ -0,0 +1,86 @@ +from typing import Optional + +from app.config import GENERAL_SCRAPING_API +from app.services.scrapers.scraper import Scraper, DataProcessor +from app.services.scrapers.general.base import BaseGeneralScraper +from app.services.scrapers.general.firecrawl import FirecrawlScraper +from app.services.scrapers.general.zyte import ZyteScraper +from app.utils.logger import logger + + +class GeneralScraper(Scraper): + """ + GeneralScraper: A wrapper scraper that delegates to the configured scraper implementation. + + This class acts as a factory/facade that selects the appropriate scraper + based on the GENERAL_SCRAPING_API configuration. + + Supported scrapers: + - FIRECRAWL: Uses Firecrawl API for scraping + - ZYTE: Uses Zyte API for scraping + """ + + # Registry of available scrapers + SCRAPER_REGISTRY: dict[str, type[BaseGeneralScraper]] = { + "FIRECRAWL": FirecrawlScraper, + "ZYTE": ZyteScraper, + } + + def __init__(self, scraper_type: Optional[str] = None): + """ + Initialize the GeneralScraper with a specific scraper type. + + Args: + scraper_type: The type of scraper to use. If None, uses GENERAL_SCRAPING_API config. + """ + self.scraper_type = scraper_type or GENERAL_SCRAPING_API + self._scraper: Optional[BaseGeneralScraper] = None + self._init_scraper() + + def _init_scraper(self) -> None: + """Initialize the underlying scraper based on scraper_type.""" + scraper_class = self.SCRAPER_REGISTRY.get(self.scraper_type.upper()) + + if scraper_class is None: + available = ", ".join(self.SCRAPER_REGISTRY.keys()) + logger.error(f"Unknown scraper type: {self.scraper_type}. Available: {available}") + # Fall back to Firecrawl as default + logger.info("Falling back to FIRECRAWL scraper") + scraper_class = FirecrawlScraper + + self._scraper = scraper_class() + logger.info(f"Initialized GeneralScraper with {self.scraper_type} backend") + + async def get_processor_by_url(self, url: str) -> DataProcessor: + """ + Get the appropriate data processor for the given URL. + + Args: + url: The URL to scrape + + Returns: + DataProcessor instance for processing the URL + """ + return await self._scraper.get_processor_by_url(url) + + @classmethod + def register_scraper(cls, name: str, scraper_class: type[BaseGeneralScraper]) -> None: + """ + Register a new scraper type. + + Args: + name: The name to register the scraper under (e.g., "ZYTE") + scraper_class: The scraper class to register + """ + cls.SCRAPER_REGISTRY[name.upper()] = scraper_class + logger.info(f"Registered new scraper: {name}") + + @classmethod + def get_available_scrapers(cls) -> list[str]: + """ + Get a list of available scraper types. + + Returns: + List of registered scraper names + """ + return list(cls.SCRAPER_REGISTRY.keys()) diff --git a/app/services/scrapers/general/zyte.py b/app/services/scrapers/general/zyte.py new file mode 100644 index 0000000..1ff00a5 --- /dev/null +++ b/app/services/scrapers/general/zyte.py @@ -0,0 +1,78 @@ +from zyte_api import AsyncZyteAPI + +from app.config import ZYTE_API_KEY +from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper +from app.services.scrapers.scraper import DataProcessor +from app.utils.logger import logger + + +class ZyteDataProcessor(BaseGeneralDataProcessor): + """ + ZyteDataProcessor: Process URLs using Zyte API to extract content. + """ + + def __init__(self, url: str): + super().__init__(url) + self.scraper_type = "zyte" + + async def _get_page_content(self) -> None: + if not ZYTE_API_KEY: + raise RuntimeError("ZYTE_API_KEY is not configured") + + try: + client = AsyncZyteAPI(api_key=ZYTE_API_KEY) + result = await client.get( + { + "url": self.url, + "browserHtml": True, + "article": True, + "articleOptions": {"extractFrom": "browserHtml"}, + } + ) + await self._process_zyte_result(result) + except Exception as e: + logger.error(f"Failed to scrape URL with Zyte: {e}") + raise + + async def _process_zyte_result(self, result: dict) -> None: + article = result.get("article", {}) + browser_html = result.get("browserHtml", "") + + # Extract metadata fields from article + title = article.get("headline", "") or article.get("name", "") + + # Extract author information + authors = article.get("authors", []) + author = authors[0].get("name", "") if authors else "" + + description = article.get("description", "") or article.get("articleBodyRaw", "")[:500] + + # Get article body as HTML + article_body_html = article.get("articleBodyHtml", "") + article_body_raw = article.get("articleBodyRaw", "") + + # Use article body HTML if available, otherwise fall back to browser HTML + html_content = article_body_html if article_body_html else browser_html + markdown_content = article_body_raw + + # Extract main image + main_image = article.get("mainImage", {}) + og_image = main_image.get("url") if main_image else None + + await self._build_item_data( + title=title, + author=author, + description=description, + markdown_content=markdown_content, + html_content=html_content, + og_image=og_image, + ) + + +class ZyteScraper(BaseGeneralScraper): + """ + ZyteScraper: Scraper implementation using Zyte API for generic URL scraping. + """ + + async def get_processor_by_url(self, url: str) -> DataProcessor: + return ZyteDataProcessor(url) diff --git a/app/services/scrapers/scraper_manager.py b/app/services/scrapers/scraper_manager.py index b58bd72..aac7add 100644 --- a/app/services/scrapers/scraper_manager.py +++ b/app/services/scrapers/scraper_manager.py @@ -3,7 +3,7 @@ from app.utils.logger import logger from app.services.scrapers.bluesky.scraper import BlueskyScraper from app.services.scrapers.weibo.scraper import WeiboScraper -from app.services.scrapers.firecrawl_client.scraper import FirecrawlScraper +from app.services.scrapers.general.scraper import GeneralScraper from app.config import ( BLUESKY_USERNAME, BLUESKY_PASSWORD ) @@ -13,12 +13,12 @@ class ScraperManager: bluesky_scraper: Optional[BlueskyScraper] = None weibo_scraper: Optional[WeiboScraper] = None - firecrawl_scraper: Optional[FirecrawlScraper] = None + general_scraper: Optional[GeneralScraper] = None scrapers = {"bluesky": bluesky_scraper, "weibo": weibo_scraper, - "other": firecrawl_scraper, - "unknown": firecrawl_scraper} + "other": general_scraper, + "unknown": general_scraper} @classmethod async def init_scrapers(cls): @@ -32,8 +32,8 @@ async def init_scraper(cls, category: str) -> None: scraper = await cls.init_bluesky_scraper() elif category == "weibo" and not cls.weibo_scraper: scraper = await cls.init_weibo_scraper() - elif category in ["other", "unknown"] and not cls.firecrawl_scraper: - scraper = await cls.init_firecrawl_scraper() + elif category in ["other", "unknown"] and not cls.general_scraper: + scraper = await cls.init_general_scraper() if scraper: cls.scrapers[category] = scraper else: @@ -52,7 +52,7 @@ async def init_weibo_scraper(cls) -> WeiboScraper: return weibo_scraper @classmethod - async def init_firecrawl_scraper(cls) -> FirecrawlScraper: - firecrawl_scraper = FirecrawlScraper() - return firecrawl_scraper + async def init_general_scraper(cls) -> GeneralScraper: + general_scraper = GeneralScraper() + return general_scraper diff --git a/app/services/telegram_bot/__init__.py b/app/services/telegram_bot/__init__.py index dafd924..6fc9ace 100755 --- a/app/services/telegram_bot/__init__.py +++ b/app/services/telegram_bot/__init__.py @@ -68,7 +68,7 @@ JINJA2_ENV, OPENAI_API_KEY, DATABASE_ON, - TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, FIRECRAWL_ON, + TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, GENERAL_SCRAPING_ON, ) from app.services.telegram_bot.config import ( HTTPS_URL_REGEX, @@ -207,7 +207,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None: ) return if url_metadata.source == "unknown": - if FIRECRAWL_ON: + if GENERAL_SCRAPING_ON: await process_message.edit_text( text=f"Uncategorized url found. General webpage parser is on, Processing..." ) @@ -348,7 +348,7 @@ async def https_url_auto_process(update: Update, context: CallbackContext) -> No url_metadata = await get_url_metadata( url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST ) - if url_metadata.source == "unknown" and FIRECRAWL_ON: + if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON: metadata_item = await content_process_function(url_metadata=url_metadata) await send_item_message( metadata_item, chat_id=message.chat_id, message=message @@ -475,7 +475,7 @@ async def _create_choose_channel_keyboard(data: dict) -> list: async def invalid_buttons(update: Update, context: CallbackContext) -> None: await update.callback_query.answer("Invalid button!") await update.effective_message.edit_text( - "Sorry, Error Occured, I could not process this button click 😕." + "Sorry, Error Occurred, I could not process this button click 😕." ) diff --git a/app/templates/social_media_message.jinja2 b/app/templates/social_media_message.jinja2 index c6a282d..6e27fcb 100644 --- a/app/templates/social_media_message.jinja2 +++ b/app/templates/social_media_message.jinja2 @@ -1,9 +1,9 @@ {# templates/social_media_message.html #} {% if data.message_type == "short" %} - {% if data.title %} -{{ data.title }} - {% endif %} +{# {% if data.title %}#} +{#{{ data.title }}#} +{# {% endif %}#} {{ data.text }} {% if data.category in ['youtube', 'bilibili'] %} {% endif %} diff --git a/poetry.lock b/poetry.lock index 045b721..a0adab8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -568,6 +568,116 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version > [package.extras] crt = ["awscrt (==0.23.4)"] +[[package]] +name = "brotli" +version = "1.2.0" +description = "Python bindings for the Brotli compression library" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "brotli-1.2.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:99cfa69813d79492f0e5d52a20fd18395bc82e671d5d40bd5a91d13e75e468e8"}, + {file = "brotli-1.2.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3ebe801e0f4e56d17cd386ca6600573e3706ce1845376307f5d2cbd32149b69a"}, + {file = "brotli-1.2.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:a387225a67f619bf16bd504c37655930f910eb03675730fc2ad69d3d8b5e7e92"}, + {file = "brotli-1.2.0-cp27-cp27m-win32.whl", hash = "sha256:b908d1a7b28bc72dfb743be0d4d3f8931f8309f810af66c906ae6cd4127c93cb"}, + {file = "brotli-1.2.0-cp27-cp27m-win_amd64.whl", hash = "sha256:d206a36b4140fbb5373bf1eb73fb9de589bb06afd0d22376de23c5e91d0ab35f"}, + {file = "brotli-1.2.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7e9053f5fb4e0dfab89243079b3e217f2aea4085e4d58c5c06115fc34823707f"}, + {file = "brotli-1.2.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:4735a10f738cb5516905a121f32b24ce196ab82cfc1e4ba2e3ad1b371085fd46"}, + {file = "brotli-1.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b90b767916ac44e93a8e28ce6adf8d551e43affb512f2377c732d486ac6514e"}, + {file = "brotli-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6be67c19e0b0c56365c6a76e393b932fb0e78b3b56b711d180dd7013cb1fd984"}, + {file = "brotli-1.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0bbd5b5ccd157ae7913750476d48099aaf507a79841c0d04a9db4415b14842de"}, + {file = "brotli-1.2.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3f3c908bcc404c90c77d5a073e55271a0a498f4e0756e48127c35d91cf155947"}, + {file = "brotli-1.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b557b29782a643420e08d75aea889462a4a8796e9a6cf5621ab05a3f7da8ef2"}, + {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81da1b229b1889f25adadc929aeb9dbc4e922bd18561b65b08dd9343cfccca84"}, + {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ff09cd8c5eec3b9d02d2408db41be150d8891c5566addce57513bf546e3d6c6d"}, + {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a1778532b978d2536e79c05dac2d8cd857f6c55cd0c95ace5b03740824e0e2f1"}, + {file = "brotli-1.2.0-cp310-cp310-win32.whl", hash = "sha256:b232029d100d393ae3c603c8ffd7e3fe6f798c5e28ddca5feabb8e8fdb732997"}, + {file = "brotli-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:ef87b8ab2704da227e83a246356a2b179ef826f550f794b2c52cddb4efbd0196"}, + {file = "brotli-1.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:15b33fe93cedc4caaff8a0bd1eb7e3dab1c61bb22a0bf5bdfdfd97cd7da79744"}, + {file = "brotli-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:898be2be399c221d2671d29eed26b6b2713a02c2119168ed914e7d00ceadb56f"}, + {file = "brotli-1.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350c8348f0e76fff0a0fd6c26755d2653863279d086d3aa2c290a6a7251135dd"}, + {file = "brotli-1.2.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1ad3fda65ae0d93fec742a128d72e145c9c7a99ee2fcd667785d99eb25a7fe"}, + {file = "brotli-1.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:40d918bce2b427a0c4ba189df7a006ac0c7277c180aee4617d99e9ccaaf59e6a"}, + {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2a7f1d03727130fc875448b65b127a9ec5d06d19d0148e7554384229706f9d1b"}, + {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9c79f57faa25d97900bfb119480806d783fba83cd09ee0b33c17623935b05fa3"}, + {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:844a8ceb8483fefafc412f85c14f2aae2fb69567bf2a0de53cdb88b73e7c43ae"}, + {file = "brotli-1.2.0-cp311-cp311-win32.whl", hash = "sha256:aa47441fa3026543513139cb8926a92a8e305ee9c71a6209ef7a97d91640ea03"}, + {file = "brotli-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:022426c9e99fd65d9475dce5c195526f04bb8be8907607e27e747893f6ee3e24"}, + {file = "brotli-1.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:35d382625778834a7f3061b15423919aa03e4f5da34ac8e02c074e4b75ab4f84"}, + {file = "brotli-1.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a61c06b334bd99bc5ae84f1eeb36bfe01400264b3c352f968c6e30a10f9d08b"}, + {file = "brotli-1.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:acec55bb7c90f1dfc476126f9711a8e81c9af7fb617409a9ee2953115343f08d"}, + {file = "brotli-1.2.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:260d3692396e1895c5034f204f0db022c056f9e2ac841593a4cf9426e2a3faca"}, + {file = "brotli-1.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:072e7624b1fc4d601036ab3f4f27942ef772887e876beff0301d261210bca97f"}, + {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adedc4a67e15327dfdd04884873c6d5a01d3e3b6f61406f99b1ed4865a2f6d28"}, + {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7a47ce5c2288702e09dc22a44d0ee6152f2c7eda97b3c8482d826a1f3cfc7da7"}, + {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:af43b8711a8264bb4e7d6d9a6d004c3a2019c04c01127a868709ec29962b6036"}, + {file = "brotli-1.2.0-cp312-cp312-win32.whl", hash = "sha256:e99befa0b48f3cd293dafeacdd0d191804d105d279e0b387a32054c1180f3161"}, + {file = "brotli-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:b35c13ce241abdd44cb8ca70683f20c0c079728a36a996297adb5334adfc1c44"}, + {file = "brotli-1.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9e5825ba2c9998375530504578fd4d5d1059d09621a02065d1b6bfc41a8e05ab"}, + {file = "brotli-1.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0cf8c3b8ba93d496b2fae778039e2f5ecc7cff99df84df337ca31d8f2252896c"}, + {file = "brotli-1.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8565e3cdc1808b1a34714b553b262c5de5fbda202285782173ec137fd13709f"}, + {file = "brotli-1.2.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:26e8d3ecb0ee458a9804f47f21b74845cc823fd1bb19f02272be70774f56e2a6"}, + {file = "brotli-1.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67a91c5187e1eec76a61625c77a6c8c785650f5b576ca732bd33ef58b0dff49c"}, + {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4ecdb3b6dc36e6d6e14d3a1bdc6c1057c8cbf80db04031d566eb6080ce283a48"}, + {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3e1b35d56856f3ed326b140d3c6d9db91740f22e14b06e840fe4bb1923439a18"}, + {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54a50a9dad16b32136b2241ddea9e4df159b41247b2ce6aac0b3276a66a8f1e5"}, + {file = "brotli-1.2.0-cp313-cp313-win32.whl", hash = "sha256:1b1d6a4efedd53671c793be6dd760fcf2107da3a52331ad9ea429edf0902f27a"}, + {file = "brotli-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:b63daa43d82f0cdabf98dee215b375b4058cce72871fd07934f179885aad16e8"}, + {file = "brotli-1.2.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:6c12dad5cd04530323e723787ff762bac749a7b256a5bece32b2243dd5c27b21"}, + {file = "brotli-1.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3219bd9e69868e57183316ee19c84e03e8f8b5a1d1f2667e1aa8c2f91cb061ac"}, + {file = "brotli-1.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:963a08f3bebd8b75ac57661045402da15991468a621f014be54e50f53a58d19e"}, + {file = "brotli-1.2.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9322b9f8656782414b37e6af884146869d46ab85158201d82bab9abbcb971dc7"}, + {file = "brotli-1.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cf9cba6f5b78a2071ec6fb1e7bd39acf35071d90a81231d67e92d637776a6a63"}, + {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7547369c4392b47d30a3467fe8c3330b4f2e0f7730e45e3103d7d636678a808b"}, + {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1530af5c3c275b8524f2e24841cbe2599d74462455e9bae5109e9ff42e9361"}, + {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2d085ded05278d1c7f65560aae97b3160aeb2ea2c0b3e26204856beccb60888"}, + {file = "brotli-1.2.0-cp314-cp314-win32.whl", hash = "sha256:832c115a020e463c2f67664560449a7bea26b0c1fdd690352addad6d0a08714d"}, + {file = "brotli-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:e7c0af964e0b4e3412a0ebf341ea26ec767fa0b4cf81abb5e897c9338b5ad6a3"}, + {file = "brotli-1.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:82676c2781ecf0ab23833796062786db04648b7aae8be139f6b8065e5e7b1518"}, + {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c16ab1ef7bb55651f5836e8e62db1f711d55b82ea08c3b8083ff037157171a69"}, + {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e85190da223337a6b7431d92c799fca3e2982abd44e7b8dec69938dcc81c8e9e"}, + {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d8c05b1dfb61af28ef37624385b0029df902ca896a639881f594060b30ffc9a7"}, + {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:465a0d012b3d3e4f1d6146ea019b5c11e3e87f03d1676da1cc3833462e672fb0"}, + {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:96fbe82a58cdb2f872fa5d87dedc8477a12993626c446de794ea025bbda625ea"}, + {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:1b71754d5b6eda54d16fbbed7fce2d8bc6c052a1b91a35c320247946ee103502"}, + {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:66c02c187ad250513c2f4fce973ef402d22f80e0adce734ee4e4efd657b6cb64"}, + {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:ba76177fd318ab7b3b9bf6522be5e84c2ae798754b6cc028665490f6e66b5533"}, + {file = "brotli-1.2.0-cp36-cp36m-win32.whl", hash = "sha256:c1702888c9f3383cc2f09eb3e88b8babf5965a54afb79649458ec7c3c7a63e96"}, + {file = "brotli-1.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f8d635cafbbb0c61327f942df2e3f474dde1cff16c3cd0580564774eaba1ee13"}, + {file = "brotli-1.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e80a28f2b150774844c8b454dd288be90d76ba6109670fe33d7ff54d96eb5cb8"}, + {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b1b799f45da91292ffaa21a473ab3a3054fa78560e8ff67082a185274431c8"}, + {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29b7e6716ee4ea0c59e3b241f682204105f7da084d6254ec61886508efeb43bc"}, + {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:640fe199048f24c474ec6f3eae67c48d286de12911110437a36a87d7c89573a6"}, + {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:92edab1e2fd6cd5ca605f57d4545b6599ced5dea0fd90b2bcdf8b247a12bd190"}, + {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7274942e69b17f9cef76691bcf38f2b2d4c8a5f5dba6ec10958363dcb3308a0a"}, + {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:a56ef534b66a749759ebd091c19c03ef81eb8cd96f0d1d16b59127eaf1b97a12"}, + {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5732eff8973dd995549a18ecbd8acd692ac611c5c0bb3f59fa3541ae27b33be3"}, + {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:598e88c736f63a0efec8363f9eb34e5b5536b7b6b1821e401afcb501d881f59a"}, + {file = "brotli-1.2.0-cp37-cp37m-win32.whl", hash = "sha256:7ad8cec81f34edf44a1c6a7edf28e7b7806dfb8886e371d95dcf789ccd4e4982"}, + {file = "brotli-1.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:865cedc7c7c303df5fad14a57bc5db1d4f4f9b2b4d0a7523ddd206f00c121a16"}, + {file = "brotli-1.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ac27a70bda257ae3f380ec8310b0a06680236bea547756c277b5dfe55a2452a8"}, + {file = "brotli-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e813da3d2d865e9793ef681d3a6b66fa4b7c19244a45b817d0cceda67e615990"}, + {file = "brotli-1.2.0-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9fe11467c42c133f38d42289d0861b6b4f9da31e8087ca2c0d7ebb4543625526"}, + {file = "brotli-1.2.0-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d6770111d1879881432f81c369de5cde6e9467be7c682a983747ec800544e2"}, + {file = "brotli-1.2.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:eda5a6d042c698e28bda2507a89b16555b9aa954ef1d750e1c20473481aff675"}, + {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3173e1e57cebb6d1de186e46b5680afbd82fd4301d7b2465beebe83ed317066d"}, + {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:71a66c1c9be66595d628467401d5976158c97888c2c9379c034e1e2312c5b4f5"}, + {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:1e68cdf321ad05797ee41d1d09169e09d40fdf51a725bb148bff892ce04583d7"}, + {file = "brotli-1.2.0-cp38-cp38-win32.whl", hash = "sha256:f16dace5e4d3596eaeb8af334b4d2c820d34b8278da633ce4a00020b2eac981c"}, + {file = "brotli-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:14ef29fc5f310d34fc7696426071067462c9292ed98b5ff5a27ac70a200e5470"}, + {file = "brotli-1.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8d4f47f284bdd28629481c97b5f29ad67544fa258d9091a6ed1fda47c7347cd1"}, + {file = "brotli-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2881416badd2a88a7a14d981c103a52a23a276a553a8aacc1346c2ff47c8dc17"}, + {file = "brotli-1.2.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2d39b54b968f4b49b5e845758e202b1035f948b0561ff5e6385e855c96625971"}, + {file = "brotli-1.2.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:95db242754c21a88a79e01504912e537808504465974ebb92931cfca2510469e"}, + {file = "brotli-1.2.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bba6e7e6cfe1e6cb6eb0b7c2736a6059461de1fa2c0ad26cf845de6c078d16c8"}, + {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:88ef7d55b7bcf3331572634c3fd0ed327d237ceb9be6066810d39020a3ebac7a"}, + {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7fa18d65a213abcfbb2f6cafbb4c58863a8bd6f2103d65203c520ac117d1944b"}, + {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:09ac247501d1909e9ee47d309be760c89c990defbb2e0240845c892ea5ff0de4"}, + {file = "brotli-1.2.0-cp39-cp39-win32.whl", hash = "sha256:c25332657dee6052ca470626f18349fc1fe8855a56218e19bd7a8c6ad4952c49"}, + {file = "brotli-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:1ce223652fd4ed3eb2b7f78fbea31c52314baecfac68db44037bb4167062a937"}, + {file = "brotli-1.2.0.tar.gz", hash = "sha256:e310f77e41941c13340a95976fe66a8a95b01e783d430eeaf7a2f87e0a57dd0a"}, +] + [[package]] name = "cachetools" version = "5.5.2" @@ -3402,6 +3512,45 @@ files = [ [package.dependencies] requests = ">=2.0.1,<3.0.0" +[[package]] +name = "runstats" +version = "2.0.0" +description = "Compute statistics and regression in one pass" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "runstats-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79efa663eb47eb480d75f12889590646f7f823169dda386c986be03310cfcc34"}, + {file = "runstats-2.0.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:09cf60f075b6e03d39fbcdfd14835d9fca985e78315334e589af4840e45e04f5"}, + {file = "runstats-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2ec49f15b276cce89ffddedebe95741136b0e309ed68108c1bf33f7295973143"}, + {file = "runstats-2.0.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:748d43cc2712b319e4244c9af275f4d78513e388ff23d14eb36ae30c1e15f2ec"}, + {file = "runstats-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:9d645bebdf788ea82c2c921462cce8b5d4bda72192bde511b81816082aca9c25"}, + {file = "runstats-2.0.0-cp36-cp36m-win32.whl", hash = "sha256:9741af3341f087686db4758e2266f26da36ad44bb49039dee43edc97930ac32e"}, + {file = "runstats-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5fb4f07a3bd665335c9e4f00389585fe98203b3ff32a0e743a1ce728c22855de"}, + {file = "runstats-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:200297eed4d7f0192eb324d3c634672c9268e2e603f06b372968a849a30c2dfd"}, + {file = "runstats-2.0.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:aa71c332ab533e482f62bc7308e0474a87582986c0aecb1015f3a922b5c8283e"}, + {file = "runstats-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7be16c6b781e27f0f931a2a3bc970b00c86790342b804a6a38041a88ef71ba63"}, + {file = "runstats-2.0.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:328e1ea2be82a264e09091bd6f4513a47bed131b3ab0f654f8153d853f2978c3"}, + {file = "runstats-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:52985af2b92bb080f886e911f7bc593970aa10fd8febadcfae2e14f8b0ff9b36"}, + {file = "runstats-2.0.0-cp37-cp37m-win32.whl", hash = "sha256:e52da241b932d56e9f9f947d6e0ab3d71fefc31fe27b610c220e82fc44b4383f"}, + {file = "runstats-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2b20f6aa911b812948ac3b886c0d78ea4c7acac5d615bffcf863d11711f91c52"}, + {file = "runstats-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3ca82450e7aaef6f0f0a6332e17bc8723f3beae8b430ce32df1fd7ea624b81a5"}, + {file = "runstats-2.0.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:7212a39a457c9858acdaf895f2e3a4f4cb5085c2f5d018498c8904ec83fcfcfb"}, + {file = "runstats-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:dc631f2f1640de2abbd6db48210e4804acc46e00104c6239435d240e67f94c2f"}, + {file = "runstats-2.0.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:51c903801765b97b657ccbbba5941a5ad10491ee4e1c071cce4025f20af4b0e8"}, + {file = "runstats-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:5da7acb950243c3215c4569773d4dc30da746d49c73f14916d83b3bcfc75d7ad"}, + {file = "runstats-2.0.0-cp38-cp38-win32.whl", hash = "sha256:c51efa5f1427445b0fdf404b133b407d7ebda2143c090ed60968b975903068c7"}, + {file = "runstats-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:deb75dd5966f6c0a944b4a4f65fbb8f6d67e1c479f6a6c666cdb7cfdec03a731"}, + {file = "runstats-2.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c8b2dee3c02c32efab95b0b615a1ba24400e56db4d71591f8367120066d62ffa"}, + {file = "runstats-2.0.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:8d47a09a5274f89e709853584527ef5eefbb7f10668c802eb17d82742533a7dc"}, + {file = "runstats-2.0.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:c133803edbb5d6f23cfb4ca05cea2e74d9ea35b1451a6b3de22987649fd0cf27"}, + {file = "runstats-2.0.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:ed6e1f1839ed73bfc35ae8fae2d0e6deb826dcbc993f30a620dbb83eb2f07556"}, + {file = "runstats-2.0.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:571dc4a6abc733da2b36e72b19b0b1743adab142882966e062ee7b5487a29d9a"}, + {file = "runstats-2.0.0-cp39-cp39-win32.whl", hash = "sha256:8c412ade7596f1afd6be5b5d634a55c4affd3c4305d05fcfa6a0accdf60edc16"}, + {file = "runstats-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:bb60dbf78a6270e89aad50708075ca57c3d0e07d2d91ae6b07f53fa9a4e91d13"}, + {file = "runstats-2.0.0.tar.gz", hash = "sha256:0f9a5e6cc9938bbac3474b17727ffc29fbf5895f33e55ce8843341e0821e77c2"}, +] + [[package]] name = "s3transfer" version = "0.11.3" @@ -3826,6 +3975,18 @@ dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx_rtd_theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=25.3.0,<25.4.0)", "pycodestyle (>=2.11.0,<2.12.0)"] +[[package]] +name = "w3lib" +version = "2.3.1" +description = "Library of web-related functions" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "w3lib-2.3.1-py3-none-any.whl", hash = "sha256:9ccd2ae10c8c41c7279cd8ad4fe65f834be894fe7bfdd7304b991fd69325847b"}, + {file = "w3lib-2.3.1.tar.gz", hash = "sha256:5c8ac02a3027576174c2b61eb9a2170ba1b197cae767080771b6f1febda249a4"}, +] + [[package]] name = "webencodings" version = "0.5.1" @@ -4216,10 +4377,34 @@ idna = ">=2.0" multidict = ">=4.0" propcache = ">=0.2.1" +[[package]] +name = "zyte-api" +version = "0.8.1" +description = "Python interface to Zyte API" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "zyte_api-0.8.1-py3-none-any.whl", hash = "sha256:59565fae3898ffbd1962f260b32e9de957900d89bb8c16ddf1578c22d4fecbb7"}, + {file = "zyte_api-0.8.1.tar.gz", hash = "sha256:38d78e11e528c8b3f86c786f3ec88b1bab9dd8365ed2bcfc54bb41113963c31e"}, +] + +[package.dependencies] +aiohttp = ">=3.8.0" +attrs = ">=20.1.0" +brotli = ">=0.5.2" +runstats = ">=0.0.1" +tenacity = ">=8.2.0" +tqdm = ">=4.16.0" +w3lib = ">=2.1.1" + +[package.extras] +x402 = ["eth-account (>=0.13.7)", "x402 (>=0.1.1)"] + [extras] windows = ["python-magic-bin"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "fd313f6c346780816956f7ae6c2d107600f08810c25798d517c8f4db226788e1" +content-hash = "a30e96ee89828e5798043b83f62b3f737bd9e12d0e92b6b8a0c5a280cfd97176" diff --git a/pyproject.toml b/pyproject.toml index fc79b60..da85308 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,14 +38,12 @@ markdown = "^3.8" asyncpraw = "^7.8.1" html-telegraph-poster-v2 = "^0.2.5" fastfetchbot-telegram-bot = "*" - pytest = "^8.3.5" firecrawl-py = "^4.13.0" +zyte-api = "^0.8.1" [tool.poetry.group.dev] optional = true [tool.poetry.group.dev.dependencies] -#html-telegraph-poster-v2 = { path = "../html-telegraph-poster-v2/" } -#fastfetchbot-telegram-bot = { path = "../FastFetchBot-Telegram-Bot/" } black = "^25.1.0" pytest = "^8.3.5" pytest-asyncio = "^0.26.0" diff --git a/template.env b/template.env index ab4da38..3c519a1 100644 --- a/template.env +++ b/template.env @@ -103,3 +103,21 @@ REDDIT_CLIENT_SECRET= REDDIT_PASSWORD= REDDIT_USERNAME= FXZHIHU_HOST= + +# General Webpage Scraping +# Enable general webpage scraping for unrecognized URLs. Default: `false` +GENERAL_SCRAPING_ON=false + +# The scraping API backend to use. Options: `FIRECRAWL`, `ZYTE`. Default: `FIRECRAWL` +GENERAL_SCRAPING_API=FIRECRAWL + +# Firecrawl API +# The URL of the Firecrawl API server. Default: `` +FIRECRAWL_API_URL= + +# The API key for Firecrawl. Default: `` +FIRECRAWL_API_KEY= + +# Zyte API +# The API key for Zyte. Default: `None` +ZYTE_API_KEY= From bd906af4c59b2978ac217c69c30aa2dda5efb786 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 14 Feb 2026 15:42:03 -0600 Subject: [PATCH 2/8] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e3354ca..d9fa219 100644 --- a/.gitignore +++ b/.gitignore @@ -257,3 +257,4 @@ conf/* .run/Template Python tests.run.xml /.run/ .DS_Store +/.claude/ From ad6ec6cb10db7438ee92e9bb7561875dfad0f8c1 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 14 Feb 2026 17:07:23 -0600 Subject: [PATCH 3/8] fix: html tag sanitizing for general scraping --- app/config.py | 1 + app/services/scrapers/general/base.py | 48 +++++++++++++++++-- app/services/scrapers/general/firecrawl.py | 9 ++++ .../scrapers/general/firecrawl_client.py | 21 ++++++-- app/utils/parse.py | 46 ++++++++++++------ template.env | 3 ++ 6 files changed, 105 insertions(+), 23 deletions(-) diff --git a/app/config.py b/app/config.py index 1e0b3fc..50fd18d 100644 --- a/app/config.py +++ b/app/config.py @@ -218,6 +218,7 @@ def ban_list_resolver(ban_list_string: str) -> list: # Firecrawl API FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") +FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering # Zyte API diff --git a/app/services/scrapers/general/base.py b/app/services/scrapers/general/base.py index 53620c8..ba1c944 100644 --- a/app/services/scrapers/general/base.py +++ b/app/services/scrapers/general/base.py @@ -3,6 +3,7 @@ from typing import Optional from urllib.parse import urlparse +from bs4 import BeautifulSoup, Doctype from openai import AsyncOpenAI from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam @@ -15,7 +16,7 @@ GENERAL_TEXT_LIMIT = 800 -DEFAULT_OPENAI_MODEL = "gpt-4o-mini" +DEFAULT_OPENAI_MODEL = "gpt-5-nano" # System prompt for LLM to extract article content ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. @@ -27,7 +28,7 @@ 4. Keep important formatting like bold, italic, links, and images 5. Return clean HTML containing only the article content 6. If you cannot identify the main content, return the original HTML unchanged -7. remove some basic HTML tags like , ,