From 868896f579b45b51a7a3443cab50161b3b012bd1 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 14:41:39 -0600
Subject: [PATCH 1/8] feat: add zyte API

---
 app/config.py                                 |  16 +-
 .../{firecrawl_client => general}/__init__.py |  11 +-
 .../scraper.py => general/base.py}            | 145 +++++++-------
 app/services/scrapers/general/firecrawl.py    |  56 ++++++
 .../client.py => general/firecrawl_client.py} |  27 +--
 app/services/scrapers/general/scraper.py      |  86 ++++++++
 app/services/scrapers/general/zyte.py         |  78 ++++++++
 app/services/scrapers/scraper_manager.py      |  18 +-
 app/services/telegram_bot/__init__.py         |   8 +-
 app/templates/social_media_message.jinja2     |   6 +-
 poetry.lock                                   | 187 +++++++++++++++++-
 pyproject.toml                                |   4 +-
 template.env                                  |  18 ++
 13 files changed, 537 insertions(+), 123 deletions(-)
 rename app/services/scrapers/{firecrawl_client => general}/__init__.py (71%)
 rename app/services/scrapers/{firecrawl_client/scraper.py => general/base.py} (68%)
 create mode 100644 app/services/scrapers/general/firecrawl.py
 rename app/services/scrapers/{firecrawl_client/client.py => general/firecrawl_client.py} (68%)
 create mode 100644 app/services/scrapers/general/scraper.py
 create mode 100644 app/services/scrapers/general/zyte.py

diff --git a/app/config.py b/app/config.py
index 04cb826..1e0b3fc 100644
--- a/app/config.py
+++ b/app/config.py
@@ -208,16 +208,22 @@ def ban_list_resolver(ban_list_string: str) -> list:
 INOREADER_EMAIL = env.get("INOREADER_EMAIL", None)
 INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None)
 
-# Open AI API environment variables
+# Open AI API
 OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)
 
-# Firecrawl API environment variables
-FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False)
+# General webpage scraping
+GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False)
+GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL")
+
+# Firecrawl API
 FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
-FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)
 
-# Locale environment variables
+
+# Zyte API
+ZYTE_API_KEY = env.get("ZYTE_API_KEY", None)
+
+# Locale directories environment variables
 localedir = os.path.join(os.path.dirname(__file__), "locale")
 translation = gettext.translation("messages", localedir=localedir, fallback=True)
 _ = translation.gettext
diff --git a/app/services/scrapers/firecrawl_client/__init__.py b/app/services/scrapers/general/__init__.py
similarity index 71%
rename from app/services/scrapers/firecrawl_client/__init__.py
rename to app/services/scrapers/general/__init__.py
index d26a87a..94c0402 100644
--- a/app/services/scrapers/firecrawl_client/__init__.py
+++ b/app/services/scrapers/general/__init__.py
@@ -5,17 +5,18 @@
 
 
 @dataclass
-class FirecrawlItem(MetadataItem):
+class GeneralItem(MetadataItem):
     """
-    FirecrawlItem: Data class for scraped content from Firecrawl.
+    GeneralItem: Data class for scraped content from general webpage scrapers.
     """
     id: str = ""
     raw_content: str = ""
+    scraper_type: str = ""  # Which scraper was used (e.g., "firecrawl", "zyte", etc.)
 
     @staticmethod
-    def from_dict(obj: Any) -> "FirecrawlItem":
+    def from_dict(obj: Any) -> "GeneralItem":
         metadata_item = MetadataItem.from_dict(obj)
-        return FirecrawlItem(
+        return GeneralItem(
             url=metadata_item.url,
             title=metadata_item.title,
             author=metadata_item.author,
@@ -28,10 +29,12 @@ def from_dict(obj: Any) -> "FirecrawlItem":
             message_type=metadata_item.message_type,
             id=obj.get("id", ""),
             raw_content=obj.get("raw_content", ""),
+            scraper_type=obj.get("scraper_type", ""),
         )
 
     def to_dict(self) -> dict:
         result: dict = super().to_dict()
         result["id"] = self.id
         result["raw_content"] = self.raw_content
+        result["scraper_type"] = self.scraper_type
         return result
diff --git a/app/services/scrapers/firecrawl_client/scraper.py b/app/services/scrapers/general/base.py
similarity index 68%
rename from app/services/scrapers/firecrawl_client/scraper.py
rename to app/services/scrapers/general/base.py
index 8efc261..832ef29 100644
--- a/app/services/scrapers/firecrawl_client/scraper.py
+++ b/app/services/scrapers/general/base.py
@@ -1,4 +1,6 @@
 import hashlib
+from abc import abstractmethod
+from typing import Optional
 from urllib.parse import urlparse
 
 from openai import AsyncOpenAI
@@ -7,12 +9,13 @@
 from app.config import OPENAI_API_KEY
 from app.models.metadata_item import MediaFile, MessageType
 from app.services.scrapers.scraper import Scraper, DataProcessor
-from app.services.scrapers.firecrawl_client import FirecrawlItem
-from app.services.scrapers.firecrawl_client.client import FirecrawlClient
+from app.services.scrapers.general import GeneralItem
 from app.utils.parse import get_html_text_length, wrap_text_into_html
 from app.utils.logger import logger
 
-FIRECRAWL_TEXT_LIMIT = 800
+GENERAL_TEXT_LIMIT = 800
+
+DEFAULT_OPENAI_MODEL = "gpt-4o-mini"
 
 # System prompt for LLM to extract article content
 ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML.
@@ -28,9 +31,10 @@
 Return ONLY the extracted HTML content, no explanations or markdown."""
 
 
-class FirecrawlDataProcessor(DataProcessor):
+class BaseGeneralDataProcessor(DataProcessor):
     """
-    FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
+    Base class for general webpage data processors.
+    Each specific scraper (Firecrawl, Zyte, etc.) should inherit from this class.
     """
 
     def __init__(self, url: str):
@@ -38,27 +42,71 @@ def __init__(self, url: str):
         self._data: dict = {}
         self.url_parser = urlparse(url)
         self.id = hashlib.md5(url.encode()).hexdigest()[:16]
-        self._client: FirecrawlClient = FirecrawlClient.get_instance()
+        self.scraper_type: str = "base"
 
     async def get_item(self) -> dict:
         await self.process_data()
-        firecrawl_item = FirecrawlItem.from_dict(self._data)
-        return firecrawl_item.to_dict()
+        general_item = GeneralItem.from_dict(self._data)
+        return general_item.to_dict()
 
     async def process_data(self) -> None:
         await self._get_page_content()
 
+    @abstractmethod
     async def _get_page_content(self) -> None:
-        try:
-            result = self._client.scrape_url(
-                url=self.url,
-                formats=["markdown", "html"],
-                only_main_content=True,
-            )
-            await self._process_firecrawl_result(result)
-        except Exception as e:
-            logger.error(f"Failed to scrape URL with Firecrawl: {e}")
-            raise
+        """Subclasses must implement this method to fetch page content."""
+        pass
+
+    async def _build_item_data(
+        self,
+        title: str,
+        author: str,
+        description: str,
+        markdown_content: str,
+        html_content: str,
+        og_image: Optional[str] = None,
+    ) -> None:
+        """
+        Common method to build item data from scraped content.
+        """
+        item_data = {
+            "id": self.id,
+            "category": "other",
+            "url": self.url,
+            "title": title or self.url,
+            "author": author or self.url_parser.netloc,
+            "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
+            "scraper_type": self.scraper_type,
+        }
+
+        # Process text content - use description or first part of markdown
+        text = description if description else markdown_content[:500]
+        item_data["text"] = text
+
+        # Process HTML content with LLM if available
+        if html_content:
+            cleaned_html = await self.parsing_article_body_by_llm(html_content)
+            content = wrap_text_into_html(cleaned_html, is_html=True)
+        else:
+            content = wrap_text_into_html(markdown_content, is_html=False)
+        item_data["content"] = content
+        item_data["raw_content"] = markdown_content
+
+        # Process media files - extract og:image if available
+        media_files = []
+        if og_image:
+            media_files.append(MediaFile(url=og_image, media_type="image"))
+
+        item_data["media_files"] = [m.to_dict() for m in media_files]
+
+        # Determine the message type based on content length (not text length)
+        item_data["message_type"] = (
+            MessageType.LONG
+            if get_html_text_length(content) > GENERAL_TEXT_LIMIT
+            else MessageType.SHORT
+        )
+
+        self._data = item_data
 
     @staticmethod
     async def parsing_article_body_by_llm(html_content: str) -> str:
@@ -66,7 +114,7 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
         Use LLM to extract the main article content from HTML.
 
         Args:
-            html_content: Raw HTML content from Firecrawl
+            html_content: Raw HTML content from a scraper
 
         Returns:
             Cleaned HTML containing only the main article content
@@ -86,7 +134,7 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
             truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content
 
             response = await client.chat.completions.create(
-                model="gpt-4o-mini",
+                model=DEFAULT_OPENAI_MODEL,
                 messages=[
                     ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT),
                     ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}")
@@ -108,61 +156,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
             logger.error(f"Failed to parse article body with LLM: {e}")
             return html_content
 
-    async def _process_firecrawl_result(self, result: dict) -> None:
-        metadata = result.get("metadata", {})
-        markdown_content = result.get("markdown", "")
-        html_content = result.get("html", "")
-
-        # Extract metadata fields
-        title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
-        author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
-        description = metadata.get("description", "") or metadata.get("ogDescription", "")
-
-        item_data = {
-            "id": self.id,
-            "category": "other",
-            "url": self.url,
-            "title": title,
-            "author": author,
-            "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
-        }
-
-        # Process text content - use description or first part of markdown
-        text = description if description else markdown_content[:500]
-        item_data["text"] = text
-
-        html_content = await self.parsing_article_body_by_llm(html_content)
-
-        # Process HTML content
-        if html_content:
-            content = wrap_text_into_html(html_content, is_html=True)
-        else:
-            content = wrap_text_into_html(markdown_content, is_html=False)
-        item_data["content"] = content
-        item_data["raw_content"] = markdown_content
-
-        # Process media files - extract og:image if available
-        media_files = []
-        og_image = metadata.get("ogImage")
-        if og_image:
-            media_files.append(MediaFile(url=og_image, media_type="image"))
-
-        item_data["media_files"] = [m.to_dict() for m in media_files]
-
-        # Determine message type based on text length
-        item_data["message_type"] = (
-            MessageType.LONG
-            if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT
-            else MessageType.SHORT
-        )
-
-        self._data = item_data
-
 
-class FirecrawlScraper(Scraper):
+class BaseGeneralScraper(Scraper):
     """
-    FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
+    Base class for general webpage scrapers.
     """
 
+    @abstractmethod
     async def get_processor_by_url(self, url: str) -> DataProcessor:
-        return FirecrawlDataProcessor(url)
+        pass
diff --git a/app/services/scrapers/general/firecrawl.py b/app/services/scrapers/general/firecrawl.py
new file mode 100644
index 0000000..b1d9d40
--- /dev/null
+++ b/app/services/scrapers/general/firecrawl.py
@@ -0,0 +1,56 @@
+from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper
+from app.services.scrapers.general.firecrawl_client import FirecrawlClient
+from app.services.scrapers.scraper import DataProcessor
+from app.utils.logger import logger
+
+
+class FirecrawlDataProcessor(BaseGeneralDataProcessor):
+    """
+    FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
+    """
+
+    def __init__(self, url: str):
+        super().__init__(url)
+        self.scraper_type = "firecrawl"
+        self._client: FirecrawlClient = FirecrawlClient.get_instance()
+
+    async def _get_page_content(self) -> None:
+        try:
+            result = self._client.scrape_url(
+                url=self.url,
+                formats=["markdown", "html"],
+                only_main_content=True,
+            )
+            await self._process_firecrawl_result(result)
+        except Exception as e:
+            logger.error(f"Failed to scrape URL with Firecrawl: {e}")
+            raise
+
+    async def _process_firecrawl_result(self, result: dict) -> None:
+        metadata = result.get("metadata", {})
+        markdown_content = result.get("markdown", "")
+        html_content = result.get("html", "")
+
+        # Extract metadata fields
+        title = metadata.get("title", "") or metadata.get("ogTitle", "")
+        author = metadata.get("author", "") or metadata.get("ogSiteName", "")
+        description = metadata.get("description", "") or metadata.get("ogDescription", "")
+        og_image = metadata.get("ogImage")
+
+        await self._build_item_data(
+            title=title,
+            author=author,
+            description=description,
+            markdown_content=markdown_content,
+            html_content=html_content,
+            og_image=og_image,
+        )
+
+
+class FirecrawlScraper(BaseGeneralScraper):
+    """
+    FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
+    """
+
+    async def get_processor_by_url(self, url: str) -> DataProcessor:
+        return FirecrawlDataProcessor(url)
diff --git a/app/services/scrapers/firecrawl_client/client.py b/app/services/scrapers/general/firecrawl_client.py
similarity index 68%
rename from app/services/scrapers/firecrawl_client/client.py
rename to app/services/scrapers/general/firecrawl_client.py
index 7389996..454ca63 100644
--- a/app/services/scrapers/firecrawl_client/client.py
+++ b/app/services/scrapers/general/firecrawl_client.py
@@ -6,14 +6,13 @@
 
 from firecrawl import Firecrawl
 
-from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY, FIRECRAWL_TIMEOUT_SECONDS
+from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY
 
 
 @dataclass(frozen=True)
 class FirecrawlSettings:
     api_url: str
     api_key: str
-    timeout_seconds: int = 60  # 你也可以在反代侧控制超时
 
 
 class FirecrawlClient:
@@ -33,10 +32,7 @@ def __init__(self, config: FirecrawlSettings):
 
     @staticmethod
     def _create_app(config: FirecrawlSettings) -> Firecrawl:
-        try:
-            return Firecrawl(api_url=config.api_url, api_key=config.api_key)
-        except TypeError:
-            return Firecrawl(api_url=config.api_url, api_key=config.api_key)
+        return Firecrawl(api_url=config.api_url, api_key=config.api_key)
 
     @classmethod
     def get_instance(cls) -> "FirecrawlClient":
@@ -55,7 +51,6 @@ def get_instance(cls) -> "FirecrawlClient":
             config = FirecrawlSettings(
                 api_url=FIRECRAWL_API_URL,
                 api_key=FIRECRAWL_API_KEY,
-                timeout_seconds=FIRECRAWL_TIMEOUT_SECONDS,
             )
 
             cls._instance = cls(config)
@@ -72,24 +67,14 @@ def scrape_url(
             url: str,
             formats: Optional[List[str]] = None,
             only_main_content: bool = True,
-            timeout_seconds: Optional[int] = None,
-            extra_params: Optional[Dict[str, Any]] = None,
+            timeout: Optional[int] = None,
     ) -> Dict[str, Any]:
         """
-        单页抓取（最常用）
+        timeout: milliseconds
         """
-        params: Dict[str, Any] = {
-            "formats": formats or ["markdown"],
-            "onlyMainContent": only_main_content,
-        }
-        if extra_params:
-            params.update(extra_params)
-
-        # if timeout_seconds is None:
-        #     timeout_seconds = self._settings.timeout_seconds
-
         try:
-            return self._app.scrape(url, formats=formats, only_main_content=only_main_content).model_dump(
+            return self._app.scrape(url, formats=formats, only_main_content=only_main_content,
+                                    timeout=timeout).model_dump(
                 exclude_none=True)
         except Exception as e:
             raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e
diff --git a/app/services/scrapers/general/scraper.py b/app/services/scrapers/general/scraper.py
new file mode 100644
index 0000000..17d9c38
--- /dev/null
+++ b/app/services/scrapers/general/scraper.py
@@ -0,0 +1,86 @@
+from typing import Optional
+
+from app.config import GENERAL_SCRAPING_API
+from app.services.scrapers.scraper import Scraper, DataProcessor
+from app.services.scrapers.general.base import BaseGeneralScraper
+from app.services.scrapers.general.firecrawl import FirecrawlScraper
+from app.services.scrapers.general.zyte import ZyteScraper
+from app.utils.logger import logger
+
+
+class GeneralScraper(Scraper):
+    """
+    GeneralScraper: A wrapper scraper that delegates to the configured scraper implementation.
+
+    This class acts as a factory/facade that selects the appropriate scraper
+    based on the GENERAL_SCRAPING_API configuration.
+
+    Supported scrapers:
+    - FIRECRAWL: Uses Firecrawl API for scraping
+    - ZYTE: Uses Zyte API for scraping
+    """
+
+    # Registry of available scrapers
+    SCRAPER_REGISTRY: dict[str, type[BaseGeneralScraper]] = {
+        "FIRECRAWL": FirecrawlScraper,
+        "ZYTE": ZyteScraper,
+    }
+
+    def __init__(self, scraper_type: Optional[str] = None):
+        """
+        Initialize the GeneralScraper with a specific scraper type.
+
+        Args:
+            scraper_type: The type of scraper to use. If None, uses GENERAL_SCRAPING_API config.
+        """
+        self.scraper_type = scraper_type or GENERAL_SCRAPING_API
+        self._scraper: Optional[BaseGeneralScraper] = None
+        self._init_scraper()
+
+    def _init_scraper(self) -> None:
+        """Initialize the underlying scraper based on scraper_type."""
+        scraper_class = self.SCRAPER_REGISTRY.get(self.scraper_type.upper())
+
+        if scraper_class is None:
+            available = ", ".join(self.SCRAPER_REGISTRY.keys())
+            logger.error(f"Unknown scraper type: {self.scraper_type}. Available: {available}")
+            # Fall back to Firecrawl as default
+            logger.info("Falling back to FIRECRAWL scraper")
+            scraper_class = FirecrawlScraper
+
+        self._scraper = scraper_class()
+        logger.info(f"Initialized GeneralScraper with {self.scraper_type} backend")
+
+    async def get_processor_by_url(self, url: str) -> DataProcessor:
+        """
+        Get the appropriate data processor for the given URL.
+
+        Args:
+            url: The URL to scrape
+
+        Returns:
+            DataProcessor instance for processing the URL
+        """
+        return await self._scraper.get_processor_by_url(url)
+
+    @classmethod
+    def register_scraper(cls, name: str, scraper_class: type[BaseGeneralScraper]) -> None:
+        """
+        Register a new scraper type.
+
+        Args:
+            name: The name to register the scraper under (e.g., "ZYTE")
+            scraper_class: The scraper class to register
+        """
+        cls.SCRAPER_REGISTRY[name.upper()] = scraper_class
+        logger.info(f"Registered new scraper: {name}")
+
+    @classmethod
+    def get_available_scrapers(cls) -> list[str]:
+        """
+        Get a list of available scraper types.
+
+        Returns:
+            List of registered scraper names
+        """
+        return list(cls.SCRAPER_REGISTRY.keys())
diff --git a/app/services/scrapers/general/zyte.py b/app/services/scrapers/general/zyte.py
new file mode 100644
index 0000000..1ff00a5
--- /dev/null
+++ b/app/services/scrapers/general/zyte.py
@@ -0,0 +1,78 @@
+from zyte_api import AsyncZyteAPI
+
+from app.config import ZYTE_API_KEY
+from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper
+from app.services.scrapers.scraper import DataProcessor
+from app.utils.logger import logger
+
+
+class ZyteDataProcessor(BaseGeneralDataProcessor):
+    """
+    ZyteDataProcessor: Process URLs using Zyte API to extract content.
+    """
+
+    def __init__(self, url: str):
+        super().__init__(url)
+        self.scraper_type = "zyte"
+
+    async def _get_page_content(self) -> None:
+        if not ZYTE_API_KEY:
+            raise RuntimeError("ZYTE_API_KEY is not configured")
+
+        try:
+            client = AsyncZyteAPI(api_key=ZYTE_API_KEY)
+            result = await client.get(
+                {
+                    "url": self.url,
+                    "browserHtml": True,
+                    "article": True,
+                    "articleOptions": {"extractFrom": "browserHtml"},
+                }
+            )
+            await self._process_zyte_result(result)
+        except Exception as e:
+            logger.error(f"Failed to scrape URL with Zyte: {e}")
+            raise
+
+    async def _process_zyte_result(self, result: dict) -> None:
+        article = result.get("article", {})
+        browser_html = result.get("browserHtml", "")
+
+        # Extract metadata fields from article
+        title = article.get("headline", "") or article.get("name", "")
+
+        # Extract author information
+        authors = article.get("authors", [])
+        author = authors[0].get("name", "") if authors else ""
+
+        description = article.get("description", "") or article.get("articleBodyRaw", "")[:500]
+
+        # Get article body as HTML
+        article_body_html = article.get("articleBodyHtml", "")
+        article_body_raw = article.get("articleBodyRaw", "")
+
+        # Use article body HTML if available, otherwise fall back to browser HTML
+        html_content = article_body_html if article_body_html else browser_html
+        markdown_content = article_body_raw
+
+        # Extract main image
+        main_image = article.get("mainImage", {})
+        og_image = main_image.get("url") if main_image else None
+
+        await self._build_item_data(
+            title=title,
+            author=author,
+            description=description,
+            markdown_content=markdown_content,
+            html_content=html_content,
+            og_image=og_image,
+        )
+
+
+class ZyteScraper(BaseGeneralScraper):
+    """
+    ZyteScraper: Scraper implementation using Zyte API for generic URL scraping.
+    """
+
+    async def get_processor_by_url(self, url: str) -> DataProcessor:
+        return ZyteDataProcessor(url)
diff --git a/app/services/scrapers/scraper_manager.py b/app/services/scrapers/scraper_manager.py
index b58bd72..aac7add 100644
--- a/app/services/scrapers/scraper_manager.py
+++ b/app/services/scrapers/scraper_manager.py
@@ -3,7 +3,7 @@
 from app.utils.logger import logger
 from app.services.scrapers.bluesky.scraper import BlueskyScraper
 from app.services.scrapers.weibo.scraper import WeiboScraper
-from app.services.scrapers.firecrawl_client.scraper import FirecrawlScraper
+from app.services.scrapers.general.scraper import GeneralScraper
 from app.config import (
     BLUESKY_USERNAME, BLUESKY_PASSWORD
 )
@@ -13,12 +13,12 @@ class ScraperManager:
 
     bluesky_scraper: Optional[BlueskyScraper] = None
     weibo_scraper: Optional[WeiboScraper] = None
-    firecrawl_scraper: Optional[FirecrawlScraper] = None
+    general_scraper: Optional[GeneralScraper] = None
 
     scrapers = {"bluesky": bluesky_scraper,
                 "weibo": weibo_scraper,
-                "other": firecrawl_scraper,
-                "unknown": firecrawl_scraper}
+                "other": general_scraper,
+                "unknown": general_scraper}
 
     @classmethod
     async def init_scrapers(cls):
@@ -32,8 +32,8 @@ async def init_scraper(cls, category: str) -> None:
                 scraper = await cls.init_bluesky_scraper()
             elif category == "weibo" and not cls.weibo_scraper:
                 scraper = await cls.init_weibo_scraper()
-            elif category in ["other", "unknown"] and not cls.firecrawl_scraper:
-                scraper = await cls.init_firecrawl_scraper()
+            elif category in ["other", "unknown"] and not cls.general_scraper:
+                scraper = await cls.init_general_scraper()
             if scraper:
                 cls.scrapers[category] = scraper
         else:
@@ -52,7 +52,7 @@ async def init_weibo_scraper(cls) -> WeiboScraper:
         return weibo_scraper
 
     @classmethod
-    async def init_firecrawl_scraper(cls) -> FirecrawlScraper:
-        firecrawl_scraper = FirecrawlScraper()
-        return firecrawl_scraper
+    async def init_general_scraper(cls) -> GeneralScraper:
+        general_scraper = GeneralScraper()
+        return general_scraper
 
diff --git a/app/services/telegram_bot/__init__.py b/app/services/telegram_bot/__init__.py
index dafd924..6fc9ace 100755
--- a/app/services/telegram_bot/__init__.py
+++ b/app/services/telegram_bot/__init__.py
@@ -68,7 +68,7 @@
     JINJA2_ENV,
     OPENAI_API_KEY,
     DATABASE_ON,
-    TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, FIRECRAWL_ON,
+    TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, GENERAL_SCRAPING_ON,
 )
 from app.services.telegram_bot.config import (
     HTTPS_URL_REGEX,
@@ -207,7 +207,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None:
             )
             return
         if url_metadata.source == "unknown":
-            if FIRECRAWL_ON:
+            if GENERAL_SCRAPING_ON:
                 await process_message.edit_text(
                     text=f"Uncategorized url found. General webpage parser is on, Processing..."
                 )
@@ -348,7 +348,7 @@ async def https_url_auto_process(update: Update, context: CallbackContext) -> No
         url_metadata = await get_url_metadata(
             url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST
         )
-        if url_metadata.source == "unknown" and FIRECRAWL_ON:
+        if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON:
             metadata_item = await content_process_function(url_metadata=url_metadata)
             await send_item_message(
                 metadata_item, chat_id=message.chat_id, message=message
@@ -475,7 +475,7 @@ async def _create_choose_channel_keyboard(data: dict) -> list:
 async def invalid_buttons(update: Update, context: CallbackContext) -> None:
     await update.callback_query.answer("Invalid button!")
     await update.effective_message.edit_text(
-        "Sorry, Error Occured, I could not process this button click 😕."
+        "Sorry, Error Occurred, I could not process this button click 😕."
     )
 
 
diff --git a/app/templates/social_media_message.jinja2 b/app/templates/social_media_message.jinja2
index c6a282d..6e27fcb 100644
--- a/app/templates/social_media_message.jinja2
+++ b/app/templates/social_media_message.jinja2
@@ -1,9 +1,9 @@
 {# templates/social_media_message.html #}
 
 {% if data.message_type == "short" %}
-   {% if data.title %}
-<b>{{ data.title }}</b>
-   {% endif %}
+{#   {% if data.title %}#}
+{#<b>{{ data.title }}</b>#}
+{#   {% endif %}#}
 {{ data.text }}
     {% if data.category in ['youtube', 'bilibili'] %}
     {% endif %}
diff --git a/poetry.lock b/poetry.lock
index 045b721..a0adab8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -568,6 +568,116 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >
 [package.extras]
 crt = ["awscrt (==0.23.4)"]
 
+[[package]]
+name = "brotli"
+version = "1.2.0"
+description = "Python bindings for the Brotli compression library"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "brotli-1.2.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:99cfa69813d79492f0e5d52a20fd18395bc82e671d5d40bd5a91d13e75e468e8"},
+    {file = "brotli-1.2.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3ebe801e0f4e56d17cd386ca6600573e3706ce1845376307f5d2cbd32149b69a"},
+    {file = "brotli-1.2.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:a387225a67f619bf16bd504c37655930f910eb03675730fc2ad69d3d8b5e7e92"},
+    {file = "brotli-1.2.0-cp27-cp27m-win32.whl", hash = "sha256:b908d1a7b28bc72dfb743be0d4d3f8931f8309f810af66c906ae6cd4127c93cb"},
+    {file = "brotli-1.2.0-cp27-cp27m-win_amd64.whl", hash = "sha256:d206a36b4140fbb5373bf1eb73fb9de589bb06afd0d22376de23c5e91d0ab35f"},
+    {file = "brotli-1.2.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7e9053f5fb4e0dfab89243079b3e217f2aea4085e4d58c5c06115fc34823707f"},
+    {file = "brotli-1.2.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:4735a10f738cb5516905a121f32b24ce196ab82cfc1e4ba2e3ad1b371085fd46"},
+    {file = "brotli-1.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b90b767916ac44e93a8e28ce6adf8d551e43affb512f2377c732d486ac6514e"},
+    {file = "brotli-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6be67c19e0b0c56365c6a76e393b932fb0e78b3b56b711d180dd7013cb1fd984"},
+    {file = "brotli-1.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0bbd5b5ccd157ae7913750476d48099aaf507a79841c0d04a9db4415b14842de"},
+    {file = "brotli-1.2.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3f3c908bcc404c90c77d5a073e55271a0a498f4e0756e48127c35d91cf155947"},
+    {file = "brotli-1.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b557b29782a643420e08d75aea889462a4a8796e9a6cf5621ab05a3f7da8ef2"},
+    {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81da1b229b1889f25adadc929aeb9dbc4e922bd18561b65b08dd9343cfccca84"},
+    {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ff09cd8c5eec3b9d02d2408db41be150d8891c5566addce57513bf546e3d6c6d"},
+    {file = "brotli-1.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a1778532b978d2536e79c05dac2d8cd857f6c55cd0c95ace5b03740824e0e2f1"},
+    {file = "brotli-1.2.0-cp310-cp310-win32.whl", hash = "sha256:b232029d100d393ae3c603c8ffd7e3fe6f798c5e28ddca5feabb8e8fdb732997"},
+    {file = "brotli-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:ef87b8ab2704da227e83a246356a2b179ef826f550f794b2c52cddb4efbd0196"},
+    {file = "brotli-1.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:15b33fe93cedc4caaff8a0bd1eb7e3dab1c61bb22a0bf5bdfdfd97cd7da79744"},
+    {file = "brotli-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:898be2be399c221d2671d29eed26b6b2713a02c2119168ed914e7d00ceadb56f"},
+    {file = "brotli-1.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350c8348f0e76fff0a0fd6c26755d2653863279d086d3aa2c290a6a7251135dd"},
+    {file = "brotli-1.2.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1ad3fda65ae0d93fec742a128d72e145c9c7a99ee2fcd667785d99eb25a7fe"},
+    {file = "brotli-1.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:40d918bce2b427a0c4ba189df7a006ac0c7277c180aee4617d99e9ccaaf59e6a"},
+    {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2a7f1d03727130fc875448b65b127a9ec5d06d19d0148e7554384229706f9d1b"},
+    {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9c79f57faa25d97900bfb119480806d783fba83cd09ee0b33c17623935b05fa3"},
+    {file = "brotli-1.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:844a8ceb8483fefafc412f85c14f2aae2fb69567bf2a0de53cdb88b73e7c43ae"},
+    {file = "brotli-1.2.0-cp311-cp311-win32.whl", hash = "sha256:aa47441fa3026543513139cb8926a92a8e305ee9c71a6209ef7a97d91640ea03"},
+    {file = "brotli-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:022426c9e99fd65d9475dce5c195526f04bb8be8907607e27e747893f6ee3e24"},
+    {file = "brotli-1.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:35d382625778834a7f3061b15423919aa03e4f5da34ac8e02c074e4b75ab4f84"},
+    {file = "brotli-1.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a61c06b334bd99bc5ae84f1eeb36bfe01400264b3c352f968c6e30a10f9d08b"},
+    {file = "brotli-1.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:acec55bb7c90f1dfc476126f9711a8e81c9af7fb617409a9ee2953115343f08d"},
+    {file = "brotli-1.2.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:260d3692396e1895c5034f204f0db022c056f9e2ac841593a4cf9426e2a3faca"},
+    {file = "brotli-1.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:072e7624b1fc4d601036ab3f4f27942ef772887e876beff0301d261210bca97f"},
+    {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adedc4a67e15327dfdd04884873c6d5a01d3e3b6f61406f99b1ed4865a2f6d28"},
+    {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7a47ce5c2288702e09dc22a44d0ee6152f2c7eda97b3c8482d826a1f3cfc7da7"},
+    {file = "brotli-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:af43b8711a8264bb4e7d6d9a6d004c3a2019c04c01127a868709ec29962b6036"},
+    {file = "brotli-1.2.0-cp312-cp312-win32.whl", hash = "sha256:e99befa0b48f3cd293dafeacdd0d191804d105d279e0b387a32054c1180f3161"},
+    {file = "brotli-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:b35c13ce241abdd44cb8ca70683f20c0c079728a36a996297adb5334adfc1c44"},
+    {file = "brotli-1.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9e5825ba2c9998375530504578fd4d5d1059d09621a02065d1b6bfc41a8e05ab"},
+    {file = "brotli-1.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0cf8c3b8ba93d496b2fae778039e2f5ecc7cff99df84df337ca31d8f2252896c"},
+    {file = "brotli-1.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8565e3cdc1808b1a34714b553b262c5de5fbda202285782173ec137fd13709f"},
+    {file = "brotli-1.2.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:26e8d3ecb0ee458a9804f47f21b74845cc823fd1bb19f02272be70774f56e2a6"},
+    {file = "brotli-1.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67a91c5187e1eec76a61625c77a6c8c785650f5b576ca732bd33ef58b0dff49c"},
+    {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4ecdb3b6dc36e6d6e14d3a1bdc6c1057c8cbf80db04031d566eb6080ce283a48"},
+    {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3e1b35d56856f3ed326b140d3c6d9db91740f22e14b06e840fe4bb1923439a18"},
+    {file = "brotli-1.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54a50a9dad16b32136b2241ddea9e4df159b41247b2ce6aac0b3276a66a8f1e5"},
+    {file = "brotli-1.2.0-cp313-cp313-win32.whl", hash = "sha256:1b1d6a4efedd53671c793be6dd760fcf2107da3a52331ad9ea429edf0902f27a"},
+    {file = "brotli-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:b63daa43d82f0cdabf98dee215b375b4058cce72871fd07934f179885aad16e8"},
+    {file = "brotli-1.2.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:6c12dad5cd04530323e723787ff762bac749a7b256a5bece32b2243dd5c27b21"},
+    {file = "brotli-1.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3219bd9e69868e57183316ee19c84e03e8f8b5a1d1f2667e1aa8c2f91cb061ac"},
+    {file = "brotli-1.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:963a08f3bebd8b75ac57661045402da15991468a621f014be54e50f53a58d19e"},
+    {file = "brotli-1.2.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9322b9f8656782414b37e6af884146869d46ab85158201d82bab9abbcb971dc7"},
+    {file = "brotli-1.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cf9cba6f5b78a2071ec6fb1e7bd39acf35071d90a81231d67e92d637776a6a63"},
+    {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7547369c4392b47d30a3467fe8c3330b4f2e0f7730e45e3103d7d636678a808b"},
+    {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1530af5c3c275b8524f2e24841cbe2599d74462455e9bae5109e9ff42e9361"},
+    {file = "brotli-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2d085ded05278d1c7f65560aae97b3160aeb2ea2c0b3e26204856beccb60888"},
+    {file = "brotli-1.2.0-cp314-cp314-win32.whl", hash = "sha256:832c115a020e463c2f67664560449a7bea26b0c1fdd690352addad6d0a08714d"},
+    {file = "brotli-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:e7c0af964e0b4e3412a0ebf341ea26ec767fa0b4cf81abb5e897c9338b5ad6a3"},
+    {file = "brotli-1.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:82676c2781ecf0ab23833796062786db04648b7aae8be139f6b8065e5e7b1518"},
+    {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c16ab1ef7bb55651f5836e8e62db1f711d55b82ea08c3b8083ff037157171a69"},
+    {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e85190da223337a6b7431d92c799fca3e2982abd44e7b8dec69938dcc81c8e9e"},
+    {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d8c05b1dfb61af28ef37624385b0029df902ca896a639881f594060b30ffc9a7"},
+    {file = "brotli-1.2.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:465a0d012b3d3e4f1d6146ea019b5c11e3e87f03d1676da1cc3833462e672fb0"},
+    {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:96fbe82a58cdb2f872fa5d87dedc8477a12993626c446de794ea025bbda625ea"},
+    {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:1b71754d5b6eda54d16fbbed7fce2d8bc6c052a1b91a35c320247946ee103502"},
+    {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:66c02c187ad250513c2f4fce973ef402d22f80e0adce734ee4e4efd657b6cb64"},
+    {file = "brotli-1.2.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:ba76177fd318ab7b3b9bf6522be5e84c2ae798754b6cc028665490f6e66b5533"},
+    {file = "brotli-1.2.0-cp36-cp36m-win32.whl", hash = "sha256:c1702888c9f3383cc2f09eb3e88b8babf5965a54afb79649458ec7c3c7a63e96"},
+    {file = "brotli-1.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f8d635cafbbb0c61327f942df2e3f474dde1cff16c3cd0580564774eaba1ee13"},
+    {file = "brotli-1.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e80a28f2b150774844c8b454dd288be90d76ba6109670fe33d7ff54d96eb5cb8"},
+    {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b1b799f45da91292ffaa21a473ab3a3054fa78560e8ff67082a185274431c8"},
+    {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29b7e6716ee4ea0c59e3b241f682204105f7da084d6254ec61886508efeb43bc"},
+    {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:640fe199048f24c474ec6f3eae67c48d286de12911110437a36a87d7c89573a6"},
+    {file = "brotli-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:92edab1e2fd6cd5ca605f57d4545b6599ced5dea0fd90b2bcdf8b247a12bd190"},
+    {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7274942e69b17f9cef76691bcf38f2b2d4c8a5f5dba6ec10958363dcb3308a0a"},
+    {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:a56ef534b66a749759ebd091c19c03ef81eb8cd96f0d1d16b59127eaf1b97a12"},
+    {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5732eff8973dd995549a18ecbd8acd692ac611c5c0bb3f59fa3541ae27b33be3"},
+    {file = "brotli-1.2.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:598e88c736f63a0efec8363f9eb34e5b5536b7b6b1821e401afcb501d881f59a"},
+    {file = "brotli-1.2.0-cp37-cp37m-win32.whl", hash = "sha256:7ad8cec81f34edf44a1c6a7edf28e7b7806dfb8886e371d95dcf789ccd4e4982"},
+    {file = "brotli-1.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:865cedc7c7c303df5fad14a57bc5db1d4f4f9b2b4d0a7523ddd206f00c121a16"},
+    {file = "brotli-1.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ac27a70bda257ae3f380ec8310b0a06680236bea547756c277b5dfe55a2452a8"},
+    {file = "brotli-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e813da3d2d865e9793ef681d3a6b66fa4b7c19244a45b817d0cceda67e615990"},
+    {file = "brotli-1.2.0-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9fe11467c42c133f38d42289d0861b6b4f9da31e8087ca2c0d7ebb4543625526"},
+    {file = "brotli-1.2.0-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d6770111d1879881432f81c369de5cde6e9467be7c682a983747ec800544e2"},
+    {file = "brotli-1.2.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:eda5a6d042c698e28bda2507a89b16555b9aa954ef1d750e1c20473481aff675"},
+    {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3173e1e57cebb6d1de186e46b5680afbd82fd4301d7b2465beebe83ed317066d"},
+    {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:71a66c1c9be66595d628467401d5976158c97888c2c9379c034e1e2312c5b4f5"},
+    {file = "brotli-1.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:1e68cdf321ad05797ee41d1d09169e09d40fdf51a725bb148bff892ce04583d7"},
+    {file = "brotli-1.2.0-cp38-cp38-win32.whl", hash = "sha256:f16dace5e4d3596eaeb8af334b4d2c820d34b8278da633ce4a00020b2eac981c"},
+    {file = "brotli-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:14ef29fc5f310d34fc7696426071067462c9292ed98b5ff5a27ac70a200e5470"},
+    {file = "brotli-1.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8d4f47f284bdd28629481c97b5f29ad67544fa258d9091a6ed1fda47c7347cd1"},
+    {file = "brotli-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2881416badd2a88a7a14d981c103a52a23a276a553a8aacc1346c2ff47c8dc17"},
+    {file = "brotli-1.2.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2d39b54b968f4b49b5e845758e202b1035f948b0561ff5e6385e855c96625971"},
+    {file = "brotli-1.2.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:95db242754c21a88a79e01504912e537808504465974ebb92931cfca2510469e"},
+    {file = "brotli-1.2.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bba6e7e6cfe1e6cb6eb0b7c2736a6059461de1fa2c0ad26cf845de6c078d16c8"},
+    {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:88ef7d55b7bcf3331572634c3fd0ed327d237ceb9be6066810d39020a3ebac7a"},
+    {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7fa18d65a213abcfbb2f6cafbb4c58863a8bd6f2103d65203c520ac117d1944b"},
+    {file = "brotli-1.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:09ac247501d1909e9ee47d309be760c89c990defbb2e0240845c892ea5ff0de4"},
+    {file = "brotli-1.2.0-cp39-cp39-win32.whl", hash = "sha256:c25332657dee6052ca470626f18349fc1fe8855a56218e19bd7a8c6ad4952c49"},
+    {file = "brotli-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:1ce223652fd4ed3eb2b7f78fbea31c52314baecfac68db44037bb4167062a937"},
+    {file = "brotli-1.2.0.tar.gz", hash = "sha256:e310f77e41941c13340a95976fe66a8a95b01e783d430eeaf7a2f87e0a57dd0a"},
+]
+
 [[package]]
 name = "cachetools"
 version = "5.5.2"
@@ -3402,6 +3512,45 @@ files = [
 [package.dependencies]
 requests = ">=2.0.1,<3.0.0"
 
+[[package]]
+name = "runstats"
+version = "2.0.0"
+description = "Compute statistics and regression in one pass"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+    {file = "runstats-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79efa663eb47eb480d75f12889590646f7f823169dda386c986be03310cfcc34"},
+    {file = "runstats-2.0.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:09cf60f075b6e03d39fbcdfd14835d9fca985e78315334e589af4840e45e04f5"},
+    {file = "runstats-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2ec49f15b276cce89ffddedebe95741136b0e309ed68108c1bf33f7295973143"},
+    {file = "runstats-2.0.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:748d43cc2712b319e4244c9af275f4d78513e388ff23d14eb36ae30c1e15f2ec"},
+    {file = "runstats-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:9d645bebdf788ea82c2c921462cce8b5d4bda72192bde511b81816082aca9c25"},
+    {file = "runstats-2.0.0-cp36-cp36m-win32.whl", hash = "sha256:9741af3341f087686db4758e2266f26da36ad44bb49039dee43edc97930ac32e"},
+    {file = "runstats-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5fb4f07a3bd665335c9e4f00389585fe98203b3ff32a0e743a1ce728c22855de"},
+    {file = "runstats-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:200297eed4d7f0192eb324d3c634672c9268e2e603f06b372968a849a30c2dfd"},
+    {file = "runstats-2.0.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:aa71c332ab533e482f62bc7308e0474a87582986c0aecb1015f3a922b5c8283e"},
+    {file = "runstats-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7be16c6b781e27f0f931a2a3bc970b00c86790342b804a6a38041a88ef71ba63"},
+    {file = "runstats-2.0.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:328e1ea2be82a264e09091bd6f4513a47bed131b3ab0f654f8153d853f2978c3"},
+    {file = "runstats-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:52985af2b92bb080f886e911f7bc593970aa10fd8febadcfae2e14f8b0ff9b36"},
+    {file = "runstats-2.0.0-cp37-cp37m-win32.whl", hash = "sha256:e52da241b932d56e9f9f947d6e0ab3d71fefc31fe27b610c220e82fc44b4383f"},
+    {file = "runstats-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2b20f6aa911b812948ac3b886c0d78ea4c7acac5d615bffcf863d11711f91c52"},
+    {file = "runstats-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3ca82450e7aaef6f0f0a6332e17bc8723f3beae8b430ce32df1fd7ea624b81a5"},
+    {file = "runstats-2.0.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:7212a39a457c9858acdaf895f2e3a4f4cb5085c2f5d018498c8904ec83fcfcfb"},
+    {file = "runstats-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:dc631f2f1640de2abbd6db48210e4804acc46e00104c6239435d240e67f94c2f"},
+    {file = "runstats-2.0.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:51c903801765b97b657ccbbba5941a5ad10491ee4e1c071cce4025f20af4b0e8"},
+    {file = "runstats-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:5da7acb950243c3215c4569773d4dc30da746d49c73f14916d83b3bcfc75d7ad"},
+    {file = "runstats-2.0.0-cp38-cp38-win32.whl", hash = "sha256:c51efa5f1427445b0fdf404b133b407d7ebda2143c090ed60968b975903068c7"},
+    {file = "runstats-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:deb75dd5966f6c0a944b4a4f65fbb8f6d67e1c479f6a6c666cdb7cfdec03a731"},
+    {file = "runstats-2.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c8b2dee3c02c32efab95b0b615a1ba24400e56db4d71591f8367120066d62ffa"},
+    {file = "runstats-2.0.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:8d47a09a5274f89e709853584527ef5eefbb7f10668c802eb17d82742533a7dc"},
+    {file = "runstats-2.0.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:c133803edbb5d6f23cfb4ca05cea2e74d9ea35b1451a6b3de22987649fd0cf27"},
+    {file = "runstats-2.0.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:ed6e1f1839ed73bfc35ae8fae2d0e6deb826dcbc993f30a620dbb83eb2f07556"},
+    {file = "runstats-2.0.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:571dc4a6abc733da2b36e72b19b0b1743adab142882966e062ee7b5487a29d9a"},
+    {file = "runstats-2.0.0-cp39-cp39-win32.whl", hash = "sha256:8c412ade7596f1afd6be5b5d634a55c4affd3c4305d05fcfa6a0accdf60edc16"},
+    {file = "runstats-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:bb60dbf78a6270e89aad50708075ca57c3d0e07d2d91ae6b07f53fa9a4e91d13"},
+    {file = "runstats-2.0.0.tar.gz", hash = "sha256:0f9a5e6cc9938bbac3474b17727ffc29fbf5895f33e55ce8843341e0821e77c2"},
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.11.3"
@@ -3826,6 +3975,18 @@ dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"]
 docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx_rtd_theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
 test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=25.3.0,<25.4.0)", "pycodestyle (>=2.11.0,<2.12.0)"]
 
+[[package]]
+name = "w3lib"
+version = "2.3.1"
+description = "Library of web-related functions"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "w3lib-2.3.1-py3-none-any.whl", hash = "sha256:9ccd2ae10c8c41c7279cd8ad4fe65f834be894fe7bfdd7304b991fd69325847b"},
+    {file = "w3lib-2.3.1.tar.gz", hash = "sha256:5c8ac02a3027576174c2b61eb9a2170ba1b197cae767080771b6f1febda249a4"},
+]
+
 [[package]]
 name = "webencodings"
 version = "0.5.1"
@@ -4216,10 +4377,34 @@ idna = ">=2.0"
 multidict = ">=4.0"
 propcache = ">=0.2.1"
 
+[[package]]
+name = "zyte-api"
+version = "0.8.1"
+description = "Python interface to Zyte API"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "zyte_api-0.8.1-py3-none-any.whl", hash = "sha256:59565fae3898ffbd1962f260b32e9de957900d89bb8c16ddf1578c22d4fecbb7"},
+    {file = "zyte_api-0.8.1.tar.gz", hash = "sha256:38d78e11e528c8b3f86c786f3ec88b1bab9dd8365ed2bcfc54bb41113963c31e"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.8.0"
+attrs = ">=20.1.0"
+brotli = ">=0.5.2"
+runstats = ">=0.0.1"
+tenacity = ">=8.2.0"
+tqdm = ">=4.16.0"
+w3lib = ">=2.1.1"
+
+[package.extras]
+x402 = ["eth-account (>=0.13.7)", "x402 (>=0.1.1)"]
+
 [extras]
 windows = ["python-magic-bin"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.11,<3.14"
-content-hash = "fd313f6c346780816956f7ae6c2d107600f08810c25798d517c8f4db226788e1"
+content-hash = "a30e96ee89828e5798043b83f62b3f737bd9e12d0e92b6b8a0c5a280cfd97176"
diff --git a/pyproject.toml b/pyproject.toml
index fc79b60..da85308 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,14 +38,12 @@ markdown = "^3.8"
 asyncpraw = "^7.8.1"
 html-telegraph-poster-v2 = "^0.2.5"
 fastfetchbot-telegram-bot = "*"
-
 pytest = "^8.3.5"
 firecrawl-py = "^4.13.0"
+zyte-api = "^0.8.1"
 [tool.poetry.group.dev]
 optional = true
 [tool.poetry.group.dev.dependencies]
-#html-telegraph-poster-v2 = { path = "../html-telegraph-poster-v2/" }
-#fastfetchbot-telegram-bot = { path = "../FastFetchBot-Telegram-Bot/" }
 black = "^25.1.0"
 pytest = "^8.3.5"
 pytest-asyncio = "^0.26.0"
diff --git a/template.env b/template.env
index ab4da38..3c519a1 100644
--- a/template.env
+++ b/template.env
@@ -103,3 +103,21 @@ REDDIT_CLIENT_SECRET=
 REDDIT_PASSWORD=
 REDDIT_USERNAME=
 FXZHIHU_HOST=
+
+# General Webpage Scraping
+# Enable general webpage scraping for unrecognized URLs. Default: `false`
+GENERAL_SCRAPING_ON=false
+
+# The scraping API backend to use. Options: `FIRECRAWL`, `ZYTE`. Default: `FIRECRAWL`
+GENERAL_SCRAPING_API=FIRECRAWL
+
+# Firecrawl API
+# The URL of the Firecrawl API server. Default: ``
+FIRECRAWL_API_URL=
+
+# The API key for Firecrawl. Default: ``
+FIRECRAWL_API_KEY=
+
+# Zyte API
+# The API key for Zyte. Default: `None`
+ZYTE_API_KEY=

From bd906af4c59b2978ac217c69c30aa2dda5efb786 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 15:42:03 -0600
Subject: [PATCH 2/8] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index e3354ca..d9fa219 100644
--- a/.gitignore
+++ b/.gitignore
@@ -257,3 +257,4 @@ conf/*
 .run/Template Python tests.run.xml
 /.run/
 .DS_Store
+/.claude/

From ad6ec6cb10db7438ee92e9bb7561875dfad0f8c1 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 17:07:23 -0600
Subject: [PATCH 3/8] fix: html tag sanitizing for general scraping

---
 app/config.py                                 |  1 +
 app/services/scrapers/general/base.py         | 48 +++++++++++++++++--
 app/services/scrapers/general/firecrawl.py    |  9 ++++
 .../scrapers/general/firecrawl_client.py      | 21 ++++++--
 app/utils/parse.py                            | 46 ++++++++++++------
 template.env                                  |  3 ++
 6 files changed, 105 insertions(+), 23 deletions(-)

diff --git a/app/config.py b/app/config.py
index 1e0b3fc..50fd18d 100644
--- a/app/config.py
+++ b/app/config.py
@@ -218,6 +218,7 @@ def ban_list_resolver(ban_list_string: str) -> list:
 # Firecrawl API
 FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
+FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000))  # milliseconds to wait for JS rendering
 
 
 # Zyte API
diff --git a/app/services/scrapers/general/base.py b/app/services/scrapers/general/base.py
index 53620c8..ba1c944 100644
--- a/app/services/scrapers/general/base.py
+++ b/app/services/scrapers/general/base.py
@@ -3,6 +3,7 @@
 from typing import Optional
 from urllib.parse import urlparse
 
+from bs4 import BeautifulSoup, Doctype
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam
 
@@ -15,7 +16,7 @@
 
 GENERAL_TEXT_LIMIT = 800
 
-DEFAULT_OPENAI_MODEL = "gpt-4o-mini"
+DEFAULT_OPENAI_MODEL = "gpt-5-nano"
 
 # System prompt for LLM to extract article content
 ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML.
@@ -27,7 +28,7 @@
 4. Keep important formatting like bold, italic, links, and images
 5. Return clean HTML containing only the article content
 6. If you cannot identify the main content, return the original HTML unchanged
-7. remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>
+7. After all of the above, remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>
 
 Return ONLY the extracted HTML content, no explanations or markdown."""
 
@@ -81,12 +82,15 @@ async def _build_item_data(
         }
 
         # Process text content - use description or first part of markdown
+        # Strip any HTML tags to ensure plain text for Telegram short messages
         text = description if description else markdown_content[:500]
+        text = BeautifulSoup(text, "html.parser").get_text()
         item_data["text"] = text
 
-        # Process HTML content with LLM if available
+        # Process HTML content with LLM if available, then sanitize deterministically
         if html_content:
             cleaned_html = await self.parsing_article_body_by_llm(html_content)
+            cleaned_html = self.sanitize_html(cleaned_html)
             content = wrap_text_into_html(cleaned_html, is_html=True)
         else:
             content = wrap_text_into_html(markdown_content, is_html=False)
@@ -109,6 +113,42 @@ async def _build_item_data(
 
         self._data = item_data
 
+    @staticmethod
+    def sanitize_html(html_content: str) -> str:
+        """
+        Deterministic HTML sanitizer that removes all non-content tags.
+
+        This runs AFTER the LLM extraction as a safety net — the LLM is unreliable,
+        and when it fails (or when OPENAI_API_KEY is not set), raw Firecrawl HTML
+        (including <!DOCTYPE>, <script>, etc.) passes through unchanged.
+
+        Keeps content-meaningful tags: p, h1-h6, a, b/strong, i/em, u, ul, ol, li,
+        blockquote, pre, code, img, br, table, tr, td, th, thead, tbody.
+        """
+        if not html_content:
+            return html_content
+
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Remove DOCTYPE declarations
+        for item in soup.contents:
+            if isinstance(item, Doctype):
+                item.extract()
+
+        # Remove tags that should be destroyed with all their content
+        for tag_name in ["script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]:
+            for tag in soup.find_all(tag_name):
+                tag.decompose()
+
+        # Unwrap structural/layout tags — keep their text content, discard the tag itself
+        for tag_name in ["html", "body", "div", "span", "section", "article", "nav",
+                         "header", "footer", "main", "aside", "figure", "figcaption",
+                         "details", "summary", "dd", "dt", "dl"]:
+            for tag in soup.find_all(tag_name):
+                tag.unwrap()
+
+        return str(soup).strip()
+
     @staticmethod
     async def parsing_article_body_by_llm(html_content: str) -> str:
         """
@@ -141,7 +181,7 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
                     ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}")
                 ],
                 temperature=0.1,
-                max_tokens=16000,
+                max_completion_tokens=10000,
             )
 
             extracted_content = response.choices[0].message.content
diff --git a/app/services/scrapers/general/firecrawl.py b/app/services/scrapers/general/firecrawl.py
index b1d9d40..197d18d 100644
--- a/app/services/scrapers/general/firecrawl.py
+++ b/app/services/scrapers/general/firecrawl.py
@@ -1,8 +1,15 @@
+from app.config import FIRECRAWL_WAIT_FOR
 from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper
 from app.services.scrapers.general.firecrawl_client import FirecrawlClient
 from app.services.scrapers.scraper import DataProcessor
 from app.utils.logger import logger
 
+# HTML tags to exclude from Firecrawl output at the source
+FIRECRAWL_EXCLUDE_TAGS = [
+    "nav", "footer", "aside", "script", "style",
+    "noscript", "iframe", "svg", "form",
+]
+
 
 class FirecrawlDataProcessor(BaseGeneralDataProcessor):
     """
@@ -20,6 +27,8 @@ async def _get_page_content(self) -> None:
                 url=self.url,
                 formats=["markdown", "html"],
                 only_main_content=True,
+                exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
+                wait_for=FIRECRAWL_WAIT_FOR,
             )
             await self._process_firecrawl_result(result)
         except Exception as e:
diff --git a/app/services/scrapers/general/firecrawl_client.py b/app/services/scrapers/general/firecrawl_client.py
index 454ca63..6bf2747 100644
--- a/app/services/scrapers/general/firecrawl_client.py
+++ b/app/services/scrapers/general/firecrawl_client.py
@@ -68,13 +68,26 @@ def scrape_url(
             formats: Optional[List[str]] = None,
             only_main_content: bool = True,
             timeout: Optional[int] = None,
+            exclude_tags: Optional[List[str]] = None,
+            wait_for: Optional[int] = None,
     ) -> Dict[str, Any]:
         """
-        timeout: milliseconds
+        Args:
+            url: The URL to scrape.
+            formats: Output formats (e.g. ["markdown", "html"]).
+            only_main_content: If True, extract only the main content.
+            timeout: Request timeout in milliseconds.
+            exclude_tags: HTML tag names to exclude from output (e.g. ["nav", "footer"]).
+            wait_for: Time in milliseconds to wait for JS rendering before scraping.
         """
         try:
-            return self._app.scrape(url, formats=formats, only_main_content=only_main_content,
-                                    timeout=timeout).model_dump(
-                exclude_none=True)
+            return self._app.scrape(
+                url,
+                formats=formats,
+                only_main_content=only_main_content,
+                timeout=timeout,
+                exclude_tags=exclude_tags,
+                wait_for=wait_for,
+            ).model_dump(exclude_none=True)
         except Exception as e:
             raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e
diff --git a/app/utils/parse.py b/app/utils/parse.py
index 7727767..53c55e4 100644
--- a/app/utils/parse.py
+++ b/app/utils/parse.py
@@ -116,37 +116,54 @@ def wrap_text_into_html(text: str, is_html: bool = False) -> str:
         for item in soup.find_all("br"):
             item.replace_with("\n")
         text = str(soup)
-        print(text)
-    split_pivot = "\n" if is_html is False else "<br>"
-    text_list = text.split(split_pivot)
+    text_list = text.split("\n")
     text_list = [f"<p>{item}</p>" for item in text_list if item.strip() != ""]
     text = "".join(text_list)
     return text
 
 
 def telegram_message_html_trim(html_content: str, trim_length: int = TELEGRAM_TEXT_LIMIT) -> str:
-    # remove all img tag
+    from bs4 import Doctype
+
     soup = BeautifulSoup(html_content, "html.parser")
-    for img in soup.find_all("img"):
-        img.decompose()
-    for div in soup.find_all("div"):
-        div.unwrap()
-    for script in soup.find_all("script"):
-        script.decompose()
-    html_content = str(soup)
+
+    # Remove DOCTYPE declarations
+    for item in soup.contents:
+        if isinstance(item, Doctype):
+            item.extract()
+
+    # Decompose tags that should be removed entirely (with their content)
+    for tag_name in ["img", "script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]:
+        for tag in soup.find_all(tag_name):
+            tag.decompose()
+
+    # Unwrap structural/layout tags — keep their text, discard the wrapper
+    for tag_name in ["div", "span", "section", "article", "nav", "header", "footer",
+                     "main", "aside", "figure", "figcaption", "html", "body"]:
+        for tag in soup.find_all(tag_name):
+            tag.unwrap()
+
+    # Convert headings to bold text with line break
+    for level in range(1, 7):
+        for tag in soup.find_all(f"h{level}"):
+            tag.name = "b"
+
+    # Unwrap <p> tags (keep text content)
+    for tag in soup.find_all("p"):
+        tag.unwrap()
+
+    html_content = str(soup).strip()
 
     if len(html_content) <= trim_length:
         return html_content
 
-        # Initial trimming
+    # Initial trimming
     trimmed_content = html_content[:trim_length]
-    remaining_content = html_content[trim_length:]
 
     # Find the position of the last complete tag in the trimmed content
     last_complete_pos = trimmed_content.rfind('<')
     if last_complete_pos != -1:
         trimmed_content = trimmed_content[:last_complete_pos]
-        remaining_content = html_content[last_complete_pos:] + remaining_content
 
     # Remove any incomplete tags by ensuring each tag is closed
     cleaned_html = ''
@@ -182,7 +199,6 @@ def telegram_message_html_trim(html_content: str, trim_length: int = TELEGRAM_TE
     for tag in reversed(open_tags):
         cleaned_html += f'</{tag}>'
 
-    print(cleaned_html)
     return cleaned_html + ' ...'
 
 
diff --git a/template.env b/template.env
index 3c519a1..71c6a95 100644
--- a/template.env
+++ b/template.env
@@ -118,6 +118,9 @@ FIRECRAWL_API_URL=
 # The API key for Firecrawl. Default: ``
 FIRECRAWL_API_KEY=
 
+# Time in milliseconds to wait for JS rendering before scraping. Default: `3000`
+FIRECRAWL_WAIT_FOR=3000
+
 # Zyte API
 # The API key for Zyte. Default: `None`
 ZYTE_API_KEY=

From 32b08f83940ed2e988ec58f304ca04d18e67337e Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 18:27:46 -0600
Subject: [PATCH 4/8] chore: disable in group chat error message

---
 app/services/telegram_bot/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/app/services/telegram_bot/__init__.py b/app/services/telegram_bot/__init__.py
index 6fc9ace..5de80d8 100755
--- a/app/services/telegram_bot/__init__.py
+++ b/app/services/telegram_bot/__init__.py
@@ -353,7 +353,7 @@ async def https_url_auto_process(update: Update, context: CallbackContext) -> No
             await send_item_message(
                 metadata_item, chat_id=message.chat_id, message=message
             )
-        if url_metadata.source == "unknown" or url_metadata.source == "banned":
+        elif url_metadata.source == "unknown" or url_metadata.source == "banned":
             logger.debug(f"for the {i + 1}th url {url}, no supported url found.")
             return
         if url_metadata.to_dict().get("source") in SOCIAL_MEDIA_WEBSITE_PATTERNS.keys():
@@ -614,11 +614,11 @@ async def send_item_message(
     except Exception as e:
         logger.error(e)
         traceback.print_exc()
-        await application.bot.send_message(
-            chat_id=discussion_chat_id,
-            text="Error occurred while sending the item to the target 😕",
-            reply_to_message_id=message.message_id if message else None,
-        )
+        # await application.bot.send_message(
+        #     chat_id=discussion_chat_id,
+        #     text="Error occurred while sending the item to the target 😕",
+        #     reply_to_message_id=message.message_id if message else None,
+        # )
         await send_debug_channel(traceback.format_exc())
 
 

From 04382682d70bf064f0a23490cecde728075a39e2 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 18:28:07 -0600
Subject: [PATCH 5/8] fix: add fallback to text slice

---
 app/services/scrapers/general/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/services/scrapers/general/base.py b/app/services/scrapers/general/base.py
index ba1c944..1ab9360 100644
--- a/app/services/scrapers/general/base.py
+++ b/app/services/scrapers/general/base.py
@@ -83,7 +83,7 @@ async def _build_item_data(
 
         # Process text content - use description or first part of markdown
         # Strip any HTML tags to ensure plain text for Telegram short messages
-        text = description if description else markdown_content[:500]
+        text = description if description else (markdown_content or "")[:500]
         text = BeautifulSoup(text, "html.parser").get_text()
         item_data["text"] = text
 
@@ -93,7 +93,7 @@ async def _build_item_data(
             cleaned_html = self.sanitize_html(cleaned_html)
             content = wrap_text_into_html(cleaned_html, is_html=True)
         else:
-            content = wrap_text_into_html(markdown_content, is_html=False)
+            content = wrap_text_into_html(markdown_content or "", is_html=False)
         item_data["content"] = content
         item_data["raw_content"] = markdown_content
 

From ca6e9c048e858e504f1a27d7e4771834a4cf38c6 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 18:28:21 -0600
Subject: [PATCH 6/8] fix: fix class init

---
 app/services/scrapers/scraper_manager.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/app/services/scrapers/scraper_manager.py b/app/services/scrapers/scraper_manager.py
index aac7add..365bd41 100644
--- a/app/services/scrapers/scraper_manager.py
+++ b/app/services/scrapers/scraper_manager.py
@@ -42,17 +42,17 @@ async def init_scraper(cls, category: str) -> None:
 
     @classmethod
     async def init_bluesky_scraper(cls) -> BlueskyScraper:
-        bluesky_scraper = BlueskyScraper(username=BLUESKY_USERNAME, password=BLUESKY_PASSWORD)
-        await bluesky_scraper.init()
-        return bluesky_scraper
+        cls.bluesky_scraper = BlueskyScraper(username=BLUESKY_USERNAME, password=BLUESKY_PASSWORD)
+        await cls.bluesky_scraper.init()
+        return cls.bluesky_scraper
 
     @classmethod
     async def init_weibo_scraper(cls) -> WeiboScraper:
-        weibo_scraper = WeiboScraper()
-        return weibo_scraper
+        cls.weibo_scraper = WeiboScraper()
+        return cls.weibo_scraper
 
     @classmethod
     async def init_general_scraper(cls) -> GeneralScraper:
-        general_scraper = GeneralScraper()
-        return general_scraper
+        cls.general_scraper = GeneralScraper()
+        return cls.general_scraper
 

From 1155a6956d8ed0a0a6fa5e10b85ff8dd7fa30489 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 18:28:30 -0600
Subject: [PATCH 7/8] fix: fix Firecrawl SDK async

---
 app/services/scrapers/general/firecrawl.py        |  2 +-
 app/services/scrapers/general/firecrawl_client.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/app/services/scrapers/general/firecrawl.py b/app/services/scrapers/general/firecrawl.py
index 197d18d..093fe75 100644
--- a/app/services/scrapers/general/firecrawl.py
+++ b/app/services/scrapers/general/firecrawl.py
@@ -23,7 +23,7 @@ def __init__(self, url: str):
 
     async def _get_page_content(self) -> None:
         try:
-            result = self._client.scrape_url(
+            result = await self._client.scrape_url(
                 url=self.url,
                 formats=["markdown", "html"],
                 only_main_content=True,
diff --git a/app/services/scrapers/general/firecrawl_client.py b/app/services/scrapers/general/firecrawl_client.py
index 6bf2747..b92e3bd 100644
--- a/app/services/scrapers/general/firecrawl_client.py
+++ b/app/services/scrapers/general/firecrawl_client.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
-from firecrawl import Firecrawl
+from firecrawl import AsyncFirecrawl
 
 from app.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY
 
@@ -28,11 +28,11 @@ class FirecrawlClient:
 
     def __init__(self, config: FirecrawlSettings):
         self._settings: FirecrawlSettings = config
-        self._app: Firecrawl = self._create_app(config)
+        self._app: AsyncFirecrawl = self._create_app(config)
 
     @staticmethod
-    def _create_app(config: FirecrawlSettings) -> Firecrawl:
-        return Firecrawl(api_url=config.api_url, api_key=config.api_key)
+    def _create_app(config: FirecrawlSettings) -> AsyncFirecrawl:
+        return AsyncFirecrawl(api_url=config.api_url, api_key=config.api_key)
 
     @classmethod
     def get_instance(cls) -> "FirecrawlClient":
@@ -62,7 +62,7 @@ def reset_instance(cls) -> None:
         with cls._lock:
             cls._instance = None
 
-    def scrape_url(
+    async def scrape_url(
             self,
             url: str,
             formats: Optional[List[str]] = None,
@@ -81,13 +81,14 @@ def scrape_url(
             wait_for: Time in milliseconds to wait for JS rendering before scraping.
         """
         try:
-            return self._app.scrape(
+            result = await self._app.scrape(
                 url,
                 formats=formats,
                 only_main_content=only_main_content,
                 timeout=timeout,
                 exclude_tags=exclude_tags,
                 wait_for=wait_for,
-            ).model_dump(exclude_none=True)
+            )
+            return result.model_dump(exclude_none=True)
         except Exception as e:
             raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e

From 3d228c5ffffa2c830081829369139ee344714ddc Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 14 Feb 2026 18:41:17 -0600
Subject: [PATCH 8/8] fix: keep scraper manager in sync with general

---
 app/services/scrapers/scraper_manager.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/app/services/scrapers/scraper_manager.py b/app/services/scrapers/scraper_manager.py
index 365bd41..0a010e5 100644
--- a/app/services/scrapers/scraper_manager.py
+++ b/app/services/scrapers/scraper_manager.py
@@ -36,6 +36,10 @@ async def init_scraper(cls, category: str) -> None:
                 scraper = await cls.init_general_scraper()
             if scraper:
                 cls.scrapers[category] = scraper
+                # general_scraper serves both "other" and "unknown" — keep both keys in sync
+                if category in ["other", "unknown"]:
+                    cls.scrapers["other"] = scraper
+                    cls.scrapers["unknown"] = scraper
         else:
             logger.error(f"Scraper {category} is not supported")
             raise ValueError(f"Scraper {category} is not supported")