aturret · aturret · Feb 15, 2026 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -257,3 +257,4 @@ conf/*
 .run/Template Python tests.run.xml
 /.run/
 .DS_Store
+/.claude/
diff --git a/app/config.py b/app/config.py
@@ -208,16 +208,23 @@ def ban_list_resolver(ban_list_string: str) -> list:
 INOREADER_EMAIL = env.get("INOREADER_EMAIL", None)
 INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None)
 
-# Open AI API environment variables
+# Open AI API
 OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)
 
-# Firecrawl API environment variables
-FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False)
+# General webpage scraping
+GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False)
+GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL")
+
+# Firecrawl API
 FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
-FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60)
+FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000))  # milliseconds to wait for JS rendering
+
+
+# Zyte API
+ZYTE_API_KEY = env.get("ZYTE_API_KEY", None)
 
-# Locale environment variables
+# Locale directories environment variables
 localedir = os.path.join(os.path.dirname(__file__), "locale")
 translation = gettext.translation("messages", localedir=localedir, fallback=True)
 _ = translation.gettext

diff --git a/...ces/scrapers/firecrawl_client/__init__.py → app/services/scrapers/general/__init__.py b/...ces/scrapers/firecrawl_client/__init__.py → app/services/scrapers/general/__init__.py
@@ -5,17 +5,18 @@
 
 
 @dataclass
-class FirecrawlItem(MetadataItem):
+class GeneralItem(MetadataItem):
     """
-    FirecrawlItem: Data class for scraped content from Firecrawl.
+    GeneralItem: Data class for scraped content from general webpage scrapers.
     """
     id: str = ""
     raw_content: str = ""
+    scraper_type: str = ""  # Which scraper was used (e.g., "firecrawl", "zyte", etc.)
 
     @staticmethod
-    def from_dict(obj: Any) -> "FirecrawlItem":
+    def from_dict(obj: Any) -> "GeneralItem":
         metadata_item = MetadataItem.from_dict(obj)
-        return FirecrawlItem(
+        return GeneralItem(
             url=metadata_item.url,
             title=metadata_item.title,
             author=metadata_item.author,
@@ -28,10 +29,12 @@ def from_dict(obj: Any) -> "FirecrawlItem":
             message_type=metadata_item.message_type,
             id=obj.get("id", ""),
             raw_content=obj.get("raw_content", ""),
+            scraper_type=obj.get("scraper_type", ""),
         )
 
     def to_dict(self) -> dict:
         result: dict = super().to_dict()
         result["id"] = self.id
         result["raw_content"] = self.raw_content
+        result["scraper_type"] = self.scraper_type
         return result
diff --git a/...ices/scrapers/firecrawl_client/scraper.py → app/services/scrapers/general/base.py b/...ices/scrapers/firecrawl_client/scraper.py → app/services/scrapers/general/base.py
@@ -1,18 +1,22 @@
 import hashlib
+from abc import abstractmethod
+from typing import Optional
 from urllib.parse import urlparse
 
+from bs4 import BeautifulSoup, Doctype
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam
 
 from app.config import OPENAI_API_KEY
 from app.models.metadata_item import MediaFile, MessageType
 from app.services.scrapers.scraper import Scraper, DataProcessor
-from app.services.scrapers.firecrawl_client import FirecrawlItem
-from app.services.scrapers.firecrawl_client.client import FirecrawlClient
+from app.services.scrapers.general import GeneralItem
 from app.utils.parse import get_html_text_length, wrap_text_into_html
 from app.utils.logger import logger
 
-FIRECRAWL_TEXT_LIMIT = 800
+GENERAL_TEXT_LIMIT = 800
+
+DEFAULT_OPENAI_MODEL = "gpt-5-nano"
 
 # System prompt for LLM to extract article content
 ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML.
@@ -24,50 +28,134 @@
 4. Keep important formatting like bold, italic, links, and images
 5. Return clean HTML containing only the article content
 6. If you cannot identify the main content, return the original HTML unchanged
-7. remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>
+7. After all of the above, remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body>
 
 Return ONLY the extracted HTML content, no explanations or markdown."""
 
 
-class FirecrawlDataProcessor(DataProcessor):
+class BaseGeneralDataProcessor(DataProcessor):
     """
-    FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
+    Base class for general webpage data processors.
+    Each specific scraper (Firecrawl, Zyte, etc.) should inherit from this class.
     """
 
     def __init__(self, url: str):
         self.url: str = url
         self._data: dict = {}
         self.url_parser = urlparse(url)
         self.id = hashlib.md5(url.encode()).hexdigest()[:16]
-        self._client: FirecrawlClient = FirecrawlClient.get_instance()
+        self.scraper_type: str = "base"
 
     async def get_item(self) -> dict:
         await self.process_data()
-        firecrawl_item = FirecrawlItem.from_dict(self._data)
-        return firecrawl_item.to_dict()
+        general_item = GeneralItem.from_dict(self._data)
+        return general_item.to_dict()
 
     async def process_data(self) -> None:
         await self._get_page_content()
 
+    @abstractmethod
     async def _get_page_content(self) -> None:
-        try:
-            result = self._client.scrape_url(
-                url=self.url,
-                formats=["markdown", "html"],
-                only_main_content=True,
-            )
-            await self._process_firecrawl_result(result)
-        except Exception as e:
-            logger.error(f"Failed to scrape URL with Firecrawl: {e}")
-            raise
+        """Subclasses must implement this method to fetch page content."""
+        pass
+
+    async def _build_item_data(
+        self,
+        title: str,
+        author: str,
+        description: str,
+        markdown_content: str,
+        html_content: str,
+        og_image: Optional[str] = None,
+    ) -> None:
+        """
+        Common method to build item data from scraped content.
+        """
+        item_data = {
+            "id": self.id,
+            "category": "other",
+            "url": self.url,
+            "title": title or self.url,
+            "author": author or self.url_parser.netloc,
+            "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
+            "scraper_type": self.scraper_type,
+        }
+
+        # Process text content - use description or first part of markdown
+        # Strip any HTML tags to ensure plain text for Telegram short messages
+        text = description if description else (markdown_content or "")[:500]
+        text = BeautifulSoup(text, "html.parser").get_text()
+        item_data["text"] = text
+
+        # Process HTML content with LLM if available, then sanitize deterministically
+        if html_content:
+            cleaned_html = await self.parsing_article_body_by_llm(html_content)
+            cleaned_html = self.sanitize_html(cleaned_html)
+            content = wrap_text_into_html(cleaned_html, is_html=True)
+        else:
+            content = wrap_text_into_html(markdown_content or "", is_html=False)
+        item_data["content"] = content
+        item_data["raw_content"] = markdown_content
+
+        # Process media files - extract og:image if available
+        media_files = []
+        if og_image:
+            media_files.append(MediaFile(url=og_image, media_type="image"))
+
+        item_data["media_files"] = [m.to_dict() for m in media_files]
+
+        # Determine the message type based on content length (not text length)
+        item_data["message_type"] = (
+            MessageType.LONG
+            if get_html_text_length(content) > GENERAL_TEXT_LIMIT
+            else MessageType.SHORT
+        )
+
+        self._data = item_data
+
+    @staticmethod
+    def sanitize_html(html_content: str) -> str:
+        """
+        Deterministic HTML sanitizer that removes all non-content tags.
+
+        This runs AFTER the LLM extraction as a safety net — the LLM is unreliable,
+        and when it fails (or when OPENAI_API_KEY is not set), raw Firecrawl HTML
+        (including <!DOCTYPE>, <script>, etc.) passes through unchanged.
+
+        Keeps content-meaningful tags: p, h1-h6, a, b/strong, i/em, u, ul, ol, li,
+        blockquote, pre, code, img, br, table, tr, td, th, thead, tbody.
+        """
+        if not html_content:
+            return html_content
+
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Remove DOCTYPE declarations
+        for item in soup.contents:
+            if isinstance(item, Doctype):
+                item.extract()
+
+        # Remove tags that should be destroyed with all their content
+        for tag_name in ["script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]:
+            for tag in soup.find_all(tag_name):
+                tag.decompose()
+
+        # Unwrap structural/layout tags — keep their text content, discard the tag itself
+        for tag_name in ["html", "body", "div", "span", "section", "article", "nav",
+                         "header", "footer", "main", "aside", "figure", "figcaption",
+                         "details", "summary", "dd", "dt", "dl"]:
+            for tag in soup.find_all(tag_name):
+                tag.unwrap()
+
+        return str(soup).strip()
 
     @staticmethod
     async def parsing_article_body_by_llm(html_content: str) -> str:
         """
         Use LLM to extract the main article content from HTML.
 
         Args:
-            html_content: Raw HTML content from Firecrawl
+            html_content: Raw HTML content from a scraper
 
         Returns:
             Cleaned HTML containing only the main article content
@@ -87,13 +175,13 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
             truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content
 
             response = await client.chat.completions.create(
-                model="gpt-4o-mini",
+                model=DEFAULT_OPENAI_MODEL,
                 messages=[
                     ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT),
                     ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}")
                 ],
                 temperature=0.1,
-                max_tokens=16000,
+                max_completion_tokens=10000,
             )
 
             extracted_content = response.choices[0].message.content
@@ -109,61 +197,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str:
             logger.error(f"Failed to parse article body with LLM: {e}")
             return html_content
 
-    async def _process_firecrawl_result(self, result: dict) -> None:
-        metadata = result.get("metadata", {})
-        markdown_content = result.get("markdown", "")
-        html_content = result.get("html", "")
-
-        # Extract metadata fields
-        title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url
-        author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc
-        # description = metadata.get("description", "") or metadata.get("ogDescription", "")
-
-        item_data = {
-            "id": self.id,
-            "category": "other",
-            "url": self.url,
-            "title": title,
-            "author": author,
-            "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
-        }
-
-        # Process text content - use description or first part of markdown
-        text = html_content[:FIRECRAWL_TEXT_LIMIT]
-        item_data["text"] = text
-
-        html_content = await self.parsing_article_body_by_llm(html_content)
-
-        # Process HTML content
-        if html_content:
-            content = wrap_text_into_html(html_content, is_html=True)
-        else:
-            content = wrap_text_into_html(markdown_content, is_html=False)
-        item_data["content"] = content
-        item_data["raw_content"] = markdown_content
-
-        # Process media files - extract og:image if available
-        media_files = []
-        og_image = metadata.get("ogImage")
-        if og_image:
-            media_files.append(MediaFile(url=og_image, media_type="image"))
-
-        item_data["media_files"] = [m.to_dict() for m in media_files]
-
-        # Determine message type based on text length
-        item_data["message_type"] = (
-            MessageType.LONG
-            if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT
-            else MessageType.SHORT
-        )
-
-        self._data = item_data
-
 
-class FirecrawlScraper(Scraper):
+class BaseGeneralScraper(Scraper):
     """
-    FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
+    Base class for general webpage scrapers.
     """
 
+    @abstractmethod
     async def get_processor_by_url(self, url: str) -> DataProcessor:
-        return FirecrawlDataProcessor(url)
+        pass
diff --git a/app/services/scrapers/general/firecrawl.py b/app/services/scrapers/general/firecrawl.py
@@ -0,0 +1,65 @@
+from app.config import FIRECRAWL_WAIT_FOR
+from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper
+from app.services.scrapers.general.firecrawl_client import FirecrawlClient
+from app.services.scrapers.scraper import DataProcessor
+from app.utils.logger import logger
+
+# HTML tags to exclude from Firecrawl output at the source
+FIRECRAWL_EXCLUDE_TAGS = [
+    "nav", "footer", "aside", "script", "style",
+    "noscript", "iframe", "svg", "form",
+]
+
+
+class FirecrawlDataProcessor(BaseGeneralDataProcessor):
+    """
+    FirecrawlDataProcessor: Process URLs using Firecrawl to extract content.
+    """
+
+    def __init__(self, url: str):
+        super().__init__(url)
+        self.scraper_type = "firecrawl"
+        self._client: FirecrawlClient = FirecrawlClient.get_instance()
+
+    async def _get_page_content(self) -> None:
+        try:
+            result = await self._client.scrape_url(
+                url=self.url,
+                formats=["markdown", "html"],
+                only_main_content=True,
+                exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
+                wait_for=FIRECRAWL_WAIT_FOR,
+            )
+            await self._process_firecrawl_result(result)
+        except Exception as e:
+            logger.error(f"Failed to scrape URL with Firecrawl: {e}")
+            raise
+
+    async def _process_firecrawl_result(self, result: dict) -> None:
+        metadata = result.get("metadata", {})
+        markdown_content = result.get("markdown", "")
+        html_content = result.get("html", "")
+
+        # Extract metadata fields
+        title = metadata.get("title", "") or metadata.get("ogTitle", "")
+        author = metadata.get("author", "") or metadata.get("ogSiteName", "")
+        description = metadata.get("description", "") or metadata.get("ogDescription", "")
+        og_image = metadata.get("ogImage")
+
+        await self._build_item_data(
+            title=title,
+            author=author,
+            description=description,
+            markdown_content=markdown_content,
+            html_content=html_content,
+            og_image=og_image,
+        )
+
+
+class FirecrawlScraper(BaseGeneralScraper):
+    """
+    FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping.
+    """
+
+    async def get_processor_by_url(self, url: str) -> DataProcessor:
+        return FirecrawlDataProcessor(url)