-
Notifications
You must be signed in to change notification settings - Fork 4
Feat: Refactor general webpage scraping and fix HTML santizing #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
868896f
ce2e1fd
bd906af
ad6ec6c
32b08f8
0438268
ca6e9c0
1155a69
3d228c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -257,3 +257,4 @@ conf/* | |
| .run/Template Python tests.run.xml | ||
| /.run/ | ||
| .DS_Store | ||
| /.claude/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,18 +1,22 @@ | ||
| import hashlib | ||
| from abc import abstractmethod | ||
| from typing import Optional | ||
| from urllib.parse import urlparse | ||
|
|
||
| from bs4 import BeautifulSoup, Doctype | ||
| from openai import AsyncOpenAI | ||
| from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam | ||
|
|
||
| from app.config import OPENAI_API_KEY | ||
| from app.models.metadata_item import MediaFile, MessageType | ||
| from app.services.scrapers.scraper import Scraper, DataProcessor | ||
| from app.services.scrapers.firecrawl_client import FirecrawlItem | ||
| from app.services.scrapers.firecrawl_client.client import FirecrawlClient | ||
| from app.services.scrapers.general import GeneralItem | ||
| from app.utils.parse import get_html_text_length, wrap_text_into_html | ||
| from app.utils.logger import logger | ||
|
|
||
| FIRECRAWL_TEXT_LIMIT = 800 | ||
| GENERAL_TEXT_LIMIT = 800 | ||
|
|
||
| DEFAULT_OPENAI_MODEL = "gpt-5-nano" | ||
|
|
||
| # System prompt for LLM to extract article content | ||
| ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. | ||
|
|
@@ -24,50 +28,134 @@ | |
| 4. Keep important formatting like bold, italic, links, and images | ||
| 5. Return clean HTML containing only the article content | ||
| 6. If you cannot identify the main content, return the original HTML unchanged | ||
| 7. remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body> | ||
| 7. After all of the above, remove some basic HTML tags like <!DOCTYPE>, <html>, <script>, <body> | ||
|
|
||
| Return ONLY the extracted HTML content, no explanations or markdown.""" | ||
|
|
||
|
|
||
| class FirecrawlDataProcessor(DataProcessor): | ||
| class BaseGeneralDataProcessor(DataProcessor): | ||
| """ | ||
| FirecrawlDataProcessor: Process URLs using Firecrawl to extract content. | ||
| Base class for general webpage data processors. | ||
| Each specific scraper (Firecrawl, Zyte, etc.) should inherit from this class. | ||
| """ | ||
|
|
||
| def __init__(self, url: str): | ||
| self.url: str = url | ||
| self._data: dict = {} | ||
| self.url_parser = urlparse(url) | ||
| self.id = hashlib.md5(url.encode()).hexdigest()[:16] | ||
| self._client: FirecrawlClient = FirecrawlClient.get_instance() | ||
| self.scraper_type: str = "base" | ||
|
|
||
| async def get_item(self) -> dict: | ||
| await self.process_data() | ||
| firecrawl_item = FirecrawlItem.from_dict(self._data) | ||
| return firecrawl_item.to_dict() | ||
| general_item = GeneralItem.from_dict(self._data) | ||
| return general_item.to_dict() | ||
|
|
||
| async def process_data(self) -> None: | ||
| await self._get_page_content() | ||
|
|
||
| @abstractmethod | ||
| async def _get_page_content(self) -> None: | ||
| try: | ||
| result = self._client.scrape_url( | ||
| url=self.url, | ||
| formats=["markdown", "html"], | ||
| only_main_content=True, | ||
| ) | ||
| await self._process_firecrawl_result(result) | ||
| except Exception as e: | ||
| logger.error(f"Failed to scrape URL with Firecrawl: {e}") | ||
| raise | ||
| """Subclasses must implement this method to fetch page content.""" | ||
| pass | ||
|
|
||
| async def _build_item_data( | ||
| self, | ||
| title: str, | ||
| author: str, | ||
| description: str, | ||
| markdown_content: str, | ||
| html_content: str, | ||
| og_image: Optional[str] = None, | ||
| ) -> None: | ||
| """ | ||
| Common method to build item data from scraped content. | ||
| """ | ||
| item_data = { | ||
| "id": self.id, | ||
| "category": "other", | ||
| "url": self.url, | ||
| "title": title or self.url, | ||
| "author": author or self.url_parser.netloc, | ||
| "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", | ||
| "scraper_type": self.scraper_type, | ||
| } | ||
|
|
||
| # Process text content - use description or first part of markdown | ||
| # Strip any HTML tags to ensure plain text for Telegram short messages | ||
| text = description if description else (markdown_content or "")[:500] | ||
| text = BeautifulSoup(text, "html.parser").get_text() | ||
| item_data["text"] = text | ||
|
|
||
| # Process HTML content with LLM if available, then sanitize deterministically | ||
| if html_content: | ||
| cleaned_html = await self.parsing_article_body_by_llm(html_content) | ||
| cleaned_html = self.sanitize_html(cleaned_html) | ||
| content = wrap_text_into_html(cleaned_html, is_html=True) | ||
| else: | ||
| content = wrap_text_into_html(markdown_content or "", is_html=False) | ||
| item_data["content"] = content | ||
| item_data["raw_content"] = markdown_content | ||
|
|
||
| # Process media files - extract og:image if available | ||
| media_files = [] | ||
| if og_image: | ||
| media_files.append(MediaFile(url=og_image, media_type="image")) | ||
|
|
||
| item_data["media_files"] = [m.to_dict() for m in media_files] | ||
|
|
||
| # Determine the message type based on content length (not text length) | ||
| item_data["message_type"] = ( | ||
| MessageType.LONG | ||
| if get_html_text_length(content) > GENERAL_TEXT_LIMIT | ||
| else MessageType.SHORT | ||
| ) | ||
|
|
||
| self._data = item_data | ||
|
|
||
| @staticmethod | ||
| def sanitize_html(html_content: str) -> str: | ||
| """ | ||
| Deterministic HTML sanitizer that removes all non-content tags. | ||
|
|
||
| This runs AFTER the LLM extraction as a safety net — the LLM is unreliable, | ||
| and when it fails (or when OPENAI_API_KEY is not set), raw Firecrawl HTML | ||
| (including <!DOCTYPE>, <script>, etc.) passes through unchanged. | ||
|
|
||
| Keeps content-meaningful tags: p, h1-h6, a, b/strong, i/em, u, ul, ol, li, | ||
| blockquote, pre, code, img, br, table, tr, td, th, thead, tbody. | ||
| """ | ||
| if not html_content: | ||
| return html_content | ||
|
|
||
| soup = BeautifulSoup(html_content, "html.parser") | ||
|
|
||
| # Remove DOCTYPE declarations | ||
| for item in soup.contents: | ||
| if isinstance(item, Doctype): | ||
| item.extract() | ||
|
Comment on lines
+133
to
+136
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same live-list mutation issue — iterate over a snapshot of Same issue as in Proposed fix # Remove DOCTYPE declarations
- for item in soup.contents:
+ for item in list(soup.contents):
if isinstance(item, Doctype):
item.extract()🤖 Prompt for AI Agents |
||
|
|
||
| # Remove tags that should be destroyed with all their content | ||
| for tag_name in ["script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]: | ||
| for tag in soup.find_all(tag_name): | ||
| tag.decompose() | ||
|
|
||
| # Unwrap structural/layout tags — keep their text content, discard the tag itself | ||
| for tag_name in ["html", "body", "div", "span", "section", "article", "nav", | ||
| "header", "footer", "main", "aside", "figure", "figcaption", | ||
| "details", "summary", "dd", "dt", "dl"]: | ||
| for tag in soup.find_all(tag_name): | ||
| tag.unwrap() | ||
|
|
||
| return str(soup).strip() | ||
|
|
||
| @staticmethod | ||
| async def parsing_article_body_by_llm(html_content: str) -> str: | ||
| """ | ||
| Use LLM to extract the main article content from HTML. | ||
|
|
||
| Args: | ||
| html_content: Raw HTML content from Firecrawl | ||
| html_content: Raw HTML content from a scraper | ||
|
|
||
| Returns: | ||
| Cleaned HTML containing only the main article content | ||
|
|
@@ -87,13 +175,13 @@ async def parsing_article_body_by_llm(html_content: str) -> str: | |
| truncated_content = html_content[:max_content_length] if len(html_content) > max_content_length else html_content | ||
|
|
||
| response = await client.chat.completions.create( | ||
| model="gpt-4o-mini", | ||
| model=DEFAULT_OPENAI_MODEL, | ||
| messages=[ | ||
| ChatCompletionSystemMessageParam(role="system", content=ARTICLE_EXTRACTION_PROMPT), | ||
| ChatCompletionUserMessageParam(role="user", content=f"Extract the main article content from this HTML:\n\n{truncated_content}") | ||
| ], | ||
| temperature=0.1, | ||
| max_tokens=16000, | ||
| max_completion_tokens=10000, | ||
| ) | ||
|
|
||
| extracted_content = response.choices[0].message.content | ||
|
|
@@ -109,61 +197,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str: | |
| logger.error(f"Failed to parse article body with LLM: {e}") | ||
| return html_content | ||
|
|
||
| async def _process_firecrawl_result(self, result: dict) -> None: | ||
| metadata = result.get("metadata", {}) | ||
| markdown_content = result.get("markdown", "") | ||
| html_content = result.get("html", "") | ||
|
|
||
| # Extract metadata fields | ||
| title = metadata.get("title", "") or metadata.get("ogTitle", "") or self.url | ||
| author = metadata.get("author", "") or metadata.get("ogSiteName", "") or self.url_parser.netloc | ||
| # description = metadata.get("description", "") or metadata.get("ogDescription", "") | ||
|
|
||
| item_data = { | ||
| "id": self.id, | ||
| "category": "other", | ||
| "url": self.url, | ||
| "title": title, | ||
| "author": author, | ||
| "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", | ||
| } | ||
|
|
||
| # Process text content - use description or first part of markdown | ||
| text = html_content[:FIRECRAWL_TEXT_LIMIT] | ||
| item_data["text"] = text | ||
|
|
||
| html_content = await self.parsing_article_body_by_llm(html_content) | ||
|
|
||
| # Process HTML content | ||
| if html_content: | ||
| content = wrap_text_into_html(html_content, is_html=True) | ||
| else: | ||
| content = wrap_text_into_html(markdown_content, is_html=False) | ||
| item_data["content"] = content | ||
| item_data["raw_content"] = markdown_content | ||
|
|
||
| # Process media files - extract og:image if available | ||
| media_files = [] | ||
| og_image = metadata.get("ogImage") | ||
| if og_image: | ||
| media_files.append(MediaFile(url=og_image, media_type="image")) | ||
|
|
||
| item_data["media_files"] = [m.to_dict() for m in media_files] | ||
|
|
||
| # Determine message type based on text length | ||
| item_data["message_type"] = ( | ||
| MessageType.LONG | ||
| if get_html_text_length(content) > FIRECRAWL_TEXT_LIMIT | ||
| else MessageType.SHORT | ||
| ) | ||
|
|
||
| self._data = item_data | ||
|
|
||
|
|
||
| class FirecrawlScraper(Scraper): | ||
| class BaseGeneralScraper(Scraper): | ||
| """ | ||
| FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping. | ||
| Base class for general webpage scrapers. | ||
| """ | ||
|
|
||
| @abstractmethod | ||
| async def get_processor_by_url(self, url: str) -> DataProcessor: | ||
| return FirecrawlDataProcessor(url) | ||
| pass | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| from app.config import FIRECRAWL_WAIT_FOR | ||
| from app.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper | ||
| from app.services.scrapers.general.firecrawl_client import FirecrawlClient | ||
| from app.services.scrapers.scraper import DataProcessor | ||
| from app.utils.logger import logger | ||
|
|
||
| # HTML tags to exclude from Firecrawl output at the source | ||
| FIRECRAWL_EXCLUDE_TAGS = [ | ||
| "nav", "footer", "aside", "script", "style", | ||
| "noscript", "iframe", "svg", "form", | ||
| ] | ||
|
|
||
|
|
||
| class FirecrawlDataProcessor(BaseGeneralDataProcessor): | ||
| """ | ||
| FirecrawlDataProcessor: Process URLs using Firecrawl to extract content. | ||
| """ | ||
|
|
||
| def __init__(self, url: str): | ||
| super().__init__(url) | ||
| self.scraper_type = "firecrawl" | ||
| self._client: FirecrawlClient = FirecrawlClient.get_instance() | ||
|
|
||
| async def _get_page_content(self) -> None: | ||
| try: | ||
| result = await self._client.scrape_url( | ||
| url=self.url, | ||
| formats=["markdown", "html"], | ||
| only_main_content=True, | ||
| exclude_tags=FIRECRAWL_EXCLUDE_TAGS, | ||
| wait_for=FIRECRAWL_WAIT_FOR, | ||
| ) | ||
| await self._process_firecrawl_result(result) | ||
| except Exception as e: | ||
| logger.error(f"Failed to scrape URL with Firecrawl: {e}") | ||
| raise | ||
|
Comment on lines
+24
to
+36
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Synchronous
Wrap the blocking call with Proposed fix+import asyncio
+from functools import partial
+
...
async def _get_page_content(self) -> None:
try:
- result = self._client.scrape_url(
- url=self.url,
- formats=["markdown", "html"],
- only_main_content=True,
- exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
- wait_for=FIRECRAWL_WAIT_FOR,
- )
+ result = await asyncio.to_thread(
+ partial(
+ self._client.scrape_url,
+ url=self.url,
+ formats=["markdown", "html"],
+ only_main_content=True,
+ exclude_tags=FIRECRAWL_EXCLUDE_TAGS,
+ wait_for=FIRECRAWL_WAIT_FOR,
+ )
+ )
await self._process_firecrawl_result(result)
except Exception as e:
logger.error(f"Failed to scrape URL with Firecrawl: {e}")
raise🤖 Prompt for AI Agents |
||
|
|
||
| async def _process_firecrawl_result(self, result: dict) -> None: | ||
| metadata = result.get("metadata", {}) | ||
| markdown_content = result.get("markdown", "") | ||
| html_content = result.get("html", "") | ||
|
|
||
| # Extract metadata fields | ||
| title = metadata.get("title", "") or metadata.get("ogTitle", "") | ||
| author = metadata.get("author", "") or metadata.get("ogSiteName", "") | ||
| description = metadata.get("description", "") or metadata.get("ogDescription", "") | ||
| og_image = metadata.get("ogImage") | ||
|
|
||
| await self._build_item_data( | ||
| title=title, | ||
| author=author, | ||
| description=description, | ||
| markdown_content=markdown_content, | ||
| html_content=html_content, | ||
| og_image=og_image, | ||
| ) | ||
|
|
||
|
|
||
| class FirecrawlScraper(BaseGeneralScraper): | ||
| """ | ||
| FirecrawlScraper: Scraper implementation using Firecrawl for generic URL scraping. | ||
| """ | ||
|
|
||
| async def get_processor_by_url(self, url: str) -> DataProcessor: | ||
| return FirecrawlDataProcessor(url) | ||
Uh oh!
There was an error while loading. Please reload this page.