diff --git a/.gitignore b/.gitignore index e3354ca..d9fa219 100644 --- a/.gitignore +++ b/.gitignore @@ -257,3 +257,4 @@ conf/* .run/Template Python tests.run.xml /.run/ .DS_Store +/.claude/ diff --git a/app/config.py b/app/config.py index 04cb826..50fd18d 100644 --- a/app/config.py +++ b/app/config.py @@ -208,16 +208,23 @@ def ban_list_resolver(ban_list_string: str) -> list: INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) -# Open AI API environment variables +# Open AI API OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) -# Firecrawl API environment variables -FIRECRAWL_ON = get_env_bool(env, "FIRECRAWL_ON", False) +# General webpage scraping +GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) +GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") + +# Firecrawl API FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") -FIRECRAWL_TIMEOUT_SECONDS = env.get("FIRECRAWL_TIMEOUT_SECONDS", 60) +FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering + + +# Zyte API +ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) -# Locale environment variables +# Locale directories environment variables localedir = os.path.join(os.path.dirname(__file__), "locale") translation = gettext.translation("messages", localedir=localedir, fallback=True) _ = translation.gettext diff --git a/app/services/scrapers/firecrawl_client/__init__.py b/app/services/scrapers/general/__init__.py similarity index 71% rename from app/services/scrapers/firecrawl_client/__init__.py rename to app/services/scrapers/general/__init__.py index d26a87a..94c0402 100644 --- a/app/services/scrapers/firecrawl_client/__init__.py +++ b/app/services/scrapers/general/__init__.py @@ -5,17 +5,18 @@ @dataclass -class FirecrawlItem(MetadataItem): +class GeneralItem(MetadataItem): """ - FirecrawlItem: Data class for scraped content from Firecrawl. + GeneralItem: Data class for scraped content from general webpage scrapers. """ id: str = "" raw_content: str = "" + scraper_type: str = "" # Which scraper was used (e.g., "firecrawl", "zyte", etc.) @staticmethod - def from_dict(obj: Any) -> "FirecrawlItem": + def from_dict(obj: Any) -> "GeneralItem": metadata_item = MetadataItem.from_dict(obj) - return FirecrawlItem( + return GeneralItem( url=metadata_item.url, title=metadata_item.title, author=metadata_item.author, @@ -28,10 +29,12 @@ def from_dict(obj: Any) -> "FirecrawlItem": message_type=metadata_item.message_type, id=obj.get("id", ""), raw_content=obj.get("raw_content", ""), + scraper_type=obj.get("scraper_type", ""), ) def to_dict(self) -> dict: result: dict = super().to_dict() result["id"] = self.id result["raw_content"] = self.raw_content + result["scraper_type"] = self.scraper_type return result diff --git a/app/services/scrapers/firecrawl_client/scraper.py b/app/services/scrapers/general/base.py similarity index 53% rename from app/services/scrapers/firecrawl_client/scraper.py rename to app/services/scrapers/general/base.py index 6b14188..1ab9360 100644 --- a/app/services/scrapers/firecrawl_client/scraper.py +++ b/app/services/scrapers/general/base.py @@ -1,18 +1,22 @@ import hashlib +from abc import abstractmethod +from typing import Optional from urllib.parse import urlparse +from bs4 import BeautifulSoup, Doctype from openai import AsyncOpenAI from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam from app.config import OPENAI_API_KEY from app.models.metadata_item import MediaFile, MessageType from app.services.scrapers.scraper import Scraper, DataProcessor -from app.services.scrapers.firecrawl_client import FirecrawlItem -from app.services.scrapers.firecrawl_client.client import FirecrawlClient +from app.services.scrapers.general import GeneralItem from app.utils.parse import get_html_text_length, wrap_text_into_html from app.utils.logger import logger -FIRECRAWL_TEXT_LIMIT = 800 +GENERAL_TEXT_LIMIT = 800 + +DEFAULT_OPENAI_MODEL = "gpt-5-nano" # System prompt for LLM to extract article content ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. @@ -24,14 +28,15 @@ 4. Keep important formatting like bold, italic, links, and images 5. Return clean HTML containing only the article content 6. If you cannot identify the main content, return the original HTML unchanged -7. remove some basic HTML tags like , ,