diff --git a/CLAUDE.md b/CLAUDE.md index 4f0b52f..9246ebc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,7 +13,6 @@ FastFetchBot/ ├── apps/api/ # FastAPI server: scrapers, storage, routing ├── apps/telegram-bot/ # Telegram Bot: webhook/polling, message handling ├── apps/worker/ # Celery worker: async file operations (video, PDF, audio) -├── app/ # Legacy re-export wrappers (backward compatibility) ├── pyproject.toml # Root workspace configuration └── uv.lock # Lockfile for the entire workspace ``` @@ -54,10 +53,6 @@ The Telegram Bot communicates with the API server over HTTP (`API_SERVER_URL`). - **`models/`** — `classes.py` (NamedBytesIO), `metadata_item.py`, `telegraph_item.py`, `url_metadata.py` - **`utils/`** — `parse.py` (URL parsing, HTML processing, `get_env_bool`), `image.py`, `logger.py`, `network.py` -### Legacy `app/` Directory - -Re-export wrappers providing backward compatibility. Actual code lives in `apps/api/src/` and `packages/shared/`. For example, `app/config.py` imports `get_env_bool` from `fastfetchbot_shared.utils.parse`. - ## Development Commands ### Package Management @@ -143,8 +138,6 @@ GitHub Actions (`.github/workflows/ci.yml`) builds and pushes all three images o - `ghcr.io/aturret/fastfetchbot-tgbot:latest` - `ghcr.io/aturret/fastfetchbot-worker:latest` -Deployment is triggered via Watchtower webhook after builds complete. Include `[github-action]` in a commit message to skip the build. - ## Development Guidelines ### Adding a New Platform Scraper diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml index 578c0bd..b3fda22 100644 --- a/apps/api/pyproject.toml +++ b/apps/api/pyproject.toml @@ -3,28 +3,17 @@ name = "fastfetchbot-api" version = "0.1.0" requires-python = ">=3.12,<3.13" dependencies = [ - "fastfetchbot-shared", + "fastfetchbot-shared[scrapers]", "fastapi>=0.115.12", "sentry-sdk[fastapi]>=2.27.0", "gunicorn>=23.0.0", "uvicorn>=0.34.2", - "jinja2>=3.1.6", "babel>=2.17.0", "beanie>=1.29.0", - "jmespath>=1.0.1", - "twitter-api-client-v2>=0.1.1", - "atproto>=0.0.61", - "asyncpraw>=7.8.1", "pillow>=10.0.0", "pydub>=0.25.1", "xhtml2pdf>=0.2.17", "aioboto3>=13.4.0", - "tenacity>=9.1.2", - "markdown>=3.8", - "openai>=2.15.0", - "html-telegraph-poster-v2>=0.2.5", - "firecrawl-py>=4.13.0", - "zyte-api>=0.8.1", "celery[redis]>=5.4.0", ] diff --git a/apps/api/src/config.py b/apps/api/src/config.py index 12a339b..ea04347 100644 --- a/apps/api/src/config.py +++ b/apps/api/src/config.py @@ -1,13 +1,8 @@ -import json import os import tempfile - -from jinja2 import Environment, FileSystemLoader import gettext import secrets -from fastfetchbot_shared.utils.cookie import read_json_cookies_to_string -from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.utils.parse import get_env_bool env = os.environ @@ -35,10 +30,6 @@ MONGODB_HOST = env.get("MONGODB_HOST", "localhost") MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") -# Telegraph -telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "") -TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None - # File exporter toggle (used by telegram bot to show/hide buttons) FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True) DOWNLOAD_VIDEO_TIMEOUT = env.get("DOWNLOAD_VIDEO_TIMEOUT", 600) @@ -47,105 +38,6 @@ CELERY_BROKER_URL = env.get("CELERY_BROKER_URL", "redis://localhost:6379/0") CELERY_RESULT_BACKEND = env.get("CELERY_RESULT_BACKEND", "redis://localhost:6379/1") -# Services environment variables -templates_directory = os.path.join(current_directory, "templates") -JINJA2_ENV = Environment( - loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True -) -TEMPLATE_LANGUAGE = env.get( - "TEMPLATE_LANGUAGE", "zh_CN" -) # It is a workaround for translation system - -# X-RapidAPI (for instagram) -X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None) - -# Twitter -TWITTER_EMAIL = env.get("TWITTER_EMAIL", None) -TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None) -TWITTER_USERNAME = env.get("TWITTER_USERNAME", None) -TWITTER_CT0 = env.get("TWITTER_CT0", None) -TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None) -TWITTER_COOKIES = { - "ct0": TWITTER_CT0, - "auth_token": TWITTER_AUTH_TOKEN, -} - -# Bluesky -BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None) -BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None) - -# Weibo -weibo_cookies_path = os.path.join(conf_dir, "weibo_cookies.json") -if os.path.exists(weibo_cookies_path): - WEIBO_COOKIES = read_json_cookies_to_string(weibo_cookies_path) -else: - WEIBO_COOKIES = env.get("WEIBO_COOKIES", None) - -# Xiaohongshu -XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None) -XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None) -XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None) -XIAOHONGSHU_COOKIES = { - "a1": XIAOHONGSHU_A1, - "web_id": XIAOHONGSHU_WEBID, - "web_session": XIAOHONGSHU_WEBSESSION, -} -XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",") -XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",") -XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False) -XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True) - -# XHS sign server and cookie file -from fastfetchbot_shared.config import SIGN_SERVER_URL as XHS_SIGN_SERVER_URL -from fastfetchbot_shared.config import XHS_COOKIE_PATH as _XHS_COOKIE_PATH - -xhs_cookie_path = _XHS_COOKIE_PATH or os.path.join(conf_dir, "xhs_cookies.txt") - -# Load XHS cookies from file (similar to Zhihu cookie loading) -XHS_COOKIE_STRING = "" -if os.path.exists(xhs_cookie_path): - try: - with open(xhs_cookie_path, "r", encoding="utf-8") as f: - XHS_COOKIE_STRING = f.read().strip() - except (IOError, OSError) as e: - logger.error(f"Error reading XHS cookie file: {e}") - XHS_COOKIE_STRING = "" -else: - # Fallback: build cookie string from individual env vars (backward compat) - cookie_parts = [] - if XIAOHONGSHU_A1: - cookie_parts.append(f"a1={XIAOHONGSHU_A1}") - if XIAOHONGSHU_WEBID: - cookie_parts.append(f"web_id={XIAOHONGSHU_WEBID}") - if XIAOHONGSHU_WEBSESSION: - cookie_parts.append(f"web_session={XIAOHONGSHU_WEBSESSION}") - XHS_COOKIE_STRING = "; ".join(cookie_parts) - -# Zhihu -FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") -ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None) - -zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json") -if os.path.exists(zhihu_cookie_path): - try: - with open(zhihu_cookie_path, "r") as f: - ZHIHU_COOKIES_JSON = json.load(f) - except json.JSONDecodeError: - print("Error: The file is not in a valid JSON format.") - ZHIHU_COOKIES_JSON = None - except FileNotFoundError: - print("Error: The file does not exist.") - ZHIHU_COOKIES_JSON = None -else: - print("Error: We cannot find it.") - ZHIHU_COOKIES_JSON = None - -# Reddit -REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None) -REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None) -REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None) -REDDIT_USERNAME = env.get("REDDIT_USERNAME", None) - # AWS storage AWS_STORAGE_ON = get_env_bool(env, "AWS_STORAGE_ON", False) AWS_ACCESS_KEY_ID = env.get("AWS_ACCESS_KEY_ID", None) @@ -155,28 +47,13 @@ AWS_DOMAIN_HOST = env.get("AWS_DOMAIN_HOST", None) if not (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_S3_BUCKET_NAME): AWS_STORAGE_ON = False + +# Inoreader INOREADER_APP_ID = env.get("INOREADER_APP_ID", None) INOREADER_APP_KEY = env.get("INOREADER_APP_KEY", None) INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) -# Open AI API -OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) - -# General webpage scraping -GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) -GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") - -# Firecrawl API -FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") -FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") -FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering -FIRECRAWL_USE_JSON_EXTRACTION = get_env_bool(env, "FIRECRAWL_USE_JSON_EXTRACTION", False) - - -# Zyte API -ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) - # Locale directories environment variables localedir = os.path.join(os.path.dirname(__file__), "locale") translation = gettext.translation("messages", localedir=localedir, fallback=True) diff --git a/apps/api/src/services/file_export/video_download/__init__.py b/apps/api/src/services/file_export/video_download/__init__.py index aebda54..0dda8e1 100644 --- a/apps/api/src/services/file_export/video_download/__init__.py +++ b/apps/api/src/services/file_export/video_download/__init__.py @@ -10,7 +10,7 @@ from src.config import DOWNLOAD_VIDEO_TIMEOUT from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html from fastfetchbot_shared.utils.logger import logger -from src.config import JINJA2_ENV +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV video_info_template = JINJA2_ENV.get_template("video_info.jinja2") diff --git a/apps/api/src/services/scrapers/common.py b/apps/api/src/services/scrapers/common.py index 0ea0d7a..ec78f54 100644 --- a/apps/api/src/services/scrapers/common.py +++ b/apps/api/src/services/scrapers/common.py @@ -3,32 +3,21 @@ from src.models.database_model import Metadata from fastfetchbot_shared.models.url_metadata import UrlMetadata from fastfetchbot_shared.models.metadata_item import MessageType -from src.services import ( - telegraph, - inoreader -) +from fastfetchbot_shared.services.scrapers.common import InfoExtractService as CoreInfoExtractService +from fastfetchbot_shared.services.telegraph import Telegraph from src.services.file_export import video_download, document_export -from src.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads -from src.services.scrapers.scraper_manager import ScraperManager from src.database import save_instances from fastfetchbot_shared.utils.logger import logger from src.config import DATABASE_ON -class InfoExtractService(object): +class InfoExtractService(CoreInfoExtractService): + """API-layer service that adds Telegraph, PDF export, DB storage, and video download.""" + service_classes: dict = { - "twitter": twitter.Twitter, - "threads": threads.Threads, - "reddit": reddit.Reddit, - "weibo": weibo.Weibo, - "wechat": wechat.Wechat, - "instagram": instagram.Instagram, - "douban": douban.Douban, - "zhihu": zhihu.Zhihu, - "xiaohongshu": xiaohongshu.Xiaohongshu, + **CoreInfoExtractService.service_classes, "youtube": video_download.VideoDownloader, "bilibili": video_download.VideoDownloader, - "inoreader": inoreader.Inoreader, } def __init__( @@ -40,49 +29,21 @@ def __init__( store_document: Optional[bool] = False, **kwargs, ): - url_metadata = url_metadata.to_dict() - self.url = url_metadata["url"] - self.content_type = url_metadata["content_type"] - self.source = url_metadata["source"] - self.data = data - self.kwargs = kwargs - self.store_database = store_database - self.store_telegraph = store_telegraph - self.store_document = store_document - - @property - def category(self) -> str: - return self.source - - async def get_item(self, metadata_item: Optional[dict] = None) -> dict: - if self.content_type == "video": - if not self.kwargs.get("category"): - self.kwargs["category"] = self.category - if not metadata_item: - try: - if self.category in ["bluesky", "weibo", "other", "unknown"]: # it is a workaround before the code refactor - await ScraperManager.init_scraper(self.category) - item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url) - metadata_item = await item_data_processor.get_item() - else: - scraper_item = InfoExtractService.service_classes[self.category]( - url=self.url, data=self.data, **self.kwargs - ) - metadata_item = await scraper_item.get_item() - except Exception as e: - logger.error(f"Error while getting item: {e}") - raise e - logger.info(f"Got metadata item") - logger.debug(metadata_item) - metadata_item = await self.process_item(metadata_item) - return metadata_item + super().__init__( + url_metadata, + data=data, + store_database=store_database, + store_telegraph=store_telegraph, + store_document=store_document, + **kwargs, + ) async def process_item(self, metadata_item: dict) -> dict: if metadata_item.get("message_type") == MessageType.LONG: self.store_telegraph = True logger.info("message type is long, store in telegraph") if self.store_telegraph: - telegraph_item = telegraph.Telegraph.from_dict(metadata_item) + telegraph_item = Telegraph.from_dict(metadata_item) try: telegraph_url = await telegraph_item.get_telegraph() except Exception as e: diff --git a/apps/api/src/services/telegraph/__init__.py b/apps/api/src/services/telegraph/__init__.py index 218f992..b93bf83 100644 --- a/apps/api/src/services/telegraph/__init__.py +++ b/apps/api/src/services/telegraph/__init__.py @@ -1,74 +1,4 @@ -# TODO: copy the html-to-telegraph package and modify it to fit the asynchronous model -import random -import traceback -from typing import Any +# Re-export from shared package +from fastfetchbot_shared.services.telegraph import Telegraph -from html_telegraph_poster_v2.async_poster import ( - AsyncTelegraphPoster, -) -from html_telegraph_poster_v2.async_poster.utils import DocumentPreprocessor - -from src.config import TELEGRAPH_TOKEN_LIST -from fastfetchbot_shared.models.telegraph_item import TelegraphItem, from_str -from fastfetchbot_shared.utils.logger import logger - - -class Telegraph(TelegraphItem): - def __init__( - self, - title: str, - url: str, - author: str, - author_url: str, - category: str, - content: str, - ): - self.telegraph = AsyncTelegraphPoster(use_api=True) - self.title = title - self.url = url - self.author = author - self.author_url = author_url - self.category = category - self.content = content - - @staticmethod - def from_dict(obj: Any) -> "Telegraph": - assert isinstance(obj, dict) - title = from_str(obj.get("title")) - url = from_str(obj.get("url")) - author = from_str(obj.get("author")) - author_url = from_str(obj.get("author_url")) - category = from_str(obj.get("category")) - content = from_str(obj.get("content")) - return Telegraph(title, url, author, author_url, category, content) - - async def get_telegraph(self, upload_images: bool = True) -> str: - try: - if upload_images: - temp_html = DocumentPreprocessor(self.content, url=self.url) - logger.info("Telegraph: Uploading images to telegraph...") - await temp_html.upload_all_images() - self.content = temp_html.get_processed_html() - logger.info("Telegraph: Uploading to telegraph...") - if not TELEGRAPH_TOKEN_LIST: - await self.telegraph.create_api_token( - short_name=self.author[0:14], author_name=self.author - ) - else: - random_token = random.choice(TELEGRAPH_TOKEN_LIST) - await self.telegraph.set_token(random_token) - - telegraph_post = await self.telegraph.post( - title=self.title, - author=self.author, - author_url=self.author_url, - text=self.content, - ) - logger.info( - f"Telegraph: Uploaded to telegraph. Link: {telegraph_post['url']}" - ) - telegraph_url = telegraph_post["url"] - return telegraph_url - except Exception as e: - traceback.print_exc() - return "" +__all__ = ["Telegraph"] diff --git a/packages/shared/fastfetchbot_shared/services/__init__.py b/packages/shared/fastfetchbot_shared/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/__init__.py new file mode 100644 index 0000000..847a9af --- /dev/null +++ b/packages/shared/fastfetchbot_shared/services/scrapers/__init__.py @@ -0,0 +1,31 @@ +from fastfetchbot_shared.services.scrapers import ( + twitter, + weibo, + bluesky, + reddit, + xiaohongshu, + zhihu, + douban, + instagram, + threads, + wechat, + general, +) +from fastfetchbot_shared.services.scrapers.common import InfoExtractService +from fastfetchbot_shared.services.scrapers.scraper_manager import ScraperManager + +__all__ = [ + "InfoExtractService", + "ScraperManager", + "twitter", + "weibo", + "bluesky", + "reddit", + "xiaohongshu", + "zhihu", + "douban", + "instagram", + "threads", + "wechat", + "general", +] diff --git a/apps/api/src/services/scrapers/bluesky/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py similarity index 100% rename from apps/api/src/services/scrapers/bluesky/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py diff --git a/apps/api/src/services/scrapers/bluesky/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/config.py similarity index 100% rename from apps/api/src/services/scrapers/bluesky/config.py rename to packages/shared/fastfetchbot_shared/services/scrapers/bluesky/config.py diff --git a/apps/api/src/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py similarity index 96% rename from apps/api/src/services/scrapers/bluesky/scraper.py rename to packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py index fd3799a..2874fe1 100644 --- a/apps/api/src/services/scrapers/bluesky/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py @@ -5,11 +5,11 @@ from atproto_client.models.app.bsky.embed.record import ViewRecord from atproto_client.models.app.bsky.feed.defs import ThreadViewPost, PostView -from src.config import JINJA2_ENV +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType -from src.services.scrapers.scraper import Scraper, DataProcessor -from src.services.scrapers.bluesky import Bluesky -from src.services.scrapers.bluesky.config import BLUESKY_HOST, BLUESKY_MAX_LENGTH +from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor +from fastfetchbot_shared.services.scrapers.bluesky import Bluesky +from fastfetchbot_shared.services.scrapers.bluesky.config import BLUESKY_HOST, BLUESKY_MAX_LENGTH from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.utils.parse import wrap_text_into_html diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py new file mode 100644 index 0000000..d87e078 --- /dev/null +++ b/packages/shared/fastfetchbot_shared/services/scrapers/common.py @@ -0,0 +1,84 @@ +from typing import Optional, Any + +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from fastfetchbot_shared.services.scrapers import ( + twitter, + wechat, + reddit, + weibo, + zhihu, + douban, + instagram, + xiaohongshu, + threads, +) +from fastfetchbot_shared.services.scrapers.scraper_manager import ScraperManager +from fastfetchbot_shared.utils.logger import logger + + +class InfoExtractService(object): + """Core scraping service — routes URLs to the correct scraper and returns raw metadata. + + This base class handles only scraping. Telegraph publishing, PDF export, + DB storage, and video download are handled by subclasses (e.g. in the API app). + """ + + service_classes: dict = { + "twitter": twitter.Twitter, + "threads": threads.Threads, + "reddit": reddit.Reddit, + "weibo": weibo.Weibo, + "wechat": wechat.Wechat, + "instagram": instagram.Instagram, + "douban": douban.Douban, + "zhihu": zhihu.Zhihu, + "xiaohongshu": xiaohongshu.Xiaohongshu, + } + + def __init__( + self, + url_metadata: UrlMetadata, + data: Any = None, + store_database: Optional[bool] = False, + store_telegraph: Optional[bool] = True, + store_document: Optional[bool] = False, + **kwargs, + ): + url_metadata = url_metadata.to_dict() + self.url = url_metadata["url"] + self.content_type = url_metadata["content_type"] + self.source = url_metadata["source"] + self.data = data + self.kwargs = kwargs + self.store_database = store_database + self.store_telegraph = store_telegraph + self.store_document = store_document + + @property + def category(self) -> str: + return self.source + + async def get_item(self, metadata_item: Optional[dict] = None) -> dict: + if not metadata_item: + try: + if self.category in ["bluesky", "weibo", "other", "unknown"]: + await ScraperManager.init_scraper(self.category) + item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url) + metadata_item = await item_data_processor.get_item() + else: + scraper_item = self.service_classes[self.category]( + url=self.url, data=self.data, **self.kwargs + ) + metadata_item = await scraper_item.get_item() + except Exception as e: + logger.error(f"Error while getting item: {e}") + raise e + logger.info(f"Got metadata item") + logger.debug(metadata_item) + metadata_item = await self.process_item(metadata_item) + return metadata_item + + async def process_item(self, metadata_item: dict) -> dict: + """Base process_item — just strips title whitespace. Override for enrichment.""" + metadata_item["title"] = metadata_item["title"].strip() + return metadata_item diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/config.py new file mode 100644 index 0000000..3c28cb0 --- /dev/null +++ b/packages/shared/fastfetchbot_shared/services/scrapers/config.py @@ -0,0 +1,137 @@ +import json +import os +import tempfile + +from jinja2 import Environment, FileSystemLoader + +from fastfetchbot_shared.utils.cookie import read_json_cookies_to_string +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_env_bool + +env = os.environ + +# Filesystem environment variables +TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) +WORK_DIR = env.get("WORK_DIR", os.getcwd()) +DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) +DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) + +# Cookie/config file directory — defaults to /conf but can be overridden +CONF_DIR = env.get("CONF_DIR", os.path.join(WORK_DIR, "conf")) + +# Templates & Jinja2 +templates_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates") +JINJA2_ENV = Environment( + loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True +) +TEMPLATE_LANGUAGE = env.get("TEMPLATE_LANGUAGE", "zh_CN") + +# X-RapidAPI (shared by Twitter and Instagram scrapers) +X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None) + +# Twitter +TWITTER_EMAIL = env.get("TWITTER_EMAIL", None) +TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None) +TWITTER_USERNAME = env.get("TWITTER_USERNAME", None) +TWITTER_CT0 = env.get("TWITTER_CT0", None) +TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None) +TWITTER_COOKIES = { + "ct0": TWITTER_CT0, + "auth_token": TWITTER_AUTH_TOKEN, +} + +# Bluesky +BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None) +BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None) + +# Weibo +weibo_cookies_path = os.path.join(CONF_DIR, "weibo_cookies.json") +if os.path.exists(weibo_cookies_path): + WEIBO_COOKIES = read_json_cookies_to_string(weibo_cookies_path) +else: + WEIBO_COOKIES = env.get("WEIBO_COOKIES", None) + +# Xiaohongshu +XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None) +XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None) +XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None) +XIAOHONGSHU_COOKIES = { + "a1": XIAOHONGSHU_A1, + "web_id": XIAOHONGSHU_WEBID, + "web_session": XIAOHONGSHU_WEBSESSION, +} +XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",") +XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",") +XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False) +XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True) + +# XHS sign server and cookie file +from fastfetchbot_shared.config import SIGN_SERVER_URL as XHS_SIGN_SERVER_URL +from fastfetchbot_shared.config import XHS_COOKIE_PATH as _XHS_COOKIE_PATH + +xhs_cookie_path = _XHS_COOKIE_PATH or os.path.join(CONF_DIR, "xhs_cookies.txt") + +XHS_COOKIE_STRING = "" +if os.path.exists(xhs_cookie_path): + try: + with open(xhs_cookie_path, "r", encoding="utf-8") as f: + XHS_COOKIE_STRING = f.read().strip() + except (IOError, OSError) as e: + logger.error(f"Error reading XHS cookie file: {e}") + XHS_COOKIE_STRING = "" +else: + cookie_parts = [] + if XIAOHONGSHU_A1: + cookie_parts.append(f"a1={XIAOHONGSHU_A1}") + if XIAOHONGSHU_WEBID: + cookie_parts.append(f"web_id={XIAOHONGSHU_WEBID}") + if XIAOHONGSHU_WEBSESSION: + cookie_parts.append(f"web_session={XIAOHONGSHU_WEBSESSION}") + XHS_COOKIE_STRING = "; ".join(cookie_parts) + +# Zhihu +FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") +ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None) + +zhihu_cookie_path = os.path.join(CONF_DIR, "zhihu_cookies.json") +if os.path.exists(zhihu_cookie_path): + try: + with open(zhihu_cookie_path, "r") as f: + ZHIHU_COOKIES_JSON = json.load(f) + except json.JSONDecodeError: + logger.error("Error: zhihu_cookies.json is not in a valid JSON format.") + ZHIHU_COOKIES_JSON = None + except FileNotFoundError: + logger.error("Error: zhihu_cookies.json does not exist.") + ZHIHU_COOKIES_JSON = None +else: + ZHIHU_COOKIES_JSON = None + +# Reddit +REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None) +REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None) +REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None) +REDDIT_USERNAME = env.get("REDDIT_USERNAME", None) + +# Open AI API +OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) + +# General webpage scraping +GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) +GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") + +# Firecrawl API +FIRECRAWL_API_URL = env.get("FIRECRAWL_API_URL", "") +FIRECRAWL_API_KEY = env.get("FIRECRAWL_API_KEY", "") +try: + FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR") or 3000) +except (ValueError, TypeError): + FIRECRAWL_WAIT_FOR = 3000 +FIRECRAWL_USE_JSON_EXTRACTION = get_env_bool(env, "FIRECRAWL_USE_JSON_EXTRACTION", False) + +# Zyte API +ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) + +# Telegraph +telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "") +TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None diff --git a/apps/api/src/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py similarity index 99% rename from apps/api/src/services/scrapers/douban/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py index 4ea8712..a273294 100644 --- a/apps/api/src/services/scrapers/douban/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py @@ -9,7 +9,7 @@ from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html from fastfetchbot_shared.utils.network import get_selector, HEADERS from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType -from src.config import JINJA2_ENV +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV SHORT_LIMIT = 600 diff --git a/apps/api/src/services/scrapers/general/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py similarity index 100% rename from apps/api/src/services/scrapers/general/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py diff --git a/apps/api/src/services/scrapers/general/base.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py similarity index 97% rename from apps/api/src/services/scrapers/general/base.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/base.py index 8d454d6..aeffdd0 100644 --- a/apps/api/src/services/scrapers/general/base.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py @@ -7,10 +7,10 @@ from openai import AsyncOpenAI from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam -from src.config import OPENAI_API_KEY +from fastfetchbot_shared.services.scrapers.config import OPENAI_API_KEY from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType -from src.services.scrapers.scraper import Scraper, DataProcessor -from src.services.scrapers.general import GeneralItem +from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor +from fastfetchbot_shared.services.scrapers.general import GeneralItem from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html from fastfetchbot_shared.utils.logger import logger diff --git a/apps/api/src/services/scrapers/general/firecrawl.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py similarity index 94% rename from apps/api/src/services/scrapers/general/firecrawl.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py index e09fb57..d27600e 100644 --- a/apps/api/src/services/scrapers/general/firecrawl.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py @@ -1,13 +1,13 @@ from typing import Optional -from src.config import FIRECRAWL_WAIT_FOR, FIRECRAWL_USE_JSON_EXTRACTION -from src.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper -from src.services.scrapers.general.firecrawl_client import FirecrawlClient -from src.services.scrapers.general.firecrawl_schema import ( +from fastfetchbot_shared.services.scrapers.config import FIRECRAWL_WAIT_FOR, FIRECRAWL_USE_JSON_EXTRACTION +from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper +from fastfetchbot_shared.services.scrapers.general.firecrawl_client import FirecrawlClient +from fastfetchbot_shared.services.scrapers.general.firecrawl_schema import ( ExtractedArticle, FIRECRAWL_EXTRACTION_PROMPT, ) -from src.services.scrapers.scraper import DataProcessor +from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html diff --git a/apps/api/src/services/scrapers/general/firecrawl_client.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py similarity index 97% rename from apps/api/src/services/scrapers/general/firecrawl_client.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py index 1b05b8f..db4b519 100644 --- a/apps/api/src/services/scrapers/general/firecrawl_client.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py @@ -6,7 +6,7 @@ from firecrawl import AsyncFirecrawl -from src.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY +from fastfetchbot_shared.services.scrapers.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY @dataclass(frozen=True) diff --git a/apps/api/src/services/scrapers/general/firecrawl_schema.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_schema.py similarity index 100% rename from apps/api/src/services/scrapers/general/firecrawl_schema.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_schema.py diff --git a/apps/api/src/services/scrapers/general/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py similarity index 87% rename from apps/api/src/services/scrapers/general/scraper.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py index f2454a3..08d472b 100644 --- a/apps/api/src/services/scrapers/general/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py @@ -1,10 +1,10 @@ from typing import Optional -from src.config import GENERAL_SCRAPING_API -from src.services.scrapers.scraper import Scraper, DataProcessor -from src.services.scrapers.general.base import BaseGeneralScraper -from src.services.scrapers.general.firecrawl import FirecrawlScraper -from src.services.scrapers.general.zyte import ZyteScraper +from fastfetchbot_shared.services.scrapers.config import GENERAL_SCRAPING_API +from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor +from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralScraper +from fastfetchbot_shared.services.scrapers.general.firecrawl import FirecrawlScraper +from fastfetchbot_shared.services.scrapers.general.zyte import ZyteScraper from fastfetchbot_shared.utils.logger import logger diff --git a/apps/api/src/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py similarity index 90% rename from apps/api/src/services/scrapers/general/zyte.py rename to packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index 804f5b0..234dd5f 100644 --- a/apps/api/src/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -1,8 +1,8 @@ from zyte_api import AsyncZyteAPI -from src.config import ZYTE_API_KEY -from src.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper -from src.services.scrapers.scraper import DataProcessor +from fastfetchbot_shared.services.scrapers.config import ZYTE_API_KEY +from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper +from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.utils.logger import logger diff --git a/apps/api/src/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py similarity index 99% rename from apps/api/src/services/scrapers/instagram/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py index dfaad67..9b4408e 100644 --- a/apps/api/src/services/scrapers/instagram/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py @@ -10,7 +10,7 @@ from fastfetchbot_shared.utils.parse import get_html_text_length from fastfetchbot_shared.utils.logger import logger from .config import API_HEADERS_LIST, ALL_SCRAPERS -from src.config import X_RAPIDAPI_KEY +from fastfetchbot_shared.services.scrapers.config import X_RAPIDAPI_KEY class Instagram(MetadataItem): diff --git a/apps/api/src/services/scrapers/instagram/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/config.py similarity index 100% rename from apps/api/src/services/scrapers/instagram/config.py rename to packages/shared/fastfetchbot_shared/services/scrapers/instagram/config.py diff --git a/apps/api/src/services/scrapers/reddit/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py similarity index 98% rename from apps/api/src/services/scrapers/reddit/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py index 7f873fb..d44c1a9 100644 --- a/apps/api/src/services/scrapers/reddit/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile -from src.config import ( +from fastfetchbot_shared.services.scrapers.config import ( REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_PASSWORD, diff --git a/apps/api/src/services/scrapers/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/scraper.py similarity index 100% rename from apps/api/src/services/scrapers/scraper.py rename to packages/shared/fastfetchbot_shared/services/scrapers/scraper.py diff --git a/apps/api/src/services/scrapers/scraper_manager.py b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py similarity index 87% rename from apps/api/src/services/scrapers/scraper_manager.py rename to packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py index ebe3935..f7bbbef 100644 --- a/apps/api/src/services/scrapers/scraper_manager.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py @@ -1,10 +1,10 @@ from typing import Optional from fastfetchbot_shared.utils.logger import logger -from src.services.scrapers.bluesky.scraper import BlueskyScraper -from src.services.scrapers.weibo.scraper import WeiboScraper -from src.services.scrapers.general.scraper import GeneralScraper -from src.config import ( +from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper +from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboScraper +from fastfetchbot_shared.services.scrapers.general.scraper import GeneralScraper +from fastfetchbot_shared.services.scrapers.config import ( BLUESKY_USERNAME, BLUESKY_PASSWORD ) diff --git a/apps/api/src/templates/bluesky_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/bluesky_content.jinja2 similarity index 100% rename from apps/api/src/templates/bluesky_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/bluesky_content.jinja2 diff --git a/apps/api/src/templates/bluesky_telegram_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/bluesky_telegram_text.jinja2 similarity index 100% rename from apps/api/src/templates/bluesky_telegram_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/bluesky_telegram_text.jinja2 diff --git a/apps/api/src/templates/douban_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/douban_content.jinja2 similarity index 100% rename from apps/api/src/templates/douban_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/douban_content.jinja2 diff --git a/apps/api/src/templates/douban_short_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/douban_short_text.jinja2 similarity index 100% rename from apps/api/src/templates/douban_short_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/douban_short_text.jinja2 diff --git a/apps/api/src/templates/reddit_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/reddit_content.jinja2 similarity index 100% rename from apps/api/src/templates/reddit_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/reddit_content.jinja2 diff --git a/apps/api/src/templates/reddit_short_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/reddit_short_text.jinja2 similarity index 100% rename from apps/api/src/templates/reddit_short_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/reddit_short_text.jinja2 diff --git a/apps/api/src/templates/video_info.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/video_info.jinja2 similarity index 100% rename from apps/api/src/templates/video_info.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/video_info.jinja2 diff --git a/apps/api/src/templates/weibo_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/weibo_content.jinja2 similarity index 100% rename from apps/api/src/templates/weibo_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/weibo_content.jinja2 diff --git a/apps/api/src/templates/weibo_short_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/weibo_short_text.jinja2 similarity index 100% rename from apps/api/src/templates/weibo_short_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/weibo_short_text.jinja2 diff --git a/apps/api/src/templates/xiaohongshu_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/xiaohongshu_content.jinja2 similarity index 100% rename from apps/api/src/templates/xiaohongshu_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/xiaohongshu_content.jinja2 diff --git a/apps/api/src/templates/xiaohongshu_short_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/xiaohongshu_short_text.jinja2 similarity index 100% rename from apps/api/src/templates/xiaohongshu_short_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/xiaohongshu_short_text.jinja2 diff --git a/apps/api/src/templates/zhihu_content.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/zhihu_content.jinja2 similarity index 100% rename from apps/api/src/templates/zhihu_content.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/zhihu_content.jinja2 diff --git a/apps/api/src/templates/zhihu_short_text.jinja2 b/packages/shared/fastfetchbot_shared/services/scrapers/templates/zhihu_short_text.jinja2 similarity index 100% rename from apps/api/src/templates/zhihu_short_text.jinja2 rename to packages/shared/fastfetchbot_shared/services/scrapers/templates/zhihu_short_text.jinja2 diff --git a/apps/api/src/services/scrapers/threads/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py similarity index 100% rename from apps/api/src/services/scrapers/threads/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py diff --git a/apps/api/src/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py similarity index 99% rename from apps/api/src/services/scrapers/twitter/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py index a48437f..66d8019 100644 --- a/apps/api/src/services/scrapers/twitter/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py @@ -17,7 +17,7 @@ SCRAPER_INFO, SHORT_LIMIT, ) -from src.config import X_RAPIDAPI_KEY, TWITTER_COOKIES, DEBUG_MODE +from fastfetchbot_shared.services.scrapers.config import X_RAPIDAPI_KEY, TWITTER_COOKIES, DEBUG_MODE from fastfetchbot_shared.utils.logger import logger diff --git a/apps/api/src/services/scrapers/twitter/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/config.py similarity index 100% rename from apps/api/src/services/scrapers/twitter/config.py rename to packages/shared/fastfetchbot_shared/services/scrapers/twitter/config.py diff --git a/apps/api/src/services/scrapers/wechat/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py similarity index 100% rename from apps/api/src/services/scrapers/wechat/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py diff --git a/apps/api/src/services/scrapers/weibo/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py similarity index 95% rename from apps/api/src/services/scrapers/weibo/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py index ac7eaee..c84494d 100644 --- a/apps/api/src/services/scrapers/weibo/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py @@ -18,7 +18,7 @@ WEIBO_HOST, WEIBO_TEXT_LIMIT, ) -from src.config import JINJA2_ENV, WEIBO_COOKIES +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV, WEIBO_COOKIES from fastfetchbot_shared.utils.logger import logger short_text_template = JINJA2_ENV.get_template("weibo_short_text.jinja2") diff --git a/apps/api/src/services/scrapers/weibo/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/config.py similarity index 100% rename from apps/api/src/services/scrapers/weibo/config.py rename to packages/shared/fastfetchbot_shared/services/scrapers/weibo/config.py diff --git a/apps/api/src/services/scrapers/weibo/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py similarity index 99% rename from apps/api/src/services/scrapers/weibo/scraper.py rename to packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py index bba152e..71215e2 100644 --- a/apps/api/src/services/scrapers/weibo/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py @@ -8,8 +8,8 @@ from lxml import html from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType -from src.services.scrapers.scraper import Scraper, DataProcessor -from src.services.scrapers.weibo import Weibo +from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor +from fastfetchbot_shared.services.scrapers.weibo import Weibo from fastfetchbot_shared.utils.network import get_response_json, get_random_user_agent from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html from .config import ( @@ -19,7 +19,7 @@ WEIBO_HOST, WEIBO_TEXT_LIMIT, ) -from src.config import JINJA2_ENV, WEIBO_COOKIES +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV, WEIBO_COOKIES from fastfetchbot_shared.utils.logger import logger short_text_template = JINJA2_ENV.get_template("weibo_short_text.jinja2") diff --git a/apps/api/src/services/scrapers/xiaohongshu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py similarity index 97% rename from apps/api/src/services/scrapers/xiaohongshu/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py index bd66cea..b586215 100644 --- a/apps/api/src/services/scrapers/xiaohongshu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py @@ -7,7 +7,7 @@ get_html_text_length, wrap_text_into_html, ) -from src.config import JINJA2_ENV, XHS_COOKIE_STRING, XHS_SIGN_SERVER_URL +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV, XHS_COOKIE_STRING, XHS_SIGN_SERVER_URL from .adaptar import XhsSinglePostAdapter environment = JINJA2_ENV diff --git a/apps/api/src/services/scrapers/xiaohongshu/adaptar.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py similarity index 100% rename from apps/api/src/services/scrapers/xiaohongshu/adaptar.py rename to packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py diff --git a/apps/api/src/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py similarity index 99% rename from apps/api/src/services/scrapers/zhihu/__init__.py rename to packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index b8c8a1a..17032d5 100644 --- a/apps/api/src/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -18,7 +18,7 @@ from fastfetchbot_shared.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \ get_content_async, get_response from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType -from src.config import JINJA2_ENV, FXZHIHU_HOST +from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV, FXZHIHU_HOST from .config import ( SHORT_LIMIT, ZHIHU_COLUMNS_API_HOST, diff --git a/apps/api/src/services/scrapers/zhihu/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py similarity index 92% rename from apps/api/src/services/scrapers/zhihu/config.py rename to packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py index 4d44fc8..d187797 100644 --- a/apps/api/src/services/scrapers/zhihu/config.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py @@ -1,4 +1,4 @@ -from src.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0 +from fastfetchbot_shared.services.scrapers.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0 SHORT_LIMIT = 600 ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api" diff --git a/apps/api/src/services/scrapers/zhihu/content_processing.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/content_processing.py similarity index 100% rename from apps/api/src/services/scrapers/zhihu/content_processing.py rename to packages/shared/fastfetchbot_shared/services/scrapers/zhihu/content_processing.py diff --git a/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py new file mode 100644 index 0000000..dbe429f --- /dev/null +++ b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py @@ -0,0 +1,74 @@ +# TODO: copy the html-to-telegraph package and modify it to fit the asynchronous model +import random +import traceback +from typing import Any + +from html_telegraph_poster_v2.async_poster import ( + AsyncTelegraphPoster, +) +from html_telegraph_poster_v2.async_poster.utils import DocumentPreprocessor + +from fastfetchbot_shared.services.scrapers.config import TELEGRAPH_TOKEN_LIST +from fastfetchbot_shared.models.telegraph_item import TelegraphItem, from_str +from fastfetchbot_shared.utils.logger import logger + + +class Telegraph(TelegraphItem): + def __init__( + self, + title: str, + url: str, + author: str, + author_url: str, + category: str, + content: str, + ): + self.telegraph = AsyncTelegraphPoster(use_api=True) + self.title = title + self.url = url + self.author = author + self.author_url = author_url + self.category = category + self.content = content + + @staticmethod + def from_dict(obj: Any) -> "Telegraph": + assert isinstance(obj, dict) + title = from_str(obj.get("title")) + url = from_str(obj.get("url")) + author = from_str(obj.get("author")) + author_url = from_str(obj.get("author_url")) + category = from_str(obj.get("category")) + content = from_str(obj.get("content")) + return Telegraph(title, url, author, author_url, category, content) + + async def get_telegraph(self, upload_images: bool = True) -> str: + try: + if upload_images: + temp_html = DocumentPreprocessor(self.content, url=self.url) + logger.info("Telegraph: Uploading images to telegraph...") + await temp_html.upload_all_images() + self.content = temp_html.get_processed_html() + logger.info("Telegraph: Uploading to telegraph...") + if not TELEGRAPH_TOKEN_LIST: + await self.telegraph.create_api_token( + short_name=self.author[0:14], author_name=self.author + ) + else: + random_token = random.choice(TELEGRAPH_TOKEN_LIST) + await self.telegraph.set_token(random_token) + + telegraph_post = await self.telegraph.post( + title=self.title, + author=self.author, + author_url=self.author_url, + text=self.content, + ) + logger.info( + f"Telegraph: Uploaded to telegraph. Link: {telegraph_post['url']}" + ) + telegraph_url = telegraph_post["url"] + return telegraph_url + except Exception as e: + logger.error("Telegraph upload failed", exc_info=e) + return "" diff --git a/packages/shared/pyproject.toml b/packages/shared/pyproject.toml index 5532b6c..cc081f4 100644 --- a/packages/shared/pyproject.toml +++ b/packages/shared/pyproject.toml @@ -20,6 +20,19 @@ dependencies = [ [project.optional-dependencies] postgres = ["asyncpg>=0.30.0"] migrate = ["alembic>=1.15.0"] +scrapers = [ + "jinja2>=3.1.6", + "jmespath>=1.0.1", + "twitter-api-client-v2>=0.1.1", + "atproto>=0.0.61", + "asyncpraw>=7.8.1", + "openai>=2.15.0", + "html-telegraph-poster-v2>=0.2.5", + "firecrawl-py>=4.13.0", + "zyte-api>=0.8.1", + "tenacity>=9.1.2", + "markdown>=3.8", +] [build-system] requires = ["hatchling"] diff --git a/uv.lock b/uv.lock index 0fa1fec..af7ff28 100644 --- a/uv.lock +++ b/uv.lock @@ -770,55 +770,33 @@ version = "0.1.0" source = { virtual = "apps/api" } dependencies = [ { name = "aioboto3" }, - { name = "asyncpraw" }, - { name = "atproto" }, { name = "babel" }, { name = "beanie" }, { name = "celery", extra = ["redis"] }, { name = "fastapi" }, - { name = "fastfetchbot-shared" }, - { name = "firecrawl-py" }, + { name = "fastfetchbot-shared", extra = ["scrapers"] }, { name = "gunicorn" }, - { name = "html-telegraph-poster-v2" }, - { name = "jinja2" }, - { name = "jmespath" }, - { name = "markdown" }, - { name = "openai" }, { name = "pillow" }, { name = "pydub" }, { name = "sentry-sdk", extra = ["fastapi"] }, - { name = "tenacity" }, - { name = "twitter-api-client-v2" }, { name = "uvicorn" }, { name = "xhtml2pdf" }, - { name = "zyte-api" }, ] [package.metadata] requires-dist = [ { name = "aioboto3", specifier = ">=13.4.0" }, - { name = "asyncpraw", specifier = ">=7.8.1" }, - { name = "atproto", specifier = ">=0.0.61" }, { name = "babel", specifier = ">=2.17.0" }, { name = "beanie", specifier = ">=1.29.0" }, { name = "celery", extras = ["redis"], specifier = ">=5.4.0" }, { name = "fastapi", specifier = ">=0.115.12" }, - { name = "fastfetchbot-shared", editable = "packages/shared" }, - { name = "firecrawl-py", specifier = ">=4.13.0" }, + { name = "fastfetchbot-shared", extras = ["scrapers"], editable = "packages/shared" }, { name = "gunicorn", specifier = ">=23.0.0" }, - { name = "html-telegraph-poster-v2", specifier = ">=0.2.5" }, - { name = "jinja2", specifier = ">=3.1.6" }, - { name = "jmespath", specifier = ">=1.0.1" }, - { name = "markdown", specifier = ">=3.8" }, - { name = "openai", specifier = ">=2.15.0" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pydub", specifier = ">=0.25.1" }, { name = "sentry-sdk", extras = ["fastapi"], specifier = ">=2.27.0" }, - { name = "tenacity", specifier = ">=9.1.2" }, - { name = "twitter-api-client-v2", specifier = ">=0.1.1" }, { name = "uvicorn", specifier = ">=0.34.2" }, { name = "xhtml2pdf", specifier = ">=0.2.17" }, - { name = "zyte-api", specifier = ">=0.8.1" }, ] [[package]] @@ -868,6 +846,19 @@ migrate = [ postgres = [ { name = "asyncpg" }, ] +scrapers = [ + { name = "asyncpraw" }, + { name = "atproto" }, + { name = "firecrawl-py" }, + { name = "html-telegraph-poster-v2" }, + { name = "jinja2" }, + { name = "jmespath" }, + { name = "markdown" }, + { name = "openai" }, + { name = "tenacity" }, + { name = "twitter-api-client-v2" }, + { name = "zyte-api" }, +] [package.metadata] requires-dist = [ @@ -875,18 +866,29 @@ requires-dist = [ { name = "aiosqlite", specifier = ">=0.17.0" }, { name = "alembic", marker = "extra == 'migrate'", specifier = ">=1.15.0" }, { name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.30.0" }, + { name = "asyncpraw", marker = "extra == 'scrapers'", specifier = ">=7.8.1" }, + { name = "atproto", marker = "extra == 'scrapers'", specifier = ">=0.0.61" }, { name = "beautifulsoup4", specifier = ">=4.13.4" }, { name = "fake-useragent", specifier = ">=1.5.1" }, + { name = "firecrawl-py", marker = "extra == 'scrapers'", specifier = ">=4.13.0" }, + { name = "html-telegraph-poster-v2", marker = "extra == 'scrapers'", specifier = ">=0.2.5" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "jinja2", marker = "extra == 'scrapers'", specifier = ">=3.1.6" }, + { name = "jmespath", marker = "extra == 'scrapers'", specifier = ">=1.0.1" }, { name = "loguru", specifier = ">=0.7.2" }, { name = "lxml", specifier = ">=5.4.0" }, + { name = "markdown", marker = "extra == 'scrapers'", specifier = ">=3.8" }, + { name = "openai", marker = "extra == 'scrapers'", specifier = ">=2.15.0" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "playwright", specifier = ">=1.52.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "python-magic", specifier = ">=0.4.27" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.0" }, + { name = "tenacity", marker = "extra == 'scrapers'", specifier = ">=9.1.2" }, + { name = "twitter-api-client-v2", marker = "extra == 'scrapers'", specifier = ">=0.1.1" }, + { name = "zyte-api", marker = "extra == 'scrapers'", specifier = ">=0.8.1" }, ] -provides-extras = ["postgres", "migrate"] +provides-extras = ["postgres", "migrate", "scrapers"] [[package]] name = "fastfetchbot-telegram-bot"