Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ FastFetchBot/
├── apps/api/ # FastAPI server: scrapers, storage, routing
├── apps/telegram-bot/ # Telegram Bot: webhook/polling, message handling
├── apps/worker/ # Celery worker: async file operations (video, PDF, audio)
├── app/ # Legacy re-export wrappers (backward compatibility)
├── pyproject.toml # Root workspace configuration
└── uv.lock # Lockfile for the entire workspace
```
Expand Down Expand Up @@ -54,10 +53,6 @@ The Telegram Bot communicates with the API server over HTTP (`API_SERVER_URL`).
- **`models/`** — `classes.py` (NamedBytesIO), `metadata_item.py`, `telegraph_item.py`, `url_metadata.py`
- **`utils/`** — `parse.py` (URL parsing, HTML processing, `get_env_bool`), `image.py`, `logger.py`, `network.py`

### Legacy `app/` Directory

Re-export wrappers providing backward compatibility. Actual code lives in `apps/api/src/` and `packages/shared/`. For example, `app/config.py` imports `get_env_bool` from `fastfetchbot_shared.utils.parse`.

## Development Commands

### Package Management
Expand Down Expand Up @@ -143,8 +138,6 @@ GitHub Actions (`.github/workflows/ci.yml`) builds and pushes all three images o
- `ghcr.io/aturret/fastfetchbot-tgbot:latest`
- `ghcr.io/aturret/fastfetchbot-worker:latest`

Deployment is triggered via Watchtower webhook after builds complete. Include `[github-action]` in a commit message to skip the build.

## Development Guidelines

### Adding a New Platform Scraper
Expand Down
13 changes: 1 addition & 12 deletions apps/api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,17 @@ name = "fastfetchbot-api"
version = "0.1.0"
requires-python = ">=3.12,<3.13"
dependencies = [
"fastfetchbot-shared",
"fastfetchbot-shared[scrapers]",
"fastapi>=0.115.12",
"sentry-sdk[fastapi]>=2.27.0",
"gunicorn>=23.0.0",
"uvicorn>=0.34.2",
"jinja2>=3.1.6",
"babel>=2.17.0",
"beanie>=1.29.0",
"jmespath>=1.0.1",
"twitter-api-client-v2>=0.1.1",
"atproto>=0.0.61",
"asyncpraw>=7.8.1",
"pillow>=10.0.0",
"pydub>=0.25.1",
"xhtml2pdf>=0.2.17",
"aioboto3>=13.4.0",
"tenacity>=9.1.2",
"markdown>=3.8",
"openai>=2.15.0",
"html-telegraph-poster-v2>=0.2.5",
"firecrawl-py>=4.13.0",
"zyte-api>=0.8.1",
"celery[redis]>=5.4.0",
]

Expand Down
127 changes: 2 additions & 125 deletions apps/api/src/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
import json
import os
import tempfile

from jinja2 import Environment, FileSystemLoader
import gettext
import secrets

from fastfetchbot_shared.utils.cookie import read_json_cookies_to_string
from fastfetchbot_shared.utils.logger import logger
from fastfetchbot_shared.utils.parse import get_env_bool

env = os.environ
Expand Down Expand Up @@ -35,10 +30,6 @@
MONGODB_HOST = env.get("MONGODB_HOST", "localhost")
MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}")

# Telegraph
telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "")
TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None

# File exporter toggle (used by telegram bot to show/hide buttons)
FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True)
DOWNLOAD_VIDEO_TIMEOUT = env.get("DOWNLOAD_VIDEO_TIMEOUT", 600)
Expand All @@ -47,105 +38,6 @@
CELERY_BROKER_URL = env.get("CELERY_BROKER_URL", "redis://localhost:6379/0")
CELERY_RESULT_BACKEND = env.get("CELERY_RESULT_BACKEND", "redis://localhost:6379/1")

# Services environment variables
templates_directory = os.path.join(current_directory, "templates")
JINJA2_ENV = Environment(
loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True
)
TEMPLATE_LANGUAGE = env.get(
"TEMPLATE_LANGUAGE", "zh_CN"
) # It is a workaround for translation system

# X-RapidAPI (for instagram)
X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None)

# Twitter
TWITTER_EMAIL = env.get("TWITTER_EMAIL", None)
TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None)
TWITTER_USERNAME = env.get("TWITTER_USERNAME", None)
TWITTER_CT0 = env.get("TWITTER_CT0", None)
TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None)
TWITTER_COOKIES = {
"ct0": TWITTER_CT0,
"auth_token": TWITTER_AUTH_TOKEN,
}

# Bluesky
BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None)
BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None)

# Weibo
weibo_cookies_path = os.path.join(conf_dir, "weibo_cookies.json")
if os.path.exists(weibo_cookies_path):
WEIBO_COOKIES = read_json_cookies_to_string(weibo_cookies_path)
else:
WEIBO_COOKIES = env.get("WEIBO_COOKIES", None)

# Xiaohongshu
XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None)
XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None)
XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None)
XIAOHONGSHU_COOKIES = {
"a1": XIAOHONGSHU_A1,
"web_id": XIAOHONGSHU_WEBID,
"web_session": XIAOHONGSHU_WEBSESSION,
}
XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",")
XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",")
XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False)
XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True)

# XHS sign server and cookie file
from fastfetchbot_shared.config import SIGN_SERVER_URL as XHS_SIGN_SERVER_URL
from fastfetchbot_shared.config import XHS_COOKIE_PATH as _XHS_COOKIE_PATH

xhs_cookie_path = _XHS_COOKIE_PATH or os.path.join(conf_dir, "xhs_cookies.txt")

# Load XHS cookies from file (similar to Zhihu cookie loading)
XHS_COOKIE_STRING = ""
if os.path.exists(xhs_cookie_path):
try:
with open(xhs_cookie_path, "r", encoding="utf-8") as f:
XHS_COOKIE_STRING = f.read().strip()
except (IOError, OSError) as e:
logger.error(f"Error reading XHS cookie file: {e}")
XHS_COOKIE_STRING = ""
else:
# Fallback: build cookie string from individual env vars (backward compat)
cookie_parts = []
if XIAOHONGSHU_A1:
cookie_parts.append(f"a1={XIAOHONGSHU_A1}")
if XIAOHONGSHU_WEBID:
cookie_parts.append(f"web_id={XIAOHONGSHU_WEBID}")
if XIAOHONGSHU_WEBSESSION:
cookie_parts.append(f"web_session={XIAOHONGSHU_WEBSESSION}")
XHS_COOKIE_STRING = "; ".join(cookie_parts)

# Zhihu
FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com")
ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None)

zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json")
if os.path.exists(zhihu_cookie_path):
try:
with open(zhihu_cookie_path, "r") as f:
ZHIHU_COOKIES_JSON = json.load(f)
except json.JSONDecodeError:
print("Error: The file is not in a valid JSON format.")
ZHIHU_COOKIES_JSON = None
except FileNotFoundError:
print("Error: The file does not exist.")
ZHIHU_COOKIES_JSON = None
else:
print("Error: We cannot find it.")
ZHIHU_COOKIES_JSON = None

# Reddit
REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None)
REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None)
REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None)
REDDIT_USERNAME = env.get("REDDIT_USERNAME", None)

# AWS storage
AWS_STORAGE_ON = get_env_bool(env, "AWS_STORAGE_ON", False)
AWS_ACCESS_KEY_ID = env.get("AWS_ACCESS_KEY_ID", None)
Expand All @@ -155,28 +47,13 @@
AWS_DOMAIN_HOST = env.get("AWS_DOMAIN_HOST", None)
if not (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_S3_BUCKET_NAME):
AWS_STORAGE_ON = False

# Inoreader
INOREADER_APP_ID = env.get("INOREADER_APP_ID", None)
INOREADER_APP_KEY = env.get("INOREADER_APP_KEY", None)
INOREADER_EMAIL = env.get("INOREADER_EMAIL", None)
INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None)

# Open AI API
OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)

# General webpage scraping
GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False)
GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL")

# Firecrawl API
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering
FIRECRAWL_USE_JSON_EXTRACTION = get_env_bool(env, "FIRECRAWL_USE_JSON_EXTRACTION", False)


# Zyte API
ZYTE_API_KEY = env.get("ZYTE_API_KEY", None)

# Locale directories environment variables
localedir = os.path.join(os.path.dirname(__file__), "locale")
translation = gettext.translation("messages", localedir=localedir, fallback=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from src.config import DOWNLOAD_VIDEO_TIMEOUT
from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html
from fastfetchbot_shared.utils.logger import logger
from src.config import JINJA2_ENV
from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV

video_info_template = JINJA2_ENV.get_template("video_info.jinja2")

Expand Down
69 changes: 15 additions & 54 deletions apps/api/src/services/scrapers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,21 @@
from src.models.database_model import Metadata
from fastfetchbot_shared.models.url_metadata import UrlMetadata
from fastfetchbot_shared.models.metadata_item import MessageType
from src.services import (
telegraph,
inoreader
)
from fastfetchbot_shared.services.scrapers.common import InfoExtractService as CoreInfoExtractService
from fastfetchbot_shared.services.telegraph import Telegraph
from src.services.file_export import video_download, document_export
from src.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads
from src.services.scrapers.scraper_manager import ScraperManager
from src.database import save_instances
from fastfetchbot_shared.utils.logger import logger
from src.config import DATABASE_ON


class InfoExtractService(object):
class InfoExtractService(CoreInfoExtractService):
"""API-layer service that adds Telegraph, PDF export, DB storage, and video download."""

service_classes: dict = {
"twitter": twitter.Twitter,
"threads": threads.Threads,
"reddit": reddit.Reddit,
"weibo": weibo.Weibo,
"wechat": wechat.Wechat,
"instagram": instagram.Instagram,
"douban": douban.Douban,
"zhihu": zhihu.Zhihu,
"xiaohongshu": xiaohongshu.Xiaohongshu,
**CoreInfoExtractService.service_classes,
"youtube": video_download.VideoDownloader,
"bilibili": video_download.VideoDownloader,
"inoreader": inoreader.Inoreader,
}

def __init__(
Expand All @@ -40,49 +29,21 @@ def __init__(
store_document: Optional[bool] = False,
**kwargs,
):
url_metadata = url_metadata.to_dict()
self.url = url_metadata["url"]
self.content_type = url_metadata["content_type"]
self.source = url_metadata["source"]
self.data = data
self.kwargs = kwargs
self.store_database = store_database
self.store_telegraph = store_telegraph
self.store_document = store_document

@property
def category(self) -> str:
return self.source

async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
if self.content_type == "video":
if not self.kwargs.get("category"):
self.kwargs["category"] = self.category
if not metadata_item:
try:
if self.category in ["bluesky", "weibo", "other", "unknown"]: # it is a workaround before the code refactor
await ScraperManager.init_scraper(self.category)
item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url)
metadata_item = await item_data_processor.get_item()
else:
scraper_item = InfoExtractService.service_classes[self.category](
url=self.url, data=self.data, **self.kwargs
)
metadata_item = await scraper_item.get_item()
except Exception as e:
logger.error(f"Error while getting item: {e}")
raise e
logger.info(f"Got metadata item")
logger.debug(metadata_item)
metadata_item = await self.process_item(metadata_item)
return metadata_item
super().__init__(
url_metadata,
data=data,
store_database=store_database,
store_telegraph=store_telegraph,
store_document=store_document,
**kwargs,
)

async def process_item(self, metadata_item: dict) -> dict:
if metadata_item.get("message_type") == MessageType.LONG:
self.store_telegraph = True
logger.info("message type is long, store in telegraph")
if self.store_telegraph:
telegraph_item = telegraph.Telegraph.from_dict(metadata_item)
telegraph_item = Telegraph.from_dict(metadata_item)
try:
telegraph_url = await telegraph_item.get_telegraph()
except Exception as e:
Expand Down
Loading