From 5dc7648a208c3532dfbc0903f2294a4e7756252f Mon Sep 17 00:00:00 2001 From: aturret Date: Wed, 18 Feb 2026 01:41:36 -0600 Subject: [PATCH 1/8] feat: keep old app folder --- app/config.py | 2 +- app/models/classes.py | 19 +- app/models/metadata_item.py | 135 +--- app/models/telegraph_item.py | 65 +- app/models/url_metadata.py | 57 +- app/routers/feed_push.py | 53 -- app/services/inoreader/telegram_process.py | 23 +- app/services/telegram_bot/__init__.py | 696 +------------------- app/services/telegram_bot/handlers.py | 359 ++++++++++ app/services/telegram_bot/message_sender.py | 345 ++++++++++ app/utils/config.py | 62 +- app/utils/image.py | 55 +- app/utils/logger.py | 20 +- app/utils/network.py | 215 +----- app/utils/parse.py | 240 +------ 15 files changed, 810 insertions(+), 1536 deletions(-) delete mode 100644 app/routers/feed_push.py create mode 100644 app/services/telegram_bot/handlers.py create mode 100644 app/services/telegram_bot/message_sender.py diff --git a/app/config.py b/app/config.py index 50fd18d..ca0db5d 100644 --- a/app/config.py +++ b/app/config.py @@ -6,7 +6,7 @@ import gettext import secrets -from app.utils.parse import get_env_bool +from fastfetchbot_shared.utils.parse import get_env_bool env = os.environ current_directory = os.path.dirname(os.path.abspath(__file__)) diff --git a/app/models/classes.py b/app/models/classes.py index 2fab80a..e89bcf7 100644 --- a/app/models/classes.py +++ b/app/models/classes.py @@ -1,17 +1,2 @@ -from io import BytesIO - - -class NamedBytesIO(BytesIO): - @property - def name(self): - return self._name - - def __init__(self, content=None, name=None): - super().__init__(content) - self._name = name - if content is not None: - self.size = self.getbuffer().nbytes - - @name.setter - def name(self, value): - self._name = value +# Re-export from shared package +from fastfetchbot_shared.models.classes import NamedBytesIO # noqa: F401 diff --git a/app/models/metadata_item.py b/app/models/metadata_item.py index 6b5820d..5bce9fa 100644 --- a/app/models/metadata_item.py +++ b/app/models/metadata_item.py @@ -1,123 +1,12 @@ -from dataclasses import dataclass -from enum import Enum, unique -from typing import Any, List, TypeVar, Callable, Type, cast, Union, Optional - -from pydantic import BaseModel - -""" -MetadataItem is a dataclass that represents a single item for our services. It would be saved in the database. -The MetadataItem is used to send to the telegram bot. Users can use the metadata to define their own message template. -If the program doesn't find the attribute in the dict_data, it will use the default value in case of KeyError. -""" - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - if x is None: - return "" - assert isinstance(x, str) - return x - - -def from_list(f: Callable[[Any], T], x: Any) -> List[T]: - assert isinstance(x, list) - return [f(y) for y in x] - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@unique -class MessageType(str, Enum): - SHORT = "short" - LONG = "long" - - -@dataclass -class MediaFile: - media_type: str - url: str - original_url: Optional[str] = None - caption: Optional[str] = None - - @staticmethod - def from_dict(obj: Any) -> "MediaFile": - assert isinstance(obj, dict) - media_type = from_str(obj.get("media_type")) - url = from_str(obj.get("url")) - caption = from_str(obj.get("caption")) - return MediaFile(media_type, url, caption) - - def to_dict(self) -> dict: - result: dict = {} - result["media_type"] = from_str(self.media_type) - result["url"] = from_str(self.url) - result["caption"] = self.caption - return result - - -@dataclass -class MetadataItem: - url: str - telegraph_url: Optional[str] - content: Optional[str] - text: Optional[str] - media_files: List[MediaFile] - author: str - title: str - author_url: Optional[str] - category: str - message_type: Optional[MessageType] - - @staticmethod - def from_dict(obj: Any) -> "MetadataItem": - assert isinstance(obj, dict) - url = from_str(obj.get("url")) - telegraph_url = from_str(obj.get("telegraph_url")) - content = from_str(obj.get("content")) - text = from_str(obj.get("text")) - media_files = from_list(MediaFile.from_dict, obj.get("media_files")) - author = from_str(obj.get("author")) - title = from_str(obj.get("title")) - author_url = from_str(obj.get("author_url")) - category = from_str(obj.get("category")) - message_type = MessageType(obj.get("message_type")) - return MetadataItem( - url, - telegraph_url, - content, - text, - media_files, - author, - title, - author_url, - category, - message_type, - ) - - def to_dict(self) -> dict: - result: dict = { - "url": from_str(self.url), - "telegraph_url": "", "content": from_str(self.content), - "text": from_str(self.text), - "media_files": from_list( - lambda x: to_class(MediaFile, x), self.media_files - ), - "author": from_str(self.author), - "title": from_str(self.title), - "author_url": from_str(self.author_url), - "category": from_str(self.category), - "message_type": self.message_type.value - } - return result - - -def metadata_item_from_dict(s: Any) -> MetadataItem: - return MetadataItem.from_dict(s) - - -def metadata_item_to_dict(x: MetadataItem) -> Any: - return to_class(MetadataItem, x) +# Re-export from shared package +from fastfetchbot_shared.models.metadata_item import * # noqa: F401,F403 +from fastfetchbot_shared.models.metadata_item import ( # noqa: F401 + MetadataItem, + MediaFile, + MessageType, + from_str, + from_list, + to_class, + metadata_item_from_dict, + metadata_item_to_dict, +) diff --git a/app/models/telegraph_item.py b/app/models/telegraph_item.py index 04d5b77..2b4b2f0 100644 --- a/app/models/telegraph_item.py +++ b/app/models/telegraph_item.py @@ -1,58 +1,7 @@ -from dataclasses import dataclass -from typing import Any, TypeVar, Type, cast - -""" -The TelegraphItem is a class for generating a Telegraph page. -If the program doesn't find the attribute in the dict_data, it will use the default value in case of KeyError. -""" - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - assert isinstance(x, str) - return x - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@dataclass -class TelegraphItem: - title: str - url: str - author: str - author_url: str - category: str - content: str - - @staticmethod - def from_dict(obj: Any) -> 'TelegraphItem': - assert isinstance(obj, dict) - title = from_str(obj.get("title")) - url = from_str(obj.get("url")) - author = from_str(obj.get("author")) - author_url = from_str(obj.get("author_url")) - category = from_str(obj.get("category")) - content = from_str(obj.get("content")) - return TelegraphItem(title, url, author, author_url, category, content) - - def to_dict(self) -> dict: - result: dict = {} - result["title"] = from_str(self.title) - result["url"] = from_str(self.url) - result["author"] = from_str(self.author) - result["author_url"] = from_str(self.author_url) - result["category"] = from_str(self.category) - result["content"] = from_str(self.content) - return result - - -def telegraph_item_from_dict(s: Any) -> TelegraphItem: - return TelegraphItem.from_dict(s) - - -def telegraph_item_to_dict(x: TelegraphItem) -> Any: - return to_class(TelegraphItem, x) +# Re-export from shared package +from fastfetchbot_shared.models.telegraph_item import * # noqa: F401,F403 +from fastfetchbot_shared.models.telegraph_item import ( # noqa: F401 + TelegraphItem, + telegraph_item_from_dict, + telegraph_item_to_dict, +) diff --git a/app/models/url_metadata.py b/app/models/url_metadata.py index a581045..020d120 100644 --- a/app/models/url_metadata.py +++ b/app/models/url_metadata.py @@ -1,50 +1,7 @@ -import re -from dataclasses import dataclass -from typing import Any, TypeVar, Type, cast - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - assert isinstance(x, str) - return x - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@dataclass -class UrlMetadata: - url: str - source: str - content_type: str - - def __init__(self, url: str, source: str, content_type: str) -> None: - self.url = url - self.source = source - self.content_type = content_type - - @staticmethod - def from_dict(obj: Any) -> "UrlMetadata": - assert isinstance(obj, dict) - url = from_str(obj.get("url")) - source = from_str(obj.get("source")) - the_type = from_str(obj.get("type")) - return UrlMetadata(url, source, the_type) - - def to_dict(self) -> dict: - result: dict = {} - result["url"] = from_str(self.url) - result["source"] = from_str(self.source) - result["content_type"] = from_str(self.content_type) - return result - - -def url_metadata_from_dict(s: Any) -> UrlMetadata: - return UrlMetadata.from_dict(s) - - -def url_metadata_to_dict(x: UrlMetadata) -> Any: - return to_class(UrlMetadata, x) +# Re-export from shared package +from fastfetchbot_shared.models.url_metadata import * # noqa: F401,F403 +from fastfetchbot_shared.models.url_metadata import ( # noqa: F401 + UrlMetadata, + url_metadata_from_dict, + url_metadata_to_dict, +) diff --git a/app/routers/feed_push.py b/app/routers/feed_push.py deleted file mode 100644 index 997fdb1..0000000 --- a/app/routers/feed_push.py +++ /dev/null @@ -1,53 +0,0 @@ -# TODO: this script is now unused, will be removed in the future - -from fastapi import APIRouter -from fastapi.requests import Request - -from app.config import TELEGRAM_CHANNEL_ID -from app.services.telegram_bot import send_item_message -from app.services.scrapers.common import InfoExtractService -from fastapi import Security -from app.auth import verify_api_key -from app.utils.logger import logger -from app.utils.parse import get_url_metadata - -router = APIRouter(prefix="/feedPush") - - -async def get_feed_item(url: str, channel_id: str, **kwargs): - try: - channel_id = int(channel_id) if channel_id.startswith("-") else channel_id - url_metadata = await get_url_metadata(url) - item = InfoExtractService(url_metadata, **kwargs) - metadata_item = await item.get_item() - if channel_id not in TELEGRAM_CHANNEL_ID: - logger.error(f"channel_id {channel_id} not found") - return - await send_item_message(metadata_item, chat_id=channel_id) - except Exception as e: - logger.error(f"Error while getting item: {e}") - - -@router.post("/", dependencies=[Security(verify_api_key)]) -async def push_feed_item( - request: Request, -): - try: - data = await request.json() - params = request.query_params - url = ( - data.get("url") - or data.get("aurl") - or params.get("url") - or params.get("aurl") - ) - if not url: - return f"Error: url is required" - channel_id = data.get("channelId") or params.get("channelId") - if not channel_id: - return f"Error: channelId is required" - kwargs = data.get("kwargs", {}) - await get_feed_item(url, channel_id, **kwargs) - return "ok" - except Exception as e: - return f"Error: {e}" diff --git a/app/services/inoreader/telegram_process.py b/app/services/inoreader/telegram_process.py index a1102f5..975e894 100644 --- a/app/services/inoreader/telegram_process.py +++ b/app/services/inoreader/telegram_process.py @@ -1,22 +1,34 @@ -from typing import Union, Optional, Dict +from typing import Union, Optional, Dict, Callable, Awaitable from app.config import TELEGRAM_CHANNEL_ID from app.models.url_metadata import UrlMetadata from app.services.inoreader import Inoreader from app.services.scrapers.common import InfoExtractService -from app.services.telegram_bot import send_item_message from app.utils.logger import logger from app.utils.parse import get_url_metadata, get_bool default_telegram_channel_id = TELEGRAM_CHANNEL_ID[0] if TELEGRAM_CHANNEL_ID else None +# Type alias for the message callback +MessageCallback = Callable[[dict, Union[int, str]], Awaitable[None]] + + +async def _default_message_callback(metadata_item: dict, chat_id: Union[int, str]) -> None: + """Default callback that sends via Telegram bot. Used when no callback is provided.""" + from app.services.telegram_bot import send_item_message + await send_item_message(metadata_item, chat_id=chat_id) + async def process_inoreader_data( data: list, use_inoreader_content: bool, telegram_channel_id: Union[int, str] = default_telegram_channel_id, stream_id: str = None, + message_callback: MessageCallback = None, ): + if message_callback is None: + message_callback = _default_message_callback + for item in data: url_type_item = await get_url_metadata(item["aurl"]) url_type_dict = url_type_item.to_dict() @@ -46,7 +58,7 @@ async def process_inoreader_data( store_document=True, ) message_metadata_item = await metadata_item.get_item() - await send_item_message(message_metadata_item, chat_id=telegram_channel_id) + await message_callback(message_metadata_item, telegram_channel_id) if stream_id: await Inoreader.mark_all_as_read( stream_id=stream_id, timestamp=item["timestamp"] - 1 @@ -57,7 +69,7 @@ async def get_inoreader_item_async( data: Optional[Dict] = None, trigger: bool = False, params: Optional[Dict] = None, - # filters: Optional[Dict] = None, + message_callback: MessageCallback = None, ) -> None: stream_id = None use_inoreader_content = True @@ -83,7 +95,8 @@ async def get_inoreader_item_async( if type(data) is dict: data = [data] await process_inoreader_data( - data, use_inoreader_content, telegram_channel_id, stream_id + data, use_inoreader_content, telegram_channel_id, stream_id, + message_callback=message_callback, ) if stream_id: await Inoreader.mark_all_as_read(stream_id=stream_id) diff --git a/app/services/telegram_bot/__init__.py b/app/services/telegram_bot/__init__.py index 5de80d8..0fe696e 100755 --- a/app/services/telegram_bot/__init__.py +++ b/app/services/telegram_bot/__init__.py @@ -1,36 +1,15 @@ # TODO: Implement Telegram Service # example: https://docs.python-telegram-bot.org/en/stable/examples.customwebhookbot.html -import asyncio -import html -import json -import os import mimetypes -import aiofiles -import traceback -from io import BytesIO -from urllib.parse import urlparse -from urllib.request import url2pathname -from typing import Union mimetypes.init() from telegram import ( Update, MessageEntity, - InlineKeyboardButton, - InlineKeyboardMarkup, - Message, - InputMediaPhoto, - InputMediaVideo, - InputMediaDocument, - InputMediaAnimation, - InputMediaAudio, ) -from telegram.constants import ParseMode from telegram.ext import ( Application, - CallbackContext, - ContextTypes, MessageHandler, CallbackQueryHandler, filters, @@ -38,49 +17,31 @@ AIORateLimiter, ) -from app.database import save_instances -from app.models.metadata_item import MessageType -from app.models.telegram_chat import TelegramMessage, TelegramUser, TelegramChat -from app.services.scrapers.common import InfoExtractService -from app.utils.parse import get_url_metadata, telegram_message_html_trim -from app.utils.network import download_file_by_metadata_item -from app.utils.image import Image, image_compressing, check_image_type -from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS from app.utils.logger import logger from app.config import ( TELEGRAM_BOT_TOKEN, TELEGRAM_WEBHOOK_URL, TELEGRAM_BOT_SECRET_TOKEN, - TELEGRAM_CHANNEL_ID, - TELEGRAM_CHANNEL_ADMIN_LIST, - TELEBOT_DEBUG_CHANNEL, TELEBOT_API_SERVER, TELEBOT_API_SERVER_FILE, TELEBOT_LOCAL_FILE_MODE, TELEBOT_CONNECT_TIMEOUT, TELEBOT_READ_TIMEOUT, TELEBOT_WRITE_TIMEOUT, - TELEGRAM_IMAGE_DIMENSION_LIMIT, - TELEGRAM_IMAGE_SIZE_LIMIT, - TELEGRAM_GROUP_MESSAGE_BAN_LIST, - TELEGRAM_BOT_MESSAGE_BAN_LIST, - FILE_EXPORTER_ON, - JINJA2_ENV, - OPENAI_API_KEY, - DATABASE_ON, - TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, GENERAL_SCRAPING_ON, + TELEBOT_MAX_RETRY, ) -from app.services.telegram_bot.config import ( - HTTPS_URL_REGEX, - TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT, - TELEGRAM_FILE_UPLOAD_LIMIT, - TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API, - REFERER_REQUIRED, - TELEGRAM_TEXT_LIMIT, - TEMPLATE_TRANSLATION, + +# Re-export for external consumers +from app.services.telegram_bot.message_sender import send_item_message # noqa: F401 +from app.services.telegram_bot.handlers import ( # noqa: F401 + https_url_process, + https_url_auto_process, + all_messages_process, + buttons_process, + invalid_buttons, + error_process, + content_process_function, ) -from app.models.classes import NamedBytesIO -from app.models.url_metadata import UrlMetadata """ application and handlers initialization @@ -112,12 +73,6 @@ async def set_webhook() -> bool: else: logger.error("TELEGRAM_BOT_TOKEN is not set!") -environment = JINJA2_ENV -template = environment.get_template("social_media_message.jinja2") -template_text = TEMPLATE_TRANSLATION.get( - TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] -) - async def startup() -> None: await application.initialize() @@ -187,630 +142,3 @@ async def process_telegram_update( update = Update.de_json(data=data, bot=application.bot) application.bot.insert_callback_data(update) await application.update_queue.put(update) - - -async def https_url_process(update: Update, context: CallbackContext) -> None: - message = update.message - welcome_message = await message.reply_text( - text="Processing...", - ) - url_dict: dict = message.parse_entities(types=["url"]) - await welcome_message.delete() - for i, url in enumerate(url_dict.values()): - process_message = await message.reply_text( - text=f"Processing the {i + 1}th url...", - ) - url_metadata = await get_url_metadata(url, ban_list=TELEGRAM_BOT_MESSAGE_BAN_LIST) - if url_metadata.source == "banned": - await process_message.edit_text( - text=f"For the {i + 1} th url, the url is banned." - ) - return - if url_metadata.source == "unknown": - if GENERAL_SCRAPING_ON: - await process_message.edit_text( - text=f"Uncategorized url found. General webpage parser is on, Processing..." - ) - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id - ) - await process_message.edit_text( - text=f"For the {i + 1} th url, no supported url found." - ) - return - else: - await process_message.edit_text( - text=f"{url_metadata.source} url found. Processing..." - ) - # create the inline keyboard - special_function_keyboard = [] - basic_function_keyboard = [] - if TELEGRAM_CHANNEL_ID and ( - TELEGRAM_CHANNEL_ADMIN_LIST - and str(message.from_user.id) in TELEGRAM_CHANNEL_ADMIN_LIST - ): - special_function_keyboard.append( - InlineKeyboardButton( - "Send to Channel", - callback_data={ - "type": "channel", - "metadata": url_metadata, - "extra_args": {"store_document": True}, - }, - ), - ) - # video content url buttons - if url_metadata.content_type == "video": - basic_function_keyboard.extend( - [ - InlineKeyboardButton( - "Get Info", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": {"download": False}, - }, - ), - InlineKeyboardButton( - "Download", - callback_data={ - "type": "video", - "metadata": url_metadata, - }, - ), - ] - ) - if FILE_EXPORTER_ON: - special_function_keyboard.extend( - [ - InlineKeyboardButton( - "Audio Only", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": { - "audio_only": True, - }, - }, - ), - InlineKeyboardButton( - "Download HD", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": {"hd": True}, - }, - ), - ] - ) - if OPENAI_API_KEY: - special_function_keyboard.append( - InlineKeyboardButton( - "Transcribe Text", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": { - "audio_only": True, - "transcribe": True, - "store_document": True, - }, - }, - ), - ) - elif url_metadata.content_type == "social_media": - basic_function_keyboard.extend( - [ - InlineKeyboardButton( - "Send to Me", - callback_data={"type": "private", "metadata": url_metadata}, - ), - InlineKeyboardButton( - "Force Send in Chat", - callback_data={"type": "force", "metadata": url_metadata}, - ), - ] - ) - if FILE_EXPORTER_ON: - special_function_keyboard.append( - InlineKeyboardButton( - "Send with PDF", - callback_data={ - "type": "pdf", - "metadata": url_metadata, - "extra_args": {"store_document": True}, - }, - ), - ) - basic_function_keyboard.append( - InlineKeyboardButton( - "Cancel", - callback_data={"type": "cancel"}, - ), - ) - keyboard = [ - special_function_keyboard, - basic_function_keyboard, - ] - reply_markup = InlineKeyboardMarkup(keyboard) - await process_message.reply_text( - f"For the {i + 1}th url: {url}, please choose the function you want to use:", - reply_markup=reply_markup, - ) - await process_message.delete() - - -async def https_url_auto_process(update: Update, context: CallbackContext) -> None: - message = update.message - url_dict = message.parse_entities(types=["url"]) - for i, url in enumerate(url_dict.values()): - url_metadata = await get_url_metadata( - url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST - ) - if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON: - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - elif url_metadata.source == "unknown" or url_metadata.source == "banned": - logger.debug(f"for the {i + 1}th url {url}, no supported url found.") - return - if url_metadata.to_dict().get("source") in SOCIAL_MEDIA_WEBSITE_PATTERNS.keys(): - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - if url_metadata.to_dict().get("source") in VIDEO_WEBSITE_PATTERNS.keys(): - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - - -async def all_messages_process(update: Update, context: CallbackContext) -> None: - message = update.message - logger.debug(message) - if message and DATABASE_ON: - telegram_chat = TelegramChat.construct(**message.chat.to_dict()) - telegram_user = TelegramUser.construct(**message.from_user.to_dict()) - telegram_message = TelegramMessage( - datetime=message.date, - chat=telegram_chat, - user=telegram_user, - text=message.text or "", - ) - await save_instances(telegram_message) - - -async def buttons_process(update: Update, context: CallbackContext) -> None: - query = update.callback_query - data = query.data - chat_id = None - if data["type"] == "cancel": - await query.answer("Canceled") - else: - if data["type"] == "private" or data["type"] == "force": - await query.answer("Sending to you...") - if data["type"] == "channel": - if data.get("channel_id") or len(TELEGRAM_CHANNEL_ID) == 1: - channel_chat = await application.bot.get_chat( - chat_id=data.get("channel_id") - if data.get("channel_id") - else TELEGRAM_CHANNEL_ID[0] - ) - await query.answer("Sending to channel...") - if channel_chat.type == "channel": - chat_id = channel_chat.id - else: - await query.message.reply_text( - text="Sorry, the provided channel id does not exist or is not a channel." - ) - chat_id = query.message.chat_id - elif len(TELEGRAM_CHANNEL_ID) > 1: - choose_channel_keyboard = await _create_choose_channel_keyboard( - data=data - ) - await query.message.reply_text( - text="Please choose the channel you want to send:", - reply_markup=InlineKeyboardMarkup(choose_channel_keyboard), - ) - await query.message.delete() - context.drop_callback_data(query) - return - else: - chat_id = query.message.chat_id - if data["type"] == "video": - await query.answer("Video processing...") - replying_message = await query.message.reply_text( - text=f"Item processing...", - ) - extra_args = data["extra_args"] if "extra_args" in data else {} - metadata_item = await content_process_function( - url_metadata=data["metadata"], **extra_args - ) - await replying_message.edit_text( - text=f"Item processed. Sending to the target...", - ) - if data["type"] == "force": - metadata_item["message_type"] = MessageType.SHORT - await send_item_message(metadata_item, chat_id=chat_id) - if data["type"] == "channel": - await query.message.reply_text( - text=f"Item sent to the channel.", - ) - await replying_message.delete() - await query.message.delete() - context.drop_callback_data(query) - - -async def _create_choose_channel_keyboard(data: dict) -> list: - choose_channel_keyboard = [] - for i, channel_id in enumerate(TELEGRAM_CHANNEL_ID): - channel_chat = await application.bot.get_chat(chat_id=channel_id) - choose_channel_keyboard.append( - [ - InlineKeyboardButton( - channel_chat.title, - callback_data={ - "type": "channel", - "metadata": data["metadata"], - "extra_args": data["extra_args"], - "channel_id": channel_id, - }, - ) - ] - ) - choose_channel_keyboard.append( - [ - InlineKeyboardButton( - "Cancel", - callback_data={"type": "cancel"}, - ) - ] - ) - return choose_channel_keyboard - - -async def invalid_buttons(update: Update, context: CallbackContext) -> None: - await update.callback_query.answer("Invalid button!") - await update.effective_message.edit_text( - "Sorry, Error Occurred, I could not process this button click 😕." - ) - - -async def content_process_function(url_metadata: UrlMetadata, **kwargs) -> dict: - item = InfoExtractService(url_metadata, **kwargs) - metadata_item = await item.get_item() - return metadata_item - - -async def send_item_message( - data: dict, chat_id: Union[int, str] = None, message: Message = None -) -> None: - """ - :param data: (dict) metadata of the item - :param chat_id: (int) any chat id for sending - :param message: (Message) any message to reply - :return: - """ - logger.debug(f"send_item_message: {data}, {chat_id}, {message}") - if not chat_id and not message: - raise ValueError("must provide chat_id or message") - if ( - not chat_id - ) and message: # this function supports direct reply to a message even if the chat_id is None - chat_id = message.chat.id - discussion_chat_id = chat_id - the_chat = await application.bot.get_chat(chat_id=chat_id) - logger.debug(f"the chat of sending message: {the_chat}") - if the_chat.type == "channel" and the_chat.linked_chat_id: - discussion_chat_id = the_chat.linked_chat_id - try: - caption_text = message_formatting(data) - if len(data["media_files"]) > 0: - # if the message type is short and there are some media files, send media group - reply_to_message_id = None - media_message_group, file_message_group = await media_files_packaging( - media_files=data["media_files"], data=data - ) - if ( - len(media_message_group) > 0 - ): # if there are some media groups to send, send it - for i, media_group in enumerate(media_message_group): - caption_text = ( - caption_text - if i == 0 - else f"the {i + 1}th part of the media item:" - ) - logger.debug(f"media group: {media_group}") - logger.debug( - f"caption text: {caption_text},length={len(caption_text)}" - ) - sent_media_files_message = await application.bot.send_media_group( - chat_id=chat_id, - media=media_group, - parse_mode=ParseMode.HTML, - caption=caption_text, - write_timeout=TELEBOT_WRITE_TIMEOUT, - reply_to_message_id=message.message_id if message else None, - ) - if sent_media_files_message is tuple: - reply_to_message_id = sent_media_files_message[0].message_id - elif sent_media_files_message is Message: - reply_to_message_id = sent_media_files_message.message_id - logger.debug(f"sent media files message: {sent_media_files_message}") - else: - sent_message = await application.bot.send_message( - chat_id=chat_id, - text=caption_text, - parse_mode=ParseMode.HTML, - reply_to_message_id=message.message_id if message else None, - disable_web_page_preview=True - if data["message_type"] == MessageType.SHORT - else False, - disable_notification=True, - ) - if discussion_chat_id != chat_id: - await asyncio.sleep( - 3 - ) # wait for several seconds to avoid missing the target message - # if the chat is a channel, get the latest pinned message from the channel and reply to it - group_chat = await application.bot.get_chat(chat_id=discussion_chat_id) - logger.debug(f"the group chat: {group_chat}") - pinned_message = group_chat.pinned_message - logger.debug(f"the pinned message: {pinned_message}") - if len(media_message_group) > 0: - if ( - pinned_message.forward_origin.message_id - == sent_media_files_message[-1].message_id - ): - reply_to_message_id = ( - group_chat.pinned_message.id - - len(sent_media_files_message) - + 1 - ) - else: - reply_to_message_id = group_chat.pinned_message.id + 1 - elif pinned_message.forward_origin.message_id == sent_message.message_id: - reply_to_message_id = group_chat.pinned_message.id - else: - reply_to_message_id = group_chat.pinned_message.id + 1 - if ( - len(file_message_group) > 0 - ): # to send files, the files messages should be replied to the message sent before - logger.debug(f"reply_to_message_id: {reply_to_message_id}") - for file_group in file_message_group: - logger.debug(f"file group: {file_group}") - await application.bot.send_media_group( - chat_id=discussion_chat_id, - media=file_group, - reply_to_message_id=reply_to_message_id, - parse_mode=ParseMode.HTML, - disable_notification=True, - ) - else: - await application.bot.send_message( - chat_id=chat_id, - text=caption_text, - parse_mode=ParseMode.HTML, - reply_to_message_id=message.message_id if message else None, - disable_web_page_preview=True - if data["message_type"] == "short" - else False, - disable_notification=True, - ) - # except BadRequest as e: - # logger.error(e) - # except RetryAfter as e: - # logger.error(e) - # except TimedOut as e: - # logger.error(e) - # await application.bot.send_message( - # chat_id=discussion_chat_id, - # text="Timed out while sending the item to the target 😕", - # reply_to_message_id=message.message_id if message else None, - # ) - except Exception as e: - logger.error(e) - traceback.print_exc() - # await application.bot.send_message( - # chat_id=discussion_chat_id, - # text="Error occurred while sending the item to the target 😕", - # reply_to_message_id=message.message_id if message else None, - # ) - await send_debug_channel(traceback.format_exc()) - - -async def error_process(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: - logger.error("Exception while handling an update:", exc_info=context.error) - tb_list = traceback.format_exception( - None, context.error, context.error.__traceback__ - ) - tb_string = "".join(tb_list) - update_str = update.to_dict() if isinstance(update, Update) else str(update) - message = ( - f"An exception was raised while handling an update\n" - f"
update = {html.escape(json.dumps(update_str, indent=2, ensure_ascii=False))}"
-        "
\n\n" - f"
context.chat_data = {html.escape(str(context.chat_data))}
\n\n" - f"
context.user_data = {html.escape(str(context.user_data))}
\n\n" - f"
{html.escape(tb_string)}
" - ) - debug_chat_id = update.message.chat_id - if TELEBOT_DEBUG_CHANNEL is not None: - debug_chat_id = TELEBOT_DEBUG_CHANNEL - await context.bot.send_message( - chat_id=debug_chat_id, text=message, parse_mode=ParseMode.HTML - ) - - -async def send_debug_channel(message: str) -> None: - if TELEBOT_DEBUG_CHANNEL is not None: - await application.bot.send_message( - chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML - ) - - -def message_formatting(data: dict) -> str: - """ - Format the message to be sent to the user. - :param data: - :return: text (str) the formatted text for telegram bot api sending message. - """ - if data["message_type"] == "short": - data["text"] = telegram_message_html_trim(data["text"]) - message_template = template - text = message_template.render(data=data, template_text=template_text) - logger.debug(f"message text: \n{text}") - return text - - -async def media_files_packaging(media_files: list, data: dict) -> tuple: - """ - Download the media files from data["media_files"] and package them into a list of media group or file group for - sending them by send_media_group method or send_document method. - :param data: (dict) metadata of the item - :param media_files: (list) a list of media files, - :param caption_text: (str) the caption text - :return: (tuple) a tuple of media group and file group - media_message_group: (list) a list of media items, the type of each item is InputMediaPhoto or InputMediaVideo - file_group: (list) a list of file items, the type of each item is InputFile - TODO: It's not a good practice for this function. This method will still download all the media files even when - media files are too large and it can be memory consuming even if we use a database to store the media files. - The function should be optimized to resolve the media files one group by one group and send each group - immediately after it is resolved. - This processing method should be optimized in the future. - """ - media_counter, file_counter = 0, 0 - media_message_group, media_group, file_message_group, file_group = [], [], [], [] - for ( - media_item - ) in media_files: # To traverse all media items in the media files list - # check if we need to create a new media group - if media_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: - # the limitation of media item for a single telegram media group message is 10 - media_message_group.append(media_group) - media_group = [] - media_counter = 0 - if file_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: - # the limitation of media item for a single telegram media group message is 10 - file_message_group.append(file_group) - file_group = [] - file_counter = 0 - if not ( - media_item["media_type"] in ["image", "gif", "video"] - and data["message_type"] == "long" - ): - # check the url validity - url_parser = urlparse(media_item["url"]) - if url_parser.scheme in [ - "http", - "https", - ]: # if the url is a http url, download the file - file_format = "mp4" if media_item["media_type"] == "video" else None - io_object = await download_file_by_metadata_item( - media_item["url"], data=data, file_format=file_format - ) - filename = io_object.name - file_size = io_object.size - else: # if the url is a local file path, just add it to the media group - try: - file_path = url2pathname(media_item["url"]) - async with aiofiles.open(file_path, mode="rb") as f: - filename = os.path.basename(file_path) - content = await f.read() - io_object = NamedBytesIO(content=content, name=filename) - file_size = io_object.size - except Exception as e: # the url is not a valid file path - logger.error(e) - continue - # check the file size - if ( - not TELEBOT_API_SERVER - ): # the official telegram bot api server only supports 50MB file - if file_size > TELEGRAM_FILE_UPLOAD_LIMIT: - # if the size is over 50MB, skip this file - continue - else: - if file_size > TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API: - # for local api sever, if the size is over 2GB, skip this file - continue - # check media files' type and process them by their type - if media_item["media_type"] == "image": - image_url = media_item["url"] - ext = await check_image_type(io_object) - # jpg to jpeg, ignore case - if ext.lower() == "jpg": - ext = "JPEG" - io_object.seek(0) - image = Image.open(io_object, formats=[ext]) - img_width, img_height = image.size - ratio = float(max(img_height, img_width)) / float( - min(img_height, img_width) - ) - # don't try to resize image if the ratio is too large - if ( - ratio < 5 - or max(img_height, img_width) < TELEGRAM_IMAGE_DIMENSION_LIMIT - ): - image = image_compressing(image, TELEGRAM_IMAGE_DIMENSION_LIMIT) - with BytesIO() as buffer: - # mime_type file format - image.save(buffer, format=ext) - buffer.seek(0) - resized_ratio = max(image.height, image.width) / min( - image.height, image.width - ) - logger.debug( - f"resized image size: {buffer.getbuffer().nbytes}, ratio: {resized_ratio}, width: {image.width}, height: {image.height}" - ) - media_group.append(InputMediaPhoto(buffer, filename=filename)) - # the image is not able to get json serialized - logger.debug( - f"image size: {file_size}, ratio: {ratio}, width: {img_width}, height: {img_height}" - ) - if ( - file_size > TELEGRAM_IMAGE_SIZE_LIMIT - or img_width > TELEGRAM_IMAGE_DIMENSION_LIMIT - or img_height > TELEGRAM_IMAGE_DIMENSION_LIMIT - ) and data["category"] not in ["xiaohongshu"]: - io_object = await download_file_by_metadata_item( - url=image_url, data=data - ) - if not io_object.name.endswith(".gif"): - if not io_object.name.endswith(ext.lower()): - io_object.name = io_object.name + "." + ext.lower() - # TODO: it is not a good way to judge whether it is a gif... - file_group.append( - InputMediaDocument(io_object, parse_mode=ParseMode.HTML) - ) - file_counter += 1 - elif media_item["media_type"] == "gif": - io_object = await download_file_by_metadata_item( - url=media_item["url"], - data=data, - file_name="gif_image-" + str(media_counter) + ".gif", - ) - io_object.name = io_object.name + ".gif" - media_group.append(InputMediaAnimation(io_object)) - elif media_item["media_type"] == "video": - media_group.append(InputMediaVideo(io_object, supports_streaming=True)) - # TODO: not have any services to store audio files for now, just a placeholder - elif media_item["media_type"] == "audio": - media_group.append(InputMediaAudio(io_object)) - elif media_item["media_type"] == "document": - file_group.append( - InputMediaDocument(io_object, parse_mode=ParseMode.HTML) - ) - file_counter += 1 - media_counter += 1 - logger.info( - f"get the {media_counter}th media item,type: {media_item['media_type']}, url: {media_item['url']}" - ) - # check if the media group is empty, if it is, return None - if len(media_group) > 0: # append the last media group - media_message_group.append(media_group) - if len(file_group) > 0: - file_message_group.append(file_group) - return media_message_group, file_message_group diff --git a/app/services/telegram_bot/handlers.py b/app/services/telegram_bot/handlers.py new file mode 100644 index 0000000..73bd5b9 --- /dev/null +++ b/app/services/telegram_bot/handlers.py @@ -0,0 +1,359 @@ +import html +import json +import traceback + +from telegram import ( + Update, + MessageEntity, + InlineKeyboardButton, + InlineKeyboardMarkup, +) +from telegram.constants import ParseMode +from telegram.ext import ( + CallbackContext, + ContextTypes, +) + +from app.database import save_instances +from app.models.metadata_item import MessageType +from app.models.telegram_chat import TelegramMessage, TelegramUser, TelegramChat +from app.models.url_metadata import UrlMetadata +from app.services.scrapers.common import InfoExtractService +from app.services.telegram_bot.message_sender import send_item_message +from app.utils.parse import get_url_metadata +from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS +from app.utils.logger import logger +from app.config import ( + TELEGRAM_CHANNEL_ID, + TELEGRAM_CHANNEL_ADMIN_LIST, + TELEBOT_DEBUG_CHANNEL, + TELEGRAM_GROUP_MESSAGE_BAN_LIST, + TELEGRAM_BOT_MESSAGE_BAN_LIST, + FILE_EXPORTER_ON, + OPENAI_API_KEY, + DATABASE_ON, + GENERAL_SCRAPING_ON, +) + + +async def content_process_function(url_metadata: UrlMetadata, **kwargs) -> dict: + item = InfoExtractService(url_metadata, **kwargs) + metadata_item = await item.get_item() + return metadata_item + + +async def https_url_process(update: Update, context: CallbackContext) -> None: + message = update.message + welcome_message = await message.reply_text( + text="Processing...", + ) + url_dict: dict = message.parse_entities(types=["url"]) + await welcome_message.delete() + for i, url in enumerate(url_dict.values()): + process_message = await message.reply_text( + text=f"Processing the {i + 1}th url...", + ) + url_metadata = await get_url_metadata(url, ban_list=TELEGRAM_BOT_MESSAGE_BAN_LIST) + if url_metadata.source == "banned": + await process_message.edit_text( + text=f"For the {i + 1} th url, the url is banned." + ) + return + if url_metadata.source == "unknown": + if GENERAL_SCRAPING_ON: + await process_message.edit_text( + text=f"Uncategorized url found. General webpage parser is on, Processing..." + ) + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id + ) + await process_message.edit_text( + text=f"For the {i + 1} th url, no supported url found." + ) + return + else: + await process_message.edit_text( + text=f"{url_metadata.source} url found. Processing..." + ) + # create the inline keyboard + special_function_keyboard = [] + basic_function_keyboard = [] + if TELEGRAM_CHANNEL_ID and ( + TELEGRAM_CHANNEL_ADMIN_LIST + and str(message.from_user.id) in TELEGRAM_CHANNEL_ADMIN_LIST + ): + special_function_keyboard.append( + InlineKeyboardButton( + "Send to Channel", + callback_data={ + "type": "channel", + "metadata": url_metadata, + "extra_args": {"store_document": True}, + }, + ), + ) + # video content url buttons + if url_metadata.content_type == "video": + basic_function_keyboard.extend( + [ + InlineKeyboardButton( + "Get Info", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": {"download": False}, + }, + ), + InlineKeyboardButton( + "Download", + callback_data={ + "type": "video", + "metadata": url_metadata, + }, + ), + ] + ) + if FILE_EXPORTER_ON: + special_function_keyboard.extend( + [ + InlineKeyboardButton( + "Audio Only", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": { + "audio_only": True, + }, + }, + ), + InlineKeyboardButton( + "Download HD", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": {"hd": True}, + }, + ), + ] + ) + if OPENAI_API_KEY: + special_function_keyboard.append( + InlineKeyboardButton( + "Transcribe Text", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": { + "audio_only": True, + "transcribe": True, + "store_document": True, + }, + }, + ), + ) + elif url_metadata.content_type == "social_media": + basic_function_keyboard.extend( + [ + InlineKeyboardButton( + "Send to Me", + callback_data={"type": "private", "metadata": url_metadata}, + ), + InlineKeyboardButton( + "Force Send in Chat", + callback_data={"type": "force", "metadata": url_metadata}, + ), + ] + ) + if FILE_EXPORTER_ON: + special_function_keyboard.append( + InlineKeyboardButton( + "Send with PDF", + callback_data={ + "type": "pdf", + "metadata": url_metadata, + "extra_args": {"store_document": True}, + }, + ), + ) + basic_function_keyboard.append( + InlineKeyboardButton( + "Cancel", + callback_data={"type": "cancel"}, + ), + ) + keyboard = [ + special_function_keyboard, + basic_function_keyboard, + ] + reply_markup = InlineKeyboardMarkup(keyboard) + await process_message.reply_text( + f"For the {i + 1}th url: {url}, please choose the function you want to use:", + reply_markup=reply_markup, + ) + await process_message.delete() + + +async def https_url_auto_process(update: Update, context: CallbackContext) -> None: + message = update.message + url_dict = message.parse_entities(types=["url"]) + for i, url in enumerate(url_dict.values()): + url_metadata = await get_url_metadata( + url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST + ) + if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON: + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + elif url_metadata.source == "unknown" or url_metadata.source == "banned": + logger.debug(f"for the {i + 1}th url {url}, no supported url found.") + return + if url_metadata.to_dict().get("source") in SOCIAL_MEDIA_WEBSITE_PATTERNS.keys(): + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + if url_metadata.to_dict().get("source") in VIDEO_WEBSITE_PATTERNS.keys(): + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + + +async def all_messages_process(update: Update, context: CallbackContext) -> None: + message = update.message + logger.debug(message) + if message and DATABASE_ON: + telegram_chat = TelegramChat.construct(**message.chat.to_dict()) + telegram_user = TelegramUser.construct(**message.from_user.to_dict()) + telegram_message = TelegramMessage( + datetime=message.date, + chat=telegram_chat, + user=telegram_user, + text=message.text or "", + ) + await save_instances(telegram_message) + + +async def buttons_process(update: Update, context: CallbackContext) -> None: + from app.services.telegram_bot import application + + query = update.callback_query + data = query.data + chat_id = None + if data["type"] == "cancel": + await query.answer("Canceled") + else: + if data["type"] == "private" or data["type"] == "force": + await query.answer("Sending to you...") + if data["type"] == "channel": + if data.get("channel_id") or len(TELEGRAM_CHANNEL_ID) == 1: + channel_chat = await application.bot.get_chat( + chat_id=data.get("channel_id") + if data.get("channel_id") + else TELEGRAM_CHANNEL_ID[0] + ) + await query.answer("Sending to channel...") + if channel_chat.type == "channel": + chat_id = channel_chat.id + else: + await query.message.reply_text( + text="Sorry, the provided channel id does not exist or is not a channel." + ) + chat_id = query.message.chat_id + elif len(TELEGRAM_CHANNEL_ID) > 1: + choose_channel_keyboard = await _create_choose_channel_keyboard( + data=data + ) + await query.message.reply_text( + text="Please choose the channel you want to send:", + reply_markup=InlineKeyboardMarkup(choose_channel_keyboard), + ) + await query.message.delete() + context.drop_callback_data(query) + return + else: + chat_id = query.message.chat_id + if data["type"] == "video": + await query.answer("Video processing...") + replying_message = await query.message.reply_text( + text=f"Item processing...", + ) + extra_args = data["extra_args"] if "extra_args" in data else {} + metadata_item = await content_process_function( + url_metadata=data["metadata"], **extra_args + ) + await replying_message.edit_text( + text=f"Item processed. Sending to the target...", + ) + if data["type"] == "force": + metadata_item["message_type"] = MessageType.SHORT + await send_item_message(metadata_item, chat_id=chat_id) + if data["type"] == "channel": + await query.message.reply_text( + text=f"Item sent to the channel.", + ) + await replying_message.delete() + await query.message.delete() + context.drop_callback_data(query) + + +async def _create_choose_channel_keyboard(data: dict) -> list: + from app.services.telegram_bot import application + + choose_channel_keyboard = [] + for i, channel_id in enumerate(TELEGRAM_CHANNEL_ID): + channel_chat = await application.bot.get_chat(chat_id=channel_id) + choose_channel_keyboard.append( + [ + InlineKeyboardButton( + channel_chat.title, + callback_data={ + "type": "channel", + "metadata": data["metadata"], + "extra_args": data["extra_args"], + "channel_id": channel_id, + }, + ) + ] + ) + choose_channel_keyboard.append( + [ + InlineKeyboardButton( + "Cancel", + callback_data={"type": "cancel"}, + ) + ] + ) + return choose_channel_keyboard + + +async def invalid_buttons(update: Update, context: CallbackContext) -> None: + await update.callback_query.answer("Invalid button!") + await update.effective_message.edit_text( + "Sorry, Error Occurred, I could not process this button click 😕." + ) + + +async def error_process(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: + logger.error("Exception while handling an update:", exc_info=context.error) + tb_list = traceback.format_exception( + None, context.error, context.error.__traceback__ + ) + tb_string = "".join(tb_list) + update_str = update.to_dict() if isinstance(update, Update) else str(update) + message = ( + f"An exception was raised while handling an update\n" + f"
update = {html.escape(json.dumps(update_str, indent=2, ensure_ascii=False))}"
+        "
\n\n" + f"
context.chat_data = {html.escape(str(context.chat_data))}
\n\n" + f"
context.user_data = {html.escape(str(context.user_data))}
\n\n" + f"
{html.escape(tb_string)}
" + ) + debug_chat_id = update.message.chat_id + if TELEBOT_DEBUG_CHANNEL is not None: + debug_chat_id = TELEBOT_DEBUG_CHANNEL + await context.bot.send_message( + chat_id=debug_chat_id, text=message, parse_mode=ParseMode.HTML + ) diff --git a/app/services/telegram_bot/message_sender.py b/app/services/telegram_bot/message_sender.py new file mode 100644 index 0000000..8b60f8f --- /dev/null +++ b/app/services/telegram_bot/message_sender.py @@ -0,0 +1,345 @@ +import asyncio +import os +import traceback +from io import BytesIO +from urllib.parse import urlparse +from urllib.request import url2pathname +from typing import Union + +import aiofiles +from telegram import ( + Message, + InputMediaPhoto, + InputMediaVideo, + InputMediaDocument, + InputMediaAnimation, + InputMediaAudio, +) +from telegram.constants import ParseMode + +from app.models.metadata_item import MessageType +from app.models.classes import NamedBytesIO +from app.utils.parse import telegram_message_html_trim +from app.utils.network import download_file_by_metadata_item +from app.utils.image import Image, image_compressing, check_image_type +from app.utils.logger import logger +from app.config import ( + TELEBOT_API_SERVER, + TELEBOT_WRITE_TIMEOUT, + TELEGRAM_IMAGE_DIMENSION_LIMIT, + TELEGRAM_IMAGE_SIZE_LIMIT, + JINJA2_ENV, + TEMPLATE_LANGUAGE, +) +from app.services.telegram_bot.config import ( + TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT, + TELEGRAM_FILE_UPLOAD_LIMIT, + TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API, + TEMPLATE_TRANSLATION, +) + +environment = JINJA2_ENV +template = environment.get_template("social_media_message.jinja2") +template_text = TEMPLATE_TRANSLATION.get( + TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] +) + + +def _get_application(): + """Lazy import to avoid circular dependency.""" + from app.services.telegram_bot import application + return application + + +async def send_item_message( + data: dict, chat_id: Union[int, str] = None, message: Message = None +) -> None: + """ + :param data: (dict) metadata of the item + :param chat_id: (int) any chat id for sending + :param message: (Message) any message to reply + :return: + """ + application = _get_application() + logger.debug(f"send_item_message: {data}, {chat_id}, {message}") + if not chat_id and not message: + raise ValueError("must provide chat_id or message") + if ( + not chat_id + ) and message: # this function supports direct reply to a message even if the chat_id is None + chat_id = message.chat.id + discussion_chat_id = chat_id + the_chat = await application.bot.get_chat(chat_id=chat_id) + logger.debug(f"the chat of sending message: {the_chat}") + if the_chat.type == "channel" and the_chat.linked_chat_id: + discussion_chat_id = the_chat.linked_chat_id + try: + caption_text = message_formatting(data) + if len(data["media_files"]) > 0: + # if the message type is short and there are some media files, send media group + reply_to_message_id = None + media_message_group, file_message_group = await media_files_packaging( + media_files=data["media_files"], data=data + ) + if ( + len(media_message_group) > 0 + ): # if there are some media groups to send, send it + for i, media_group in enumerate(media_message_group): + caption_text = ( + caption_text + if i == 0 + else f"the {i + 1}th part of the media item:" + ) + logger.debug(f"media group: {media_group}") + logger.debug( + f"caption text: {caption_text},length={len(caption_text)}" + ) + sent_media_files_message = await application.bot.send_media_group( + chat_id=chat_id, + media=media_group, + parse_mode=ParseMode.HTML, + caption=caption_text, + write_timeout=TELEBOT_WRITE_TIMEOUT, + reply_to_message_id=message.message_id if message else None, + ) + if sent_media_files_message is tuple: + reply_to_message_id = sent_media_files_message[0].message_id + elif sent_media_files_message is Message: + reply_to_message_id = sent_media_files_message.message_id + logger.debug(f"sent media files message: {sent_media_files_message}") + else: + sent_message = await application.bot.send_message( + chat_id=chat_id, + text=caption_text, + parse_mode=ParseMode.HTML, + reply_to_message_id=message.message_id if message else None, + disable_web_page_preview=True + if data["message_type"] == MessageType.SHORT + else False, + disable_notification=True, + ) + if discussion_chat_id != chat_id: + await asyncio.sleep( + 3 + ) # wait for several seconds to avoid missing the target message + # if the chat is a channel, get the latest pinned message from the channel and reply to it + group_chat = await application.bot.get_chat(chat_id=discussion_chat_id) + logger.debug(f"the group chat: {group_chat}") + pinned_message = group_chat.pinned_message + logger.debug(f"the pinned message: {pinned_message}") + if len(media_message_group) > 0: + if ( + pinned_message.forward_origin.message_id + == sent_media_files_message[-1].message_id + ): + reply_to_message_id = ( + group_chat.pinned_message.id + - len(sent_media_files_message) + + 1 + ) + else: + reply_to_message_id = group_chat.pinned_message.id + 1 + elif pinned_message.forward_origin.message_id == sent_message.message_id: + reply_to_message_id = group_chat.pinned_message.id + else: + reply_to_message_id = group_chat.pinned_message.id + 1 + if ( + len(file_message_group) > 0 + ): # to send files, the files messages should be replied to the message sent before + logger.debug(f"reply_to_message_id: {reply_to_message_id}") + for file_group in file_message_group: + logger.debug(f"file group: {file_group}") + await application.bot.send_media_group( + chat_id=discussion_chat_id, + media=file_group, + reply_to_message_id=reply_to_message_id, + parse_mode=ParseMode.HTML, + disable_notification=True, + ) + else: + await application.bot.send_message( + chat_id=chat_id, + text=caption_text, + parse_mode=ParseMode.HTML, + reply_to_message_id=message.message_id if message else None, + disable_web_page_preview=True + if data["message_type"] == "short" + else False, + disable_notification=True, + ) + except Exception as e: + logger.error(e) + traceback.print_exc() + await send_debug_channel(traceback.format_exc()) + + +async def send_debug_channel(message: str) -> None: + from app.config import TELEBOT_DEBUG_CHANNEL + application = _get_application() + if TELEBOT_DEBUG_CHANNEL is not None: + await application.bot.send_message( + chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML + ) + + +def message_formatting(data: dict) -> str: + """ + Format the message to be sent to the user. + :param data: + :return: text (str) the formatted text for telegram bot api sending message. + """ + if data["message_type"] == "short": + data["text"] = telegram_message_html_trim(data["text"]) + message_template = template + text = message_template.render(data=data, template_text=template_text) + logger.debug(f"message text: \n{text}") + return text + + +async def media_files_packaging(media_files: list, data: dict) -> tuple: + """ + Download the media files from data["media_files"] and package them into a list of media group or file group for + sending them by send_media_group method or send_document method. + :param data: (dict) metadata of the item + :param media_files: (list) a list of media files, + :return: (tuple) a tuple of media group and file group + media_message_group: (list) a list of media items, the type of each item is InputMediaPhoto or InputMediaVideo + file_group: (list) a list of file items, the type of each item is InputFile + TODO: It's not a good practice for this function. This method will still download all the media files even when + media files are too large and it can be memory consuming even if we use a database to store the media files. + The function should be optimized to resolve the media files one group by one group and send each group + immediately after it is resolved. + This processing method should be optimized in the future. + """ + media_counter, file_counter = 0, 0 + media_message_group, media_group, file_message_group, file_group = [], [], [], [] + for ( + media_item + ) in media_files: # To traverse all media items in the media files list + # check if we need to create a new media group + if media_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: + # the limitation of media item for a single telegram media group message is 10 + media_message_group.append(media_group) + media_group = [] + media_counter = 0 + if file_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: + # the limitation of media item for a single telegram media group message is 10 + file_message_group.append(file_group) + file_group = [] + file_counter = 0 + if not ( + media_item["media_type"] in ["image", "gif", "video"] + and data["message_type"] == "long" + ): + # check the url validity + url_parser = urlparse(media_item["url"]) + if url_parser.scheme in [ + "http", + "https", + ]: # if the url is a http url, download the file + file_format = "mp4" if media_item["media_type"] == "video" else None + io_object = await download_file_by_metadata_item( + media_item["url"], data=data, file_format=file_format + ) + filename = io_object.name + file_size = io_object.size + else: # if the url is a local file path, just add it to the media group + try: + file_path = url2pathname(media_item["url"]) + async with aiofiles.open(file_path, mode="rb") as f: + filename = os.path.basename(file_path) + content = await f.read() + io_object = NamedBytesIO(content=content, name=filename) + file_size = io_object.size + except Exception as e: # the url is not a valid file path + logger.error(e) + continue + # check the file size + if ( + not TELEBOT_API_SERVER + ): # the official telegram bot api server only supports 50MB file + if file_size > TELEGRAM_FILE_UPLOAD_LIMIT: + # if the size is over 50MB, skip this file + continue + else: + if file_size > TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API: + # for local api sever, if the size is over 2GB, skip this file + continue + # check media files' type and process them by their type + if media_item["media_type"] == "image": + image_url = media_item["url"] + ext = await check_image_type(io_object) + # jpg to jpeg, ignore case + if ext.lower() == "jpg": + ext = "JPEG" + io_object.seek(0) + image = Image.open(io_object, formats=[ext]) + img_width, img_height = image.size + ratio = float(max(img_height, img_width)) / float( + min(img_height, img_width) + ) + # don't try to resize image if the ratio is too large + if ( + ratio < 5 + or max(img_height, img_width) < TELEGRAM_IMAGE_DIMENSION_LIMIT + ): + image = image_compressing(image, TELEGRAM_IMAGE_DIMENSION_LIMIT) + with BytesIO() as buffer: + # mime_type file format + image.save(buffer, format=ext) + buffer.seek(0) + resized_ratio = max(image.height, image.width) / min( + image.height, image.width + ) + logger.debug( + f"resized image size: {buffer.getbuffer().nbytes}, ratio: {resized_ratio}, width: {image.width}, height: {image.height}" + ) + media_group.append(InputMediaPhoto(buffer, filename=filename)) + # the image is not able to get json serialized + logger.debug( + f"image size: {file_size}, ratio: {ratio}, width: {img_width}, height: {img_height}" + ) + if ( + file_size > TELEGRAM_IMAGE_SIZE_LIMIT + or img_width > TELEGRAM_IMAGE_DIMENSION_LIMIT + or img_height > TELEGRAM_IMAGE_DIMENSION_LIMIT + ) and data["category"] not in ["xiaohongshu"]: + io_object = await download_file_by_metadata_item( + url=image_url, data=data + ) + if not io_object.name.endswith(".gif"): + if not io_object.name.endswith(ext.lower()): + io_object.name = io_object.name + "." + ext.lower() + # TODO: it is not a good way to judge whether it is a gif... + file_group.append( + InputMediaDocument(io_object, parse_mode=ParseMode.HTML) + ) + file_counter += 1 + elif media_item["media_type"] == "gif": + io_object = await download_file_by_metadata_item( + url=media_item["url"], + data=data, + file_name="gif_image-" + str(media_counter) + ".gif", + ) + io_object.name = io_object.name + ".gif" + media_group.append(InputMediaAnimation(io_object)) + elif media_item["media_type"] == "video": + media_group.append(InputMediaVideo(io_object, supports_streaming=True)) + # TODO: not have any services to store audio files for now, just a placeholder + elif media_item["media_type"] == "audio": + media_group.append(InputMediaAudio(io_object)) + elif media_item["media_type"] == "document": + file_group.append( + InputMediaDocument(io_object, parse_mode=ParseMode.HTML) + ) + file_counter += 1 + media_counter += 1 + logger.info( + f"get the {media_counter}th media item,type: {media_item['media_type']}, url: {media_item['url']}" + ) + # check if the media group is empty, if it is, return None + if len(media_group) > 0: # append the last media group + media_message_group.append(media_group) + if len(file_group) > 0: + file_message_group.append(file_group) + return media_message_group, file_message_group diff --git a/app/utils/config.py b/app/utils/config.py index 2c9b6a3..ad3d691 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -1,55 +1,7 @@ -""" -patterns for check url type -""" -SOCIAL_MEDIA_WEBSITE_PATTERNS = { - "weibo": [ - r"(m\.)?weibo.cn\/(status\/)?[0-9a-zA-Z]+", - r"(www\.)?weibo\.com\/(status\/)?[0-9a-zA-Z]+", - ], - "twitter": [r"(twitter|x)\.com\/[^\/]+\/status\/[0-9]+"], - "instagram": [r"(www\.)?instagram\.com(\/share)?\/(p|reel)\/[A-Za-z0-9_-]+"], - "zhihu": [ - r"(www\.)?zhihu\.com\/question\/[0-9]+\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/aria\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/aria\/question\/[0-9]+\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/pin\/[0-9]+", - r"zhuanlan\.zhihu\.com\/p\/[0-9]+", - ], - "douban": [ - r"(game|music|movie|book)?\.douban\.com\/review\/[0-9]+", - r"((www|m)\.)?douban\.com\/note\/[0-9]+", - r"((www|m)\.)?douban\.com\/people\/[^\/]+\/status\/[0-9]+", - r"((www|m)\.)?douban\.com\/group\/topic\/[0-9]+", - r"((www|m)\.)?douban\.com\/(game|music|movie|book)\/review\/[0-9]+", - ], - "wechat": [r"mp\.weixin\.qq\.com\/s", r"mp\.weixin\.qq\.com\/mp\/appmsg\/show"], - "threads": [r"(www\.)?threads\.net\/@[a-zA-Z0-9]+\/post"], - "xiaohongshu": [ - r"(www\.)?xiaohongshu\.com\/(discovery\/item|explore)\/[0-9a-zA-Z_-]+", - r"(www\.)?xhslink\.com\/[0-9a-zA-Z_-]+", - ], - "reddit": [ - r"(www\.)?reddit\.com\/r\/[a-zA-Z0-9_-]+\/comments\/[a-zA-Z0-9_-]+", - r"(www\.)?reddit\.com\/r\/[a-zA-Z0-9_-]+\/s\/[a-zA-Z0-9_-]+", - ], - "bluesky": [ - r"(www\.)?bsky\.app\/profile/[a-zA-Z0-9\.]+\/post\/[a-zA-Z0-9\-_]+", - ] -} -VIDEO_WEBSITE_PATTERNS = { - "youtube": [ - r"((m|www)\.)youtube\.com\/watch", - r"youtu\.be\/[A-Za-z0-9_-]+", - r"youtube\.com\/shorts\/[A-Za-z0-9_-]+", - ], - "bilibili": [ - r"((www\.)?bilibili\.com\/video\/[A-Za-z0-9]+)", - r"b23\.tv\/[A-Za-z0-9]+", - ], -} -BANNED_PATTERNS = [ - r"chatgpt\.com\/share\/[A-Za-z0-9]+", - r"gemini\/share\/[A-Za-z0-9]+", - r"t\.me\/[A-Za-z0-9]+" -] \ No newline at end of file +# Re-export from shared package +from fastfetchbot_shared.utils.config import * # noqa: F401,F403 +from fastfetchbot_shared.utils.config import ( # noqa: F401 + SOCIAL_MEDIA_WEBSITE_PATTERNS, + VIDEO_WEBSITE_PATTERNS, + BANNED_PATTERNS, +) diff --git a/app/utils/image.py b/app/utils/image.py index 1e0a4af..500afcd 100644 --- a/app/utils/image.py +++ b/app/utils/image.py @@ -1,46 +1,9 @@ -import mimetypes -from io import BytesIO - -import magic -from PIL import Image -import asyncio -from app.config import env - -DEFAULT_IMAGE_LIMITATION = env.get("DEFAULT_IMAGE_LIMITATION", 1600) - - -def get_image_dimension(image_file: str): - image = Image.open(image_file) - return image.size - - -def image_compressing(image: Image, limitation: int = DEFAULT_IMAGE_LIMITATION): - new_image = image - if image.size[0] > limitation or image.size[1] > limitation: - if image.size[0] > image.size[1]: - new_image = image.resize( - (limitation, int(image.size[1] * limitation / image.size[0])), - Image.Resampling.LANCZOS, - ) - else: - new_image = image.resize( - (int(image.size[0] * limitation / image.size[1]), limitation), - Image.Resampling.LANCZOS, - ) - return new_image - - -async def check_image_type(io_object: BytesIO): - loop = asyncio.get_running_loop() - mime_type = await loop.run_in_executor( - None, lambda: magic.from_buffer(io_object.read(), mime=True) - ) - if mime_type == "image/webp": - ext = "webp" - else: - ext = mimetypes.guess_extension(mime_type, strict=True) - if ext is None: - ext = "webp" - else: - ext = ext[1:] - return ext +# Re-export from shared package +from fastfetchbot_shared.utils.image import * # noqa: F401,F403 +from fastfetchbot_shared.utils.image import ( # noqa: F401 + Image, + get_image_dimension, + image_compressing, + check_image_type, + DEFAULT_IMAGE_LIMITATION, +) diff --git a/app/utils/logger.py b/app/utils/logger.py index b7e2d46..1d4ac5f 100644 --- a/app/utils/logger.py +++ b/app/utils/logger.py @@ -1,18 +1,2 @@ -import logging -import os - -from loguru import logger - -from app.config import LOG_LEVEL, LOG_FILE_PATH - -log_path = os.path.join(LOG_FILE_PATH, "app.log") - -logger.add( - log_path, - level=LOG_LEVEL, - rotation="1 week", - retention="10 days", - compression="zip", -) -logger.debug(f"Logger initialized with level: {LOG_LEVEL}") -logger.debug(f"Logger initialized with log file path: {log_path}") +# Re-export from shared package +from fastfetchbot_shared.utils.logger import logger # noqa: F401 diff --git a/app/utils/network.py b/app/utils/network.py index ff7ec1f..bb422db 100644 --- a/app/utils/network.py +++ b/app/utils/network.py @@ -1,202 +1,13 @@ -import asyncio -import datetime -import json -import os -import uuid -from typing import Optional - -import aiofiles -import httpx -import traceback - -from lxml import etree -from fake_useragent import UserAgent -from playwright.async_api import async_playwright - -from app.models.classes import NamedBytesIO -from app.config import HTTP_REQUEST_TIMEOUT, DOWNLOAD_DIR -from app.utils.image import check_image_type -from app.utils.logger import logger - - -async def get_response( - url: str, headers: dict = None, params: dict = None, client: httpx.AsyncClient = None -) -> httpx.Response: - if headers is None: - headers = HEADERS - if client: - resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT - ) - return resp - else: - async with httpx.AsyncClient() as client: - resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT - ) - return resp - - -async def get_response_json(url: str, headers=None, client: httpx.AsyncClient = None) -> dict: - try: - response = await get_response(url, headers=headers, client=client) - json_result = response.json() - except Exception as e: - print(e, traceback.format_exc()) - json_result = None - return json_result - - - -async def get_selector( - url: str, headers: dict, follow_redirects: bool = True -) -> etree.HTML: - """ - A function to get etree.HTML selector according to url and headers. - We can use this function to do additional parsing works. - :param follow_redirects: - :param url: the target webpage url - :param headers: the headers of the request - :return: the selector of the target webpage parsed by etree.HTML - """ - async with httpx.AsyncClient() as client: - resp = await client.get( - url, - headers=headers, - follow_redirects=follow_redirects, - timeout=HTTP_REQUEST_TIMEOUT, - ) - if ( - resp.history - ): # if there is a redirect, the request will have a response chain - print("Request was redirected") - for h in resp.history: - print(h.status_code, h.url) - # if code is 302, do not follow the redirect - if h.status_code == 302: - selector = await get_selector( - h.url, headers=headers, follow_redirects=False - ) - return selector - print("Final destination:", resp.status_code, resp.url) - selector = etree.HTML(resp.text) # the content of the final destination - return selector - - -async def get_redirect_url(url: str, headers: Optional[dict] = None) -> str: - if not headers: - headers = HEADERS - async with httpx.AsyncClient() as client: - resp = await client.get(url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT) - if resp.status_code == 302 or resp.status_code == 301: - return resp.headers["Location"] - else: - return url - - -async def get_content_async(url): - async with async_playwright() as p: - browser = await p.firefox.launch() - context = await browser.new_context(viewport={"width": 1920, "height": 1080}) - page = await context.new_page() - - async def scroll_to_end(page): - # Scrolls to the bottom of the page - await page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) { - document.scrollingElement.scrollTop += 100; // Adjust the scroll amount - await delay(100); // Adjust the delay time - } - } - """) - - async def wait_for_network_idle(): - async with page.expect_response("**/api/content") as response_info: - response = await response_info.value - if response.status == 200: - print("Content loaded") - - await page.goto(url) - await wait_for_network_idle() - await scroll_to_end(page) - content = await page.content() - await browser.close() - return content - - -async def download_file_by_metadata_item( - url: str, - data: dict, - file_name: str = None, - file_format: str = None, - headers: dict = None, -) -> NamedBytesIO: - """ - A customized function to download a file from url and return a NamedBytesIO object. - :param file_format: - :param data: - :param url: - :param file_name: - :param headers: - :return: - """ - try: - if headers is None: - headers = HEADERS - headers["User-Agent"] = get_random_user_agent() - headers["referer"] = data["url"] - if data["category"] in ["reddit"]: - headers["Accept"] = "image/avif,image/webp,*/*" - async with httpx.AsyncClient() as client: - response = await client.get( - url=url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT - ) - # if redirect 302, get the final url - if response.status_code == 302 or response.status_code == 301: - url = response.headers["Location"] - file_data = response.content - if file_name is None: - file_format = file_format if file_format else url.split(".")[-1] - file_name = "media-" + str(uuid.uuid1())[:8] + "." + file_format - io_object = NamedBytesIO(file_data, name=file_name) - return io_object - except Exception as e: - await asyncio.sleep(2) - logger.error(f"Failed to download {url}, {e}") - - -async def download_file_to_local( - url: str, - file_path: str = None, - dir_path: str = DOWNLOAD_DIR, - file_name: str = "", - headers: dict = None, - referer: str = None, -) -> str: - io_object = await download_file_by_metadata_item(url=url, data={}, file_name=file_name, headers=headers) - ext = await check_image_type(io_object) - io_object.seek(0) - file_name = file_name + uuid.uuid4().hex + "." + ext - logger.info(f"Downloading {file_name}") - if file_path is None and dir_path is not None: - file_path = os.path.join(dir_path, file_name) - async with aiofiles.open(file_path, "wb") as f: - await f.write(io_object.read()) - return file_path - - -def get_random_user_agent() -> str: - ua = UserAgent() - return ua.random - - -""" -default headers -""" - -HEADERS = { - "User-Agent": get_random_user_agent(), - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", -} +# Re-export from shared package +from fastfetchbot_shared.utils.network import * # noqa: F401,F403 +from fastfetchbot_shared.utils.network import ( # noqa: F401 + get_response, + get_response_json, + get_selector, + get_redirect_url, + get_content_async, + download_file_by_metadata_item, + download_file_to_local, + get_random_user_agent, + HEADERS, +) diff --git a/app/utils/parse.py b/app/utils/parse.py index 53c55e4..8843e7e 100644 --- a/app/utils/parse.py +++ b/app/utils/parse.py @@ -1,224 +1,16 @@ -import datetime -import os -import re -import mimetypes -from typing import Optional -from urllib.parse import urlparse, unquote - -from bs4 import BeautifulSoup - -from app.models.url_metadata import UrlMetadata -from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS - -TELEGRAM_TEXT_LIMIT = 900 - -mimetypes.init() - - -def get_html_text_length(html: str) -> int: - if html is None: - return 0 - soup = BeautifulSoup(html, "html.parser") - text = soup.get_text() - return len(text) - - -def format_telegram_short_text(soup: BeautifulSoup) -> BeautifulSoup: - decompose_list = ["br"] - unwrap_list = ["span", "div", "blockquote", "h2", "ol", "ul"] - new_line_list = ["p", "li"] - for decompose in decompose_list: - for item in soup.find_all(decompose): - item.decompose() - for unwrap in unwrap_list: - for item in soup.find_all(unwrap): - item.unwrap() - for ( - new_line - ) in ( - new_line_list - ): # add a new line after each

and

  • tag and then remove the tag(unwrapping) - for item in soup.find_all(new_line): - item.append(BeautifulSoup("
    ", "html.parser")) - item.unwrap() - return soup - - -def unix_timestamp_to_utc(timestamp: int) -> str | None: - if not timestamp: - return None - utc_time = datetime.datetime.utcfromtimestamp(timestamp) - beijing_time = utc_time + datetime.timedelta(hours=8) - return beijing_time.strftime("%Y-%m-%d %H:%M") - - -def second_to_time(second: int) -> str: - m, s = divmod(second, 60) - h, m = divmod(m, 60) - return "{:02d}:{:02d}:{:02d}".format(h, m, s) - - -def string_to_list(string: str, divider: str = ",") -> list: - if string is None: - return [] - return string.split(divider) - - -async def get_url_metadata(url: str, ban_list: Optional[list] = None) -> UrlMetadata: - if not ban_list: - ban_list = [] - url_parser = urlparse(url) - url_main = str(url_parser.hostname) + str(url_parser.path) - source, content_type = "unknown", "unknown" - # check if the url is a social media platform website - for website, patterns in SOCIAL_MEDIA_WEBSITE_PATTERNS.items(): - for pattern in patterns: - if re.search(pattern, url_main): - source = website - content_type = "social_media" - # check if the url is a video website - if source == "unknown": - for website, patterns in VIDEO_WEBSITE_PATTERNS.items(): - for pattern in patterns: - if re.search(pattern, url_main): - source = website - content_type = "video" - # clear the url query - if source not in ["youtube", "bilibili", "wechat"]: - url = url_parser.scheme + "://" + url_parser.netloc + url_parser.path - if source in ban_list: - source = "banned" - content_type = "banned" - else: - for item in BANNED_PATTERNS: - if re.search(item, url): - source = "banned" - content_type = "banned" - break - # TODO: check if the url is from Mastodon, according to the request cookie - return UrlMetadata(url=url, source=source, content_type=content_type) - - -def get_ext_from_url(url: str) -> str: - url_object = urlparse(url) - filename = unquote(url_object.path) - ext = os.path.splitext(filename)[1] - # check if ext in mimetypes.types_map - if ext in mimetypes.types_map: - return ext - else: - return None - - -def wrap_text_into_html(text: str, is_html: bool = False) -> str: - if is_html: - soup = BeautifulSoup(text, "html.parser") - for item in soup.find_all("br"): - item.replace_with("\n") - text = str(soup) - text_list = text.split("\n") - text_list = [f"

    {item}

    " for item in text_list if item.strip() != ""] - text = "".join(text_list) - return text - - -def telegram_message_html_trim(html_content: str, trim_length: int = TELEGRAM_TEXT_LIMIT) -> str: - from bs4 import Doctype - - soup = BeautifulSoup(html_content, "html.parser") - - # Remove DOCTYPE declarations - for item in soup.contents: - if isinstance(item, Doctype): - item.extract() - - # Decompose tags that should be removed entirely (with their content) - for tag_name in ["img", "script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]: - for tag in soup.find_all(tag_name): - tag.decompose() - - # Unwrap structural/layout tags — keep their text, discard the wrapper - for tag_name in ["div", "span", "section", "article", "nav", "header", "footer", - "main", "aside", "figure", "figcaption", "html", "body"]: - for tag in soup.find_all(tag_name): - tag.unwrap() - - # Convert headings to bold text with line break - for level in range(1, 7): - for tag in soup.find_all(f"h{level}"): - tag.name = "b" - - # Unwrap

    tags (keep text content) - for tag in soup.find_all("p"): - tag.unwrap() - - html_content = str(soup).strip() - - if len(html_content) <= trim_length: - return html_content - - # Initial trimming - trimmed_content = html_content[:trim_length] - - # Find the position of the last complete tag in the trimmed content - last_complete_pos = trimmed_content.rfind('<') - if last_complete_pos != -1: - trimmed_content = trimmed_content[:last_complete_pos] - - # Remove any incomplete tags by ensuring each tag is closed - cleaned_html = '' - open_tags = [] - - tag_pattern = re.compile(r'<(/?)([a-zA-Z0-9]+)([^>]*)>') - pos = 0 - - while pos < len(trimmed_content): - match = tag_pattern.search(trimmed_content, pos) - if not match: - break - - start, end = match.span() - cleaned_html += trimmed_content[pos:start] - - closing, tag_name, attributes = match.groups() - - if closing: - if open_tags and open_tags[-1] == tag_name: - open_tags.pop() - cleaned_html += match.group(0) - else: - if not attributes.endswith('/'): - open_tags.append(tag_name) - cleaned_html += match.group(0) - - pos = end - - cleaned_html += trimmed_content[pos:] - - # Ensure to close all open tags - for tag in reversed(open_tags): - cleaned_html += f'' - - return cleaned_html + ' ...' - - -def get_bool(value: Optional[str], default: bool = True) -> bool: - true_values = ("True", "true", "1", "yes", "on") - false_values = ("False", "false", "0", "no", "off") - - if value is None: - return default - value = value.lower() - - if value in true_values: - return True - elif value in false_values: - return False - else: - return default - - -def get_env_bool(env, var_name: Optional[str], default: bool = False): - """Retrieve environment variable as a boolean.""" - value = env.get(var_name, "").lower() - return get_bool(value, default) +# Re-export from shared package +from fastfetchbot_shared.utils.parse import * # noqa: F401,F403 +from fastfetchbot_shared.utils.parse import ( # noqa: F401 + get_html_text_length, + format_telegram_short_text, + unix_timestamp_to_utc, + second_to_time, + string_to_list, + get_url_metadata, + get_ext_from_url, + wrap_text_into_html, + telegram_message_html_trim, + get_bool, + get_env_bool, + TELEGRAM_TEXT_LIMIT, +) From e4ee85e4b081a71a5ca6f76eec05a7c38ef2b3f7 Mon Sep 17 00:00:00 2001 From: aturret Date: Wed, 18 Feb 2026 01:42:06 -0600 Subject: [PATCH 2/8] feat: refactor the codebase --- apps/api/Dockerfile | 88 ++ apps/api/pyproject.toml | 41 + apps/api/src/__init__.py | 0 apps/api/src/auth.py | 19 + apps/api/src/config.py | 154 ++++ apps/api/src/database.py | 37 + apps/api/src/main.py | 55 ++ apps/api/src/models/__init__.py | 0 apps/api/src/models/database_model.py | 41 + apps/api/src/routers/__init__.py | 0 apps/api/src/routers/inoreader.py | 38 + apps/api/src/routers/scraper.py | 37 + apps/api/src/routers/scraper_routers.py | 6 + apps/api/src/routers/wechat.py | 29 + apps/api/src/services/__init__.py | 0 apps/api/src/services/amazon/__init__.py | 0 apps/api/src/services/amazon/s3.py | 67 ++ apps/api/src/services/file_export/__init__.py | 0 .../file_export/audio_transcribe/__init__.py | 30 + .../file_export/document_export/__init__.py | 10 + .../file_export/document_export/pdf_export.py | 89 ++ .../file_export/video_download/__init__.py | 232 +++++ apps/api/src/services/inoreader/__init__.py | 168 ++++ apps/api/src/services/inoreader/process.py | 108 +++ apps/api/src/services/scrapers/__init__.py | 0 .../src/services/scrapers/bluesky/__init__.py | 45 + .../src/services/scrapers/bluesky/config.py | 3 + .../src/services/scrapers/bluesky/scraper.py | 191 +++++ apps/api/src/services/scrapers/common.py | 114 +++ .../src/services/scrapers/douban/__init__.py | 230 +++++ .../src/services/scrapers/general/__init__.py | 40 + .../api/src/services/scrapers/general/base.py | 208 +++++ .../services/scrapers/general/firecrawl.py | 65 ++ .../scrapers/general/firecrawl_client.py | 94 +++ .../src/services/scrapers/general/scraper.py | 86 ++ .../api/src/services/scrapers/general/zyte.py | 78 ++ .../services/scrapers/instagram/__init__.py | 271 ++++++ .../src/services/scrapers/instagram/config.py | 33 + .../src/services/scrapers/reddit/__init__.py | 124 +++ apps/api/src/services/scrapers/scraper.py | 19 + .../src/services/scrapers/scraper_manager.py | 61 ++ .../src/services/scrapers/threads/__init__.py | 191 +++++ .../src/services/scrapers/twitter/__init__.py | 381 +++++++++ .../src/services/scrapers/twitter/config.py | 31 + .../src/services/scrapers/wechat/__init__.py | 102 +++ .../src/services/scrapers/weibo/__init__.py | 54 ++ .../api/src/services/scrapers/weibo/config.py | 5 + .../src/services/scrapers/weibo/scraper.py | 501 +++++++++++ .../services/scrapers/xiaohongshu/__init__.py | 153 ++++ .../scrapers/xiaohongshu/xhs/__init__.py | 2 + .../scrapers/xiaohongshu/xhs/base_crawler.py | 35 + .../scrapers/xiaohongshu/xhs/client.py | 217 +++++ .../services/scrapers/xiaohongshu/xhs/core.py | 225 +++++ .../scrapers/xiaohongshu/xhs/exception.py | 9 + .../scrapers/xiaohongshu/xhs/field.py | 72 ++ .../services/scrapers/xiaohongshu/xhs/help.py | 262 ++++++ .../scrapers/xiaohongshu/xhs/login.py | 132 +++ .../xiaohongshu/xhs/proxy_account_pool.py | 132 +++ .../scrapers/xiaohongshu/xhs/utils.py | 146 ++++ .../src/services/scrapers/zhihu/__init__.py | 792 ++++++++++++++++++ .../api/src/services/scrapers/zhihu/config.py | 23 + apps/api/src/services/telegraph/__init__.py | 74 ++ apps/api/src/templates/bluesky_content.jinja2 | 19 + .../templates/bluesky_telegram_text.jinja2 | 1 + apps/api/src/templates/douban_content.jinja2 | 5 + .../src/templates/douban_short_text.jinja2 | 11 + apps/api/src/templates/reddit_content.jinja2 | 7 + .../src/templates/reddit_short_text.jinja2 | 3 + apps/api/src/templates/video_info.jinja2 | 6 + apps/api/src/templates/weibo_content.jinja2 | 11 + .../api/src/templates/weibo_short_text.jinja2 | 5 + .../src/templates/xiaohongshu_content.jinja2 | 10 + .../templates/xiaohongshu_short_text.jinja2 | 2 + apps/api/src/templates/zhihu_content.jinja2 | 47 ++ .../api/src/templates/zhihu_short_text.jinja2 | 11 + apps/telegram-bot/Dockerfile | 50 ++ apps/telegram-bot/core/__init__.py | 0 apps/telegram-bot/core/api_client.py | 34 + apps/telegram-bot/core/config.py | 136 +++ apps/telegram-bot/core/database.py | 37 + apps/telegram-bot/core/handlers/__init__.py | 0 apps/telegram-bot/core/handlers/buttons.py | 118 +++ apps/telegram-bot/core/handlers/messages.py | 58 ++ .../telegram-bot/core/handlers/url_process.py | 225 +++++ apps/telegram-bot/core/main.py | 13 + apps/telegram-bot/core/models/__init__.py | 0 .../core/models/database_model.py | 4 + .../telegram-bot/core/models/telegram_chat.py | 33 + apps/telegram-bot/core/services/__init__.py | 0 apps/telegram-bot/core/services/bot_app.py | 183 ++++ apps/telegram-bot/core/services/constants.py | 40 + .../core/services/message_sender.py | 345 ++++++++ .../templates/social_media_message.jinja2 | 32 + apps/telegram-bot/core/webhook/__init__.py | 0 apps/telegram-bot/core/webhook/server.py | 87 ++ apps/telegram-bot/pyproject.toml | 27 + docker-compose.template.yml | 26 +- .../shared/fastfetchbot_shared/__init__.py | 0 packages/shared/fastfetchbot_shared/config.py | 19 + .../fastfetchbot_shared/models/__init__.py | 0 .../fastfetchbot_shared/models/classes.py | 17 + .../models/metadata_item.py | 123 +++ .../models/telegraph_item.py | 58 ++ .../models/url_metadata.py | 50 ++ .../fastfetchbot_shared/utils/__init__.py | 0 .../fastfetchbot_shared/utils/config.py | 55 ++ .../shared/fastfetchbot_shared/utils/image.py | 46 + .../fastfetchbot_shared/utils/logger.py | 17 + .../fastfetchbot_shared/utils/network.py | 200 +++++ .../shared/fastfetchbot_shared/utils/parse.py | 224 +++++ packages/shared/pyproject.toml | 20 + pyproject.toml | 8 +- template.env | 8 + uv.lock | 121 ++- 114 files changed, 8988 insertions(+), 14 deletions(-) create mode 100644 apps/api/Dockerfile create mode 100644 apps/api/pyproject.toml create mode 100644 apps/api/src/__init__.py create mode 100644 apps/api/src/auth.py create mode 100644 apps/api/src/config.py create mode 100644 apps/api/src/database.py create mode 100644 apps/api/src/main.py create mode 100644 apps/api/src/models/__init__.py create mode 100644 apps/api/src/models/database_model.py create mode 100644 apps/api/src/routers/__init__.py create mode 100644 apps/api/src/routers/inoreader.py create mode 100644 apps/api/src/routers/scraper.py create mode 100644 apps/api/src/routers/scraper_routers.py create mode 100644 apps/api/src/routers/wechat.py create mode 100644 apps/api/src/services/__init__.py create mode 100644 apps/api/src/services/amazon/__init__.py create mode 100644 apps/api/src/services/amazon/s3.py create mode 100644 apps/api/src/services/file_export/__init__.py create mode 100644 apps/api/src/services/file_export/audio_transcribe/__init__.py create mode 100644 apps/api/src/services/file_export/document_export/__init__.py create mode 100644 apps/api/src/services/file_export/document_export/pdf_export.py create mode 100644 apps/api/src/services/file_export/video_download/__init__.py create mode 100644 apps/api/src/services/inoreader/__init__.py create mode 100644 apps/api/src/services/inoreader/process.py create mode 100644 apps/api/src/services/scrapers/__init__.py create mode 100644 apps/api/src/services/scrapers/bluesky/__init__.py create mode 100644 apps/api/src/services/scrapers/bluesky/config.py create mode 100644 apps/api/src/services/scrapers/bluesky/scraper.py create mode 100644 apps/api/src/services/scrapers/common.py create mode 100644 apps/api/src/services/scrapers/douban/__init__.py create mode 100644 apps/api/src/services/scrapers/general/__init__.py create mode 100644 apps/api/src/services/scrapers/general/base.py create mode 100644 apps/api/src/services/scrapers/general/firecrawl.py create mode 100644 apps/api/src/services/scrapers/general/firecrawl_client.py create mode 100644 apps/api/src/services/scrapers/general/scraper.py create mode 100644 apps/api/src/services/scrapers/general/zyte.py create mode 100644 apps/api/src/services/scrapers/instagram/__init__.py create mode 100644 apps/api/src/services/scrapers/instagram/config.py create mode 100644 apps/api/src/services/scrapers/reddit/__init__.py create mode 100644 apps/api/src/services/scrapers/scraper.py create mode 100644 apps/api/src/services/scrapers/scraper_manager.py create mode 100644 apps/api/src/services/scrapers/threads/__init__.py create mode 100644 apps/api/src/services/scrapers/twitter/__init__.py create mode 100644 apps/api/src/services/scrapers/twitter/config.py create mode 100644 apps/api/src/services/scrapers/wechat/__init__.py create mode 100644 apps/api/src/services/scrapers/weibo/__init__.py create mode 100644 apps/api/src/services/scrapers/weibo/config.py create mode 100644 apps/api/src/services/scrapers/weibo/scraper.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/__init__.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/__init__.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/base_crawler.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/client.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/core.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/exception.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/field.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/help.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/login.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/proxy_account_pool.py create mode 100644 apps/api/src/services/scrapers/xiaohongshu/xhs/utils.py create mode 100644 apps/api/src/services/scrapers/zhihu/__init__.py create mode 100644 apps/api/src/services/scrapers/zhihu/config.py create mode 100644 apps/api/src/services/telegraph/__init__.py create mode 100644 apps/api/src/templates/bluesky_content.jinja2 create mode 100644 apps/api/src/templates/bluesky_telegram_text.jinja2 create mode 100644 apps/api/src/templates/douban_content.jinja2 create mode 100644 apps/api/src/templates/douban_short_text.jinja2 create mode 100644 apps/api/src/templates/reddit_content.jinja2 create mode 100644 apps/api/src/templates/reddit_short_text.jinja2 create mode 100644 apps/api/src/templates/video_info.jinja2 create mode 100644 apps/api/src/templates/weibo_content.jinja2 create mode 100644 apps/api/src/templates/weibo_short_text.jinja2 create mode 100644 apps/api/src/templates/xiaohongshu_content.jinja2 create mode 100644 apps/api/src/templates/xiaohongshu_short_text.jinja2 create mode 100644 apps/api/src/templates/zhihu_content.jinja2 create mode 100644 apps/api/src/templates/zhihu_short_text.jinja2 create mode 100644 apps/telegram-bot/Dockerfile create mode 100644 apps/telegram-bot/core/__init__.py create mode 100644 apps/telegram-bot/core/api_client.py create mode 100644 apps/telegram-bot/core/config.py create mode 100644 apps/telegram-bot/core/database.py create mode 100644 apps/telegram-bot/core/handlers/__init__.py create mode 100644 apps/telegram-bot/core/handlers/buttons.py create mode 100644 apps/telegram-bot/core/handlers/messages.py create mode 100644 apps/telegram-bot/core/handlers/url_process.py create mode 100644 apps/telegram-bot/core/main.py create mode 100644 apps/telegram-bot/core/models/__init__.py create mode 100644 apps/telegram-bot/core/models/database_model.py create mode 100644 apps/telegram-bot/core/models/telegram_chat.py create mode 100644 apps/telegram-bot/core/services/__init__.py create mode 100644 apps/telegram-bot/core/services/bot_app.py create mode 100644 apps/telegram-bot/core/services/constants.py create mode 100644 apps/telegram-bot/core/services/message_sender.py create mode 100644 apps/telegram-bot/core/templates/social_media_message.jinja2 create mode 100644 apps/telegram-bot/core/webhook/__init__.py create mode 100644 apps/telegram-bot/core/webhook/server.py create mode 100644 apps/telegram-bot/pyproject.toml create mode 100644 packages/shared/fastfetchbot_shared/__init__.py create mode 100644 packages/shared/fastfetchbot_shared/config.py create mode 100644 packages/shared/fastfetchbot_shared/models/__init__.py create mode 100644 packages/shared/fastfetchbot_shared/models/classes.py create mode 100644 packages/shared/fastfetchbot_shared/models/metadata_item.py create mode 100644 packages/shared/fastfetchbot_shared/models/telegraph_item.py create mode 100644 packages/shared/fastfetchbot_shared/models/url_metadata.py create mode 100644 packages/shared/fastfetchbot_shared/utils/__init__.py create mode 100644 packages/shared/fastfetchbot_shared/utils/config.py create mode 100644 packages/shared/fastfetchbot_shared/utils/image.py create mode 100644 packages/shared/fastfetchbot_shared/utils/logger.py create mode 100644 packages/shared/fastfetchbot_shared/utils/network.py create mode 100644 packages/shared/fastfetchbot_shared/utils/parse.py create mode 100644 packages/shared/pyproject.toml diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile new file mode 100644 index 0000000..ef4d33c --- /dev/null +++ b/apps/api/Dockerfile @@ -0,0 +1,88 @@ + +# `python-base` sets up all our shared environment variables +FROM python:3.12-slim AS python-base + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + # uv settings + UV_PROJECT_ENVIRONMENT="/opt/pysetup/.venv" \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + # paths + PYSETUP_PATH="/opt/pysetup" \ + VENV_PATH="/opt/pysetup/.venv" \ + PLAYWRIGHT_BROWSERS_PATH="/opt/playwright-browsers" + +# prepend venv to path +ENV PATH="$VENV_PATH/bin:$PATH" + + +# `builder-base` stage is used to build deps + create our virtual environment +FROM python-base AS builder-base + +# install uv from the official image +COPY --from=ghcr.io/astral-sh/uv:0.10.4 /uv /usr/local/bin/uv + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + curl \ + ffmpeg \ + libmagic1 \ + # deps for weasyprint + libpango-1.0-0 \ + libpangoft2-1.0-0 \ + libjpeg-dev \ + libopenjp2-7-dev \ + libffi-dev \ + build-essential \ + fonts-wqy-microhei \ + fonts-wqy-zenhei \ + fonts-noto-cjk \ + fonts-noto-cjk-extra + +# copy workspace files for dependency resolution +WORKDIR $PYSETUP_PATH +COPY pyproject.toml uv.lock ./ +COPY packages/ packages/ +COPY apps/api/ apps/api/ + +# install runtime deps +RUN uv sync --frozen --no-dev --no-install-project --package fastfetchbot-api + +# install the browser dependencies for playwright +RUN uv run playwright install --with-deps + + +# `production` image used for runtime +FROM python-base AS production +ENV FASTAPI_ENV=production +ENV PYTHONPATH=/app/apps/api:$PYTHONPATH +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + curl \ + ffmpeg \ + libmagic1 \ + # deps for weasyprint + libpango-1.0-0 \ + libpangoft2-1.0-0 \ + libjpeg-dev \ + libopenjp2-7-dev \ + libffi-dev \ + fonts-wqy-microhei \ + fonts-wqy-zenhei \ + fonts-noto-cjk \ + fonts-noto-cjk-extra \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libatspi2.0-0 \ + libxcomposite1 \ + libxdamage1 +COPY --from=builder-base $PYSETUP_PATH $PYSETUP_PATH +COPY --from=builder-base $PLAYWRIGHT_BROWSERS_PATH $PLAYWRIGHT_BROWSERS_PATH +COPY packages/ /app/packages/ +COPY apps/api/ /app/apps/api/ +WORKDIR /app/apps/api +CMD ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "src.main:app", "--preload"] diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml new file mode 100644 index 0000000..dc07911 --- /dev/null +++ b/apps/api/pyproject.toml @@ -0,0 +1,41 @@ +[project] +name = "fastfetchbot-api" +version = "0.1.0" +requires-python = ">=3.12,<3.13" +dependencies = [ + "fastfetchbot-shared", + "fastapi>=0.115.12", + "sentry-sdk[fastapi]>=2.27.0", + "gunicorn>=23.0.0", + "uvicorn>=0.34.2", + "jinja2>=3.1.6", + "babel>=2.17.0", + "beanie>=1.29.0", + "jmespath>=1.0.1", + "twitter-api-client-v2>=0.1.1", + "atproto>=0.0.61", + "asyncpraw>=7.8.1", + "pillow>=10.0.0", + "pydub>=0.25.1", + "xhtml2pdf>=0.2.17", + "aioboto3>=13.4.0", + "tenacity>=9.1.2", + "markdown>=3.8", + "openai>=2.15.0", + "html-telegraph-poster-v2>=0.2.5", + "firecrawl-py>=4.13.0", + "zyte-api>=0.8.1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.uv] +package = false + +[tool.uv.sources] +fastfetchbot-shared = { workspace = true } diff --git a/apps/api/src/__init__.py b/apps/api/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/auth.py b/apps/api/src/auth.py new file mode 100644 index 0000000..c9600d8 --- /dev/null +++ b/apps/api/src/auth.py @@ -0,0 +1,19 @@ +import secrets + +from fastapi import HTTPException, Security, status +from fastapi.security.api_key import APIKeyQuery + +from src.config import API_KEY_NAME, API_KEY + +api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False) + + +def verify_key(input_key: str, true_key: str): + if api_key_query is None or not secrets.compare_digest(input_key, true_key): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, detail="API Key Invalid" + ) + + +def verify_api_key(api_key_query: str = Security(api_key_query)): + verify_key(api_key_query, API_KEY) diff --git a/apps/api/src/config.py b/apps/api/src/config.py new file mode 100644 index 0000000..aa1e7de --- /dev/null +++ b/apps/api/src/config.py @@ -0,0 +1,154 @@ +import json +import os +import tempfile + +from jinja2 import Environment, FileSystemLoader +import gettext +import secrets + +from fastfetchbot_shared.utils.parse import get_env_bool + +env = os.environ +current_directory = os.path.dirname(os.path.abspath(__file__)) +conf_dir = os.path.join(current_directory, "..", "conf") + +# FastAPI environment variables +BASE_URL = env.get("BASE_URL", "localhost") +API_KEY_NAME = env.get("API_KEY_NAME", "pwd") +API_KEY = env.get("API_KEY", secrets.token_urlsafe(32)) + +# Filesystem environment variables +TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) +WORK_DIR = env.get("WORK_DIR", os.getcwd()) +DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) +DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) + +# Logging environment variables +LOG_FILE_PATH = env.get("LOG_FILE_PATH", TEMP_DIR) +LOG_LEVEL = env.get("LOG_LEVEL", "DEBUG") + +# MongoDB environment variables +DATABASE_ON = get_env_bool(env, "DATABASE_ON", False) +MONGODB_PORT = int(env.get("MONGODB_PORT", 27017)) or 27017 +MONGODB_HOST = env.get("MONGODB_HOST", "localhost") +MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") + +# Telegraph +telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "") +TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None + +# Youtube-dl environment variables +FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True) +FILE_EXPORTER_HOST = env.get("FILE_EXPORTER_HOST", "fast-yt-downloader") +FILE_EXPORTER_PORT = env.get("FILE_EXPORTER_PORT", "4000") +FILE_EXPORTER_URL = f"http://{FILE_EXPORTER_HOST}:{FILE_EXPORTER_PORT}" +DOWNLOAD_VIDEO_TIMEOUT = env.get("DOWNLOAD_VIDEO_TIMEOUT", 600) + +# Services environment variables +templates_directory = os.path.join(current_directory, "templates") +JINJA2_ENV = Environment( + loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True +) +TEMPLATE_LANGUAGE = env.get( + "TEMPLATE_LANGUAGE", "zh_CN" +) # It is a workaround for translation system + +# X-RapidAPI (for instagram) +X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None) + +# Twitter +TWITTER_EMAIL = env.get("TWITTER_EMAIL", None) +TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None) +TWITTER_USERNAME = env.get("TWITTER_USERNAME", None) +TWITTER_CT0 = env.get("TWITTER_CT0", None) +TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None) +TWITTER_COOKIES = { + "ct0": TWITTER_CT0, + "auth_token": TWITTER_AUTH_TOKEN, +} + +# Bluesky +BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None) +BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None) + +# Weibo +WEIBO_COOKIES = env.get("WEIBO_COOKIES", None) + +# Xiaohongshu +XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None) +XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None) +XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None) +XIAOHONGSHU_COOKIES = { + "a1": XIAOHONGSHU_A1, + "web_id": XIAOHONGSHU_WEBID, + "web_session": XIAOHONGSHU_WEBSESSION, +} +XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",") +XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",") +XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False) +XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True) + +# Zhihu +FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") + +zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json") +if os.path.exists(zhihu_cookie_path): + try: + with open(zhihu_cookie_path, "r") as f: + ZHIHU_COOKIES_JSON = json.load(f) + except json.JSONDecodeError: + print("Error: The file is not in a valid JSON format.") + ZHIHU_COOKIES_JSON = None + except FileNotFoundError: + print("Error: The file does not exist.") + ZHIHU_COOKIES_JSON = None +else: + print("Error: We cannot find it.") + ZHIHU_COOKIES_JSON = None + +# Reddit +REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None) +REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None) +REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None) +REDDIT_USERNAME = env.get("REDDIT_USERNAME", None) + +# AWS storage +AWS_STORAGE_ON = get_env_bool(env, "AWS_STORAGE_ON", False) +AWS_ACCESS_KEY_ID = env.get("AWS_ACCESS_KEY_ID", None) +AWS_SECRET_ACCESS_KEY = env.get("AWS_SECRET_ACCESS_KEY", None) +AWS_S3_BUCKET_NAME = env.get("AWS_S3_BUCKET_NAME", "") +AWS_REGION_NAME = env.get("AWS_REGION_NAME", "") +AWS_DOMAIN_HOST = env.get("AWS_DOMAIN_HOST", None) +if not (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_S3_BUCKET_NAME): + AWS_STORAGE_ON = False +INOREADER_APP_ID = env.get("INOREADER_APP_ID", None) +INOREADER_APP_KEY = env.get("INOREADER_APP_KEY", None) +INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) +INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) + +# Open AI API +OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) + +# General webpage scraping +GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) +GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") + +# Firecrawl API +FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") +FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") +FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering + + +# Zyte API +ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) + +# Locale directories environment variables +localedir = os.path.join(os.path.dirname(__file__), "locale") +translation = gettext.translation("messages", localedir=localedir, fallback=True) +_ = translation.gettext + +# Utils environment variables +HTTP_REQUEST_TIMEOUT = env.get("HTTP_REQUEST_TIMEOUT", 30) + +# Telegram Bot callback URL (for inter-service communication) +TELEGRAM_BOT_CALLBACK_URL = env.get("TELEGRAM_BOT_CALLBACK_URL", "http://telegram-bot:10451") diff --git a/apps/api/src/database.py b/apps/api/src/database.py new file mode 100644 index 0000000..5a4387e --- /dev/null +++ b/apps/api/src/database.py @@ -0,0 +1,37 @@ +from typing import Optional, Union, List + +from motor.motor_asyncio import AsyncIOMotorClient +from beanie import init_beanie, Document, Indexed + +from src.config import MONGODB_URL +from src.models.database_model import document_list +from fastfetchbot_shared.utils.logger import logger + + +async def startup() -> None: + client = AsyncIOMotorClient(MONGODB_URL) + await init_beanie(database=client["telegram_bot"], document_models=document_list) + + +async def shutdown() -> None: + pass + + +async def save_instances(instances: Union[Document, List[Document]], *args) -> None: + if instances is None: + raise TypeError("instances must be a Model or a list of Model") + + if isinstance(instances, Document): + instance_type = type(instances) + await instance_type.insert(instances) + elif isinstance(instances, list): + instance_type = type(instances[0]) + await instance_type.insert_many(instances) + else: + raise TypeError("instances must be a Model or a list of Model") + + for arg in args: + if not isinstance(arg, Document): + raise TypeError("args must be a Model") + instance_type = type(arg) + await instance_type.insert_one(arg) diff --git a/apps/api/src/main.py b/apps/api/src/main.py new file mode 100644 index 0000000..2a712be --- /dev/null +++ b/apps/api/src/main.py @@ -0,0 +1,55 @@ +import sentry_sdk + +from fastapi import FastAPI, Request +from contextlib import asynccontextmanager +from starlette.middleware.base import BaseHTTPMiddleware + +from src import database +from src.routers import inoreader, scraper_routers, scraper +from src.config import DATABASE_ON +from fastfetchbot_shared.utils.logger import logger + +SENTRY_DSN = "" + +# https://docs.sentry.io/platforms/python/guides/fastapi/ +sentry_sdk.init( + dsn=SENTRY_DSN, + # Set traces_sample_rate to 1.0 to capture 100% + # of transactions for performance monitoring. + # We recommend adjusting this value in production, + traces_sample_rate=1.0, +) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + if DATABASE_ON: + await database.startup() + try: + yield + finally: + if DATABASE_ON: + await database.shutdown() + + +class LogMiddleware(BaseHTTPMiddleware): + def __init__(self, app): + super().__init__(app) + + async def dispatch(self, request: Request, call_next): + logger.info(f"{request.method} {request.url}") + response = await call_next(request) + return response + + +def create_app(): + fastapi_app = FastAPI(lifespan=lifespan) + fastapi_app.add_middleware(LogMiddleware) + fastapi_app.include_router(inoreader.router) + fastapi_app.include_router(scraper.router) + for router in scraper_routers.scraper_routers: + fastapi_app.include_router(router) + return fastapi_app + + +app = create_app() diff --git a/apps/api/src/models/__init__.py b/apps/api/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/models/database_model.py b/apps/api/src/models/database_model.py new file mode 100644 index 0000000..049756f --- /dev/null +++ b/apps/api/src/models/database_model.py @@ -0,0 +1,41 @@ +from typing import Optional, Any +from datetime import datetime + +from pydantic import BaseModel, Field +from beanie import Document, Indexed, Insert, after_event, before_event + +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_html_text_length + + +class Metadata(Document): + title: str = Field(default="untitled") + message_type: MessageType = MessageType.SHORT + url: str + author: Optional[str] = None + author_url: Optional[str] = None + text: Optional[str] = None + text_length: Optional[int] = Field(ge=0) + content: Optional[str] = None + content_length: Optional[int] = Field(ge=0) + category: Optional[str] = None + source: Optional[str] = None + media_files: Optional[list[MediaFile]] = None + telegraph_url: Optional[str] = None + timestamp: datetime = Field(default_factory=datetime.utcnow) + scrape_status: bool = False + + @before_event(Insert) + def get_text_length(self): + self.text_length = get_html_text_length(self.text) + self.content_length = get_html_text_length(self.content) + + # + @staticmethod + def from_dict(obj: Any) -> "Metadata": + assert isinstance(obj, dict) + return Metadata(**obj) + + +document_list = [Metadata] diff --git a/apps/api/src/routers/__init__.py b/apps/api/src/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/routers/inoreader.py b/apps/api/src/routers/inoreader.py new file mode 100644 index 0000000..521adaa --- /dev/null +++ b/apps/api/src/routers/inoreader.py @@ -0,0 +1,38 @@ +from fastapi import APIRouter +from fastapi.requests import Request + +from src.config import INOREADER_APP_ID, INOREADER_APP_KEY +from src.services.inoreader import Inoreader +from src.services.inoreader.process import ( + get_inoreader_item_async, + process_inoreader_data, + default_telegram_channel_id +) +from fastapi import Security +from src.auth import verify_api_key + +router = APIRouter(prefix="/inoreader") + + +async def get_inoreader_webhook_data(data: dict): + result = data["items"] + return result + + +@router.post("/triggerAsync", dependencies=[Security(verify_api_key)]) +async def inoreader_trigger_webhook(request: Request): + if not INOREADER_APP_ID or not INOREADER_APP_KEY: + return "inoreader app id or key not set" + params = request.query_params + await get_inoreader_item_async(trigger=True, params=params) + return "ok" + + +@router.post("/webhook", dependencies=[Security(verify_api_key)]) +async def inoreader_tag_webhook(request: Request): + data = await request.json() + data = await Inoreader.process_items_data(data) + params = request.query_params + telegram_channel_id = params.get("channel_id", default_telegram_channel_id) + await process_inoreader_data(data=data, use_inoreader_content=True, telegram_channel_id=telegram_channel_id) + return "ok" diff --git a/apps/api/src/routers/scraper.py b/apps/api/src/routers/scraper.py new file mode 100644 index 0000000..b02be9c --- /dev/null +++ b/apps/api/src/routers/scraper.py @@ -0,0 +1,37 @@ +import asyncio + +from fastapi import APIRouter +from fastapi.requests import Request + +from src.config import API_KEY_NAME +from src.services.scrapers.common import InfoExtractService +from fastapi import Security +from src.auth import verify_api_key +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_url_metadata + +router = APIRouter(prefix="/scraper") + + +@router.post("/getItem", dependencies=[Security(verify_api_key)]) +async def get_item_route(request: Request): + logger.debug("A scraper getItem request received") + query_params = dict(request.query_params) + url = query_params.pop("url") + ban_list = query_params.pop("ban_list", None) + logger.debug(f"get_item_route: url: {url}, query_params: {query_params}") + if API_KEY_NAME in query_params: + query_params.pop(API_KEY_NAME) + url_metadata = await get_url_metadata(url, ban_list) + item = InfoExtractService(url_metadata, **query_params) + result = await item.get_item() + logger.debug(f"getItem result: {result}") + return result + + +@router.post("/getUrlMetadata", dependencies=[Security(verify_api_key)]) +async def get_url_metadata_route(request: Request): + url = request.query_params.get("url") + ban_list = request.query_params.get("ban_list") + url_metadata = await get_url_metadata(url, ban_list) + return url_metadata.to_dict() diff --git a/apps/api/src/routers/scraper_routers.py b/apps/api/src/routers/scraper_routers.py new file mode 100644 index 0000000..66316c7 --- /dev/null +++ b/apps/api/src/routers/scraper_routers.py @@ -0,0 +1,6 @@ +from .wechat import router as wechat_router + + +scraper_routers = [ + wechat_router, +] diff --git a/apps/api/src/routers/wechat.py b/apps/api/src/routers/wechat.py new file mode 100644 index 0000000..3f66b55 --- /dev/null +++ b/apps/api/src/routers/wechat.py @@ -0,0 +1,29 @@ +from fastapi import APIRouter +from fastapi.requests import Request + +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from src.services.scrapers.common import InfoExtractService +from fastapi import Security +from src.auth import verify_api_key + +router = APIRouter(prefix="/wechat") + + +@router.post("/gzh", dependencies=[Security(verify_api_key)]) +async def wechat_gzh_scrape(request: Request): + url = request.query_params.get("url") + if url: + url_metadata = UrlMetadata.from_dict({ + "url": url, + "type": "social_media", + "source": "wechat", + }) + else: + customized_url_metadata = request.json() + if customized_url_metadata: + url_metadata = UrlMetadata.from_dict(customized_url_metadata) + else: + return "url or url metadata not found" + item = InfoExtractService(url_metadata) + result = await item.get_item() + return result diff --git a/apps/api/src/services/__init__.py b/apps/api/src/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/amazon/__init__.py b/apps/api/src/services/amazon/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/amazon/s3.py b/apps/api/src/services/amazon/s3.py new file mode 100644 index 0000000..e0e13aa --- /dev/null +++ b/apps/api/src/services/amazon/s3.py @@ -0,0 +1,67 @@ +import asyncio +import uuid +from datetime import datetime +from urllib.parse import urlparse, quote + +import aiofiles.os +from pathlib import Path + +import aioboto3 +from botocore.exceptions import ClientError + +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.network import download_file_to_local +from src.config import AWS_S3_BUCKET_NAME, AWS_REGION_NAME, AWS_DOMAIN_HOST + +session = aioboto3.Session() +image_url_host = ( + AWS_DOMAIN_HOST + if AWS_DOMAIN_HOST + else f"{AWS_S3_BUCKET_NAME}.s3.{AWS_REGION_NAME}.amazonaws.com" +) + + +async def download_and_upload(url: str, referer: str = None, suite: str = "test") -> str: + urlparser = urlparse(url) + file_name = (urlparser.netloc + urlparser.path).replace("/", "-") + local_path = await download_file_to_local(url=url, referer=referer, file_name=file_name) + local_path = Path(local_path) + file_name = local_path.name + if not local_path: + return "" + s3_path = await upload( + suite=suite, + staging_path=local_path, + file_name=file_name, + ) + await aiofiles.os.remove(local_path) + return s3_path + + +async def upload( + staging_path: Path, + bucket: str = AWS_S3_BUCKET_NAME, + suite: str = "test", + release: str = datetime.now().strftime("%Y-%m-%d"), + file_name: str = None, +) -> str: + if not file_name: + file_name = uuid.uuid4().hex + blob_s3_key = f"{suite}/{release}/{file_name}" + async with session.client("s3") as s3: + try: + with staging_path.open("rb") as spfp: + logger.info(f"Uploading {blob_s3_key}") + await s3.upload_fileobj( + spfp, + bucket, + blob_s3_key, + ) + logger.info(f"Uploaded {file_name} to {suite}/{release}") + except Exception as e: + logger.error(f"Failed to upload {file_name} to {suite}/{release}, {e}") + return "" + image_url = f"https://{image_url_host}/{blob_s3_key}" + urlparser = urlparse(image_url) + quoted_url = urlparser.scheme + "://" + urlparser.netloc + quote(urlparser.path) + return quoted_url diff --git a/apps/api/src/services/file_export/__init__.py b/apps/api/src/services/file_export/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/file_export/audio_transcribe/__init__.py b/apps/api/src/services/file_export/audio_transcribe/__init__.py new file mode 100644 index 0000000..5088ff7 --- /dev/null +++ b/apps/api/src/services/file_export/audio_transcribe/__init__.py @@ -0,0 +1,30 @@ +import httpx + +from src.config import OPENAI_API_KEY, FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import wrap_text_into_html + +TRANSCRIBE_MODEL = "whisper-1" +SEGMENT_LENGTH = 5 * 60 + + +class AudioTranscribe: + def __init__(self, audio_file: str): + self.audio_file = audio_file + + async def transcribe(self): + return await self._get_audio_text(self.audio_file) + + @staticmethod + async def _get_audio_text(audio_file: str): + async with httpx.AsyncClient() as client: + body = { + "audio_file": audio_file, + "openai_api_key": OPENAI_API_KEY, + } + request_url = FILE_EXPORTER_URL + "/transcribe" + response = await client.post( + url=request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + transcript = response.json().get("transcript") + return transcript diff --git a/apps/api/src/services/file_export/document_export/__init__.py b/apps/api/src/services/file_export/document_export/__init__.py new file mode 100644 index 0000000..282167d --- /dev/null +++ b/apps/api/src/services/file_export/document_export/__init__.py @@ -0,0 +1,10 @@ +from . import pdf_export + + +class DocumentExport(object): + def __init__(self, document): + self.document = document + + def export(self): + if self.document["type"] == "pdf": + return pdf_export.PdfExport(self.document["content"]).export() diff --git a/apps/api/src/services/file_export/document_export/pdf_export.py b/apps/api/src/services/file_export/document_export/pdf_export.py new file mode 100644 index 0000000..88fd0b5 --- /dev/null +++ b/apps/api/src/services/file_export/document_export/pdf_export.py @@ -0,0 +1,89 @@ +import asyncio +import functools + +# import gc +import os +import uuid +from pathlib import Path + +import aiofiles +import aiofiles.os +import httpx +from bs4 import BeautifulSoup + +from src.config import DOWNLOAD_DIR, FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT, TEMP_DIR, AWS_STORAGE_ON +from src.services.amazon.s3 import upload as upload_to_s3 +from fastfetchbot_shared.utils.logger import logger + +current_directory = os.path.dirname(os.path.abspath(__file__)) + +PDF_STYLESHEET = os.path.join(current_directory, "pdf_export.css") + + +async def upload_file_to_s3(output_filename): + return await upload_to_s3( + staging_path=output_filename, + suite="documents", + file_name=output_filename.name, + ) + + +class PdfExport: + def __init__(self, title: str, html_string: str = None): + self.title = title + self.html_string = html_string + + async def export(self, method: str = "file") -> str: + body = { + "method": method + } + html_string = self.wrap_html_string(self.html_string) + if method == "string": + body["html_string"] = html_string, + logger.debug( + f""" + html_string: {html_string} + """ + ) + elif method == "file": + filename = f"{self.title}-{uuid.uuid4()}.html" + filename = os.path.join(TEMP_DIR, filename) + async with aiofiles.open( + filename, "w", encoding="utf-8" + ) as f: + await f.write(html_string) + html_file = filename + logger.debug(html_file) + body["html_file"] = html_file + output_filename = f"{self.title}-{uuid.uuid4()}.pdf" + body["output_filename"] = output_filename + + async with httpx.AsyncClient() as client: + request_url = FILE_EXPORTER_URL + "/pdfExport" + logger.info(f"requesting pdf export from pdf server: {body}") + resp = await client.post( + request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + output_filename = resp.json().get("output_filename") + logger.info(f"pdf export success: {output_filename}") + await aiofiles.os.remove(html_file) + if AWS_STORAGE_ON: + local_filename = output_filename + output_filename = await upload_file_to_s3(Path(output_filename)) + await aiofiles.os.remove(local_filename) + return output_filename + + @staticmethod + def wrap_html_string(html_string: str) -> str: + soup = BeautifulSoup( + '' + '', + "html.parser", + ) + soup.body.append(BeautifulSoup(html_string, "html.parser")) + for tag in soup.find_all(True): + if "style" in tag.attrs: + del tag["style"] + for style_tag in soup.find_all("style"): + style_tag.decompose() + return soup.prettify() diff --git a/apps/api/src/services/file_export/video_download/__init__.py b/apps/api/src/services/file_export/video_download/__init__.py new file mode 100644 index 0000000..01f95b6 --- /dev/null +++ b/apps/api/src/services/file_export/video_download/__init__.py @@ -0,0 +1,232 @@ +from typing import Any, Optional + +import httpx +from urllib.parse import urlparse, parse_qs + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile +from src.services.file_export.audio_transcribe import AudioTranscribe +from src.config import FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT +from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html +from fastfetchbot_shared.utils.logger import logger +from src.config import JINJA2_ENV + +video_info_template = JINJA2_ENV.get_template("video_info.jinja2") + + +class VideoDownloader(MetadataItem): + def __init__( + self, + url: str, + category: str, + data: Optional[Any] = None, + download: bool = True, + audio_only: bool = False, + hd: bool = False, + transcribe: bool = False, + **kwargs, + ): + self.extractor = category + self.url = url + self.author_url = "" + self.download = download + self.audio_only = audio_only + self.transcribe = transcribe + self.hd = hd + self.message_type = MessageType.SHORT + self.file_path = None + # metadata variables + self.category = category + self.media_files = [] + # auxiliary variables + self.created = None + self.duration = None + + @classmethod + async def create(cls, *args, **kwargs): + instance = cls(*args, **kwargs) + instance.url = await instance._parse_url(instance.url) + return instance + + async def get_item(self) -> dict: + self.url = await self._parse_url(self.url) + await self.get_video() + return self.to_dict() + + async def get_video(self) -> None: + content_info = await self.get_video_info() + self.file_path = content_info["file_path"] + video_info_funcs = { + "youtube": self._youtube_info_parse, + "bilibili": self._bilibili_info_parse, + } + meta_info = video_info_funcs[self.extractor](content_info) + self._video_info_formatting(meta_info) + # AI transcribe + if self.transcribe: + audio_content_info = await self.get_video_info(audio_only=True) + audio_file_path = audio_content_info["file_path"] + audio_transcribe = AudioTranscribe(audio_file_path) + transcribe_text = await audio_transcribe.transcribe() + if self.download is False: + self.message_type = MessageType.LONG + self.text += "\nAI全文摘录:" + transcribe_text + self.content += "


    " + wrap_text_into_html(transcribe_text) + + async def _parse_url(self, url: str) -> str: + async def _get_redirected_url(original_url: str) -> str: + async with httpx.AsyncClient(follow_redirects=False) as client: + resp = await client.get(original_url) + if resp.status_code == 200: + original_url = resp.url + elif resp.status_code == 302: + original_url = resp.headers["Location"] + return original_url + + def _remove_youtube_link_tracing(original_url: str) -> str: + original_url_parser = urlparse(original_url) + original_url_hostname = str(original_url_parser.hostname) + + if "youtu.be" in original_url_hostname: + # remove all queries + original_url = original_url.split("?")[0] + if "youtube.com" in original_url_hostname: + # remove all queries except "?v=" part + original_url = original_url_parser.scheme + "://" + original_url_parser.netloc + original_url_parser.path + if original_url_parser.query: + v_part_query = [item for item in original_url_parser.query.split("&") if "v=" in item] + if v_part_query: + original_url += "?" + v_part_query[0] + return original_url + + def _remove_bilibili_link_tracing(original_url: str) -> str: + original_url_parser = urlparse(original_url) + original_url_hostname = str(original_url_parser.hostname) + query_dict = parse_qs(original_url_parser.query) + bilibili_p_query_string = "?p=" + query_dict["p"][0] if 'p' in query_dict else "" + + if "bilibili.com" in original_url_hostname: + original_url = original_url_parser.scheme + "://" + original_url_parser.netloc + original_url_parser.path + return original_url + bilibili_p_query_string + + logger.info(f"parsing original video url: {url} for {self.extractor}") + + url_parser = urlparse(url) + url_hostname = str(url_parser.hostname) + + if self.extractor == "bilibili": + if "b23.tv" in url_hostname: + url = await _get_redirected_url(url) + if "m.bilibili.com" in url_hostname: + url = url.replace("m.bilibili.com", "www.bilibili.com") + url = _remove_bilibili_link_tracing(url) + elif self.extractor == "youtube": + if "youtu.be" in url_hostname: + url = await _get_redirected_url(url) + url = _remove_youtube_link_tracing(url) + + logger.info(f"parsed video url: {url} for {self.extractor}") + return url + + async def get_video_info( + self, + url: str = None, + download: bool = None, + extractor: str = None, + audio_only: bool = None, + hd: bool = None, + ) -> dict: + """ + make a request to youtube-dl server to get video info + :return: video info dict + """ + if url is None: + url = self.url + if download is None: + download = self.download + if extractor is None: + extractor = self.extractor + if audio_only is None: + audio_only = self.audio_only + if hd is None: + hd = self.hd + async with httpx.AsyncClient() as client: + body = { + "url": url, + "download": download, + "extractor": extractor, + "audio_only": audio_only, + "hd": hd, + } + request_url = FILE_EXPORTER_URL + "/videoDownload" + logger.info(f"requesting video info from youtube-dl server: {body}") + if download is True: + logger.info(f"video downloading... it may take a while") + if hd is True: + logger.info(f"downloading HD video, it may take longer") + elif audio_only is True: + logger.info(f"downloading audio only") + logger.debug(f"downloading video timeout: {DOWNLOAD_VIDEO_TIMEOUT}") + resp = await client.post( + request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + content_info = resp.json().get("content_info") + file_path = resp.json().get("file_path") + content_info["file_path"] = file_path + return content_info + + def _video_info_formatting(self, meta_info: dict): + self.title = meta_info["title"] + self.author = meta_info["author"] + self.author_url = meta_info["author_url"] + if len(meta_info["description"]) > 800: + meta_info["description"] = meta_info["description"][:800] + "..." + self.created = meta_info["upload_date"] + self.duration = meta_info["duration"] + self.text = video_info_template.render( + data={ + "url": self.url, + "title": self.title, + "author": self.author, + "author_url": self.author_url, + "duration": self.duration, + "created": self.created, + "playback_data": meta_info["playback_data"], + "description": meta_info["description"], + } + ) + self.content = self.text.replace("\n", "
    ") + if self.download: + media_type = "video" + if self.audio_only: + media_type = "audio" + self.media_files = [MediaFile(media_type, self.file_path, "")] + + @staticmethod + def _youtube_info_parse(video_info: dict) -> dict: + return { + "id": video_info["id"], + "title": video_info["title"], + "author": video_info["uploader"], + "author_url": video_info["uploader_url"] or video_info["channel_url"], + "description": video_info["description"], + "playback_data": f"视频播放量:{video_info['view_count']} 评论数:{video_info['comment_count']}", + "author_avatar": video_info["thumbnail"], + "upload_date": str(video_info["upload_date"]), + "duration": second_to_time(round(video_info["duration"])), + } + + @staticmethod + def _bilibili_info_parse(video_info: dict) -> dict: + return { + "id": video_info["id"], + "title": video_info["title"], + "author": video_info["uploader"], + "author_url": "https://space.bilibili.com/" + + str(video_info["uploader_id"]), + "author_avatar": video_info["thumbnail"], + "ext": video_info["ext"], + "description": video_info["description"], + "playback_data": f"视频播放量:{video_info['view_count']} 弹幕数:{video_info['comment_count']} 点赞数:{video_info['like_count']}", + "upload_date": unix_timestamp_to_utc(video_info["timestamp"]), + "duration": second_to_time(round(video_info["duration"])), + } diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py new file mode 100644 index 0000000..1343079 --- /dev/null +++ b/apps/api/src/services/inoreader/__init__.py @@ -0,0 +1,168 @@ +from typing import Optional +from urllib.parse import quote + +import httpx +from bs4 import BeautifulSoup +import jmespath +from httpx import Response + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.utils.network import HEADERS +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_html_text_length +from src.config import ( + INOREADER_APP_ID, + INOREADER_APP_KEY, + INOREADER_EMAIL, + INOREADER_PASSWORD, +) + +INOREADER_CONTENT_URL = "https://www.inoreader.com/reader/api/0/stream/contents/" +TAG_PATH = "user/-/label/" +OTHER_PATH = "user/-/state/com.google/" +INOREADER_LOGIN_URL = "https://www.inoreader.com/accounts/ClientLogin" + + +class Inoreader(MetadataItem): + def __init__(self, url: str = None, data: dict = None, **kwargs): + if url: + self.url = url + if data: + self.title = data.get("title", "") + self.message = data.get("message", "") + self.author = data.get("author", "") + self.author_url = data.get("author_url", "") + self.category = data.get("category", "") + self.raw_content = data.get("content", "") + self.content = self.raw_content + if kwargs.get("category"): + self.category = kwargs["category"] + self.media_files = [] + self.message_type = MessageType.LONG + + def _from_data(self, data: dict): + self.title = data.get("title", "") + self.message = data.get("message", "") + self.author = data.get("author", "") + self.author_url = data.get("author_url", "") + self.category = data.get("category", "") + self.raw_content = data.get("content", "") + self.content = self.raw_content + + async def get_item(self, api: bool = False) -> dict: + if api: + data = await self.get_api_item_data() + self._resolve_media_files() + if get_html_text_length(self.content) < 400: + self.message_type = MessageType.SHORT + metadata_dict = self.to_dict() + metadata_dict["message"] = self.message + return metadata_dict + + def _resolve_media_files(self): + soup = BeautifulSoup(self.raw_content, "html.parser") + for img in soup.find_all("img"): + self.media_files.append(MediaFile(url=img["src"], media_type="image")) + img.extract() + for video in soup.find_all("video"): + self.media_files.append(MediaFile(url=video["src"], media_type="video")) + video.extract() + for tags in soup.find_all(["p", "span"]): + tags.unwrap() + self.text = str(soup) + self.text = '' + self.author + ": " + self.text + + @staticmethod + def get_stream_id( + stream_type: str = "broadcast", tag: str = None, feed: str = None + ) -> str: + if stream_type == "feed": + stream_id = feed + elif stream_type == "tag": + stream_id = TAG_PATH + tag + else: + stream_id = OTHER_PATH + stream_type + stream_id = quote(stream_id) + return stream_id + + @staticmethod + async def mark_all_as_read(stream_id: str, timestamp: int = 0) -> None: + request_url = "https://www.inoreader.com/reader/api/0/mark-all-as-read" + params = {"s": stream_id, "ts": timestamp} + resp = await Inoreader.get_api_info(url=request_url, params=params) + logger.debug(resp.text) + + @staticmethod + async def get_api_item_data( + stream_type: str = "broadcast", + tag: str = None, + feed: str = None, + params: dict = None, + ) -> Optional[dict | list]: + stream_id = Inoreader.get_stream_id(stream_type=stream_type, tag=tag, feed=feed) + request_url = INOREADER_CONTENT_URL + stream_id + default_params = { + "comments": 1, + "n": 10, + "r": "o", + "xt": "user/-/state/com.google/read", + } + if params: + default_params.update(params) + params = default_params + resp = await Inoreader.get_api_info(url=request_url, params=params) + logger.debug(resp.text) + data = resp.json() + data = await Inoreader.process_items_data(data) + return data + + @staticmethod + async def process_items_data(data: dict) -> Optional[dict | list]: + expression = """ + items[].{ + "aurl": canonical[0].href, + "title": title, + "author": origin.title, + "author_url": origin.htmlUrl, + "content": summary.content, + "category": categories[-1], + "message": comments[0].commentBody, + "timestamp": updated + } + """ + data = jmespath.search(expression, data) + for item in data: + item["category"] = item["category"].split("/")[-1] + return data + + @staticmethod + async def get_api_info( + url: str, + params=None, + ) -> Response: + async with httpx.AsyncClient() as client: + resp = await client.post( + INOREADER_LOGIN_URL, + params={ + "Email": INOREADER_EMAIL, + "Passwd": INOREADER_PASSWORD, + }, + ) + authorization = resp.text.split("\n")[2].split("=")[1] + + async with httpx.AsyncClient() as client: + headers = HEADERS + headers["Authorization"] = f"GoogleLogin auth={authorization}" + params = params or {} + params.update( + { + "AppId": INOREADER_APP_ID, + "AppKey": INOREADER_APP_KEY, + } + ) + resp = await client.get( + url=url, + params=params, + headers=headers, + ) + return resp diff --git a/apps/api/src/services/inoreader/process.py b/apps/api/src/services/inoreader/process.py new file mode 100644 index 0000000..7fc16e3 --- /dev/null +++ b/apps/api/src/services/inoreader/process.py @@ -0,0 +1,108 @@ +from typing import Union, Optional, Dict, Callable, Awaitable + +import httpx + +from src.config import TELEGRAM_BOT_CALLBACK_URL +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from src.services.inoreader import Inoreader +from src.services.scrapers.common import InfoExtractService +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_url_metadata, get_bool + +default_telegram_channel_id = None + +# Type alias for the message callback +MessageCallback = Callable[[dict, Union[int, str]], Awaitable[None]] + + +async def _default_message_callback(metadata_item: dict, chat_id: Union[int, str]) -> None: + """Default callback that sends via HTTP to the Telegram bot service.""" + async with httpx.AsyncClient() as client: + await client.post( + f"{TELEGRAM_BOT_CALLBACK_URL}/send_message", + json={"data": metadata_item, "chat_id": str(chat_id)}, + timeout=120, + ) + + +async def process_inoreader_data( + data: list, + use_inoreader_content: bool, + telegram_channel_id: Union[int, str] = default_telegram_channel_id, + stream_id: str = None, + message_callback: MessageCallback = None, +): + if message_callback is None: + message_callback = _default_message_callback + + for item in data: + url_type_item = await get_url_metadata(item["aurl"]) + url_type_dict = url_type_item.to_dict() + logger.debug(f"ino original: {use_inoreader_content}") + if ( + use_inoreader_content is True + or url_type_dict["content_type"] == "unknown" + ): + is_video = url_type_dict["content_type"] == "video" + content_type = url_type_dict["content_type"] if is_video else "social_media" + source = url_type_dict["source"] if is_video else "inoreader" + url_metadata = UrlMetadata( + url=item["aurl"], + content_type=content_type, + source=source, + ) + metadata_item = InfoExtractService( + url_metadata=url_metadata, + data=item, + store_document=True, + category=item["category"], + ) + else: + metadata_item = InfoExtractService( + url_metadata=url_type_item, + data=item, + store_document=True, + ) + message_metadata_item = await metadata_item.get_item() + await message_callback(message_metadata_item, telegram_channel_id) + if stream_id: + await Inoreader.mark_all_as_read( + stream_id=stream_id, timestamp=item["timestamp"] - 1 + ) + + +async def get_inoreader_item_async( + data: Optional[Dict] = None, + trigger: bool = False, + params: Optional[Dict] = None, + message_callback: MessageCallback = None, +) -> None: + stream_id = None + use_inoreader_content = True + telegram_channel_id = default_telegram_channel_id + if trigger and params and not data: + logger.debug(f"params:{params}") + use_inoreader_content = get_bool(params.get("useInoreaderContent"), True) + stream_type = params.get("streamType", "broadcast") + telegram_channel_id = params.get("channelId", default_telegram_channel_id) + tag = params.get("tag", None) + feed = params.get("feed", None) + the_remaining_params = { + k: v + for k, v in params.items() + if k not in ["streamType", "channelId", "tag", "feed"] + } + data = await Inoreader.get_api_item_data( + stream_type=stream_type, tag=tag, params=the_remaining_params, feed=feed + ) + if not data: + return + stream_id = Inoreader.get_stream_id(stream_type=stream_type, tag=tag, feed=feed) + if type(data) is dict: + data = [data] + await process_inoreader_data( + data, use_inoreader_content, telegram_channel_id, stream_id, + message_callback=message_callback, + ) + if stream_id: + await Inoreader.mark_all_as_read(stream_id=stream_id) diff --git a/apps/api/src/services/scrapers/__init__.py b/apps/api/src/services/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/scrapers/bluesky/__init__.py b/apps/api/src/services/scrapers/bluesky/__init__.py new file mode 100644 index 0000000..274d049 --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/__init__.py @@ -0,0 +1,45 @@ +import traceback +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Dict, Optional, Any + +import httpx +import jmespath + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html + + +@dataclass +class Bluesky(MetadataItem): + cid: str = "" + author_did: str = "" + retweet_post: Optional["Bluesky"] = None + + @staticmethod + def from_dict(obj: Any) -> "Bluesky": + bluesky_item = MetadataItem.from_dict(obj) + bluesky_item.cid = obj.get("cid") + bluesky_item.author_did = obj.get("author_did") + return Bluesky( + url=bluesky_item.url, + title=bluesky_item.title, + author=bluesky_item.author, + author_url=bluesky_item.author_url, + telegraph_url=bluesky_item.telegraph_url, + text=bluesky_item.text, + content=bluesky_item.content, + media_files=bluesky_item.media_files, + category=bluesky_item.category, + message_type=bluesky_item.message_type, + cid=bluesky_item.cid, + author_did=bluesky_item.author_did, + ) + + def to_dict(self) -> dict: + result: dict = super().to_dict() + result["cid"] = self.cid + result["author_did"] = self.author_did + if self.retweet_post: + result["retweet_post"] = self.retweet_post.to_dict() + return result diff --git a/apps/api/src/services/scrapers/bluesky/config.py b/apps/api/src/services/scrapers/bluesky/config.py new file mode 100644 index 0000000..3183639 --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/config.py @@ -0,0 +1,3 @@ +BLUESKY_HOST = "https://bsky.app" + +BLUESKY_MAX_LENGTH = 800 diff --git a/apps/api/src/services/scrapers/bluesky/scraper.py b/apps/api/src/services/scrapers/bluesky/scraper.py new file mode 100644 index 0000000..fd3799a --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/scraper.py @@ -0,0 +1,191 @@ +from typing import Optional +from urllib.parse import urlparse + +from atproto import AsyncClient, IdResolver, AtUri +from atproto_client.models.app.bsky.embed.record import ViewRecord +from atproto_client.models.app.bsky.feed.defs import ThreadViewPost, PostView + +from src.config import JINJA2_ENV +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from src.services.scrapers.scraper import Scraper, DataProcessor +from src.services.scrapers.bluesky import Bluesky +from src.services.scrapers.bluesky.config import BLUESKY_HOST, BLUESKY_MAX_LENGTH +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import wrap_text_into_html + +telegram_text_template = JINJA2_ENV.get_template("bluesky_telegram_text.jinja2") +content_template = JINJA2_ENV.get_template("bluesky_content.jinja2") + + +class BlueskyPost: + def __init__(self, bluesky_url: str): + self.url: str = bluesky_url + bluesky_url_parser = urlparse(bluesky_url) + self.bluesky_host: Optional[str] = bluesky_url_parser.netloc + bluesky_path = bluesky_url_parser.path + self.handle: Optional[str] = bluesky_path.split("/")[2] + self.post_rkey: Optional[str] = bluesky_path.split("/")[-1] + self.did: str = BlueskyScraper.id_resolver.handle.resolve(self.handle) + + +class BlueskyDataProcessor(DataProcessor): + + def __init__(self, url: str, bluesky_thread_data: ThreadViewPost): + self.url: str = url + self.bluesky_thread_data: ThreadViewPost = bluesky_thread_data + logger.debug( + f"BlueskyDataProcessor initialized with url: {url}\n and bluesky_thread_data: \n{bluesky_thread_data}") + self._data: dict = {} + + async def get_item(self) -> dict: + await self.process_data() + bluesky_item = Bluesky.from_dict(self._data) + return bluesky_item.to_dict() + pass + + async def process_data(self): + await self._resolve_thread_data() + + async def _resolve_thread_data(self) -> None: + base_post_view_data = await BlueskyDataProcessor._resolve_single_post_data(self.bluesky_thread_data.post) + base_post_view_data["url"] = self.url + + post_author_did = base_post_view_data["author_did"] + + parent_posts_text = "" + parent_posts_content = "" + parent_posts_media_files = [] + replies_posts_text = "" + replies_posts_content = "" + replies_posts_media_files = [] + # get post data from the parent posts whose author is the same as the base post author + if self.bluesky_thread_data.parent: + parent_posts_data = [] + parent_post_view = self.bluesky_thread_data.parent + await BlueskyDataProcessor._get_parent_posts_data(parent_post_view, parent_posts_data) + if parent_posts_data: + for post_data in parent_posts_data: + parent_posts_text += "\n" + post_data["text"] + parent_posts_content += post_data["content"] + parent_posts_media_files.extend(post_data["media_files"]) + # get post data from the replies whose author is the same as the base post author + if self.bluesky_thread_data.replies: + replies_posts_data = [] + for post_thread_view in self.bluesky_thread_data.replies: + post_view = post_thread_view.post + if post_author_did == post_view.author.did: + post_data = await BlueskyDataProcessor._resolve_single_post_data(post_view) + replies_posts_data.append(post_data) + if replies_posts_data: + for post_data in replies_posts_data: + replies_posts_text += "\n" + post_data["text"] + replies_posts_content += post_data["content"] + replies_posts_media_files.extend(post_data["media_files"]) + base_post_view_data["text"] = parent_posts_text + base_post_view_data["text"] + replies_posts_text + base_post_view_data["content"] = parent_posts_content + base_post_view_data["content"] + replies_posts_content + base_post_view_data["media_files"] = parent_posts_media_files + base_post_view_data[ + "media_files"] + replies_posts_media_files + + if len(base_post_view_data["text"]) > BLUESKY_MAX_LENGTH: + base_post_view_data["message_type"] = MessageType.LONG + else: + base_post_view_data["message_type"] = MessageType.SHORT + + self._data = base_post_view_data + + @staticmethod + async def _get_parent_posts_data(parent_post_view: ThreadViewPost, parent_posts_data_list: list) -> None: + parent_post_data = await BlueskyDataProcessor._resolve_single_post_data(parent_post_view.post) + parent_posts_data_list.append(parent_post_data) + if parent_post_view.parent: + await BlueskyDataProcessor._get_parent_posts_data(parent_post_view.parent, parent_posts_data_list) + + @staticmethod + async def _resolve_single_post_data(post_data: PostView) -> dict: + at_uri = AtUri.from_str(post_data.uri) + url = BLUESKY_HOST + "/profile/" + post_data.author.handle + "/post/" + at_uri.rkey + author = post_data.author.display_name + author_url = BLUESKY_HOST + "/profile/" + post_data.author.handle + author_did = post_data.author.did + text = post_data.record.text + created_at = post_data.record.created_at + + parsed_post_data = { + "url": url, + "title": author + "\'s Bluesky post", + "author": author, + "author_url": author_url, + "text": text, + "category": "bluesky", + "media_files": [], + "created_at": created_at, + "author_did": author_did, + } + + media_files = [] + if post_data.embed is not None: + # images and videos + if "images" in post_data.embed.__dict__: + for image in post_data.embed.images: + img_url = image.fullsize + img_item = { + "media_type": "image", + "url": img_url, + "caption": "", + } + media_files.append(img_item) + # TODO: handle video, which is in m3u8 format that needs to be downloaded and converted to mp4 + parsed_post_data["media_files"] = media_files + # retweet post + if "record" in post_data.embed.__dict__ and post_data.embed.record is ViewRecord: + retweet_post_data = await BlueskyDataProcessor._resolve_single_post_data(post_data.embed.record) + parsed_post_data["retweet_post"] = retweet_post_data + + content = await BlueskyDataProcessor._generate_html_content(parsed_post_data) + text = await BlueskyDataProcessor._generate_telegram_text(parsed_post_data) + parsed_post_data["content"] = content + parsed_post_data["text"] = text + + return parsed_post_data + + @staticmethod + async def _generate_html_content(data: dict) -> str: + html_content_text = wrap_text_into_html(data["text"]) + data["html_content_text"] = html_content_text + content = content_template.render(data=data) + return content + + @staticmethod + async def _generate_telegram_text(data: dict) -> str: + text = telegram_text_template.render(data=data) + return text + + +class BlueskyScraper(Scraper): + id_resolver = IdResolver() + + def __init__(self, username: Optional[str] = None, password: Optional[str] = None): + self.client: AsyncClient = AsyncClient() + self.username: Optional[str] = username + self.password: Optional[str] = password + self.did: Optional[str] = None + + async def init(self): + if self.username and self.password: + await self.client.login(self.username, self.password) + # self.did = await self.client.com + + async def get_processor_by_url(self, url: str) -> BlueskyDataProcessor: + bluesky_post = BlueskyPost(url) + bluesky_post_data = await self._request_post_data(bluesky_post) + return BlueskyDataProcessor(url, bluesky_post_data) + + async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost: + profile_identify = bluesky_post.did or bluesky_post.handle + try: + post_data = await self.client.get_post(profile_identify=profile_identify, post_rkey=bluesky_post.post_rkey) + post_uri = post_data.uri + post_thread_data = await self.client.get_post_thread(uri=post_uri) + return post_thread_data.thread + except Exception as e: + logger.error(f"Error while getting post data: {e}") diff --git a/apps/api/src/services/scrapers/common.py b/apps/api/src/services/scrapers/common.py new file mode 100644 index 0000000..d7b83cf --- /dev/null +++ b/apps/api/src/services/scrapers/common.py @@ -0,0 +1,114 @@ +from typing import Optional, Any + +from src.models.database_model import Metadata +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from fastfetchbot_shared.models.metadata_item import MessageType +from src.services import ( + telegraph, + inoreader +) +from src.services.file_export import video_download, document_export +from src.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads +from src.services.scrapers.scraper_manager import ScraperManager +from src.database import save_instances +from fastfetchbot_shared.utils.logger import logger +from src.config import DATABASE_ON + + +class InfoExtractService(object): + service_classes: dict = { + "twitter": twitter.Twitter, + "threads": threads.Threads, + "reddit": reddit.Reddit, + "weibo": weibo.Weibo, + "wechat": wechat.Wechat, + "instagram": instagram.Instagram, + "douban": douban.Douban, + "zhihu": zhihu.Zhihu, + "xiaohongshu": xiaohongshu.Xiaohongshu, + "youtube": video_download.VideoDownloader, + "bilibili": video_download.VideoDownloader, + "inoreader": inoreader.Inoreader, + } + + def __init__( + self, + url_metadata: UrlMetadata, + data: Any = None, + store_database: Optional[bool] = DATABASE_ON, + store_telegraph: Optional[bool] = True, + store_document: Optional[bool] = False, + **kwargs, + ): + url_metadata = url_metadata.to_dict() + self.url = url_metadata["url"] + self.content_type = url_metadata["content_type"] + self.source = url_metadata["source"] + self.data = data + self.kwargs = kwargs + self.store_database = store_database + self.store_telegraph = store_telegraph + self.store_document = store_document + + @property + def category(self) -> str: + return self.source + + async def get_item(self, metadata_item: Optional[dict] = None) -> dict: + if self.content_type == "video": + if not self.kwargs.get("category"): + self.kwargs["category"] = self.category + if not metadata_item: + try: + if self.category in ["bluesky", "weibo", "other", "unknown"]: # it is a workaround before the code refactor + await ScraperManager.init_scraper(self.category) + item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url) + metadata_item = await item_data_processor.get_item() + else: + scraper_item = InfoExtractService.service_classes[self.category]( + url=self.url, data=self.data, **self.kwargs + ) + metadata_item = await scraper_item.get_item() + except Exception as e: + logger.error(f"Error while getting item: {e}") + raise e + logger.info(f"Got metadata item") + logger.debug(metadata_item) + metadata_item = await self.process_item(metadata_item) + return metadata_item + + async def process_item(self, metadata_item: dict) -> dict: + if metadata_item.get("message_type") == MessageType.LONG: + self.store_telegraph = True + logger.info("message type is long, store in telegraph") + if self.store_telegraph: + telegraph_item = telegraph.Telegraph.from_dict(metadata_item) + try: + telegraph_url = await telegraph_item.get_telegraph() + except Exception as e: + logger.error(f"Error while getting telegraph: {e}") + telegraph_url = "" + metadata_item["telegraph_url"] = telegraph_url + if self.store_document or ( + not self.store_document and metadata_item["telegraph_url"] == "" + ): + logger.info("store in document") + try: + pdf_document = document_export.pdf_export.PdfExport( + title=metadata_item["title"], html_string=metadata_item["content"] + ) + output_filename = await pdf_document.export(method="file") + metadata_item["media_files"].append( + { + "media_type": "document", + "url": output_filename, + "caption": "", + } + ) + except Exception as e: + logger.error(f"Error while exporting document: {e}") + metadata_item["title"] = metadata_item["title"].strip() + if self.store_database: + logger.info("store in database") + await save_instances(Metadata.model_construct(**metadata_item)) + return metadata_item diff --git a/apps/api/src/services/scrapers/douban/__init__.py b/apps/api/src/services/scrapers/douban/__init__.py new file mode 100644 index 0000000..4ea8712 --- /dev/null +++ b/apps/api/src/services/scrapers/douban/__init__.py @@ -0,0 +1,230 @@ +import re +from typing import Dict, Optional, Any +from enum import Enum +from urllib.parse import urlparse + +from bs4 import BeautifulSoup +from lxml import etree + +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.network import get_selector, HEADERS +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from src.config import JINJA2_ENV + +SHORT_LIMIT = 600 + +short_text_template = JINJA2_ENV.get_template("douban_short_text.jinja2") +content_template = JINJA2_ENV.get_template("douban_content.jinja2") + + +class DoubanType(str, Enum): + MOVIE_REVIEW = "movie_review" + BOOK_REVIEW = "book_review" + NOTE = "note" + STATUS = "status" + GROUP = "group" + UNKNOWN = "unknown" + + +class Douban(MetadataItem): + item_title: Optional[str] + item_url: Optional[str] + group_name: Optional[str] + group_url: Optional[str] + douban_type: DoubanType + text_group: Optional[str] + raw_content: Optional[str] + date: Optional[str] + + def __init__(self, url: str, data: Optional[Any] = None, **kwargs): + # metadata fields + self.url = url + self.title = "" + self.author = "" + self.author_url = "" + self.text = "" + self.content = "" + self.media_files = [] + self.category = "douban" + self.message_type = MessageType.SHORT + # auxiliary fields + self.item_title: Optional[str] = None + self.item_url: Optional[str] = None + self.group_name: Optional[str] = None + self.group_url: Optional[str] = None + self.douban_type: DoubanType = DoubanType.UNKNOWN + self.text_group: Optional[str] = None + self.raw_content: Optional[str] = None + self.date: Optional[str] = None + # reqeust fields + self.headers = HEADERS + self.headers["Cookie"] = kwargs.get("cookie", "") + + async def get_item(self) -> dict: + await self.get_douban() + return self.to_dict() + + async def get_douban(self) -> None: + self.check_douban_type() + await self.get_douban_item() + + def check_douban_type(self): + urlparser = urlparse(self.url) + host = urlparser.netloc + path = urlparser.path + if host.find("m.douban") != -1: # parse the m.douban url + host = host.replace("m.douban", "douban") + if path.startswith("/movie/review"): + self.douban_type = DoubanType.MOVIE_REVIEW + host = host.replace("douban", "movie.douban") + path = path.replace("/movie/", "/") + elif path.startswith("/book/review"): + self.douban_type = DoubanType.BOOK_REVIEW + host = host.replace("douban", "book.douban") + path = path.replace("/book/", "/") + if path.startswith("/note/"): + self.douban_type = DoubanType.NOTE + elif path.startswith("/status/") or re.match(r"/people/\d+/status/\d+", path): + self.douban_type = DoubanType.STATUS + elif path.startswith("/group/topic/"): + self.douban_type = DoubanType.GROUP + elif host.startswith("movie.douban") and path.startswith("/review/"): + self.douban_type = DoubanType.MOVIE_REVIEW + elif host.startswith("book.douban") and path.startswith("/review/"): + self.douban_type = DoubanType.BOOK_REVIEW + else: + self.douban_type = DoubanType.UNKNOWN + self.url = f"https://{host}{path}" + + async def get_douban_item(self): + function_dict = { + DoubanType.MOVIE_REVIEW: self._get_douban_movie_review, + DoubanType.BOOK_REVIEW: self._get_douban_book_review, + DoubanType.NOTE: self._get_douban_note, + DoubanType.STATUS: self._get_douban_status, + DoubanType.GROUP: self._get_douban_group_article, + DoubanType.UNKNOWN: None, + } + await function_dict[self.douban_type]() + short_text = self._douban_short_text_process() + if short_text.endswith("\n"): + short_text = short_text[:-1] + data = self.__dict__ + data["short_text"] = short_text + self.text = short_text_template.render(data=data) + self.raw_content = self.raw_content_to_html(self.raw_content) + self.content = wrap_text_into_html( + content_template.render(data=data), is_html=True + ) + if get_html_text_length(self.content) > SHORT_LIMIT: + self.message_type = MessageType.LONG + else: + self.message_type = MessageType.SHORT + + async def _get_douban_movie_review(self): + selector = await get_selector(url=self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1//span)') + self.author = selector.xpath('string(//header[@class="main-hd"]//span)') + self.author_url = selector.xpath('string(//header[@class="main-hd"]/a/@href)') + self.item_title = selector.xpath('string(//header[@class="main-hd"]/a[2])') + self.item_url = selector.xpath('string(//header[@class="main-hd"]/a[2]/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath("//div[contains(@class,'review-content')]")[0], + encoding="utf-8", + ), + encoding="utf-8", + ) + + async def _get_douban_book_review(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1//span)') + self.author = selector.xpath('string(//header[@class="main-hd"]//span)') + self.author_url = selector.xpath('string(//header[@class="main-hd"]/a/@href)') + self.item_title = selector.xpath('string(//header[@class="main-hd"]/a[2])') + self.item_url = selector.xpath('string(//header[@class="main-hd"]/a[2]/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + async def _get_douban_note(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath("string(//h1)") + self.author = selector.xpath('string(//div[@class="content"]/a)') + self.author_url = selector.xpath('string(//div[@class="content"]/a/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + async def _get_douban_status(self): + selector = await get_selector(self.url, headers=self.headers) + self.author = selector.xpath('string(//div[@class="content"]/a)') + self.author_url = selector.xpath('string(//div[@class="content"]/a/@href)') + self.title = self.author + "的广播" + self.raw_content = ( + str( + etree.tostring( + selector.xpath('//div[@class="status-saying"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + .replace("
    ", "") + .replace("
    ", "") + .replace(">+<", "><") + .replace(" ", "
    ") + ) + + async def _get_douban_group_article(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1)') + self.title = self.title.replace("\n", "").strip() + self.author = selector.xpath('string(//span[@class="from"]//a)') + self.author_url = selector.xpath('string(//span[@class="from"]//a/@href)') + self.group_name = selector.xpath( + 'string(//div[@id="g-side-info"]//div[@class="title"]/a)' + ) + self.group_url = selector.xpath( + 'string(//div[@id="g-side-info"]//div[@class="title"]/a/@href)' + ) + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + def _douban_short_text_process(self) -> str: + soup = BeautifulSoup(self.raw_content, "html.parser") + for img in soup.find_all("img"): + media_item = {"media_type": "image", "url": img["src"], "caption": ""} + self.media_files.append(MediaFile.from_dict(media_item)) + img.extract() + for item in soup.find_all(["p", "span", "div"]): + item.unwrap() + for item in soup.find_all(["link", "script"]): + item.decompose() + for item in soup.find_all("a"): + if item.get("title") == "查看原图": + item.decompose() + short_text = str(soup) + short_text = re.sub(r"\n{2,}", "\n", short_text) + short_text = re.sub(r"", "\n", short_text) + return short_text + + @staticmethod + def raw_content_to_html(raw_content: str) -> str: + # Split the text into paragraphs based on double newlines + print(raw_content) + paragraphs = raw_content.split('
    \n') + # Wrap each paragraph with

    tags + print(paragraphs) + html_paragraphs = [f'

    {paragraph.strip()}

    ' for paragraph in paragraphs] + # Join the paragraphs to form the final HTML string + html_string = ''.join(html_paragraphs) + return html_string diff --git a/apps/api/src/services/scrapers/general/__init__.py b/apps/api/src/services/scrapers/general/__init__.py new file mode 100644 index 0000000..f256512 --- /dev/null +++ b/apps/api/src/services/scrapers/general/__init__.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import Any + +from fastfetchbot_shared.models.metadata_item import MetadataItem + + +@dataclass +class GeneralItem(MetadataItem): + """ + GeneralItem: Data class for scraped content from general webpage scrapers. + """ + id: str = "" + raw_content: str = "" + scraper_type: str = "" # Which scraper was used (e.g., "firecrawl", "zyte", etc.) + + @staticmethod + def from_dict(obj: Any) -> "GeneralItem": + metadata_item = MetadataItem.from_dict(obj) + return GeneralItem( + url=metadata_item.url, + title=metadata_item.title, + author=metadata_item.author, + author_url=metadata_item.author_url, + telegraph_url=metadata_item.telegraph_url, + text=metadata_item.text, + content=metadata_item.content, + media_files=metadata_item.media_files, + category=metadata_item.category, + message_type=metadata_item.message_type, + id=obj.get("id", ""), + raw_content=obj.get("raw_content", ""), + scraper_type=obj.get("scraper_type", ""), + ) + + def to_dict(self) -> dict: + result: dict = super().to_dict() + result["id"] = self.id + result["raw_content"] = self.raw_content + result["scraper_type"] = self.scraper_type + return result diff --git a/apps/api/src/services/scrapers/general/base.py b/apps/api/src/services/scrapers/general/base.py new file mode 100644 index 0000000..8d454d6 --- /dev/null +++ b/apps/api/src/services/scrapers/general/base.py @@ -0,0 +1,208 @@ +import hashlib +from abc import abstractmethod +from typing import Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup, Doctype +from openai import AsyncOpenAI +from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam + +from src.config import OPENAI_API_KEY +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from src.services.scrapers.scraper import Scraper, DataProcessor +from src.services.scrapers.general import GeneralItem +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.logger import logger + +GENERAL_TEXT_LIMIT = 800 + +DEFAULT_OPENAI_MODEL = "gpt-5-nano" + +# System prompt for LLM to extract article content +ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. + +Instructions: +1. Identify and extract ONLY the main article/post content +2. Remove navigation, headers, footers, sidebars, ads, comments, and other non-article elements +3. Preserve the article's structure (headings, paragraphs, lists, etc.) +4. Keep important formatting like bold, italic, links, and images +5. Return clean HTML containing only the article content +6. If you cannot identify the main content, return the original HTML unchanged +7. After all of the above, remove some basic HTML tags like , ,