diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9cb669e..77d7f80 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,65 +5,74 @@ on: branches: - main -env: - APP_NAME: fastfetchbot - DOCKERHUB_REPO: aturret/fastfetchbot -# APP_VERSION: latest - -concurrency: +concurrency: group: fastfetchbot cancel-in-progress: true jobs: - docker: + build: runs-on: ubuntu-latest + permissions: + contents: read + packages: write + strategy: + matrix: + include: + - service: api + dockerfile: apps/api/Dockerfile + image_suffix: api + - service: telegram-bot + dockerfile: apps/telegram-bot/Dockerfile + image_suffix: telegram-bot steps: - - - name: Checkout - uses: actions/checkout@v2 - - - name: Check commit message + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check commit message id: check_message run: | MESSAGE=$(git log --format=%B -n 1 ${{ github.sha }}) if [[ "$MESSAGE" == *"[github-action]"* ]]; then - echo "::set-output name=skip::true" + echo "skip=true" >> "$GITHUB_OUTPUT" else - echo "::set-output name=skip::false" + echo "skip=false" >> "$GITHUB_OUTPUT" fi - - - name: Set up QEMU - uses: docker/setup-qemu-action@v1 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to DockerHub - uses: docker/login-action@v1 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Generate App Version - run: echo APP_VERSION=`git describe --tags --always` >> $GITHUB_ENV - - - name: Build and push + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate App Version + run: echo "APP_VERSION=$(git describe --tags --always)" >> "$GITHUB_ENV" + + - name: Build and push if: steps.check_message.outputs.skip == 'false' - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v6 with: context: . - platforms: | - linux/amd64 + file: ${{ matrix.dockerfile }} + platforms: linux/amd64 push: true build-args: | - APP_NAME=${{ env.APP_NAME }} APP_VERSION=${{ env.APP_VERSION }} tags: | - ${{ env.DOCKERHUB_REPO }}:latest -# ${{ env.DOCKERHUB_REPO }}:${{ env.APP_VERSION }} - - - name: send curl request - run: | - curl -H 'Authorization: Bearer ${{ secrets.WATCHTOWER_TOKEN }}' ${{ secrets.WATCHTOWER_WEBHOOK_URL }} + ghcr.io/${{ github.repository }}-${{ matrix.image_suffix }}:latest + deploy: + needs: build + runs-on: ubuntu-latest + steps: + - name: Trigger Watchtower deployment + run: | + curl -H "Authorization: Bearer ${{ secrets.WATCHTOWER_TOKEN }}" ${{ secrets.WATCHTOWER_WEBHOOK_URL }} diff --git a/README.md b/README.md index 77cff15..a0c0a49 100644 --- a/README.md +++ b/README.md @@ -2,128 +2,191 @@ Demo: https://t.me/aturretrss_bot # FastFetchBot -A social media fetch API based on [FastAPI](https://fastapi.tiangolo.com/), with Telegram Bot as the default client. +A social media content fetching service with a Telegram Bot client, built as a monorepo with two microservices. -Supported most mainstream social media platforms. You can get a permanent copy of the content by just sending the url to the bot. +Send a social media URL to the bot, and it fetches and archives the content for you. Supports most mainstream social media platforms. -Other separated microservices for this project: +## Architecture -- [FastFileExporter](https://github.com/aturret/FastFileExporter) -- [FastFetchBot-Telegram-Bot](https://github.com/aturret/FastFetchBot-Telegram-Bot) +FastFetchBot is organized as a UV workspace monorepo with three packages: +``` +FastFetchBot/ +├── packages/shared/ # fastfetchbot-shared: common models, utilities, logger +├── apps/api/ # FastAPI server: scrapers, storage, routing +├── apps/telegram-bot/ # Telegram Bot: webhook/polling, message handling +├── app/ # Legacy re-export wrappers (backward compatibility) +├── pyproject.toml # Root workspace configuration +└── uv.lock # Lockfile for the entire workspace +``` -## Installation - -### Docker (Recommended) - -Download the docker-compose.yml file and set the environment variables as the following section. +| Service | Port | Description | +|---------|------|-------------| +| **API Server** (`apps/api/`) | 10450 | FastAPI app with all platform scrapers, file export, and storage | +| **Telegram Bot** (`apps/telegram-bot/`) | 10451 | Receives messages via webhook or long polling, calls the API server | -#### Env +The Telegram Bot communicates with the API server over HTTP. In Docker, this is `http://api:10450`. -Create a `.env` file at the same directory and set the [environment variables](#envrionment-variables). +## Installation -#### Local Telegram API Sever +### Docker (Recommended) -If you want to send documents that larger than 50MB, you need to run a local telegram api server. The `docker-compose.yml` file has already give you an example. You just need to fill the `TELEGRAM_API_ID` and `TELEGRAM_API_HASH` in the yml file. If you don't need it, just comment it out. +1. Copy `docker-compose.template.yml` to `docker-compose.yml`. +2. Create a `.env` file from `template.env` and fill in the [environment variables](#environment-variables). +3. If you need large file support (>50 MB), fill in `TELEGRAM_API_ID` and `TELEGRAM_API_HASH` in the compose file for the local Telegram Bot API server. Otherwise, comment out the `telegram-bot-api` service. ```bash docker-compose up -d ``` -### Python (Not Recommended) +The compose file pulls pre-built images from GitHub Container Registry: -Local Telegram API sever and video download function is not supported in this way. If you do really need these functions, you can run the telegram api server and [the file export server](https://github.com/aturret/FastFileExporter) manually. +- `ghcr.io/aturret/fastfetchbot-api:latest` +- `ghcr.io/aturret/fastfetchbot-telegram-bot:latest` -We use [Poetry](https://python-poetry.org/) as the package manager for this project. You can install it by the following command. +To build locally instead, uncomment the `build:` blocks and comment out the `image:` lines in `docker-compose.yml`. -```bash -pip install poetry -``` +### Local Development -Then, install the dependencies. +Requires Python 3.12 and [uv](https://docs.astral.sh/uv/). ```bash -poetry install -``` +# Install all dependencies (including dev) +uv sync -Finally, run the server. +# Run the API server +cd apps/api +uv run gunicorn -k uvicorn.workers.UvicornWorker src.main:app --preload -```bash -poetry run gunicorn -k uvicorn.workers.UvicornWorker app.main:app --preload +# Run the Telegram Bot (in a separate terminal) +cd apps/telegram-bot +uv run python -m core.main ``` -## Environment Variables +### Telegram Bot Modes + +The bot supports two modes, controlled by the `TELEGRAM_BOT_MODE` environment variable: -Note: Many of the services requires cookies to fetch content. You can get your cookies by browser extension [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) and set the cookies as environment variables. +| Mode | Value | Use Case | +|------|-------|----------| +| **Long Polling** | `polling` (default) | Local development, simple deployments without a reverse proxy | +| **Webhook** | `webhook` | Production with a public HTTPS URL | +In both modes, the bot runs an HTTP server on port 10451 for the `/send_message` callback endpoint (used by Inoreader integration) and `/health`. -### Required Variables +## Development -- `BASE_URL`: The base url of the server. example: `example.com` -- `TELEGRAM_BOT_TOKEN`: The token of the telegram bot. -- `TELEGRAM_CHAT_ID`: The chat id of the telegram bot. +### Commands -### Optional Variables +```bash +uv sync # Install all dependencies +uv run pytest # Run tests +uv run pytest -v # Run tests with verbose output +uv run black . # Format code +``` -#### FastAPI +### Adding a New Platform Scraper -- `PORT`: Default: `10450` -- `API_KEY`: The api key for the FastAPI server. It would be generated automatically if not set. +1. Create a new scraper module in `apps/api/src/services/scrapers//` +2. Implement the scraper class following existing patterns +3. Add a platform-specific router in `apps/api/src/routers/` +4. Register the scraper in `ScraperManager` +5. Add configuration variables in `apps/api/src/config.py` +6. Create tests in `tests/cases/` -#### Telegram +### Docker Build -- `TELEBOT_API_SERVER_HOST`: The host of the telegram bot api server. Default: `telegram-bot-api` -- `TELEBOT_API_SERVER_PORT`: The port of the telegram bot api server. Default: `8081` -- `TELEGRAM_CHANNEL_ID`: The channel id of the telegram bot. Default: `None` -- `TELEGRAM_CHANNEL_ADMIN_LIST`: The id list of the users who can send message to targeted telegram channel, divided by `,`. You cannot send message to the channel if you are not in the list. Default: `None` +```bash +# Build both services locally +docker-compose build -#### Twitter +# Or build individually +docker build -f apps/api/Dockerfile -t fastfetchbot-api . +docker build -f apps/telegram-bot/Dockerfile -t fastfetchbot-telegram-bot . +``` -Must set cookies variables if you want to fetch twitter content. +> **Note:** Both Dockerfiles use the repository root as the build context (`.`) because they need access to `pyproject.toml`, `uv.lock`, and `packages/shared/`. -- `TWITTER_CT0`: The ct0 cookie of twitter. Default: `None` -- `TWITTER_AUTH_TOKEN`: The auth token of twitter. Default: `None` +## Environment Variables -#### Reddit +Many scrapers require authentication cookies. You can extract cookies using the browser extension [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc). -We use `read_only` mode of `praw` to fetch reddit content. We still need to set the `client_id` , `client_secret` , `username` and `password` of your reddit api account. +See `template.env` for a complete reference with comments. -- `REDDIT_CLIENT_ID`: The client id of reddit. Default: `None` -- `REDDIT_CLIENT_SECRET`: The client secret of reddit. Default: `None` -- `REDDIT_USERNAME`: The username of reddit. Default: `None` -- `REDDIT_PASSWORD`: The password of reddit. Default: `None` +### Required -#### Weibo +| Variable | Description | +|----------|-------------| +| `BASE_URL` | Public domain of the server (e.g. `example.com`). Used for webhook URL construction. | +| `TELEGRAM_BOT_TOKEN` | Bot token from [@BotFather](https://t.me/BotFather) | +| `TELEGRAM_CHAT_ID` | Default chat ID for the bot | -- `WEIBO_COOKIES`: The cookie of weibo. For some unknown reasons, some weibo posts may be not accessible if you don't are not logged in. Just copy the cookie from your browser and set it. Default: `None` +### Service Communication (Docker) -#### Xiaohongshu +| Variable | Default | Description | +|----------|---------|-------------| +| `API_SERVER_URL` | `http://localhost:10450` | URL the Telegram Bot uses to call the API server. Set to `http://api:10450` in Docker. | +| `TELEGRAM_BOT_CALLBACK_URL` | `http://localhost:10451` | URL the API server uses to call the Telegram Bot. Set to `http://telegram-bot:10451` in Docker. | +| `TELEGRAM_BOT_MODE` | `polling` | `polling` or `webhook` | -- `XIAOHONGSHU_A1`: The a1 cookie of xiaohongshu. Default: `None` -- `XIAOHONGSHU_WEBID`: The webid cookie of xiaohongshu. Default: `None` -- `XIAOHONGSHU_WEBSESSION`: The websession cookie of xiaohongshu. Default: `None` -#### OpenAI +### Optional -You can set the api key of OpenAI to use the transcription function. +#### API Server -- `OPENAI_API_KEY`: The api key of OpenAI. Default: `None` +| Variable | Default | Description | +|----------|---------|-------------| +| `PORT` | `10450` | API server port | +| `API_KEY` | auto-generated | API key for authentication | -#### Amazon S3 Picture Storage +#### Telegram -- `AWS_ACCESS_KEY_ID`: The access key id of Amazon S3. Default: `None` -- `AWS_SECRET_ACCESS_KEY`: The secret access key of Amazon S3. Default: `None` -- `AWS_S3_BUCKET_NAME`: The bucket name of Amazon S3. Default: `None` -- `AWS_S3_REGION_NAME`: The region name of Amazon S3. Default: `None` -- `AWS_DOMAIN_HOST`: The domain bound to the bucket. The picture upload function would generate images url by bucket name if customized host not set. Default: `None` +| Variable | Default | Description | +|----------|---------|-------------| +| `TELEBOT_API_SERVER_HOST` | `None` | Local Telegram Bot API server host | +| `TELEBOT_API_SERVER_PORT` | `None` | Local Telegram Bot API server port | +| `TELEGRAM_CHANNEL_ID` | `None` | Channel ID(s) for the bot, comma-separated | +| `TELEGRAM_CHANNEL_ADMIN_LIST` | `None` | User IDs allowed to post to the channel, comma-separated | + +#### Platform Cookies & Credentials + +| Platform | Variables | +|----------|-----------| +| Twitter | `TWITTER_CT0`, `TWITTER_AUTH_TOKEN` | +| Reddit | `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET`, `REDDIT_USERNAME`, `REDDIT_PASSWORD` | +| Weibo | `WEIBO_COOKIES` | +| Xiaohongshu | `XIAOHONGSHU_A1`, `XIAOHONGSHU_WEBID`, `XIAOHONGSHU_WEBSESSION` | +| Instagram | `X_RAPIDAPI_KEY` | +| Zhihu | Store cookies in `conf/zhihu_cookies.json` | + +#### Cloud Services + +| Variable | Description | +|----------|-------------| +| `OPENAI_API_KEY` | OpenAI API key for audio transcription | +| `AWS_ACCESS_KEY_ID` | Amazon S3 access key | +| `AWS_SECRET_ACCESS_KEY` | Amazon S3 secret key | +| `AWS_S3_BUCKET_NAME` | S3 bucket name | +| `AWS_S3_REGION_NAME` | S3 region | +| `AWS_DOMAIN_HOST` | Custom domain bound to the S3 bucket | + +#### General Webpage Scraping + +| Variable | Default | Description | +|----------|---------|-------------| +| `GENERAL_SCRAPING_ON` | `false` | Enable scraping for unrecognized URLs | +| `GENERAL_SCRAPING_API` | `FIRECRAWL` | Backend: `FIRECRAWL` or `ZYTE` | +| `FIRECRAWL_API_URL` | | Firecrawl API server URL | +| `FIRECRAWL_API_KEY` | | Firecrawl API key | +| `ZYTE_API_KEY` | | Zyte API key | ## Supported Content Types -### Social Media Content +### Social Media - [x] Twitter - [x] Bluesky (Beta, only supports part of posts) - [x] Instagram -- [ ] Threads +- [ ] Threads - [x] Reddit (Beta, only supports part of posts) - [ ] Quora - [x] Weibo @@ -132,11 +195,18 @@ You can set the api key of OpenAI to use the transcription function. - [x] Douban - [ ] Xiaohongshu -### Video Content +### Video -- [x] Youtube +- [x] YouTube - [x] Bilibili +## CI/CD + +The GitHub Actions pipeline (`.github/workflows/ci.yml`) automatically builds and pushes both microservice images to GitHub Container Registry on every push to `main`: + +- `ghcr.io/aturret/fastfetchbot-api:latest` +- `ghcr.io/aturret/fastfetchbot-telegram-bot:latest` + ## Acknowledgements The HTML to Telegra.ph converter function is based on [html-telegraph-poster](https://github.com/mercuree/html-telegraph-poster). I separated it from this project as an independent Python package: [html-telegraph-poster-v2](https://github.com/aturret/html-telegraph-poster-v2). diff --git a/app/config.py b/app/config.py index 50fd18d..ca0db5d 100644 --- a/app/config.py +++ b/app/config.py @@ -6,7 +6,7 @@ import gettext import secrets -from app.utils.parse import get_env_bool +from fastfetchbot_shared.utils.parse import get_env_bool env = os.environ current_directory = os.path.dirname(os.path.abspath(__file__)) diff --git a/app/models/classes.py b/app/models/classes.py index 2fab80a..e89bcf7 100644 --- a/app/models/classes.py +++ b/app/models/classes.py @@ -1,17 +1,2 @@ -from io import BytesIO - - -class NamedBytesIO(BytesIO): - @property - def name(self): - return self._name - - def __init__(self, content=None, name=None): - super().__init__(content) - self._name = name - if content is not None: - self.size = self.getbuffer().nbytes - - @name.setter - def name(self, value): - self._name = value +# Re-export from shared package +from fastfetchbot_shared.models.classes import NamedBytesIO # noqa: F401 diff --git a/app/models/metadata_item.py b/app/models/metadata_item.py index 6b5820d..5bce9fa 100644 --- a/app/models/metadata_item.py +++ b/app/models/metadata_item.py @@ -1,123 +1,12 @@ -from dataclasses import dataclass -from enum import Enum, unique -from typing import Any, List, TypeVar, Callable, Type, cast, Union, Optional - -from pydantic import BaseModel - -""" -MetadataItem is a dataclass that represents a single item for our services. It would be saved in the database. -The MetadataItem is used to send to the telegram bot. Users can use the metadata to define their own message template. -If the program doesn't find the attribute in the dict_data, it will use the default value in case of KeyError. -""" - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - if x is None: - return "" - assert isinstance(x, str) - return x - - -def from_list(f: Callable[[Any], T], x: Any) -> List[T]: - assert isinstance(x, list) - return [f(y) for y in x] - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@unique -class MessageType(str, Enum): - SHORT = "short" - LONG = "long" - - -@dataclass -class MediaFile: - media_type: str - url: str - original_url: Optional[str] = None - caption: Optional[str] = None - - @staticmethod - def from_dict(obj: Any) -> "MediaFile": - assert isinstance(obj, dict) - media_type = from_str(obj.get("media_type")) - url = from_str(obj.get("url")) - caption = from_str(obj.get("caption")) - return MediaFile(media_type, url, caption) - - def to_dict(self) -> dict: - result: dict = {} - result["media_type"] = from_str(self.media_type) - result["url"] = from_str(self.url) - result["caption"] = self.caption - return result - - -@dataclass -class MetadataItem: - url: str - telegraph_url: Optional[str] - content: Optional[str] - text: Optional[str] - media_files: List[MediaFile] - author: str - title: str - author_url: Optional[str] - category: str - message_type: Optional[MessageType] - - @staticmethod - def from_dict(obj: Any) -> "MetadataItem": - assert isinstance(obj, dict) - url = from_str(obj.get("url")) - telegraph_url = from_str(obj.get("telegraph_url")) - content = from_str(obj.get("content")) - text = from_str(obj.get("text")) - media_files = from_list(MediaFile.from_dict, obj.get("media_files")) - author = from_str(obj.get("author")) - title = from_str(obj.get("title")) - author_url = from_str(obj.get("author_url")) - category = from_str(obj.get("category")) - message_type = MessageType(obj.get("message_type")) - return MetadataItem( - url, - telegraph_url, - content, - text, - media_files, - author, - title, - author_url, - category, - message_type, - ) - - def to_dict(self) -> dict: - result: dict = { - "url": from_str(self.url), - "telegraph_url": "", "content": from_str(self.content), - "text": from_str(self.text), - "media_files": from_list( - lambda x: to_class(MediaFile, x), self.media_files - ), - "author": from_str(self.author), - "title": from_str(self.title), - "author_url": from_str(self.author_url), - "category": from_str(self.category), - "message_type": self.message_type.value - } - return result - - -def metadata_item_from_dict(s: Any) -> MetadataItem: - return MetadataItem.from_dict(s) - - -def metadata_item_to_dict(x: MetadataItem) -> Any: - return to_class(MetadataItem, x) +# Re-export from shared package +from fastfetchbot_shared.models.metadata_item import * # noqa: F401,F403 +from fastfetchbot_shared.models.metadata_item import ( # noqa: F401 + MetadataItem, + MediaFile, + MessageType, + from_str, + from_list, + to_class, + metadata_item_from_dict, + metadata_item_to_dict, +) diff --git a/app/models/telegraph_item.py b/app/models/telegraph_item.py index 04d5b77..2b4b2f0 100644 --- a/app/models/telegraph_item.py +++ b/app/models/telegraph_item.py @@ -1,58 +1,7 @@ -from dataclasses import dataclass -from typing import Any, TypeVar, Type, cast - -""" -The TelegraphItem is a class for generating a Telegraph page. -If the program doesn't find the attribute in the dict_data, it will use the default value in case of KeyError. -""" - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - assert isinstance(x, str) - return x - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@dataclass -class TelegraphItem: - title: str - url: str - author: str - author_url: str - category: str - content: str - - @staticmethod - def from_dict(obj: Any) -> 'TelegraphItem': - assert isinstance(obj, dict) - title = from_str(obj.get("title")) - url = from_str(obj.get("url")) - author = from_str(obj.get("author")) - author_url = from_str(obj.get("author_url")) - category = from_str(obj.get("category")) - content = from_str(obj.get("content")) - return TelegraphItem(title, url, author, author_url, category, content) - - def to_dict(self) -> dict: - result: dict = {} - result["title"] = from_str(self.title) - result["url"] = from_str(self.url) - result["author"] = from_str(self.author) - result["author_url"] = from_str(self.author_url) - result["category"] = from_str(self.category) - result["content"] = from_str(self.content) - return result - - -def telegraph_item_from_dict(s: Any) -> TelegraphItem: - return TelegraphItem.from_dict(s) - - -def telegraph_item_to_dict(x: TelegraphItem) -> Any: - return to_class(TelegraphItem, x) +# Re-export from shared package +from fastfetchbot_shared.models.telegraph_item import * # noqa: F401,F403 +from fastfetchbot_shared.models.telegraph_item import ( # noqa: F401 + TelegraphItem, + telegraph_item_from_dict, + telegraph_item_to_dict, +) diff --git a/app/models/url_metadata.py b/app/models/url_metadata.py index a581045..020d120 100644 --- a/app/models/url_metadata.py +++ b/app/models/url_metadata.py @@ -1,50 +1,7 @@ -import re -from dataclasses import dataclass -from typing import Any, TypeVar, Type, cast - -T = TypeVar("T") - - -def from_str(x: Any) -> str: - assert isinstance(x, str) - return x - - -def to_class(c: Type[T], x: Any) -> dict: - assert isinstance(x, c) - return cast(Any, x).to_dict() - - -@dataclass -class UrlMetadata: - url: str - source: str - content_type: str - - def __init__(self, url: str, source: str, content_type: str) -> None: - self.url = url - self.source = source - self.content_type = content_type - - @staticmethod - def from_dict(obj: Any) -> "UrlMetadata": - assert isinstance(obj, dict) - url = from_str(obj.get("url")) - source = from_str(obj.get("source")) - the_type = from_str(obj.get("type")) - return UrlMetadata(url, source, the_type) - - def to_dict(self) -> dict: - result: dict = {} - result["url"] = from_str(self.url) - result["source"] = from_str(self.source) - result["content_type"] = from_str(self.content_type) - return result - - -def url_metadata_from_dict(s: Any) -> UrlMetadata: - return UrlMetadata.from_dict(s) - - -def url_metadata_to_dict(x: UrlMetadata) -> Any: - return to_class(UrlMetadata, x) +# Re-export from shared package +from fastfetchbot_shared.models.url_metadata import * # noqa: F401,F403 +from fastfetchbot_shared.models.url_metadata import ( # noqa: F401 + UrlMetadata, + url_metadata_from_dict, + url_metadata_to_dict, +) diff --git a/app/services/inoreader/telegram_process.py b/app/services/inoreader/telegram_process.py index a1102f5..975e894 100644 --- a/app/services/inoreader/telegram_process.py +++ b/app/services/inoreader/telegram_process.py @@ -1,22 +1,34 @@ -from typing import Union, Optional, Dict +from typing import Union, Optional, Dict, Callable, Awaitable from app.config import TELEGRAM_CHANNEL_ID from app.models.url_metadata import UrlMetadata from app.services.inoreader import Inoreader from app.services.scrapers.common import InfoExtractService -from app.services.telegram_bot import send_item_message from app.utils.logger import logger from app.utils.parse import get_url_metadata, get_bool default_telegram_channel_id = TELEGRAM_CHANNEL_ID[0] if TELEGRAM_CHANNEL_ID else None +# Type alias for the message callback +MessageCallback = Callable[[dict, Union[int, str]], Awaitable[None]] + + +async def _default_message_callback(metadata_item: dict, chat_id: Union[int, str]) -> None: + """Default callback that sends via Telegram bot. Used when no callback is provided.""" + from app.services.telegram_bot import send_item_message + await send_item_message(metadata_item, chat_id=chat_id) + async def process_inoreader_data( data: list, use_inoreader_content: bool, telegram_channel_id: Union[int, str] = default_telegram_channel_id, stream_id: str = None, + message_callback: MessageCallback = None, ): + if message_callback is None: + message_callback = _default_message_callback + for item in data: url_type_item = await get_url_metadata(item["aurl"]) url_type_dict = url_type_item.to_dict() @@ -46,7 +58,7 @@ async def process_inoreader_data( store_document=True, ) message_metadata_item = await metadata_item.get_item() - await send_item_message(message_metadata_item, chat_id=telegram_channel_id) + await message_callback(message_metadata_item, telegram_channel_id) if stream_id: await Inoreader.mark_all_as_read( stream_id=stream_id, timestamp=item["timestamp"] - 1 @@ -57,7 +69,7 @@ async def get_inoreader_item_async( data: Optional[Dict] = None, trigger: bool = False, params: Optional[Dict] = None, - # filters: Optional[Dict] = None, + message_callback: MessageCallback = None, ) -> None: stream_id = None use_inoreader_content = True @@ -83,7 +95,8 @@ async def get_inoreader_item_async( if type(data) is dict: data = [data] await process_inoreader_data( - data, use_inoreader_content, telegram_channel_id, stream_id + data, use_inoreader_content, telegram_channel_id, stream_id, + message_callback=message_callback, ) if stream_id: await Inoreader.mark_all_as_read(stream_id=stream_id) diff --git a/app/services/telegram_bot/__init__.py b/app/services/telegram_bot/__init__.py index 5de80d8..0fe696e 100755 --- a/app/services/telegram_bot/__init__.py +++ b/app/services/telegram_bot/__init__.py @@ -1,36 +1,15 @@ # TODO: Implement Telegram Service # example: https://docs.python-telegram-bot.org/en/stable/examples.customwebhookbot.html -import asyncio -import html -import json -import os import mimetypes -import aiofiles -import traceback -from io import BytesIO -from urllib.parse import urlparse -from urllib.request import url2pathname -from typing import Union mimetypes.init() from telegram import ( Update, MessageEntity, - InlineKeyboardButton, - InlineKeyboardMarkup, - Message, - InputMediaPhoto, - InputMediaVideo, - InputMediaDocument, - InputMediaAnimation, - InputMediaAudio, ) -from telegram.constants import ParseMode from telegram.ext import ( Application, - CallbackContext, - ContextTypes, MessageHandler, CallbackQueryHandler, filters, @@ -38,49 +17,31 @@ AIORateLimiter, ) -from app.database import save_instances -from app.models.metadata_item import MessageType -from app.models.telegram_chat import TelegramMessage, TelegramUser, TelegramChat -from app.services.scrapers.common import InfoExtractService -from app.utils.parse import get_url_metadata, telegram_message_html_trim -from app.utils.network import download_file_by_metadata_item -from app.utils.image import Image, image_compressing, check_image_type -from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS from app.utils.logger import logger from app.config import ( TELEGRAM_BOT_TOKEN, TELEGRAM_WEBHOOK_URL, TELEGRAM_BOT_SECRET_TOKEN, - TELEGRAM_CHANNEL_ID, - TELEGRAM_CHANNEL_ADMIN_LIST, - TELEBOT_DEBUG_CHANNEL, TELEBOT_API_SERVER, TELEBOT_API_SERVER_FILE, TELEBOT_LOCAL_FILE_MODE, TELEBOT_CONNECT_TIMEOUT, TELEBOT_READ_TIMEOUT, TELEBOT_WRITE_TIMEOUT, - TELEGRAM_IMAGE_DIMENSION_LIMIT, - TELEGRAM_IMAGE_SIZE_LIMIT, - TELEGRAM_GROUP_MESSAGE_BAN_LIST, - TELEGRAM_BOT_MESSAGE_BAN_LIST, - FILE_EXPORTER_ON, - JINJA2_ENV, - OPENAI_API_KEY, - DATABASE_ON, - TEMPLATE_LANGUAGE, TELEBOT_MAX_RETRY, GENERAL_SCRAPING_ON, + TELEBOT_MAX_RETRY, ) -from app.services.telegram_bot.config import ( - HTTPS_URL_REGEX, - TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT, - TELEGRAM_FILE_UPLOAD_LIMIT, - TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API, - REFERER_REQUIRED, - TELEGRAM_TEXT_LIMIT, - TEMPLATE_TRANSLATION, + +# Re-export for external consumers +from app.services.telegram_bot.message_sender import send_item_message # noqa: F401 +from app.services.telegram_bot.handlers import ( # noqa: F401 + https_url_process, + https_url_auto_process, + all_messages_process, + buttons_process, + invalid_buttons, + error_process, + content_process_function, ) -from app.models.classes import NamedBytesIO -from app.models.url_metadata import UrlMetadata """ application and handlers initialization @@ -112,12 +73,6 @@ async def set_webhook() -> bool: else: logger.error("TELEGRAM_BOT_TOKEN is not set!") -environment = JINJA2_ENV -template = environment.get_template("social_media_message.jinja2") -template_text = TEMPLATE_TRANSLATION.get( - TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] -) - async def startup() -> None: await application.initialize() @@ -187,630 +142,3 @@ async def process_telegram_update( update = Update.de_json(data=data, bot=application.bot) application.bot.insert_callback_data(update) await application.update_queue.put(update) - - -async def https_url_process(update: Update, context: CallbackContext) -> None: - message = update.message - welcome_message = await message.reply_text( - text="Processing...", - ) - url_dict: dict = message.parse_entities(types=["url"]) - await welcome_message.delete() - for i, url in enumerate(url_dict.values()): - process_message = await message.reply_text( - text=f"Processing the {i + 1}th url...", - ) - url_metadata = await get_url_metadata(url, ban_list=TELEGRAM_BOT_MESSAGE_BAN_LIST) - if url_metadata.source == "banned": - await process_message.edit_text( - text=f"For the {i + 1} th url, the url is banned." - ) - return - if url_metadata.source == "unknown": - if GENERAL_SCRAPING_ON: - await process_message.edit_text( - text=f"Uncategorized url found. General webpage parser is on, Processing..." - ) - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id - ) - await process_message.edit_text( - text=f"For the {i + 1} th url, no supported url found." - ) - return - else: - await process_message.edit_text( - text=f"{url_metadata.source} url found. Processing..." - ) - # create the inline keyboard - special_function_keyboard = [] - basic_function_keyboard = [] - if TELEGRAM_CHANNEL_ID and ( - TELEGRAM_CHANNEL_ADMIN_LIST - and str(message.from_user.id) in TELEGRAM_CHANNEL_ADMIN_LIST - ): - special_function_keyboard.append( - InlineKeyboardButton( - "Send to Channel", - callback_data={ - "type": "channel", - "metadata": url_metadata, - "extra_args": {"store_document": True}, - }, - ), - ) - # video content url buttons - if url_metadata.content_type == "video": - basic_function_keyboard.extend( - [ - InlineKeyboardButton( - "Get Info", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": {"download": False}, - }, - ), - InlineKeyboardButton( - "Download", - callback_data={ - "type": "video", - "metadata": url_metadata, - }, - ), - ] - ) - if FILE_EXPORTER_ON: - special_function_keyboard.extend( - [ - InlineKeyboardButton( - "Audio Only", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": { - "audio_only": True, - }, - }, - ), - InlineKeyboardButton( - "Download HD", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": {"hd": True}, - }, - ), - ] - ) - if OPENAI_API_KEY: - special_function_keyboard.append( - InlineKeyboardButton( - "Transcribe Text", - callback_data={ - "type": "video", - "metadata": url_metadata, - "extra_args": { - "audio_only": True, - "transcribe": True, - "store_document": True, - }, - }, - ), - ) - elif url_metadata.content_type == "social_media": - basic_function_keyboard.extend( - [ - InlineKeyboardButton( - "Send to Me", - callback_data={"type": "private", "metadata": url_metadata}, - ), - InlineKeyboardButton( - "Force Send in Chat", - callback_data={"type": "force", "metadata": url_metadata}, - ), - ] - ) - if FILE_EXPORTER_ON: - special_function_keyboard.append( - InlineKeyboardButton( - "Send with PDF", - callback_data={ - "type": "pdf", - "metadata": url_metadata, - "extra_args": {"store_document": True}, - }, - ), - ) - basic_function_keyboard.append( - InlineKeyboardButton( - "Cancel", - callback_data={"type": "cancel"}, - ), - ) - keyboard = [ - special_function_keyboard, - basic_function_keyboard, - ] - reply_markup = InlineKeyboardMarkup(keyboard) - await process_message.reply_text( - f"For the {i + 1}th url: {url}, please choose the function you want to use:", - reply_markup=reply_markup, - ) - await process_message.delete() - - -async def https_url_auto_process(update: Update, context: CallbackContext) -> None: - message = update.message - url_dict = message.parse_entities(types=["url"]) - for i, url in enumerate(url_dict.values()): - url_metadata = await get_url_metadata( - url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST - ) - if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON: - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - elif url_metadata.source == "unknown" or url_metadata.source == "banned": - logger.debug(f"for the {i + 1}th url {url}, no supported url found.") - return - if url_metadata.to_dict().get("source") in SOCIAL_MEDIA_WEBSITE_PATTERNS.keys(): - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - if url_metadata.to_dict().get("source") in VIDEO_WEBSITE_PATTERNS.keys(): - metadata_item = await content_process_function(url_metadata=url_metadata) - await send_item_message( - metadata_item, chat_id=message.chat_id, message=message - ) - - -async def all_messages_process(update: Update, context: CallbackContext) -> None: - message = update.message - logger.debug(message) - if message and DATABASE_ON: - telegram_chat = TelegramChat.construct(**message.chat.to_dict()) - telegram_user = TelegramUser.construct(**message.from_user.to_dict()) - telegram_message = TelegramMessage( - datetime=message.date, - chat=telegram_chat, - user=telegram_user, - text=message.text or "", - ) - await save_instances(telegram_message) - - -async def buttons_process(update: Update, context: CallbackContext) -> None: - query = update.callback_query - data = query.data - chat_id = None - if data["type"] == "cancel": - await query.answer("Canceled") - else: - if data["type"] == "private" or data["type"] == "force": - await query.answer("Sending to you...") - if data["type"] == "channel": - if data.get("channel_id") or len(TELEGRAM_CHANNEL_ID) == 1: - channel_chat = await application.bot.get_chat( - chat_id=data.get("channel_id") - if data.get("channel_id") - else TELEGRAM_CHANNEL_ID[0] - ) - await query.answer("Sending to channel...") - if channel_chat.type == "channel": - chat_id = channel_chat.id - else: - await query.message.reply_text( - text="Sorry, the provided channel id does not exist or is not a channel." - ) - chat_id = query.message.chat_id - elif len(TELEGRAM_CHANNEL_ID) > 1: - choose_channel_keyboard = await _create_choose_channel_keyboard( - data=data - ) - await query.message.reply_text( - text="Please choose the channel you want to send:", - reply_markup=InlineKeyboardMarkup(choose_channel_keyboard), - ) - await query.message.delete() - context.drop_callback_data(query) - return - else: - chat_id = query.message.chat_id - if data["type"] == "video": - await query.answer("Video processing...") - replying_message = await query.message.reply_text( - text=f"Item processing...", - ) - extra_args = data["extra_args"] if "extra_args" in data else {} - metadata_item = await content_process_function( - url_metadata=data["metadata"], **extra_args - ) - await replying_message.edit_text( - text=f"Item processed. Sending to the target...", - ) - if data["type"] == "force": - metadata_item["message_type"] = MessageType.SHORT - await send_item_message(metadata_item, chat_id=chat_id) - if data["type"] == "channel": - await query.message.reply_text( - text=f"Item sent to the channel.", - ) - await replying_message.delete() - await query.message.delete() - context.drop_callback_data(query) - - -async def _create_choose_channel_keyboard(data: dict) -> list: - choose_channel_keyboard = [] - for i, channel_id in enumerate(TELEGRAM_CHANNEL_ID): - channel_chat = await application.bot.get_chat(chat_id=channel_id) - choose_channel_keyboard.append( - [ - InlineKeyboardButton( - channel_chat.title, - callback_data={ - "type": "channel", - "metadata": data["metadata"], - "extra_args": data["extra_args"], - "channel_id": channel_id, - }, - ) - ] - ) - choose_channel_keyboard.append( - [ - InlineKeyboardButton( - "Cancel", - callback_data={"type": "cancel"}, - ) - ] - ) - return choose_channel_keyboard - - -async def invalid_buttons(update: Update, context: CallbackContext) -> None: - await update.callback_query.answer("Invalid button!") - await update.effective_message.edit_text( - "Sorry, Error Occurred, I could not process this button click 😕." - ) - - -async def content_process_function(url_metadata: UrlMetadata, **kwargs) -> dict: - item = InfoExtractService(url_metadata, **kwargs) - metadata_item = await item.get_item() - return metadata_item - - -async def send_item_message( - data: dict, chat_id: Union[int, str] = None, message: Message = None -) -> None: - """ - :param data: (dict) metadata of the item - :param chat_id: (int) any chat id for sending - :param message: (Message) any message to reply - :return: - """ - logger.debug(f"send_item_message: {data}, {chat_id}, {message}") - if not chat_id and not message: - raise ValueError("must provide chat_id or message") - if ( - not chat_id - ) and message: # this function supports direct reply to a message even if the chat_id is None - chat_id = message.chat.id - discussion_chat_id = chat_id - the_chat = await application.bot.get_chat(chat_id=chat_id) - logger.debug(f"the chat of sending message: {the_chat}") - if the_chat.type == "channel" and the_chat.linked_chat_id: - discussion_chat_id = the_chat.linked_chat_id - try: - caption_text = message_formatting(data) - if len(data["media_files"]) > 0: - # if the message type is short and there are some media files, send media group - reply_to_message_id = None - media_message_group, file_message_group = await media_files_packaging( - media_files=data["media_files"], data=data - ) - if ( - len(media_message_group) > 0 - ): # if there are some media groups to send, send it - for i, media_group in enumerate(media_message_group): - caption_text = ( - caption_text - if i == 0 - else f"the {i + 1}th part of the media item:" - ) - logger.debug(f"media group: {media_group}") - logger.debug( - f"caption text: {caption_text},length={len(caption_text)}" - ) - sent_media_files_message = await application.bot.send_media_group( - chat_id=chat_id, - media=media_group, - parse_mode=ParseMode.HTML, - caption=caption_text, - write_timeout=TELEBOT_WRITE_TIMEOUT, - reply_to_message_id=message.message_id if message else None, - ) - if sent_media_files_message is tuple: - reply_to_message_id = sent_media_files_message[0].message_id - elif sent_media_files_message is Message: - reply_to_message_id = sent_media_files_message.message_id - logger.debug(f"sent media files message: {sent_media_files_message}") - else: - sent_message = await application.bot.send_message( - chat_id=chat_id, - text=caption_text, - parse_mode=ParseMode.HTML, - reply_to_message_id=message.message_id if message else None, - disable_web_page_preview=True - if data["message_type"] == MessageType.SHORT - else False, - disable_notification=True, - ) - if discussion_chat_id != chat_id: - await asyncio.sleep( - 3 - ) # wait for several seconds to avoid missing the target message - # if the chat is a channel, get the latest pinned message from the channel and reply to it - group_chat = await application.bot.get_chat(chat_id=discussion_chat_id) - logger.debug(f"the group chat: {group_chat}") - pinned_message = group_chat.pinned_message - logger.debug(f"the pinned message: {pinned_message}") - if len(media_message_group) > 0: - if ( - pinned_message.forward_origin.message_id - == sent_media_files_message[-1].message_id - ): - reply_to_message_id = ( - group_chat.pinned_message.id - - len(sent_media_files_message) - + 1 - ) - else: - reply_to_message_id = group_chat.pinned_message.id + 1 - elif pinned_message.forward_origin.message_id == sent_message.message_id: - reply_to_message_id = group_chat.pinned_message.id - else: - reply_to_message_id = group_chat.pinned_message.id + 1 - if ( - len(file_message_group) > 0 - ): # to send files, the files messages should be replied to the message sent before - logger.debug(f"reply_to_message_id: {reply_to_message_id}") - for file_group in file_message_group: - logger.debug(f"file group: {file_group}") - await application.bot.send_media_group( - chat_id=discussion_chat_id, - media=file_group, - reply_to_message_id=reply_to_message_id, - parse_mode=ParseMode.HTML, - disable_notification=True, - ) - else: - await application.bot.send_message( - chat_id=chat_id, - text=caption_text, - parse_mode=ParseMode.HTML, - reply_to_message_id=message.message_id if message else None, - disable_web_page_preview=True - if data["message_type"] == "short" - else False, - disable_notification=True, - ) - # except BadRequest as e: - # logger.error(e) - # except RetryAfter as e: - # logger.error(e) - # except TimedOut as e: - # logger.error(e) - # await application.bot.send_message( - # chat_id=discussion_chat_id, - # text="Timed out while sending the item to the target 😕", - # reply_to_message_id=message.message_id if message else None, - # ) - except Exception as e: - logger.error(e) - traceback.print_exc() - # await application.bot.send_message( - # chat_id=discussion_chat_id, - # text="Error occurred while sending the item to the target 😕", - # reply_to_message_id=message.message_id if message else None, - # ) - await send_debug_channel(traceback.format_exc()) - - -async def error_process(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: - logger.error("Exception while handling an update:", exc_info=context.error) - tb_list = traceback.format_exception( - None, context.error, context.error.__traceback__ - ) - tb_string = "".join(tb_list) - update_str = update.to_dict() if isinstance(update, Update) else str(update) - message = ( - f"An exception was raised while handling an update\n" - f"
update = {html.escape(json.dumps(update_str, indent=2, ensure_ascii=False))}"
-        "
\n\n" - f"
context.chat_data = {html.escape(str(context.chat_data))}
\n\n" - f"
context.user_data = {html.escape(str(context.user_data))}
\n\n" - f"
{html.escape(tb_string)}
" - ) - debug_chat_id = update.message.chat_id - if TELEBOT_DEBUG_CHANNEL is not None: - debug_chat_id = TELEBOT_DEBUG_CHANNEL - await context.bot.send_message( - chat_id=debug_chat_id, text=message, parse_mode=ParseMode.HTML - ) - - -async def send_debug_channel(message: str) -> None: - if TELEBOT_DEBUG_CHANNEL is not None: - await application.bot.send_message( - chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML - ) - - -def message_formatting(data: dict) -> str: - """ - Format the message to be sent to the user. - :param data: - :return: text (str) the formatted text for telegram bot api sending message. - """ - if data["message_type"] == "short": - data["text"] = telegram_message_html_trim(data["text"]) - message_template = template - text = message_template.render(data=data, template_text=template_text) - logger.debug(f"message text: \n{text}") - return text - - -async def media_files_packaging(media_files: list, data: dict) -> tuple: - """ - Download the media files from data["media_files"] and package them into a list of media group or file group for - sending them by send_media_group method or send_document method. - :param data: (dict) metadata of the item - :param media_files: (list) a list of media files, - :param caption_text: (str) the caption text - :return: (tuple) a tuple of media group and file group - media_message_group: (list) a list of media items, the type of each item is InputMediaPhoto or InputMediaVideo - file_group: (list) a list of file items, the type of each item is InputFile - TODO: It's not a good practice for this function. This method will still download all the media files even when - media files are too large and it can be memory consuming even if we use a database to store the media files. - The function should be optimized to resolve the media files one group by one group and send each group - immediately after it is resolved. - This processing method should be optimized in the future. - """ - media_counter, file_counter = 0, 0 - media_message_group, media_group, file_message_group, file_group = [], [], [], [] - for ( - media_item - ) in media_files: # To traverse all media items in the media files list - # check if we need to create a new media group - if media_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: - # the limitation of media item for a single telegram media group message is 10 - media_message_group.append(media_group) - media_group = [] - media_counter = 0 - if file_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: - # the limitation of media item for a single telegram media group message is 10 - file_message_group.append(file_group) - file_group = [] - file_counter = 0 - if not ( - media_item["media_type"] in ["image", "gif", "video"] - and data["message_type"] == "long" - ): - # check the url validity - url_parser = urlparse(media_item["url"]) - if url_parser.scheme in [ - "http", - "https", - ]: # if the url is a http url, download the file - file_format = "mp4" if media_item["media_type"] == "video" else None - io_object = await download_file_by_metadata_item( - media_item["url"], data=data, file_format=file_format - ) - filename = io_object.name - file_size = io_object.size - else: # if the url is a local file path, just add it to the media group - try: - file_path = url2pathname(media_item["url"]) - async with aiofiles.open(file_path, mode="rb") as f: - filename = os.path.basename(file_path) - content = await f.read() - io_object = NamedBytesIO(content=content, name=filename) - file_size = io_object.size - except Exception as e: # the url is not a valid file path - logger.error(e) - continue - # check the file size - if ( - not TELEBOT_API_SERVER - ): # the official telegram bot api server only supports 50MB file - if file_size > TELEGRAM_FILE_UPLOAD_LIMIT: - # if the size is over 50MB, skip this file - continue - else: - if file_size > TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API: - # for local api sever, if the size is over 2GB, skip this file - continue - # check media files' type and process them by their type - if media_item["media_type"] == "image": - image_url = media_item["url"] - ext = await check_image_type(io_object) - # jpg to jpeg, ignore case - if ext.lower() == "jpg": - ext = "JPEG" - io_object.seek(0) - image = Image.open(io_object, formats=[ext]) - img_width, img_height = image.size - ratio = float(max(img_height, img_width)) / float( - min(img_height, img_width) - ) - # don't try to resize image if the ratio is too large - if ( - ratio < 5 - or max(img_height, img_width) < TELEGRAM_IMAGE_DIMENSION_LIMIT - ): - image = image_compressing(image, TELEGRAM_IMAGE_DIMENSION_LIMIT) - with BytesIO() as buffer: - # mime_type file format - image.save(buffer, format=ext) - buffer.seek(0) - resized_ratio = max(image.height, image.width) / min( - image.height, image.width - ) - logger.debug( - f"resized image size: {buffer.getbuffer().nbytes}, ratio: {resized_ratio}, width: {image.width}, height: {image.height}" - ) - media_group.append(InputMediaPhoto(buffer, filename=filename)) - # the image is not able to get json serialized - logger.debug( - f"image size: {file_size}, ratio: {ratio}, width: {img_width}, height: {img_height}" - ) - if ( - file_size > TELEGRAM_IMAGE_SIZE_LIMIT - or img_width > TELEGRAM_IMAGE_DIMENSION_LIMIT - or img_height > TELEGRAM_IMAGE_DIMENSION_LIMIT - ) and data["category"] not in ["xiaohongshu"]: - io_object = await download_file_by_metadata_item( - url=image_url, data=data - ) - if not io_object.name.endswith(".gif"): - if not io_object.name.endswith(ext.lower()): - io_object.name = io_object.name + "." + ext.lower() - # TODO: it is not a good way to judge whether it is a gif... - file_group.append( - InputMediaDocument(io_object, parse_mode=ParseMode.HTML) - ) - file_counter += 1 - elif media_item["media_type"] == "gif": - io_object = await download_file_by_metadata_item( - url=media_item["url"], - data=data, - file_name="gif_image-" + str(media_counter) + ".gif", - ) - io_object.name = io_object.name + ".gif" - media_group.append(InputMediaAnimation(io_object)) - elif media_item["media_type"] == "video": - media_group.append(InputMediaVideo(io_object, supports_streaming=True)) - # TODO: not have any services to store audio files for now, just a placeholder - elif media_item["media_type"] == "audio": - media_group.append(InputMediaAudio(io_object)) - elif media_item["media_type"] == "document": - file_group.append( - InputMediaDocument(io_object, parse_mode=ParseMode.HTML) - ) - file_counter += 1 - media_counter += 1 - logger.info( - f"get the {media_counter}th media item,type: {media_item['media_type']}, url: {media_item['url']}" - ) - # check if the media group is empty, if it is, return None - if len(media_group) > 0: # append the last media group - media_message_group.append(media_group) - if len(file_group) > 0: - file_message_group.append(file_group) - return media_message_group, file_message_group diff --git a/app/services/telegram_bot/handlers.py b/app/services/telegram_bot/handlers.py new file mode 100644 index 0000000..73bd5b9 --- /dev/null +++ b/app/services/telegram_bot/handlers.py @@ -0,0 +1,359 @@ +import html +import json +import traceback + +from telegram import ( + Update, + MessageEntity, + InlineKeyboardButton, + InlineKeyboardMarkup, +) +from telegram.constants import ParseMode +from telegram.ext import ( + CallbackContext, + ContextTypes, +) + +from app.database import save_instances +from app.models.metadata_item import MessageType +from app.models.telegram_chat import TelegramMessage, TelegramUser, TelegramChat +from app.models.url_metadata import UrlMetadata +from app.services.scrapers.common import InfoExtractService +from app.services.telegram_bot.message_sender import send_item_message +from app.utils.parse import get_url_metadata +from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS +from app.utils.logger import logger +from app.config import ( + TELEGRAM_CHANNEL_ID, + TELEGRAM_CHANNEL_ADMIN_LIST, + TELEBOT_DEBUG_CHANNEL, + TELEGRAM_GROUP_MESSAGE_BAN_LIST, + TELEGRAM_BOT_MESSAGE_BAN_LIST, + FILE_EXPORTER_ON, + OPENAI_API_KEY, + DATABASE_ON, + GENERAL_SCRAPING_ON, +) + + +async def content_process_function(url_metadata: UrlMetadata, **kwargs) -> dict: + item = InfoExtractService(url_metadata, **kwargs) + metadata_item = await item.get_item() + return metadata_item + + +async def https_url_process(update: Update, context: CallbackContext) -> None: + message = update.message + welcome_message = await message.reply_text( + text="Processing...", + ) + url_dict: dict = message.parse_entities(types=["url"]) + await welcome_message.delete() + for i, url in enumerate(url_dict.values()): + process_message = await message.reply_text( + text=f"Processing the {i + 1}th url...", + ) + url_metadata = await get_url_metadata(url, ban_list=TELEGRAM_BOT_MESSAGE_BAN_LIST) + if url_metadata.source == "banned": + await process_message.edit_text( + text=f"For the {i + 1} th url, the url is banned." + ) + return + if url_metadata.source == "unknown": + if GENERAL_SCRAPING_ON: + await process_message.edit_text( + text=f"Uncategorized url found. General webpage parser is on, Processing..." + ) + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id + ) + await process_message.edit_text( + text=f"For the {i + 1} th url, no supported url found." + ) + return + else: + await process_message.edit_text( + text=f"{url_metadata.source} url found. Processing..." + ) + # create the inline keyboard + special_function_keyboard = [] + basic_function_keyboard = [] + if TELEGRAM_CHANNEL_ID and ( + TELEGRAM_CHANNEL_ADMIN_LIST + and str(message.from_user.id) in TELEGRAM_CHANNEL_ADMIN_LIST + ): + special_function_keyboard.append( + InlineKeyboardButton( + "Send to Channel", + callback_data={ + "type": "channel", + "metadata": url_metadata, + "extra_args": {"store_document": True}, + }, + ), + ) + # video content url buttons + if url_metadata.content_type == "video": + basic_function_keyboard.extend( + [ + InlineKeyboardButton( + "Get Info", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": {"download": False}, + }, + ), + InlineKeyboardButton( + "Download", + callback_data={ + "type": "video", + "metadata": url_metadata, + }, + ), + ] + ) + if FILE_EXPORTER_ON: + special_function_keyboard.extend( + [ + InlineKeyboardButton( + "Audio Only", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": { + "audio_only": True, + }, + }, + ), + InlineKeyboardButton( + "Download HD", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": {"hd": True}, + }, + ), + ] + ) + if OPENAI_API_KEY: + special_function_keyboard.append( + InlineKeyboardButton( + "Transcribe Text", + callback_data={ + "type": "video", + "metadata": url_metadata, + "extra_args": { + "audio_only": True, + "transcribe": True, + "store_document": True, + }, + }, + ), + ) + elif url_metadata.content_type == "social_media": + basic_function_keyboard.extend( + [ + InlineKeyboardButton( + "Send to Me", + callback_data={"type": "private", "metadata": url_metadata}, + ), + InlineKeyboardButton( + "Force Send in Chat", + callback_data={"type": "force", "metadata": url_metadata}, + ), + ] + ) + if FILE_EXPORTER_ON: + special_function_keyboard.append( + InlineKeyboardButton( + "Send with PDF", + callback_data={ + "type": "pdf", + "metadata": url_metadata, + "extra_args": {"store_document": True}, + }, + ), + ) + basic_function_keyboard.append( + InlineKeyboardButton( + "Cancel", + callback_data={"type": "cancel"}, + ), + ) + keyboard = [ + special_function_keyboard, + basic_function_keyboard, + ] + reply_markup = InlineKeyboardMarkup(keyboard) + await process_message.reply_text( + f"For the {i + 1}th url: {url}, please choose the function you want to use:", + reply_markup=reply_markup, + ) + await process_message.delete() + + +async def https_url_auto_process(update: Update, context: CallbackContext) -> None: + message = update.message + url_dict = message.parse_entities(types=["url"]) + for i, url in enumerate(url_dict.values()): + url_metadata = await get_url_metadata( + url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST + ) + if url_metadata.source == "unknown" and GENERAL_SCRAPING_ON: + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + elif url_metadata.source == "unknown" or url_metadata.source == "banned": + logger.debug(f"for the {i + 1}th url {url}, no supported url found.") + return + if url_metadata.to_dict().get("source") in SOCIAL_MEDIA_WEBSITE_PATTERNS.keys(): + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + if url_metadata.to_dict().get("source") in VIDEO_WEBSITE_PATTERNS.keys(): + metadata_item = await content_process_function(url_metadata=url_metadata) + await send_item_message( + metadata_item, chat_id=message.chat_id, message=message + ) + + +async def all_messages_process(update: Update, context: CallbackContext) -> None: + message = update.message + logger.debug(message) + if message and DATABASE_ON: + telegram_chat = TelegramChat.construct(**message.chat.to_dict()) + telegram_user = TelegramUser.construct(**message.from_user.to_dict()) + telegram_message = TelegramMessage( + datetime=message.date, + chat=telegram_chat, + user=telegram_user, + text=message.text or "", + ) + await save_instances(telegram_message) + + +async def buttons_process(update: Update, context: CallbackContext) -> None: + from app.services.telegram_bot import application + + query = update.callback_query + data = query.data + chat_id = None + if data["type"] == "cancel": + await query.answer("Canceled") + else: + if data["type"] == "private" or data["type"] == "force": + await query.answer("Sending to you...") + if data["type"] == "channel": + if data.get("channel_id") or len(TELEGRAM_CHANNEL_ID) == 1: + channel_chat = await application.bot.get_chat( + chat_id=data.get("channel_id") + if data.get("channel_id") + else TELEGRAM_CHANNEL_ID[0] + ) + await query.answer("Sending to channel...") + if channel_chat.type == "channel": + chat_id = channel_chat.id + else: + await query.message.reply_text( + text="Sorry, the provided channel id does not exist or is not a channel." + ) + chat_id = query.message.chat_id + elif len(TELEGRAM_CHANNEL_ID) > 1: + choose_channel_keyboard = await _create_choose_channel_keyboard( + data=data + ) + await query.message.reply_text( + text="Please choose the channel you want to send:", + reply_markup=InlineKeyboardMarkup(choose_channel_keyboard), + ) + await query.message.delete() + context.drop_callback_data(query) + return + else: + chat_id = query.message.chat_id + if data["type"] == "video": + await query.answer("Video processing...") + replying_message = await query.message.reply_text( + text=f"Item processing...", + ) + extra_args = data["extra_args"] if "extra_args" in data else {} + metadata_item = await content_process_function( + url_metadata=data["metadata"], **extra_args + ) + await replying_message.edit_text( + text=f"Item processed. Sending to the target...", + ) + if data["type"] == "force": + metadata_item["message_type"] = MessageType.SHORT + await send_item_message(metadata_item, chat_id=chat_id) + if data["type"] == "channel": + await query.message.reply_text( + text=f"Item sent to the channel.", + ) + await replying_message.delete() + await query.message.delete() + context.drop_callback_data(query) + + +async def _create_choose_channel_keyboard(data: dict) -> list: + from app.services.telegram_bot import application + + choose_channel_keyboard = [] + for i, channel_id in enumerate(TELEGRAM_CHANNEL_ID): + channel_chat = await application.bot.get_chat(chat_id=channel_id) + choose_channel_keyboard.append( + [ + InlineKeyboardButton( + channel_chat.title, + callback_data={ + "type": "channel", + "metadata": data["metadata"], + "extra_args": data["extra_args"], + "channel_id": channel_id, + }, + ) + ] + ) + choose_channel_keyboard.append( + [ + InlineKeyboardButton( + "Cancel", + callback_data={"type": "cancel"}, + ) + ] + ) + return choose_channel_keyboard + + +async def invalid_buttons(update: Update, context: CallbackContext) -> None: + await update.callback_query.answer("Invalid button!") + await update.effective_message.edit_text( + "Sorry, Error Occurred, I could not process this button click 😕." + ) + + +async def error_process(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: + logger.error("Exception while handling an update:", exc_info=context.error) + tb_list = traceback.format_exception( + None, context.error, context.error.__traceback__ + ) + tb_string = "".join(tb_list) + update_str = update.to_dict() if isinstance(update, Update) else str(update) + message = ( + f"An exception was raised while handling an update\n" + f"
update = {html.escape(json.dumps(update_str, indent=2, ensure_ascii=False))}"
+        "
\n\n" + f"
context.chat_data = {html.escape(str(context.chat_data))}
\n\n" + f"
context.user_data = {html.escape(str(context.user_data))}
\n\n" + f"
{html.escape(tb_string)}
" + ) + debug_chat_id = update.message.chat_id + if TELEBOT_DEBUG_CHANNEL is not None: + debug_chat_id = TELEBOT_DEBUG_CHANNEL + await context.bot.send_message( + chat_id=debug_chat_id, text=message, parse_mode=ParseMode.HTML + ) diff --git a/app/services/telegram_bot/message_sender.py b/app/services/telegram_bot/message_sender.py new file mode 100644 index 0000000..8b60f8f --- /dev/null +++ b/app/services/telegram_bot/message_sender.py @@ -0,0 +1,345 @@ +import asyncio +import os +import traceback +from io import BytesIO +from urllib.parse import urlparse +from urllib.request import url2pathname +from typing import Union + +import aiofiles +from telegram import ( + Message, + InputMediaPhoto, + InputMediaVideo, + InputMediaDocument, + InputMediaAnimation, + InputMediaAudio, +) +from telegram.constants import ParseMode + +from app.models.metadata_item import MessageType +from app.models.classes import NamedBytesIO +from app.utils.parse import telegram_message_html_trim +from app.utils.network import download_file_by_metadata_item +from app.utils.image import Image, image_compressing, check_image_type +from app.utils.logger import logger +from app.config import ( + TELEBOT_API_SERVER, + TELEBOT_WRITE_TIMEOUT, + TELEGRAM_IMAGE_DIMENSION_LIMIT, + TELEGRAM_IMAGE_SIZE_LIMIT, + JINJA2_ENV, + TEMPLATE_LANGUAGE, +) +from app.services.telegram_bot.config import ( + TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT, + TELEGRAM_FILE_UPLOAD_LIMIT, + TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API, + TEMPLATE_TRANSLATION, +) + +environment = JINJA2_ENV +template = environment.get_template("social_media_message.jinja2") +template_text = TEMPLATE_TRANSLATION.get( + TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] +) + + +def _get_application(): + """Lazy import to avoid circular dependency.""" + from app.services.telegram_bot import application + return application + + +async def send_item_message( + data: dict, chat_id: Union[int, str] = None, message: Message = None +) -> None: + """ + :param data: (dict) metadata of the item + :param chat_id: (int) any chat id for sending + :param message: (Message) any message to reply + :return: + """ + application = _get_application() + logger.debug(f"send_item_message: {data}, {chat_id}, {message}") + if not chat_id and not message: + raise ValueError("must provide chat_id or message") + if ( + not chat_id + ) and message: # this function supports direct reply to a message even if the chat_id is None + chat_id = message.chat.id + discussion_chat_id = chat_id + the_chat = await application.bot.get_chat(chat_id=chat_id) + logger.debug(f"the chat of sending message: {the_chat}") + if the_chat.type == "channel" and the_chat.linked_chat_id: + discussion_chat_id = the_chat.linked_chat_id + try: + caption_text = message_formatting(data) + if len(data["media_files"]) > 0: + # if the message type is short and there are some media files, send media group + reply_to_message_id = None + media_message_group, file_message_group = await media_files_packaging( + media_files=data["media_files"], data=data + ) + if ( + len(media_message_group) > 0 + ): # if there are some media groups to send, send it + for i, media_group in enumerate(media_message_group): + caption_text = ( + caption_text + if i == 0 + else f"the {i + 1}th part of the media item:" + ) + logger.debug(f"media group: {media_group}") + logger.debug( + f"caption text: {caption_text},length={len(caption_text)}" + ) + sent_media_files_message = await application.bot.send_media_group( + chat_id=chat_id, + media=media_group, + parse_mode=ParseMode.HTML, + caption=caption_text, + write_timeout=TELEBOT_WRITE_TIMEOUT, + reply_to_message_id=message.message_id if message else None, + ) + if sent_media_files_message is tuple: + reply_to_message_id = sent_media_files_message[0].message_id + elif sent_media_files_message is Message: + reply_to_message_id = sent_media_files_message.message_id + logger.debug(f"sent media files message: {sent_media_files_message}") + else: + sent_message = await application.bot.send_message( + chat_id=chat_id, + text=caption_text, + parse_mode=ParseMode.HTML, + reply_to_message_id=message.message_id if message else None, + disable_web_page_preview=True + if data["message_type"] == MessageType.SHORT + else False, + disable_notification=True, + ) + if discussion_chat_id != chat_id: + await asyncio.sleep( + 3 + ) # wait for several seconds to avoid missing the target message + # if the chat is a channel, get the latest pinned message from the channel and reply to it + group_chat = await application.bot.get_chat(chat_id=discussion_chat_id) + logger.debug(f"the group chat: {group_chat}") + pinned_message = group_chat.pinned_message + logger.debug(f"the pinned message: {pinned_message}") + if len(media_message_group) > 0: + if ( + pinned_message.forward_origin.message_id + == sent_media_files_message[-1].message_id + ): + reply_to_message_id = ( + group_chat.pinned_message.id + - len(sent_media_files_message) + + 1 + ) + else: + reply_to_message_id = group_chat.pinned_message.id + 1 + elif pinned_message.forward_origin.message_id == sent_message.message_id: + reply_to_message_id = group_chat.pinned_message.id + else: + reply_to_message_id = group_chat.pinned_message.id + 1 + if ( + len(file_message_group) > 0 + ): # to send files, the files messages should be replied to the message sent before + logger.debug(f"reply_to_message_id: {reply_to_message_id}") + for file_group in file_message_group: + logger.debug(f"file group: {file_group}") + await application.bot.send_media_group( + chat_id=discussion_chat_id, + media=file_group, + reply_to_message_id=reply_to_message_id, + parse_mode=ParseMode.HTML, + disable_notification=True, + ) + else: + await application.bot.send_message( + chat_id=chat_id, + text=caption_text, + parse_mode=ParseMode.HTML, + reply_to_message_id=message.message_id if message else None, + disable_web_page_preview=True + if data["message_type"] == "short" + else False, + disable_notification=True, + ) + except Exception as e: + logger.error(e) + traceback.print_exc() + await send_debug_channel(traceback.format_exc()) + + +async def send_debug_channel(message: str) -> None: + from app.config import TELEBOT_DEBUG_CHANNEL + application = _get_application() + if TELEBOT_DEBUG_CHANNEL is not None: + await application.bot.send_message( + chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML + ) + + +def message_formatting(data: dict) -> str: + """ + Format the message to be sent to the user. + :param data: + :return: text (str) the formatted text for telegram bot api sending message. + """ + if data["message_type"] == "short": + data["text"] = telegram_message_html_trim(data["text"]) + message_template = template + text = message_template.render(data=data, template_text=template_text) + logger.debug(f"message text: \n{text}") + return text + + +async def media_files_packaging(media_files: list, data: dict) -> tuple: + """ + Download the media files from data["media_files"] and package them into a list of media group or file group for + sending them by send_media_group method or send_document method. + :param data: (dict) metadata of the item + :param media_files: (list) a list of media files, + :return: (tuple) a tuple of media group and file group + media_message_group: (list) a list of media items, the type of each item is InputMediaPhoto or InputMediaVideo + file_group: (list) a list of file items, the type of each item is InputFile + TODO: It's not a good practice for this function. This method will still download all the media files even when + media files are too large and it can be memory consuming even if we use a database to store the media files. + The function should be optimized to resolve the media files one group by one group and send each group + immediately after it is resolved. + This processing method should be optimized in the future. + """ + media_counter, file_counter = 0, 0 + media_message_group, media_group, file_message_group, file_group = [], [], [], [] + for ( + media_item + ) in media_files: # To traverse all media items in the media files list + # check if we need to create a new media group + if media_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: + # the limitation of media item for a single telegram media group message is 10 + media_message_group.append(media_group) + media_group = [] + media_counter = 0 + if file_counter == TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT: + # the limitation of media item for a single telegram media group message is 10 + file_message_group.append(file_group) + file_group = [] + file_counter = 0 + if not ( + media_item["media_type"] in ["image", "gif", "video"] + and data["message_type"] == "long" + ): + # check the url validity + url_parser = urlparse(media_item["url"]) + if url_parser.scheme in [ + "http", + "https", + ]: # if the url is a http url, download the file + file_format = "mp4" if media_item["media_type"] == "video" else None + io_object = await download_file_by_metadata_item( + media_item["url"], data=data, file_format=file_format + ) + filename = io_object.name + file_size = io_object.size + else: # if the url is a local file path, just add it to the media group + try: + file_path = url2pathname(media_item["url"]) + async with aiofiles.open(file_path, mode="rb") as f: + filename = os.path.basename(file_path) + content = await f.read() + io_object = NamedBytesIO(content=content, name=filename) + file_size = io_object.size + except Exception as e: # the url is not a valid file path + logger.error(e) + continue + # check the file size + if ( + not TELEBOT_API_SERVER + ): # the official telegram bot api server only supports 50MB file + if file_size > TELEGRAM_FILE_UPLOAD_LIMIT: + # if the size is over 50MB, skip this file + continue + else: + if file_size > TELEGRAM_FILE_UPLOAD_LIMIT_LOCAL_API: + # for local api sever, if the size is over 2GB, skip this file + continue + # check media files' type and process them by their type + if media_item["media_type"] == "image": + image_url = media_item["url"] + ext = await check_image_type(io_object) + # jpg to jpeg, ignore case + if ext.lower() == "jpg": + ext = "JPEG" + io_object.seek(0) + image = Image.open(io_object, formats=[ext]) + img_width, img_height = image.size + ratio = float(max(img_height, img_width)) / float( + min(img_height, img_width) + ) + # don't try to resize image if the ratio is too large + if ( + ratio < 5 + or max(img_height, img_width) < TELEGRAM_IMAGE_DIMENSION_LIMIT + ): + image = image_compressing(image, TELEGRAM_IMAGE_DIMENSION_LIMIT) + with BytesIO() as buffer: + # mime_type file format + image.save(buffer, format=ext) + buffer.seek(0) + resized_ratio = max(image.height, image.width) / min( + image.height, image.width + ) + logger.debug( + f"resized image size: {buffer.getbuffer().nbytes}, ratio: {resized_ratio}, width: {image.width}, height: {image.height}" + ) + media_group.append(InputMediaPhoto(buffer, filename=filename)) + # the image is not able to get json serialized + logger.debug( + f"image size: {file_size}, ratio: {ratio}, width: {img_width}, height: {img_height}" + ) + if ( + file_size > TELEGRAM_IMAGE_SIZE_LIMIT + or img_width > TELEGRAM_IMAGE_DIMENSION_LIMIT + or img_height > TELEGRAM_IMAGE_DIMENSION_LIMIT + ) and data["category"] not in ["xiaohongshu"]: + io_object = await download_file_by_metadata_item( + url=image_url, data=data + ) + if not io_object.name.endswith(".gif"): + if not io_object.name.endswith(ext.lower()): + io_object.name = io_object.name + "." + ext.lower() + # TODO: it is not a good way to judge whether it is a gif... + file_group.append( + InputMediaDocument(io_object, parse_mode=ParseMode.HTML) + ) + file_counter += 1 + elif media_item["media_type"] == "gif": + io_object = await download_file_by_metadata_item( + url=media_item["url"], + data=data, + file_name="gif_image-" + str(media_counter) + ".gif", + ) + io_object.name = io_object.name + ".gif" + media_group.append(InputMediaAnimation(io_object)) + elif media_item["media_type"] == "video": + media_group.append(InputMediaVideo(io_object, supports_streaming=True)) + # TODO: not have any services to store audio files for now, just a placeholder + elif media_item["media_type"] == "audio": + media_group.append(InputMediaAudio(io_object)) + elif media_item["media_type"] == "document": + file_group.append( + InputMediaDocument(io_object, parse_mode=ParseMode.HTML) + ) + file_counter += 1 + media_counter += 1 + logger.info( + f"get the {media_counter}th media item,type: {media_item['media_type']}, url: {media_item['url']}" + ) + # check if the media group is empty, if it is, return None + if len(media_group) > 0: # append the last media group + media_message_group.append(media_group) + if len(file_group) > 0: + file_message_group.append(file_group) + return media_message_group, file_message_group diff --git a/app/utils/config.py b/app/utils/config.py index 2c9b6a3..ad3d691 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -1,55 +1,7 @@ -""" -patterns for check url type -""" -SOCIAL_MEDIA_WEBSITE_PATTERNS = { - "weibo": [ - r"(m\.)?weibo.cn\/(status\/)?[0-9a-zA-Z]+", - r"(www\.)?weibo\.com\/(status\/)?[0-9a-zA-Z]+", - ], - "twitter": [r"(twitter|x)\.com\/[^\/]+\/status\/[0-9]+"], - "instagram": [r"(www\.)?instagram\.com(\/share)?\/(p|reel)\/[A-Za-z0-9_-]+"], - "zhihu": [ - r"(www\.)?zhihu\.com\/question\/[0-9]+\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/aria\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/aria\/question\/[0-9]+\/answer\/[0-9]+", - r"(www\.)?zhihu\.com\/pin\/[0-9]+", - r"zhuanlan\.zhihu\.com\/p\/[0-9]+", - ], - "douban": [ - r"(game|music|movie|book)?\.douban\.com\/review\/[0-9]+", - r"((www|m)\.)?douban\.com\/note\/[0-9]+", - r"((www|m)\.)?douban\.com\/people\/[^\/]+\/status\/[0-9]+", - r"((www|m)\.)?douban\.com\/group\/topic\/[0-9]+", - r"((www|m)\.)?douban\.com\/(game|music|movie|book)\/review\/[0-9]+", - ], - "wechat": [r"mp\.weixin\.qq\.com\/s", r"mp\.weixin\.qq\.com\/mp\/appmsg\/show"], - "threads": [r"(www\.)?threads\.net\/@[a-zA-Z0-9]+\/post"], - "xiaohongshu": [ - r"(www\.)?xiaohongshu\.com\/(discovery\/item|explore)\/[0-9a-zA-Z_-]+", - r"(www\.)?xhslink\.com\/[0-9a-zA-Z_-]+", - ], - "reddit": [ - r"(www\.)?reddit\.com\/r\/[a-zA-Z0-9_-]+\/comments\/[a-zA-Z0-9_-]+", - r"(www\.)?reddit\.com\/r\/[a-zA-Z0-9_-]+\/s\/[a-zA-Z0-9_-]+", - ], - "bluesky": [ - r"(www\.)?bsky\.app\/profile/[a-zA-Z0-9\.]+\/post\/[a-zA-Z0-9\-_]+", - ] -} -VIDEO_WEBSITE_PATTERNS = { - "youtube": [ - r"((m|www)\.)youtube\.com\/watch", - r"youtu\.be\/[A-Za-z0-9_-]+", - r"youtube\.com\/shorts\/[A-Za-z0-9_-]+", - ], - "bilibili": [ - r"((www\.)?bilibili\.com\/video\/[A-Za-z0-9]+)", - r"b23\.tv\/[A-Za-z0-9]+", - ], -} -BANNED_PATTERNS = [ - r"chatgpt\.com\/share\/[A-Za-z0-9]+", - r"gemini\/share\/[A-Za-z0-9]+", - r"t\.me\/[A-Za-z0-9]+" -] \ No newline at end of file +# Re-export from shared package +from fastfetchbot_shared.utils.config import * # noqa: F401,F403 +from fastfetchbot_shared.utils.config import ( # noqa: F401 + SOCIAL_MEDIA_WEBSITE_PATTERNS, + VIDEO_WEBSITE_PATTERNS, + BANNED_PATTERNS, +) diff --git a/app/utils/image.py b/app/utils/image.py index 1e0a4af..500afcd 100644 --- a/app/utils/image.py +++ b/app/utils/image.py @@ -1,46 +1,9 @@ -import mimetypes -from io import BytesIO - -import magic -from PIL import Image -import asyncio -from app.config import env - -DEFAULT_IMAGE_LIMITATION = env.get("DEFAULT_IMAGE_LIMITATION", 1600) - - -def get_image_dimension(image_file: str): - image = Image.open(image_file) - return image.size - - -def image_compressing(image: Image, limitation: int = DEFAULT_IMAGE_LIMITATION): - new_image = image - if image.size[0] > limitation or image.size[1] > limitation: - if image.size[0] > image.size[1]: - new_image = image.resize( - (limitation, int(image.size[1] * limitation / image.size[0])), - Image.Resampling.LANCZOS, - ) - else: - new_image = image.resize( - (int(image.size[0] * limitation / image.size[1]), limitation), - Image.Resampling.LANCZOS, - ) - return new_image - - -async def check_image_type(io_object: BytesIO): - loop = asyncio.get_running_loop() - mime_type = await loop.run_in_executor( - None, lambda: magic.from_buffer(io_object.read(), mime=True) - ) - if mime_type == "image/webp": - ext = "webp" - else: - ext = mimetypes.guess_extension(mime_type, strict=True) - if ext is None: - ext = "webp" - else: - ext = ext[1:] - return ext +# Re-export from shared package +from fastfetchbot_shared.utils.image import * # noqa: F401,F403 +from fastfetchbot_shared.utils.image import ( # noqa: F401 + Image, + get_image_dimension, + image_compressing, + check_image_type, + DEFAULT_IMAGE_LIMITATION, +) diff --git a/app/utils/logger.py b/app/utils/logger.py index b7e2d46..1d4ac5f 100644 --- a/app/utils/logger.py +++ b/app/utils/logger.py @@ -1,18 +1,2 @@ -import logging -import os - -from loguru import logger - -from app.config import LOG_LEVEL, LOG_FILE_PATH - -log_path = os.path.join(LOG_FILE_PATH, "app.log") - -logger.add( - log_path, - level=LOG_LEVEL, - rotation="1 week", - retention="10 days", - compression="zip", -) -logger.debug(f"Logger initialized with level: {LOG_LEVEL}") -logger.debug(f"Logger initialized with log file path: {log_path}") +# Re-export from shared package +from fastfetchbot_shared.utils.logger import logger # noqa: F401 diff --git a/app/utils/network.py b/app/utils/network.py index ff7ec1f..bb422db 100644 --- a/app/utils/network.py +++ b/app/utils/network.py @@ -1,202 +1,13 @@ -import asyncio -import datetime -import json -import os -import uuid -from typing import Optional - -import aiofiles -import httpx -import traceback - -from lxml import etree -from fake_useragent import UserAgent -from playwright.async_api import async_playwright - -from app.models.classes import NamedBytesIO -from app.config import HTTP_REQUEST_TIMEOUT, DOWNLOAD_DIR -from app.utils.image import check_image_type -from app.utils.logger import logger - - -async def get_response( - url: str, headers: dict = None, params: dict = None, client: httpx.AsyncClient = None -) -> httpx.Response: - if headers is None: - headers = HEADERS - if client: - resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT - ) - return resp - else: - async with httpx.AsyncClient() as client: - resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT - ) - return resp - - -async def get_response_json(url: str, headers=None, client: httpx.AsyncClient = None) -> dict: - try: - response = await get_response(url, headers=headers, client=client) - json_result = response.json() - except Exception as e: - print(e, traceback.format_exc()) - json_result = None - return json_result - - - -async def get_selector( - url: str, headers: dict, follow_redirects: bool = True -) -> etree.HTML: - """ - A function to get etree.HTML selector according to url and headers. - We can use this function to do additional parsing works. - :param follow_redirects: - :param url: the target webpage url - :param headers: the headers of the request - :return: the selector of the target webpage parsed by etree.HTML - """ - async with httpx.AsyncClient() as client: - resp = await client.get( - url, - headers=headers, - follow_redirects=follow_redirects, - timeout=HTTP_REQUEST_TIMEOUT, - ) - if ( - resp.history - ): # if there is a redirect, the request will have a response chain - print("Request was redirected") - for h in resp.history: - print(h.status_code, h.url) - # if code is 302, do not follow the redirect - if h.status_code == 302: - selector = await get_selector( - h.url, headers=headers, follow_redirects=False - ) - return selector - print("Final destination:", resp.status_code, resp.url) - selector = etree.HTML(resp.text) # the content of the final destination - return selector - - -async def get_redirect_url(url: str, headers: Optional[dict] = None) -> str: - if not headers: - headers = HEADERS - async with httpx.AsyncClient() as client: - resp = await client.get(url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT) - if resp.status_code == 302 or resp.status_code == 301: - return resp.headers["Location"] - else: - return url - - -async def get_content_async(url): - async with async_playwright() as p: - browser = await p.firefox.launch() - context = await browser.new_context(viewport={"width": 1920, "height": 1080}) - page = await context.new_page() - - async def scroll_to_end(page): - # Scrolls to the bottom of the page - await page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) { - document.scrollingElement.scrollTop += 100; // Adjust the scroll amount - await delay(100); // Adjust the delay time - } - } - """) - - async def wait_for_network_idle(): - async with page.expect_response("**/api/content") as response_info: - response = await response_info.value - if response.status == 200: - print("Content loaded") - - await page.goto(url) - await wait_for_network_idle() - await scroll_to_end(page) - content = await page.content() - await browser.close() - return content - - -async def download_file_by_metadata_item( - url: str, - data: dict, - file_name: str = None, - file_format: str = None, - headers: dict = None, -) -> NamedBytesIO: - """ - A customized function to download a file from url and return a NamedBytesIO object. - :param file_format: - :param data: - :param url: - :param file_name: - :param headers: - :return: - """ - try: - if headers is None: - headers = HEADERS - headers["User-Agent"] = get_random_user_agent() - headers["referer"] = data["url"] - if data["category"] in ["reddit"]: - headers["Accept"] = "image/avif,image/webp,*/*" - async with httpx.AsyncClient() as client: - response = await client.get( - url=url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT - ) - # if redirect 302, get the final url - if response.status_code == 302 or response.status_code == 301: - url = response.headers["Location"] - file_data = response.content - if file_name is None: - file_format = file_format if file_format else url.split(".")[-1] - file_name = "media-" + str(uuid.uuid1())[:8] + "." + file_format - io_object = NamedBytesIO(file_data, name=file_name) - return io_object - except Exception as e: - await asyncio.sleep(2) - logger.error(f"Failed to download {url}, {e}") - - -async def download_file_to_local( - url: str, - file_path: str = None, - dir_path: str = DOWNLOAD_DIR, - file_name: str = "", - headers: dict = None, - referer: str = None, -) -> str: - io_object = await download_file_by_metadata_item(url=url, data={}, file_name=file_name, headers=headers) - ext = await check_image_type(io_object) - io_object.seek(0) - file_name = file_name + uuid.uuid4().hex + "." + ext - logger.info(f"Downloading {file_name}") - if file_path is None and dir_path is not None: - file_path = os.path.join(dir_path, file_name) - async with aiofiles.open(file_path, "wb") as f: - await f.write(io_object.read()) - return file_path - - -def get_random_user_agent() -> str: - ua = UserAgent() - return ua.random - - -""" -default headers -""" - -HEADERS = { - "User-Agent": get_random_user_agent(), - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", -} +# Re-export from shared package +from fastfetchbot_shared.utils.network import * # noqa: F401,F403 +from fastfetchbot_shared.utils.network import ( # noqa: F401 + get_response, + get_response_json, + get_selector, + get_redirect_url, + get_content_async, + download_file_by_metadata_item, + download_file_to_local, + get_random_user_agent, + HEADERS, +) diff --git a/app/utils/parse.py b/app/utils/parse.py index 53c55e4..8843e7e 100644 --- a/app/utils/parse.py +++ b/app/utils/parse.py @@ -1,224 +1,16 @@ -import datetime -import os -import re -import mimetypes -from typing import Optional -from urllib.parse import urlparse, unquote - -from bs4 import BeautifulSoup - -from app.models.url_metadata import UrlMetadata -from app.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS - -TELEGRAM_TEXT_LIMIT = 900 - -mimetypes.init() - - -def get_html_text_length(html: str) -> int: - if html is None: - return 0 - soup = BeautifulSoup(html, "html.parser") - text = soup.get_text() - return len(text) - - -def format_telegram_short_text(soup: BeautifulSoup) -> BeautifulSoup: - decompose_list = ["br"] - unwrap_list = ["span", "div", "blockquote", "h2", "ol", "ul"] - new_line_list = ["p", "li"] - for decompose in decompose_list: - for item in soup.find_all(decompose): - item.decompose() - for unwrap in unwrap_list: - for item in soup.find_all(unwrap): - item.unwrap() - for ( - new_line - ) in ( - new_line_list - ): # add a new line after each

and

  • tag and then remove the tag(unwrapping) - for item in soup.find_all(new_line): - item.append(BeautifulSoup("
    ", "html.parser")) - item.unwrap() - return soup - - -def unix_timestamp_to_utc(timestamp: int) -> str | None: - if not timestamp: - return None - utc_time = datetime.datetime.utcfromtimestamp(timestamp) - beijing_time = utc_time + datetime.timedelta(hours=8) - return beijing_time.strftime("%Y-%m-%d %H:%M") - - -def second_to_time(second: int) -> str: - m, s = divmod(second, 60) - h, m = divmod(m, 60) - return "{:02d}:{:02d}:{:02d}".format(h, m, s) - - -def string_to_list(string: str, divider: str = ",") -> list: - if string is None: - return [] - return string.split(divider) - - -async def get_url_metadata(url: str, ban_list: Optional[list] = None) -> UrlMetadata: - if not ban_list: - ban_list = [] - url_parser = urlparse(url) - url_main = str(url_parser.hostname) + str(url_parser.path) - source, content_type = "unknown", "unknown" - # check if the url is a social media platform website - for website, patterns in SOCIAL_MEDIA_WEBSITE_PATTERNS.items(): - for pattern in patterns: - if re.search(pattern, url_main): - source = website - content_type = "social_media" - # check if the url is a video website - if source == "unknown": - for website, patterns in VIDEO_WEBSITE_PATTERNS.items(): - for pattern in patterns: - if re.search(pattern, url_main): - source = website - content_type = "video" - # clear the url query - if source not in ["youtube", "bilibili", "wechat"]: - url = url_parser.scheme + "://" + url_parser.netloc + url_parser.path - if source in ban_list: - source = "banned" - content_type = "banned" - else: - for item in BANNED_PATTERNS: - if re.search(item, url): - source = "banned" - content_type = "banned" - break - # TODO: check if the url is from Mastodon, according to the request cookie - return UrlMetadata(url=url, source=source, content_type=content_type) - - -def get_ext_from_url(url: str) -> str: - url_object = urlparse(url) - filename = unquote(url_object.path) - ext = os.path.splitext(filename)[1] - # check if ext in mimetypes.types_map - if ext in mimetypes.types_map: - return ext - else: - return None - - -def wrap_text_into_html(text: str, is_html: bool = False) -> str: - if is_html: - soup = BeautifulSoup(text, "html.parser") - for item in soup.find_all("br"): - item.replace_with("\n") - text = str(soup) - text_list = text.split("\n") - text_list = [f"

    {item}

    " for item in text_list if item.strip() != ""] - text = "".join(text_list) - return text - - -def telegram_message_html_trim(html_content: str, trim_length: int = TELEGRAM_TEXT_LIMIT) -> str: - from bs4 import Doctype - - soup = BeautifulSoup(html_content, "html.parser") - - # Remove DOCTYPE declarations - for item in soup.contents: - if isinstance(item, Doctype): - item.extract() - - # Decompose tags that should be removed entirely (with their content) - for tag_name in ["img", "script", "style", "head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]: - for tag in soup.find_all(tag_name): - tag.decompose() - - # Unwrap structural/layout tags — keep their text, discard the wrapper - for tag_name in ["div", "span", "section", "article", "nav", "header", "footer", - "main", "aside", "figure", "figcaption", "html", "body"]: - for tag in soup.find_all(tag_name): - tag.unwrap() - - # Convert headings to bold text with line break - for level in range(1, 7): - for tag in soup.find_all(f"h{level}"): - tag.name = "b" - - # Unwrap

    tags (keep text content) - for tag in soup.find_all("p"): - tag.unwrap() - - html_content = str(soup).strip() - - if len(html_content) <= trim_length: - return html_content - - # Initial trimming - trimmed_content = html_content[:trim_length] - - # Find the position of the last complete tag in the trimmed content - last_complete_pos = trimmed_content.rfind('<') - if last_complete_pos != -1: - trimmed_content = trimmed_content[:last_complete_pos] - - # Remove any incomplete tags by ensuring each tag is closed - cleaned_html = '' - open_tags = [] - - tag_pattern = re.compile(r'<(/?)([a-zA-Z0-9]+)([^>]*)>') - pos = 0 - - while pos < len(trimmed_content): - match = tag_pattern.search(trimmed_content, pos) - if not match: - break - - start, end = match.span() - cleaned_html += trimmed_content[pos:start] - - closing, tag_name, attributes = match.groups() - - if closing: - if open_tags and open_tags[-1] == tag_name: - open_tags.pop() - cleaned_html += match.group(0) - else: - if not attributes.endswith('/'): - open_tags.append(tag_name) - cleaned_html += match.group(0) - - pos = end - - cleaned_html += trimmed_content[pos:] - - # Ensure to close all open tags - for tag in reversed(open_tags): - cleaned_html += f'' - - return cleaned_html + ' ...' - - -def get_bool(value: Optional[str], default: bool = True) -> bool: - true_values = ("True", "true", "1", "yes", "on") - false_values = ("False", "false", "0", "no", "off") - - if value is None: - return default - value = value.lower() - - if value in true_values: - return True - elif value in false_values: - return False - else: - return default - - -def get_env_bool(env, var_name: Optional[str], default: bool = False): - """Retrieve environment variable as a boolean.""" - value = env.get(var_name, "").lower() - return get_bool(value, default) +# Re-export from shared package +from fastfetchbot_shared.utils.parse import * # noqa: F401,F403 +from fastfetchbot_shared.utils.parse import ( # noqa: F401 + get_html_text_length, + format_telegram_short_text, + unix_timestamp_to_utc, + second_to_time, + string_to_list, + get_url_metadata, + get_ext_from_url, + wrap_text_into_html, + telegram_message_html_trim, + get_bool, + get_env_bool, + TELEGRAM_TEXT_LIMIT, +) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile new file mode 100644 index 0000000..ef4d33c --- /dev/null +++ b/apps/api/Dockerfile @@ -0,0 +1,88 @@ + +# `python-base` sets up all our shared environment variables +FROM python:3.12-slim AS python-base + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + # uv settings + UV_PROJECT_ENVIRONMENT="/opt/pysetup/.venv" \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + # paths + PYSETUP_PATH="/opt/pysetup" \ + VENV_PATH="/opt/pysetup/.venv" \ + PLAYWRIGHT_BROWSERS_PATH="/opt/playwright-browsers" + +# prepend venv to path +ENV PATH="$VENV_PATH/bin:$PATH" + + +# `builder-base` stage is used to build deps + create our virtual environment +FROM python-base AS builder-base + +# install uv from the official image +COPY --from=ghcr.io/astral-sh/uv:0.10.4 /uv /usr/local/bin/uv + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + curl \ + ffmpeg \ + libmagic1 \ + # deps for weasyprint + libpango-1.0-0 \ + libpangoft2-1.0-0 \ + libjpeg-dev \ + libopenjp2-7-dev \ + libffi-dev \ + build-essential \ + fonts-wqy-microhei \ + fonts-wqy-zenhei \ + fonts-noto-cjk \ + fonts-noto-cjk-extra + +# copy workspace files for dependency resolution +WORKDIR $PYSETUP_PATH +COPY pyproject.toml uv.lock ./ +COPY packages/ packages/ +COPY apps/api/ apps/api/ + +# install runtime deps +RUN uv sync --frozen --no-dev --no-install-project --package fastfetchbot-api + +# install the browser dependencies for playwright +RUN uv run playwright install --with-deps + + +# `production` image used for runtime +FROM python-base AS production +ENV FASTAPI_ENV=production +ENV PYTHONPATH=/app/apps/api:$PYTHONPATH +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + curl \ + ffmpeg \ + libmagic1 \ + # deps for weasyprint + libpango-1.0-0 \ + libpangoft2-1.0-0 \ + libjpeg-dev \ + libopenjp2-7-dev \ + libffi-dev \ + fonts-wqy-microhei \ + fonts-wqy-zenhei \ + fonts-noto-cjk \ + fonts-noto-cjk-extra \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libatspi2.0-0 \ + libxcomposite1 \ + libxdamage1 +COPY --from=builder-base $PYSETUP_PATH $PYSETUP_PATH +COPY --from=builder-base $PLAYWRIGHT_BROWSERS_PATH $PLAYWRIGHT_BROWSERS_PATH +COPY packages/ /app/packages/ +COPY apps/api/ /app/apps/api/ +WORKDIR /app/apps/api +CMD ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "src.main:app", "--preload"] diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml new file mode 100644 index 0000000..dc07911 --- /dev/null +++ b/apps/api/pyproject.toml @@ -0,0 +1,41 @@ +[project] +name = "fastfetchbot-api" +version = "0.1.0" +requires-python = ">=3.12,<3.13" +dependencies = [ + "fastfetchbot-shared", + "fastapi>=0.115.12", + "sentry-sdk[fastapi]>=2.27.0", + "gunicorn>=23.0.0", + "uvicorn>=0.34.2", + "jinja2>=3.1.6", + "babel>=2.17.0", + "beanie>=1.29.0", + "jmespath>=1.0.1", + "twitter-api-client-v2>=0.1.1", + "atproto>=0.0.61", + "asyncpraw>=7.8.1", + "pillow>=10.0.0", + "pydub>=0.25.1", + "xhtml2pdf>=0.2.17", + "aioboto3>=13.4.0", + "tenacity>=9.1.2", + "markdown>=3.8", + "openai>=2.15.0", + "html-telegraph-poster-v2>=0.2.5", + "firecrawl-py>=4.13.0", + "zyte-api>=0.8.1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.uv] +package = false + +[tool.uv.sources] +fastfetchbot-shared = { workspace = true } diff --git a/apps/api/src/__init__.py b/apps/api/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/auth.py b/apps/api/src/auth.py new file mode 100644 index 0000000..c9600d8 --- /dev/null +++ b/apps/api/src/auth.py @@ -0,0 +1,19 @@ +import secrets + +from fastapi import HTTPException, Security, status +from fastapi.security.api_key import APIKeyQuery + +from src.config import API_KEY_NAME, API_KEY + +api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False) + + +def verify_key(input_key: str, true_key: str): + if api_key_query is None or not secrets.compare_digest(input_key, true_key): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, detail="API Key Invalid" + ) + + +def verify_api_key(api_key_query: str = Security(api_key_query)): + verify_key(api_key_query, API_KEY) diff --git a/apps/api/src/config.py b/apps/api/src/config.py new file mode 100644 index 0000000..aa1e7de --- /dev/null +++ b/apps/api/src/config.py @@ -0,0 +1,154 @@ +import json +import os +import tempfile + +from jinja2 import Environment, FileSystemLoader +import gettext +import secrets + +from fastfetchbot_shared.utils.parse import get_env_bool + +env = os.environ +current_directory = os.path.dirname(os.path.abspath(__file__)) +conf_dir = os.path.join(current_directory, "..", "conf") + +# FastAPI environment variables +BASE_URL = env.get("BASE_URL", "localhost") +API_KEY_NAME = env.get("API_KEY_NAME", "pwd") +API_KEY = env.get("API_KEY", secrets.token_urlsafe(32)) + +# Filesystem environment variables +TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) +WORK_DIR = env.get("WORK_DIR", os.getcwd()) +DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) +DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) + +# Logging environment variables +LOG_FILE_PATH = env.get("LOG_FILE_PATH", TEMP_DIR) +LOG_LEVEL = env.get("LOG_LEVEL", "DEBUG") + +# MongoDB environment variables +DATABASE_ON = get_env_bool(env, "DATABASE_ON", False) +MONGODB_PORT = int(env.get("MONGODB_PORT", 27017)) or 27017 +MONGODB_HOST = env.get("MONGODB_HOST", "localhost") +MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") + +# Telegraph +telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "") +TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None + +# Youtube-dl environment variables +FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True) +FILE_EXPORTER_HOST = env.get("FILE_EXPORTER_HOST", "fast-yt-downloader") +FILE_EXPORTER_PORT = env.get("FILE_EXPORTER_PORT", "4000") +FILE_EXPORTER_URL = f"http://{FILE_EXPORTER_HOST}:{FILE_EXPORTER_PORT}" +DOWNLOAD_VIDEO_TIMEOUT = env.get("DOWNLOAD_VIDEO_TIMEOUT", 600) + +# Services environment variables +templates_directory = os.path.join(current_directory, "templates") +JINJA2_ENV = Environment( + loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True +) +TEMPLATE_LANGUAGE = env.get( + "TEMPLATE_LANGUAGE", "zh_CN" +) # It is a workaround for translation system + +# X-RapidAPI (for instagram) +X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None) + +# Twitter +TWITTER_EMAIL = env.get("TWITTER_EMAIL", None) +TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None) +TWITTER_USERNAME = env.get("TWITTER_USERNAME", None) +TWITTER_CT0 = env.get("TWITTER_CT0", None) +TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None) +TWITTER_COOKIES = { + "ct0": TWITTER_CT0, + "auth_token": TWITTER_AUTH_TOKEN, +} + +# Bluesky +BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None) +BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None) + +# Weibo +WEIBO_COOKIES = env.get("WEIBO_COOKIES", None) + +# Xiaohongshu +XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None) +XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None) +XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None) +XIAOHONGSHU_COOKIES = { + "a1": XIAOHONGSHU_A1, + "web_id": XIAOHONGSHU_WEBID, + "web_session": XIAOHONGSHU_WEBSESSION, +} +XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",") +XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",") +XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False) +XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True) + +# Zhihu +FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") + +zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json") +if os.path.exists(zhihu_cookie_path): + try: + with open(zhihu_cookie_path, "r") as f: + ZHIHU_COOKIES_JSON = json.load(f) + except json.JSONDecodeError: + print("Error: The file is not in a valid JSON format.") + ZHIHU_COOKIES_JSON = None + except FileNotFoundError: + print("Error: The file does not exist.") + ZHIHU_COOKIES_JSON = None +else: + print("Error: We cannot find it.") + ZHIHU_COOKIES_JSON = None + +# Reddit +REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None) +REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None) +REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None) +REDDIT_USERNAME = env.get("REDDIT_USERNAME", None) + +# AWS storage +AWS_STORAGE_ON = get_env_bool(env, "AWS_STORAGE_ON", False) +AWS_ACCESS_KEY_ID = env.get("AWS_ACCESS_KEY_ID", None) +AWS_SECRET_ACCESS_KEY = env.get("AWS_SECRET_ACCESS_KEY", None) +AWS_S3_BUCKET_NAME = env.get("AWS_S3_BUCKET_NAME", "") +AWS_REGION_NAME = env.get("AWS_REGION_NAME", "") +AWS_DOMAIN_HOST = env.get("AWS_DOMAIN_HOST", None) +if not (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_S3_BUCKET_NAME): + AWS_STORAGE_ON = False +INOREADER_APP_ID = env.get("INOREADER_APP_ID", None) +INOREADER_APP_KEY = env.get("INOREADER_APP_KEY", None) +INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) +INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) + +# Open AI API +OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) + +# General webpage scraping +GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) +GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") + +# Firecrawl API +FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "") +FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "") +FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR", 3000)) # milliseconds to wait for JS rendering + + +# Zyte API +ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) + +# Locale directories environment variables +localedir = os.path.join(os.path.dirname(__file__), "locale") +translation = gettext.translation("messages", localedir=localedir, fallback=True) +_ = translation.gettext + +# Utils environment variables +HTTP_REQUEST_TIMEOUT = env.get("HTTP_REQUEST_TIMEOUT", 30) + +# Telegram Bot callback URL (for inter-service communication) +TELEGRAM_BOT_CALLBACK_URL = env.get("TELEGRAM_BOT_CALLBACK_URL", "http://telegram-bot:10451") diff --git a/apps/api/src/database.py b/apps/api/src/database.py new file mode 100644 index 0000000..5a4387e --- /dev/null +++ b/apps/api/src/database.py @@ -0,0 +1,37 @@ +from typing import Optional, Union, List + +from motor.motor_asyncio import AsyncIOMotorClient +from beanie import init_beanie, Document, Indexed + +from src.config import MONGODB_URL +from src.models.database_model import document_list +from fastfetchbot_shared.utils.logger import logger + + +async def startup() -> None: + client = AsyncIOMotorClient(MONGODB_URL) + await init_beanie(database=client["telegram_bot"], document_models=document_list) + + +async def shutdown() -> None: + pass + + +async def save_instances(instances: Union[Document, List[Document]], *args) -> None: + if instances is None: + raise TypeError("instances must be a Model or a list of Model") + + if isinstance(instances, Document): + instance_type = type(instances) + await instance_type.insert(instances) + elif isinstance(instances, list): + instance_type = type(instances[0]) + await instance_type.insert_many(instances) + else: + raise TypeError("instances must be a Model or a list of Model") + + for arg in args: + if not isinstance(arg, Document): + raise TypeError("args must be a Model") + instance_type = type(arg) + await instance_type.insert_one(arg) diff --git a/apps/api/src/main.py b/apps/api/src/main.py new file mode 100644 index 0000000..2a712be --- /dev/null +++ b/apps/api/src/main.py @@ -0,0 +1,55 @@ +import sentry_sdk + +from fastapi import FastAPI, Request +from contextlib import asynccontextmanager +from starlette.middleware.base import BaseHTTPMiddleware + +from src import database +from src.routers import inoreader, scraper_routers, scraper +from src.config import DATABASE_ON +from fastfetchbot_shared.utils.logger import logger + +SENTRY_DSN = "" + +# https://docs.sentry.io/platforms/python/guides/fastapi/ +sentry_sdk.init( + dsn=SENTRY_DSN, + # Set traces_sample_rate to 1.0 to capture 100% + # of transactions for performance monitoring. + # We recommend adjusting this value in production, + traces_sample_rate=1.0, +) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + if DATABASE_ON: + await database.startup() + try: + yield + finally: + if DATABASE_ON: + await database.shutdown() + + +class LogMiddleware(BaseHTTPMiddleware): + def __init__(self, app): + super().__init__(app) + + async def dispatch(self, request: Request, call_next): + logger.info(f"{request.method} {request.url}") + response = await call_next(request) + return response + + +def create_app(): + fastapi_app = FastAPI(lifespan=lifespan) + fastapi_app.add_middleware(LogMiddleware) + fastapi_app.include_router(inoreader.router) + fastapi_app.include_router(scraper.router) + for router in scraper_routers.scraper_routers: + fastapi_app.include_router(router) + return fastapi_app + + +app = create_app() diff --git a/apps/api/src/models/__init__.py b/apps/api/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/models/database_model.py b/apps/api/src/models/database_model.py new file mode 100644 index 0000000..049756f --- /dev/null +++ b/apps/api/src/models/database_model.py @@ -0,0 +1,41 @@ +from typing import Optional, Any +from datetime import datetime + +from pydantic import BaseModel, Field +from beanie import Document, Indexed, Insert, after_event, before_event + +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_html_text_length + + +class Metadata(Document): + title: str = Field(default="untitled") + message_type: MessageType = MessageType.SHORT + url: str + author: Optional[str] = None + author_url: Optional[str] = None + text: Optional[str] = None + text_length: Optional[int] = Field(ge=0) + content: Optional[str] = None + content_length: Optional[int] = Field(ge=0) + category: Optional[str] = None + source: Optional[str] = None + media_files: Optional[list[MediaFile]] = None + telegraph_url: Optional[str] = None + timestamp: datetime = Field(default_factory=datetime.utcnow) + scrape_status: bool = False + + @before_event(Insert) + def get_text_length(self): + self.text_length = get_html_text_length(self.text) + self.content_length = get_html_text_length(self.content) + + # + @staticmethod + def from_dict(obj: Any) -> "Metadata": + assert isinstance(obj, dict) + return Metadata(**obj) + + +document_list = [Metadata] diff --git a/apps/api/src/routers/__init__.py b/apps/api/src/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/routers/inoreader.py b/apps/api/src/routers/inoreader.py new file mode 100644 index 0000000..521adaa --- /dev/null +++ b/apps/api/src/routers/inoreader.py @@ -0,0 +1,38 @@ +from fastapi import APIRouter +from fastapi.requests import Request + +from src.config import INOREADER_APP_ID, INOREADER_APP_KEY +from src.services.inoreader import Inoreader +from src.services.inoreader.process import ( + get_inoreader_item_async, + process_inoreader_data, + default_telegram_channel_id +) +from fastapi import Security +from src.auth import verify_api_key + +router = APIRouter(prefix="/inoreader") + + +async def get_inoreader_webhook_data(data: dict): + result = data["items"] + return result + + +@router.post("/triggerAsync", dependencies=[Security(verify_api_key)]) +async def inoreader_trigger_webhook(request: Request): + if not INOREADER_APP_ID or not INOREADER_APP_KEY: + return "inoreader app id or key not set" + params = request.query_params + await get_inoreader_item_async(trigger=True, params=params) + return "ok" + + +@router.post("/webhook", dependencies=[Security(verify_api_key)]) +async def inoreader_tag_webhook(request: Request): + data = await request.json() + data = await Inoreader.process_items_data(data) + params = request.query_params + telegram_channel_id = params.get("channel_id", default_telegram_channel_id) + await process_inoreader_data(data=data, use_inoreader_content=True, telegram_channel_id=telegram_channel_id) + return "ok" diff --git a/apps/api/src/routers/scraper.py b/apps/api/src/routers/scraper.py new file mode 100644 index 0000000..b02be9c --- /dev/null +++ b/apps/api/src/routers/scraper.py @@ -0,0 +1,37 @@ +import asyncio + +from fastapi import APIRouter +from fastapi.requests import Request + +from src.config import API_KEY_NAME +from src.services.scrapers.common import InfoExtractService +from fastapi import Security +from src.auth import verify_api_key +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_url_metadata + +router = APIRouter(prefix="/scraper") + + +@router.post("/getItem", dependencies=[Security(verify_api_key)]) +async def get_item_route(request: Request): + logger.debug("A scraper getItem request received") + query_params = dict(request.query_params) + url = query_params.pop("url") + ban_list = query_params.pop("ban_list", None) + logger.debug(f"get_item_route: url: {url}, query_params: {query_params}") + if API_KEY_NAME in query_params: + query_params.pop(API_KEY_NAME) + url_metadata = await get_url_metadata(url, ban_list) + item = InfoExtractService(url_metadata, **query_params) + result = await item.get_item() + logger.debug(f"getItem result: {result}") + return result + + +@router.post("/getUrlMetadata", dependencies=[Security(verify_api_key)]) +async def get_url_metadata_route(request: Request): + url = request.query_params.get("url") + ban_list = request.query_params.get("ban_list") + url_metadata = await get_url_metadata(url, ban_list) + return url_metadata.to_dict() diff --git a/apps/api/src/routers/scraper_routers.py b/apps/api/src/routers/scraper_routers.py new file mode 100644 index 0000000..66316c7 --- /dev/null +++ b/apps/api/src/routers/scraper_routers.py @@ -0,0 +1,6 @@ +from .wechat import router as wechat_router + + +scraper_routers = [ + wechat_router, +] diff --git a/apps/api/src/routers/wechat.py b/apps/api/src/routers/wechat.py new file mode 100644 index 0000000..3f66b55 --- /dev/null +++ b/apps/api/src/routers/wechat.py @@ -0,0 +1,29 @@ +from fastapi import APIRouter +from fastapi.requests import Request + +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from src.services.scrapers.common import InfoExtractService +from fastapi import Security +from src.auth import verify_api_key + +router = APIRouter(prefix="/wechat") + + +@router.post("/gzh", dependencies=[Security(verify_api_key)]) +async def wechat_gzh_scrape(request: Request): + url = request.query_params.get("url") + if url: + url_metadata = UrlMetadata.from_dict({ + "url": url, + "type": "social_media", + "source": "wechat", + }) + else: + customized_url_metadata = request.json() + if customized_url_metadata: + url_metadata = UrlMetadata.from_dict(customized_url_metadata) + else: + return "url or url metadata not found" + item = InfoExtractService(url_metadata) + result = await item.get_item() + return result diff --git a/apps/api/src/services/__init__.py b/apps/api/src/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/amazon/__init__.py b/apps/api/src/services/amazon/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/amazon/s3.py b/apps/api/src/services/amazon/s3.py new file mode 100644 index 0000000..e0e13aa --- /dev/null +++ b/apps/api/src/services/amazon/s3.py @@ -0,0 +1,67 @@ +import asyncio +import uuid +from datetime import datetime +from urllib.parse import urlparse, quote + +import aiofiles.os +from pathlib import Path + +import aioboto3 +from botocore.exceptions import ClientError + +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.network import download_file_to_local +from src.config import AWS_S3_BUCKET_NAME, AWS_REGION_NAME, AWS_DOMAIN_HOST + +session = aioboto3.Session() +image_url_host = ( + AWS_DOMAIN_HOST + if AWS_DOMAIN_HOST + else f"{AWS_S3_BUCKET_NAME}.s3.{AWS_REGION_NAME}.amazonaws.com" +) + + +async def download_and_upload(url: str, referer: str = None, suite: str = "test") -> str: + urlparser = urlparse(url) + file_name = (urlparser.netloc + urlparser.path).replace("/", "-") + local_path = await download_file_to_local(url=url, referer=referer, file_name=file_name) + local_path = Path(local_path) + file_name = local_path.name + if not local_path: + return "" + s3_path = await upload( + suite=suite, + staging_path=local_path, + file_name=file_name, + ) + await aiofiles.os.remove(local_path) + return s3_path + + +async def upload( + staging_path: Path, + bucket: str = AWS_S3_BUCKET_NAME, + suite: str = "test", + release: str = datetime.now().strftime("%Y-%m-%d"), + file_name: str = None, +) -> str: + if not file_name: + file_name = uuid.uuid4().hex + blob_s3_key = f"{suite}/{release}/{file_name}" + async with session.client("s3") as s3: + try: + with staging_path.open("rb") as spfp: + logger.info(f"Uploading {blob_s3_key}") + await s3.upload_fileobj( + spfp, + bucket, + blob_s3_key, + ) + logger.info(f"Uploaded {file_name} to {suite}/{release}") + except Exception as e: + logger.error(f"Failed to upload {file_name} to {suite}/{release}, {e}") + return "" + image_url = f"https://{image_url_host}/{blob_s3_key}" + urlparser = urlparse(image_url) + quoted_url = urlparser.scheme + "://" + urlparser.netloc + quote(urlparser.path) + return quoted_url diff --git a/apps/api/src/services/file_export/__init__.py b/apps/api/src/services/file_export/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/file_export/audio_transcribe/__init__.py b/apps/api/src/services/file_export/audio_transcribe/__init__.py new file mode 100644 index 0000000..5088ff7 --- /dev/null +++ b/apps/api/src/services/file_export/audio_transcribe/__init__.py @@ -0,0 +1,30 @@ +import httpx + +from src.config import OPENAI_API_KEY, FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import wrap_text_into_html + +TRANSCRIBE_MODEL = "whisper-1" +SEGMENT_LENGTH = 5 * 60 + + +class AudioTranscribe: + def __init__(self, audio_file: str): + self.audio_file = audio_file + + async def transcribe(self): + return await self._get_audio_text(self.audio_file) + + @staticmethod + async def _get_audio_text(audio_file: str): + async with httpx.AsyncClient() as client: + body = { + "audio_file": audio_file, + "openai_api_key": OPENAI_API_KEY, + } + request_url = FILE_EXPORTER_URL + "/transcribe" + response = await client.post( + url=request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + transcript = response.json().get("transcript") + return transcript diff --git a/apps/api/src/services/file_export/document_export/__init__.py b/apps/api/src/services/file_export/document_export/__init__.py new file mode 100644 index 0000000..282167d --- /dev/null +++ b/apps/api/src/services/file_export/document_export/__init__.py @@ -0,0 +1,10 @@ +from . import pdf_export + + +class DocumentExport(object): + def __init__(self, document): + self.document = document + + def export(self): + if self.document["type"] == "pdf": + return pdf_export.PdfExport(self.document["content"]).export() diff --git a/apps/api/src/services/file_export/document_export/pdf_export.py b/apps/api/src/services/file_export/document_export/pdf_export.py new file mode 100644 index 0000000..88fd0b5 --- /dev/null +++ b/apps/api/src/services/file_export/document_export/pdf_export.py @@ -0,0 +1,89 @@ +import asyncio +import functools + +# import gc +import os +import uuid +from pathlib import Path + +import aiofiles +import aiofiles.os +import httpx +from bs4 import BeautifulSoup + +from src.config import DOWNLOAD_DIR, FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT, TEMP_DIR, AWS_STORAGE_ON +from src.services.amazon.s3 import upload as upload_to_s3 +from fastfetchbot_shared.utils.logger import logger + +current_directory = os.path.dirname(os.path.abspath(__file__)) + +PDF_STYLESHEET = os.path.join(current_directory, "pdf_export.css") + + +async def upload_file_to_s3(output_filename): + return await upload_to_s3( + staging_path=output_filename, + suite="documents", + file_name=output_filename.name, + ) + + +class PdfExport: + def __init__(self, title: str, html_string: str = None): + self.title = title + self.html_string = html_string + + async def export(self, method: str = "file") -> str: + body = { + "method": method + } + html_string = self.wrap_html_string(self.html_string) + if method == "string": + body["html_string"] = html_string, + logger.debug( + f""" + html_string: {html_string} + """ + ) + elif method == "file": + filename = f"{self.title}-{uuid.uuid4()}.html" + filename = os.path.join(TEMP_DIR, filename) + async with aiofiles.open( + filename, "w", encoding="utf-8" + ) as f: + await f.write(html_string) + html_file = filename + logger.debug(html_file) + body["html_file"] = html_file + output_filename = f"{self.title}-{uuid.uuid4()}.pdf" + body["output_filename"] = output_filename + + async with httpx.AsyncClient() as client: + request_url = FILE_EXPORTER_URL + "/pdfExport" + logger.info(f"requesting pdf export from pdf server: {body}") + resp = await client.post( + request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + output_filename = resp.json().get("output_filename") + logger.info(f"pdf export success: {output_filename}") + await aiofiles.os.remove(html_file) + if AWS_STORAGE_ON: + local_filename = output_filename + output_filename = await upload_file_to_s3(Path(output_filename)) + await aiofiles.os.remove(local_filename) + return output_filename + + @staticmethod + def wrap_html_string(html_string: str) -> str: + soup = BeautifulSoup( + '' + '', + "html.parser", + ) + soup.body.append(BeautifulSoup(html_string, "html.parser")) + for tag in soup.find_all(True): + if "style" in tag.attrs: + del tag["style"] + for style_tag in soup.find_all("style"): + style_tag.decompose() + return soup.prettify() diff --git a/apps/api/src/services/file_export/video_download/__init__.py b/apps/api/src/services/file_export/video_download/__init__.py new file mode 100644 index 0000000..01f95b6 --- /dev/null +++ b/apps/api/src/services/file_export/video_download/__init__.py @@ -0,0 +1,232 @@ +from typing import Any, Optional + +import httpx +from urllib.parse import urlparse, parse_qs + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile +from src.services.file_export.audio_transcribe import AudioTranscribe +from src.config import FILE_EXPORTER_URL, DOWNLOAD_VIDEO_TIMEOUT +from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html +from fastfetchbot_shared.utils.logger import logger +from src.config import JINJA2_ENV + +video_info_template = JINJA2_ENV.get_template("video_info.jinja2") + + +class VideoDownloader(MetadataItem): + def __init__( + self, + url: str, + category: str, + data: Optional[Any] = None, + download: bool = True, + audio_only: bool = False, + hd: bool = False, + transcribe: bool = False, + **kwargs, + ): + self.extractor = category + self.url = url + self.author_url = "" + self.download = download + self.audio_only = audio_only + self.transcribe = transcribe + self.hd = hd + self.message_type = MessageType.SHORT + self.file_path = None + # metadata variables + self.category = category + self.media_files = [] + # auxiliary variables + self.created = None + self.duration = None + + @classmethod + async def create(cls, *args, **kwargs): + instance = cls(*args, **kwargs) + instance.url = await instance._parse_url(instance.url) + return instance + + async def get_item(self) -> dict: + self.url = await self._parse_url(self.url) + await self.get_video() + return self.to_dict() + + async def get_video(self) -> None: + content_info = await self.get_video_info() + self.file_path = content_info["file_path"] + video_info_funcs = { + "youtube": self._youtube_info_parse, + "bilibili": self._bilibili_info_parse, + } + meta_info = video_info_funcs[self.extractor](content_info) + self._video_info_formatting(meta_info) + # AI transcribe + if self.transcribe: + audio_content_info = await self.get_video_info(audio_only=True) + audio_file_path = audio_content_info["file_path"] + audio_transcribe = AudioTranscribe(audio_file_path) + transcribe_text = await audio_transcribe.transcribe() + if self.download is False: + self.message_type = MessageType.LONG + self.text += "\nAI全文摘录:" + transcribe_text + self.content += "


    " + wrap_text_into_html(transcribe_text) + + async def _parse_url(self, url: str) -> str: + async def _get_redirected_url(original_url: str) -> str: + async with httpx.AsyncClient(follow_redirects=False) as client: + resp = await client.get(original_url) + if resp.status_code == 200: + original_url = resp.url + elif resp.status_code == 302: + original_url = resp.headers["Location"] + return original_url + + def _remove_youtube_link_tracing(original_url: str) -> str: + original_url_parser = urlparse(original_url) + original_url_hostname = str(original_url_parser.hostname) + + if "youtu.be" in original_url_hostname: + # remove all queries + original_url = original_url.split("?")[0] + if "youtube.com" in original_url_hostname: + # remove all queries except "?v=" part + original_url = original_url_parser.scheme + "://" + original_url_parser.netloc + original_url_parser.path + if original_url_parser.query: + v_part_query = [item for item in original_url_parser.query.split("&") if "v=" in item] + if v_part_query: + original_url += "?" + v_part_query[0] + return original_url + + def _remove_bilibili_link_tracing(original_url: str) -> str: + original_url_parser = urlparse(original_url) + original_url_hostname = str(original_url_parser.hostname) + query_dict = parse_qs(original_url_parser.query) + bilibili_p_query_string = "?p=" + query_dict["p"][0] if 'p' in query_dict else "" + + if "bilibili.com" in original_url_hostname: + original_url = original_url_parser.scheme + "://" + original_url_parser.netloc + original_url_parser.path + return original_url + bilibili_p_query_string + + logger.info(f"parsing original video url: {url} for {self.extractor}") + + url_parser = urlparse(url) + url_hostname = str(url_parser.hostname) + + if self.extractor == "bilibili": + if "b23.tv" in url_hostname: + url = await _get_redirected_url(url) + if "m.bilibili.com" in url_hostname: + url = url.replace("m.bilibili.com", "www.bilibili.com") + url = _remove_bilibili_link_tracing(url) + elif self.extractor == "youtube": + if "youtu.be" in url_hostname: + url = await _get_redirected_url(url) + url = _remove_youtube_link_tracing(url) + + logger.info(f"parsed video url: {url} for {self.extractor}") + return url + + async def get_video_info( + self, + url: str = None, + download: bool = None, + extractor: str = None, + audio_only: bool = None, + hd: bool = None, + ) -> dict: + """ + make a request to youtube-dl server to get video info + :return: video info dict + """ + if url is None: + url = self.url + if download is None: + download = self.download + if extractor is None: + extractor = self.extractor + if audio_only is None: + audio_only = self.audio_only + if hd is None: + hd = self.hd + async with httpx.AsyncClient() as client: + body = { + "url": url, + "download": download, + "extractor": extractor, + "audio_only": audio_only, + "hd": hd, + } + request_url = FILE_EXPORTER_URL + "/videoDownload" + logger.info(f"requesting video info from youtube-dl server: {body}") + if download is True: + logger.info(f"video downloading... it may take a while") + if hd is True: + logger.info(f"downloading HD video, it may take longer") + elif audio_only is True: + logger.info(f"downloading audio only") + logger.debug(f"downloading video timeout: {DOWNLOAD_VIDEO_TIMEOUT}") + resp = await client.post( + request_url, json=body, timeout=DOWNLOAD_VIDEO_TIMEOUT + ) + content_info = resp.json().get("content_info") + file_path = resp.json().get("file_path") + content_info["file_path"] = file_path + return content_info + + def _video_info_formatting(self, meta_info: dict): + self.title = meta_info["title"] + self.author = meta_info["author"] + self.author_url = meta_info["author_url"] + if len(meta_info["description"]) > 800: + meta_info["description"] = meta_info["description"][:800] + "..." + self.created = meta_info["upload_date"] + self.duration = meta_info["duration"] + self.text = video_info_template.render( + data={ + "url": self.url, + "title": self.title, + "author": self.author, + "author_url": self.author_url, + "duration": self.duration, + "created": self.created, + "playback_data": meta_info["playback_data"], + "description": meta_info["description"], + } + ) + self.content = self.text.replace("\n", "
    ") + if self.download: + media_type = "video" + if self.audio_only: + media_type = "audio" + self.media_files = [MediaFile(media_type, self.file_path, "")] + + @staticmethod + def _youtube_info_parse(video_info: dict) -> dict: + return { + "id": video_info["id"], + "title": video_info["title"], + "author": video_info["uploader"], + "author_url": video_info["uploader_url"] or video_info["channel_url"], + "description": video_info["description"], + "playback_data": f"视频播放量:{video_info['view_count']} 评论数:{video_info['comment_count']}", + "author_avatar": video_info["thumbnail"], + "upload_date": str(video_info["upload_date"]), + "duration": second_to_time(round(video_info["duration"])), + } + + @staticmethod + def _bilibili_info_parse(video_info: dict) -> dict: + return { + "id": video_info["id"], + "title": video_info["title"], + "author": video_info["uploader"], + "author_url": "https://space.bilibili.com/" + + str(video_info["uploader_id"]), + "author_avatar": video_info["thumbnail"], + "ext": video_info["ext"], + "description": video_info["description"], + "playback_data": f"视频播放量:{video_info['view_count']} 弹幕数:{video_info['comment_count']} 点赞数:{video_info['like_count']}", + "upload_date": unix_timestamp_to_utc(video_info["timestamp"]), + "duration": second_to_time(round(video_info["duration"])), + } diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py new file mode 100644 index 0000000..1343079 --- /dev/null +++ b/apps/api/src/services/inoreader/__init__.py @@ -0,0 +1,168 @@ +from typing import Optional +from urllib.parse import quote + +import httpx +from bs4 import BeautifulSoup +import jmespath +from httpx import Response + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.utils.network import HEADERS +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_html_text_length +from src.config import ( + INOREADER_APP_ID, + INOREADER_APP_KEY, + INOREADER_EMAIL, + INOREADER_PASSWORD, +) + +INOREADER_CONTENT_URL = "https://www.inoreader.com/reader/api/0/stream/contents/" +TAG_PATH = "user/-/label/" +OTHER_PATH = "user/-/state/com.google/" +INOREADER_LOGIN_URL = "https://www.inoreader.com/accounts/ClientLogin" + + +class Inoreader(MetadataItem): + def __init__(self, url: str = None, data: dict = None, **kwargs): + if url: + self.url = url + if data: + self.title = data.get("title", "") + self.message = data.get("message", "") + self.author = data.get("author", "") + self.author_url = data.get("author_url", "") + self.category = data.get("category", "") + self.raw_content = data.get("content", "") + self.content = self.raw_content + if kwargs.get("category"): + self.category = kwargs["category"] + self.media_files = [] + self.message_type = MessageType.LONG + + def _from_data(self, data: dict): + self.title = data.get("title", "") + self.message = data.get("message", "") + self.author = data.get("author", "") + self.author_url = data.get("author_url", "") + self.category = data.get("category", "") + self.raw_content = data.get("content", "") + self.content = self.raw_content + + async def get_item(self, api: bool = False) -> dict: + if api: + data = await self.get_api_item_data() + self._resolve_media_files() + if get_html_text_length(self.content) < 400: + self.message_type = MessageType.SHORT + metadata_dict = self.to_dict() + metadata_dict["message"] = self.message + return metadata_dict + + def _resolve_media_files(self): + soup = BeautifulSoup(self.raw_content, "html.parser") + for img in soup.find_all("img"): + self.media_files.append(MediaFile(url=img["src"], media_type="image")) + img.extract() + for video in soup.find_all("video"): + self.media_files.append(MediaFile(url=video["src"], media_type="video")) + video.extract() + for tags in soup.find_all(["p", "span"]): + tags.unwrap() + self.text = str(soup) + self.text = '' + self.author + ": " + self.text + + @staticmethod + def get_stream_id( + stream_type: str = "broadcast", tag: str = None, feed: str = None + ) -> str: + if stream_type == "feed": + stream_id = feed + elif stream_type == "tag": + stream_id = TAG_PATH + tag + else: + stream_id = OTHER_PATH + stream_type + stream_id = quote(stream_id) + return stream_id + + @staticmethod + async def mark_all_as_read(stream_id: str, timestamp: int = 0) -> None: + request_url = "https://www.inoreader.com/reader/api/0/mark-all-as-read" + params = {"s": stream_id, "ts": timestamp} + resp = await Inoreader.get_api_info(url=request_url, params=params) + logger.debug(resp.text) + + @staticmethod + async def get_api_item_data( + stream_type: str = "broadcast", + tag: str = None, + feed: str = None, + params: dict = None, + ) -> Optional[dict | list]: + stream_id = Inoreader.get_stream_id(stream_type=stream_type, tag=tag, feed=feed) + request_url = INOREADER_CONTENT_URL + stream_id + default_params = { + "comments": 1, + "n": 10, + "r": "o", + "xt": "user/-/state/com.google/read", + } + if params: + default_params.update(params) + params = default_params + resp = await Inoreader.get_api_info(url=request_url, params=params) + logger.debug(resp.text) + data = resp.json() + data = await Inoreader.process_items_data(data) + return data + + @staticmethod + async def process_items_data(data: dict) -> Optional[dict | list]: + expression = """ + items[].{ + "aurl": canonical[0].href, + "title": title, + "author": origin.title, + "author_url": origin.htmlUrl, + "content": summary.content, + "category": categories[-1], + "message": comments[0].commentBody, + "timestamp": updated + } + """ + data = jmespath.search(expression, data) + for item in data: + item["category"] = item["category"].split("/")[-1] + return data + + @staticmethod + async def get_api_info( + url: str, + params=None, + ) -> Response: + async with httpx.AsyncClient() as client: + resp = await client.post( + INOREADER_LOGIN_URL, + params={ + "Email": INOREADER_EMAIL, + "Passwd": INOREADER_PASSWORD, + }, + ) + authorization = resp.text.split("\n")[2].split("=")[1] + + async with httpx.AsyncClient() as client: + headers = HEADERS + headers["Authorization"] = f"GoogleLogin auth={authorization}" + params = params or {} + params.update( + { + "AppId": INOREADER_APP_ID, + "AppKey": INOREADER_APP_KEY, + } + ) + resp = await client.get( + url=url, + params=params, + headers=headers, + ) + return resp diff --git a/apps/api/src/services/inoreader/process.py b/apps/api/src/services/inoreader/process.py new file mode 100644 index 0000000..7fc16e3 --- /dev/null +++ b/apps/api/src/services/inoreader/process.py @@ -0,0 +1,108 @@ +from typing import Union, Optional, Dict, Callable, Awaitable + +import httpx + +from src.config import TELEGRAM_BOT_CALLBACK_URL +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from src.services.inoreader import Inoreader +from src.services.scrapers.common import InfoExtractService +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import get_url_metadata, get_bool + +default_telegram_channel_id = None + +# Type alias for the message callback +MessageCallback = Callable[[dict, Union[int, str]], Awaitable[None]] + + +async def _default_message_callback(metadata_item: dict, chat_id: Union[int, str]) -> None: + """Default callback that sends via HTTP to the Telegram bot service.""" + async with httpx.AsyncClient() as client: + await client.post( + f"{TELEGRAM_BOT_CALLBACK_URL}/send_message", + json={"data": metadata_item, "chat_id": str(chat_id)}, + timeout=120, + ) + + +async def process_inoreader_data( + data: list, + use_inoreader_content: bool, + telegram_channel_id: Union[int, str] = default_telegram_channel_id, + stream_id: str = None, + message_callback: MessageCallback = None, +): + if message_callback is None: + message_callback = _default_message_callback + + for item in data: + url_type_item = await get_url_metadata(item["aurl"]) + url_type_dict = url_type_item.to_dict() + logger.debug(f"ino original: {use_inoreader_content}") + if ( + use_inoreader_content is True + or url_type_dict["content_type"] == "unknown" + ): + is_video = url_type_dict["content_type"] == "video" + content_type = url_type_dict["content_type"] if is_video else "social_media" + source = url_type_dict["source"] if is_video else "inoreader" + url_metadata = UrlMetadata( + url=item["aurl"], + content_type=content_type, + source=source, + ) + metadata_item = InfoExtractService( + url_metadata=url_metadata, + data=item, + store_document=True, + category=item["category"], + ) + else: + metadata_item = InfoExtractService( + url_metadata=url_type_item, + data=item, + store_document=True, + ) + message_metadata_item = await metadata_item.get_item() + await message_callback(message_metadata_item, telegram_channel_id) + if stream_id: + await Inoreader.mark_all_as_read( + stream_id=stream_id, timestamp=item["timestamp"] - 1 + ) + + +async def get_inoreader_item_async( + data: Optional[Dict] = None, + trigger: bool = False, + params: Optional[Dict] = None, + message_callback: MessageCallback = None, +) -> None: + stream_id = None + use_inoreader_content = True + telegram_channel_id = default_telegram_channel_id + if trigger and params and not data: + logger.debug(f"params:{params}") + use_inoreader_content = get_bool(params.get("useInoreaderContent"), True) + stream_type = params.get("streamType", "broadcast") + telegram_channel_id = params.get("channelId", default_telegram_channel_id) + tag = params.get("tag", None) + feed = params.get("feed", None) + the_remaining_params = { + k: v + for k, v in params.items() + if k not in ["streamType", "channelId", "tag", "feed"] + } + data = await Inoreader.get_api_item_data( + stream_type=stream_type, tag=tag, params=the_remaining_params, feed=feed + ) + if not data: + return + stream_id = Inoreader.get_stream_id(stream_type=stream_type, tag=tag, feed=feed) + if type(data) is dict: + data = [data] + await process_inoreader_data( + data, use_inoreader_content, telegram_channel_id, stream_id, + message_callback=message_callback, + ) + if stream_id: + await Inoreader.mark_all_as_read(stream_id=stream_id) diff --git a/apps/api/src/services/scrapers/__init__.py b/apps/api/src/services/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/src/services/scrapers/bluesky/__init__.py b/apps/api/src/services/scrapers/bluesky/__init__.py new file mode 100644 index 0000000..274d049 --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/__init__.py @@ -0,0 +1,45 @@ +import traceback +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Dict, Optional, Any + +import httpx +import jmespath + +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html + + +@dataclass +class Bluesky(MetadataItem): + cid: str = "" + author_did: str = "" + retweet_post: Optional["Bluesky"] = None + + @staticmethod + def from_dict(obj: Any) -> "Bluesky": + bluesky_item = MetadataItem.from_dict(obj) + bluesky_item.cid = obj.get("cid") + bluesky_item.author_did = obj.get("author_did") + return Bluesky( + url=bluesky_item.url, + title=bluesky_item.title, + author=bluesky_item.author, + author_url=bluesky_item.author_url, + telegraph_url=bluesky_item.telegraph_url, + text=bluesky_item.text, + content=bluesky_item.content, + media_files=bluesky_item.media_files, + category=bluesky_item.category, + message_type=bluesky_item.message_type, + cid=bluesky_item.cid, + author_did=bluesky_item.author_did, + ) + + def to_dict(self) -> dict: + result: dict = super().to_dict() + result["cid"] = self.cid + result["author_did"] = self.author_did + if self.retweet_post: + result["retweet_post"] = self.retweet_post.to_dict() + return result diff --git a/apps/api/src/services/scrapers/bluesky/config.py b/apps/api/src/services/scrapers/bluesky/config.py new file mode 100644 index 0000000..3183639 --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/config.py @@ -0,0 +1,3 @@ +BLUESKY_HOST = "https://bsky.app" + +BLUESKY_MAX_LENGTH = 800 diff --git a/apps/api/src/services/scrapers/bluesky/scraper.py b/apps/api/src/services/scrapers/bluesky/scraper.py new file mode 100644 index 0000000..fd3799a --- /dev/null +++ b/apps/api/src/services/scrapers/bluesky/scraper.py @@ -0,0 +1,191 @@ +from typing import Optional +from urllib.parse import urlparse + +from atproto import AsyncClient, IdResolver, AtUri +from atproto_client.models.app.bsky.embed.record import ViewRecord +from atproto_client.models.app.bsky.feed.defs import ThreadViewPost, PostView + +from src.config import JINJA2_ENV +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from src.services.scrapers.scraper import Scraper, DataProcessor +from src.services.scrapers.bluesky import Bluesky +from src.services.scrapers.bluesky.config import BLUESKY_HOST, BLUESKY_MAX_LENGTH +from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.utils.parse import wrap_text_into_html + +telegram_text_template = JINJA2_ENV.get_template("bluesky_telegram_text.jinja2") +content_template = JINJA2_ENV.get_template("bluesky_content.jinja2") + + +class BlueskyPost: + def __init__(self, bluesky_url: str): + self.url: str = bluesky_url + bluesky_url_parser = urlparse(bluesky_url) + self.bluesky_host: Optional[str] = bluesky_url_parser.netloc + bluesky_path = bluesky_url_parser.path + self.handle: Optional[str] = bluesky_path.split("/")[2] + self.post_rkey: Optional[str] = bluesky_path.split("/")[-1] + self.did: str = BlueskyScraper.id_resolver.handle.resolve(self.handle) + + +class BlueskyDataProcessor(DataProcessor): + + def __init__(self, url: str, bluesky_thread_data: ThreadViewPost): + self.url: str = url + self.bluesky_thread_data: ThreadViewPost = bluesky_thread_data + logger.debug( + f"BlueskyDataProcessor initialized with url: {url}\n and bluesky_thread_data: \n{bluesky_thread_data}") + self._data: dict = {} + + async def get_item(self) -> dict: + await self.process_data() + bluesky_item = Bluesky.from_dict(self._data) + return bluesky_item.to_dict() + pass + + async def process_data(self): + await self._resolve_thread_data() + + async def _resolve_thread_data(self) -> None: + base_post_view_data = await BlueskyDataProcessor._resolve_single_post_data(self.bluesky_thread_data.post) + base_post_view_data["url"] = self.url + + post_author_did = base_post_view_data["author_did"] + + parent_posts_text = "" + parent_posts_content = "" + parent_posts_media_files = [] + replies_posts_text = "" + replies_posts_content = "" + replies_posts_media_files = [] + # get post data from the parent posts whose author is the same as the base post author + if self.bluesky_thread_data.parent: + parent_posts_data = [] + parent_post_view = self.bluesky_thread_data.parent + await BlueskyDataProcessor._get_parent_posts_data(parent_post_view, parent_posts_data) + if parent_posts_data: + for post_data in parent_posts_data: + parent_posts_text += "\n" + post_data["text"] + parent_posts_content += post_data["content"] + parent_posts_media_files.extend(post_data["media_files"]) + # get post data from the replies whose author is the same as the base post author + if self.bluesky_thread_data.replies: + replies_posts_data = [] + for post_thread_view in self.bluesky_thread_data.replies: + post_view = post_thread_view.post + if post_author_did == post_view.author.did: + post_data = await BlueskyDataProcessor._resolve_single_post_data(post_view) + replies_posts_data.append(post_data) + if replies_posts_data: + for post_data in replies_posts_data: + replies_posts_text += "\n" + post_data["text"] + replies_posts_content += post_data["content"] + replies_posts_media_files.extend(post_data["media_files"]) + base_post_view_data["text"] = parent_posts_text + base_post_view_data["text"] + replies_posts_text + base_post_view_data["content"] = parent_posts_content + base_post_view_data["content"] + replies_posts_content + base_post_view_data["media_files"] = parent_posts_media_files + base_post_view_data[ + "media_files"] + replies_posts_media_files + + if len(base_post_view_data["text"]) > BLUESKY_MAX_LENGTH: + base_post_view_data["message_type"] = MessageType.LONG + else: + base_post_view_data["message_type"] = MessageType.SHORT + + self._data = base_post_view_data + + @staticmethod + async def _get_parent_posts_data(parent_post_view: ThreadViewPost, parent_posts_data_list: list) -> None: + parent_post_data = await BlueskyDataProcessor._resolve_single_post_data(parent_post_view.post) + parent_posts_data_list.append(parent_post_data) + if parent_post_view.parent: + await BlueskyDataProcessor._get_parent_posts_data(parent_post_view.parent, parent_posts_data_list) + + @staticmethod + async def _resolve_single_post_data(post_data: PostView) -> dict: + at_uri = AtUri.from_str(post_data.uri) + url = BLUESKY_HOST + "/profile/" + post_data.author.handle + "/post/" + at_uri.rkey + author = post_data.author.display_name + author_url = BLUESKY_HOST + "/profile/" + post_data.author.handle + author_did = post_data.author.did + text = post_data.record.text + created_at = post_data.record.created_at + + parsed_post_data = { + "url": url, + "title": author + "\'s Bluesky post", + "author": author, + "author_url": author_url, + "text": text, + "category": "bluesky", + "media_files": [], + "created_at": created_at, + "author_did": author_did, + } + + media_files = [] + if post_data.embed is not None: + # images and videos + if "images" in post_data.embed.__dict__: + for image in post_data.embed.images: + img_url = image.fullsize + img_item = { + "media_type": "image", + "url": img_url, + "caption": "", + } + media_files.append(img_item) + # TODO: handle video, which is in m3u8 format that needs to be downloaded and converted to mp4 + parsed_post_data["media_files"] = media_files + # retweet post + if "record" in post_data.embed.__dict__ and post_data.embed.record is ViewRecord: + retweet_post_data = await BlueskyDataProcessor._resolve_single_post_data(post_data.embed.record) + parsed_post_data["retweet_post"] = retweet_post_data + + content = await BlueskyDataProcessor._generate_html_content(parsed_post_data) + text = await BlueskyDataProcessor._generate_telegram_text(parsed_post_data) + parsed_post_data["content"] = content + parsed_post_data["text"] = text + + return parsed_post_data + + @staticmethod + async def _generate_html_content(data: dict) -> str: + html_content_text = wrap_text_into_html(data["text"]) + data["html_content_text"] = html_content_text + content = content_template.render(data=data) + return content + + @staticmethod + async def _generate_telegram_text(data: dict) -> str: + text = telegram_text_template.render(data=data) + return text + + +class BlueskyScraper(Scraper): + id_resolver = IdResolver() + + def __init__(self, username: Optional[str] = None, password: Optional[str] = None): + self.client: AsyncClient = AsyncClient() + self.username: Optional[str] = username + self.password: Optional[str] = password + self.did: Optional[str] = None + + async def init(self): + if self.username and self.password: + await self.client.login(self.username, self.password) + # self.did = await self.client.com + + async def get_processor_by_url(self, url: str) -> BlueskyDataProcessor: + bluesky_post = BlueskyPost(url) + bluesky_post_data = await self._request_post_data(bluesky_post) + return BlueskyDataProcessor(url, bluesky_post_data) + + async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost: + profile_identify = bluesky_post.did or bluesky_post.handle + try: + post_data = await self.client.get_post(profile_identify=profile_identify, post_rkey=bluesky_post.post_rkey) + post_uri = post_data.uri + post_thread_data = await self.client.get_post_thread(uri=post_uri) + return post_thread_data.thread + except Exception as e: + logger.error(f"Error while getting post data: {e}") diff --git a/apps/api/src/services/scrapers/common.py b/apps/api/src/services/scrapers/common.py new file mode 100644 index 0000000..d7b83cf --- /dev/null +++ b/apps/api/src/services/scrapers/common.py @@ -0,0 +1,114 @@ +from typing import Optional, Any + +from src.models.database_model import Metadata +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from fastfetchbot_shared.models.metadata_item import MessageType +from src.services import ( + telegraph, + inoreader +) +from src.services.file_export import video_download, document_export +from src.services.scrapers import twitter, wechat, reddit, weibo, zhihu, douban, instagram, xiaohongshu, threads +from src.services.scrapers.scraper_manager import ScraperManager +from src.database import save_instances +from fastfetchbot_shared.utils.logger import logger +from src.config import DATABASE_ON + + +class InfoExtractService(object): + service_classes: dict = { + "twitter": twitter.Twitter, + "threads": threads.Threads, + "reddit": reddit.Reddit, + "weibo": weibo.Weibo, + "wechat": wechat.Wechat, + "instagram": instagram.Instagram, + "douban": douban.Douban, + "zhihu": zhihu.Zhihu, + "xiaohongshu": xiaohongshu.Xiaohongshu, + "youtube": video_download.VideoDownloader, + "bilibili": video_download.VideoDownloader, + "inoreader": inoreader.Inoreader, + } + + def __init__( + self, + url_metadata: UrlMetadata, + data: Any = None, + store_database: Optional[bool] = DATABASE_ON, + store_telegraph: Optional[bool] = True, + store_document: Optional[bool] = False, + **kwargs, + ): + url_metadata = url_metadata.to_dict() + self.url = url_metadata["url"] + self.content_type = url_metadata["content_type"] + self.source = url_metadata["source"] + self.data = data + self.kwargs = kwargs + self.store_database = store_database + self.store_telegraph = store_telegraph + self.store_document = store_document + + @property + def category(self) -> str: + return self.source + + async def get_item(self, metadata_item: Optional[dict] = None) -> dict: + if self.content_type == "video": + if not self.kwargs.get("category"): + self.kwargs["category"] = self.category + if not metadata_item: + try: + if self.category in ["bluesky", "weibo", "other", "unknown"]: # it is a workaround before the code refactor + await ScraperManager.init_scraper(self.category) + item_data_processor = await ScraperManager.scrapers[self.category].get_processor_by_url(url=self.url) + metadata_item = await item_data_processor.get_item() + else: + scraper_item = InfoExtractService.service_classes[self.category]( + url=self.url, data=self.data, **self.kwargs + ) + metadata_item = await scraper_item.get_item() + except Exception as e: + logger.error(f"Error while getting item: {e}") + raise e + logger.info(f"Got metadata item") + logger.debug(metadata_item) + metadata_item = await self.process_item(metadata_item) + return metadata_item + + async def process_item(self, metadata_item: dict) -> dict: + if metadata_item.get("message_type") == MessageType.LONG: + self.store_telegraph = True + logger.info("message type is long, store in telegraph") + if self.store_telegraph: + telegraph_item = telegraph.Telegraph.from_dict(metadata_item) + try: + telegraph_url = await telegraph_item.get_telegraph() + except Exception as e: + logger.error(f"Error while getting telegraph: {e}") + telegraph_url = "" + metadata_item["telegraph_url"] = telegraph_url + if self.store_document or ( + not self.store_document and metadata_item["telegraph_url"] == "" + ): + logger.info("store in document") + try: + pdf_document = document_export.pdf_export.PdfExport( + title=metadata_item["title"], html_string=metadata_item["content"] + ) + output_filename = await pdf_document.export(method="file") + metadata_item["media_files"].append( + { + "media_type": "document", + "url": output_filename, + "caption": "", + } + ) + except Exception as e: + logger.error(f"Error while exporting document: {e}") + metadata_item["title"] = metadata_item["title"].strip() + if self.store_database: + logger.info("store in database") + await save_instances(Metadata.model_construct(**metadata_item)) + return metadata_item diff --git a/apps/api/src/services/scrapers/douban/__init__.py b/apps/api/src/services/scrapers/douban/__init__.py new file mode 100644 index 0000000..4ea8712 --- /dev/null +++ b/apps/api/src/services/scrapers/douban/__init__.py @@ -0,0 +1,230 @@ +import re +from typing import Dict, Optional, Any +from enum import Enum +from urllib.parse import urlparse + +from bs4 import BeautifulSoup +from lxml import etree + +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.network import get_selector, HEADERS +from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from src.config import JINJA2_ENV + +SHORT_LIMIT = 600 + +short_text_template = JINJA2_ENV.get_template("douban_short_text.jinja2") +content_template = JINJA2_ENV.get_template("douban_content.jinja2") + + +class DoubanType(str, Enum): + MOVIE_REVIEW = "movie_review" + BOOK_REVIEW = "book_review" + NOTE = "note" + STATUS = "status" + GROUP = "group" + UNKNOWN = "unknown" + + +class Douban(MetadataItem): + item_title: Optional[str] + item_url: Optional[str] + group_name: Optional[str] + group_url: Optional[str] + douban_type: DoubanType + text_group: Optional[str] + raw_content: Optional[str] + date: Optional[str] + + def __init__(self, url: str, data: Optional[Any] = None, **kwargs): + # metadata fields + self.url = url + self.title = "" + self.author = "" + self.author_url = "" + self.text = "" + self.content = "" + self.media_files = [] + self.category = "douban" + self.message_type = MessageType.SHORT + # auxiliary fields + self.item_title: Optional[str] = None + self.item_url: Optional[str] = None + self.group_name: Optional[str] = None + self.group_url: Optional[str] = None + self.douban_type: DoubanType = DoubanType.UNKNOWN + self.text_group: Optional[str] = None + self.raw_content: Optional[str] = None + self.date: Optional[str] = None + # reqeust fields + self.headers = HEADERS + self.headers["Cookie"] = kwargs.get("cookie", "") + + async def get_item(self) -> dict: + await self.get_douban() + return self.to_dict() + + async def get_douban(self) -> None: + self.check_douban_type() + await self.get_douban_item() + + def check_douban_type(self): + urlparser = urlparse(self.url) + host = urlparser.netloc + path = urlparser.path + if host.find("m.douban") != -1: # parse the m.douban url + host = host.replace("m.douban", "douban") + if path.startswith("/movie/review"): + self.douban_type = DoubanType.MOVIE_REVIEW + host = host.replace("douban", "movie.douban") + path = path.replace("/movie/", "/") + elif path.startswith("/book/review"): + self.douban_type = DoubanType.BOOK_REVIEW + host = host.replace("douban", "book.douban") + path = path.replace("/book/", "/") + if path.startswith("/note/"): + self.douban_type = DoubanType.NOTE + elif path.startswith("/status/") or re.match(r"/people/\d+/status/\d+", path): + self.douban_type = DoubanType.STATUS + elif path.startswith("/group/topic/"): + self.douban_type = DoubanType.GROUP + elif host.startswith("movie.douban") and path.startswith("/review/"): + self.douban_type = DoubanType.MOVIE_REVIEW + elif host.startswith("book.douban") and path.startswith("/review/"): + self.douban_type = DoubanType.BOOK_REVIEW + else: + self.douban_type = DoubanType.UNKNOWN + self.url = f"https://{host}{path}" + + async def get_douban_item(self): + function_dict = { + DoubanType.MOVIE_REVIEW: self._get_douban_movie_review, + DoubanType.BOOK_REVIEW: self._get_douban_book_review, + DoubanType.NOTE: self._get_douban_note, + DoubanType.STATUS: self._get_douban_status, + DoubanType.GROUP: self._get_douban_group_article, + DoubanType.UNKNOWN: None, + } + await function_dict[self.douban_type]() + short_text = self._douban_short_text_process() + if short_text.endswith("\n"): + short_text = short_text[:-1] + data = self.__dict__ + data["short_text"] = short_text + self.text = short_text_template.render(data=data) + self.raw_content = self.raw_content_to_html(self.raw_content) + self.content = wrap_text_into_html( + content_template.render(data=data), is_html=True + ) + if get_html_text_length(self.content) > SHORT_LIMIT: + self.message_type = MessageType.LONG + else: + self.message_type = MessageType.SHORT + + async def _get_douban_movie_review(self): + selector = await get_selector(url=self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1//span)') + self.author = selector.xpath('string(//header[@class="main-hd"]//span)') + self.author_url = selector.xpath('string(//header[@class="main-hd"]/a/@href)') + self.item_title = selector.xpath('string(//header[@class="main-hd"]/a[2])') + self.item_url = selector.xpath('string(//header[@class="main-hd"]/a[2]/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath("//div[contains(@class,'review-content')]")[0], + encoding="utf-8", + ), + encoding="utf-8", + ) + + async def _get_douban_book_review(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1//span)') + self.author = selector.xpath('string(//header[@class="main-hd"]//span)') + self.author_url = selector.xpath('string(//header[@class="main-hd"]/a/@href)') + self.item_title = selector.xpath('string(//header[@class="main-hd"]/a[2])') + self.item_url = selector.xpath('string(//header[@class="main-hd"]/a[2]/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + async def _get_douban_note(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath("string(//h1)") + self.author = selector.xpath('string(//div[@class="content"]/a)') + self.author_url = selector.xpath('string(//div[@class="content"]/a/@href)') + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + async def _get_douban_status(self): + selector = await get_selector(self.url, headers=self.headers) + self.author = selector.xpath('string(//div[@class="content"]/a)') + self.author_url = selector.xpath('string(//div[@class="content"]/a/@href)') + self.title = self.author + "的广播" + self.raw_content = ( + str( + etree.tostring( + selector.xpath('//div[@class="status-saying"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + .replace("
    ", "") + .replace("
    ", "") + .replace(">+<", "><") + .replace(" ", "
    ") + ) + + async def _get_douban_group_article(self): + selector = await get_selector(self.url, headers=self.headers) + self.title = selector.xpath('string(//div[@id="content"]//h1)') + self.title = self.title.replace("\n", "").strip() + self.author = selector.xpath('string(//span[@class="from"]//a)') + self.author_url = selector.xpath('string(//span[@class="from"]//a/@href)') + self.group_name = selector.xpath( + 'string(//div[@id="g-side-info"]//div[@class="title"]/a)' + ) + self.group_url = selector.xpath( + 'string(//div[@id="g-side-info"]//div[@class="title"]/a/@href)' + ) + self.raw_content = str( + etree.tostring( + selector.xpath('//div[@id="link-report"]')[0], encoding="utf-8" + ), + encoding="utf-8", + ) + + def _douban_short_text_process(self) -> str: + soup = BeautifulSoup(self.raw_content, "html.parser") + for img in soup.find_all("img"): + media_item = {"media_type": "image", "url": img["src"], "caption": ""} + self.media_files.append(MediaFile.from_dict(media_item)) + img.extract() + for item in soup.find_all(["p", "span", "div"]): + item.unwrap() + for item in soup.find_all(["link", "script"]): + item.decompose() + for item in soup.find_all("a"): + if item.get("title") == "查看原图": + item.decompose() + short_text = str(soup) + short_text = re.sub(r"\n{2,}", "\n", short_text) + short_text = re.sub(r"", "\n", short_text) + return short_text + + @staticmethod + def raw_content_to_html(raw_content: str) -> str: + # Split the text into paragraphs based on double newlines + print(raw_content) + paragraphs = raw_content.split('
    \n') + # Wrap each paragraph with

    tags + print(paragraphs) + html_paragraphs = [f'

    {paragraph.strip()}

    ' for paragraph in paragraphs] + # Join the paragraphs to form the final HTML string + html_string = ''.join(html_paragraphs) + return html_string diff --git a/apps/api/src/services/scrapers/general/__init__.py b/apps/api/src/services/scrapers/general/__init__.py new file mode 100644 index 0000000..f256512 --- /dev/null +++ b/apps/api/src/services/scrapers/general/__init__.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import Any + +from fastfetchbot_shared.models.metadata_item import MetadataItem + + +@dataclass +class GeneralItem(MetadataItem): + """ + GeneralItem: Data class for scraped content from general webpage scrapers. + """ + id: str = "" + raw_content: str = "" + scraper_type: str = "" # Which scraper was used (e.g., "firecrawl", "zyte", etc.) + + @staticmethod + def from_dict(obj: Any) -> "GeneralItem": + metadata_item = MetadataItem.from_dict(obj) + return GeneralItem( + url=metadata_item.url, + title=metadata_item.title, + author=metadata_item.author, + author_url=metadata_item.author_url, + telegraph_url=metadata_item.telegraph_url, + text=metadata_item.text, + content=metadata_item.content, + media_files=metadata_item.media_files, + category=metadata_item.category, + message_type=metadata_item.message_type, + id=obj.get("id", ""), + raw_content=obj.get("raw_content", ""), + scraper_type=obj.get("scraper_type", ""), + ) + + def to_dict(self) -> dict: + result: dict = super().to_dict() + result["id"] = self.id + result["raw_content"] = self.raw_content + result["scraper_type"] = self.scraper_type + return result diff --git a/apps/api/src/services/scrapers/general/base.py b/apps/api/src/services/scrapers/general/base.py new file mode 100644 index 0000000..8d454d6 --- /dev/null +++ b/apps/api/src/services/scrapers/general/base.py @@ -0,0 +1,208 @@ +import hashlib +from abc import abstractmethod +from typing import Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup, Doctype +from openai import AsyncOpenAI +from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam + +from src.config import OPENAI_API_KEY +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from src.services.scrapers.scraper import Scraper, DataProcessor +from src.services.scrapers.general import GeneralItem +from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.logger import logger + +GENERAL_TEXT_LIMIT = 800 + +DEFAULT_OPENAI_MODEL = "gpt-5-nano" + +# System prompt for LLM to extract article content +ARTICLE_EXTRACTION_PROMPT = """You are an expert content extractor. Your task is to extract the main article content from the provided HTML. + +Instructions: +1. Identify and extract ONLY the main article/post content +2. Remove navigation, headers, footers, sidebars, ads, comments, and other non-article elements +3. Preserve the article's structure (headings, paragraphs, lists, etc.) +4. Keep important formatting like bold, italic, links, and images +5. Return clean HTML containing only the article content +6. If you cannot identify the main content, return the original HTML unchanged +7. After all of the above, remove some basic HTML tags like , ,