diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..bdccd27 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,24 @@ +coverage: + status: + # 1. Project Coverage: Enforces rules on the entire codebase + project: + default: + target: auto # 'auto' means the PR cannot decrease the overall coverage + threshold: 1% # Allows a 1% drop margin for acceptable fluctuations + + # 2. Patch Coverage: Enforces rules ONLY on the lines of code modified in the PR + patch: + default: + target: 80% # The specific requirement you asked for: 80% on new code + +# 3. Ignore paths: Exclude test files and configs from coverage calculations +ignore: + - "tests/**/*" + - ".github/**/*" + - "**/__init__.py" + +# Optional: Configure the Codecov PR comment bot +comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: false # Always post a comment, even if coverage didn't change \ No newline at end of file diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml new file mode 100644 index 0000000..178fe10 --- /dev/null +++ b/.github/workflows/pr-gate.yml @@ -0,0 +1,43 @@ +name: pr-gate + +on: + pull_request: + branches: + - main + +concurrency: + group: pr-gate-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + test: + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.10.4" + + - name: Set up Python + run: uv python install 3.12 + + - name: Install dependencies + run: uv sync + + - name: Run tests with coverage + run: uv run pytest --cov --cov-report=xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: true diff --git a/apps/telegram-bot/tests/conftest.py b/apps/telegram-bot/tests/conftest.py deleted file mode 100644 index d006314..0000000 --- a/apps/telegram-bot/tests/conftest.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Test fixtures for the telegram-bot app. - -All bot service calls are mocked — tests verify routing, auth, and -request handling without touching the real Telegram API. -""" - -import os - -import pytest -import pytest_asyncio -from contextlib import asynccontextmanager -from unittest.mock import patch - -from httpx import AsyncClient, ASGITransport - -TEST_TELEGRAM_SECRET = "test-telegram-secret-token" - -os.environ["TELEGRAM_BOT_SECRET_TOKEN"] = TEST_TELEGRAM_SECRET -os.environ["TELEGRAM_BOT_TOKEN"] = "000000000:AAFakeTokenForTesting" -os.environ["DATABASE_ON"] = "false" -os.environ["BASE_URL"] = "localhost" - - -@pytest.fixture(scope="session") -def anyio_backend(): - return "asyncio" - - -@pytest_asyncio.fixture(scope="module") -async def app(): - """ - Create a Starlette webhook_app with a no-op lifespan - so we can test routes without real bot initialization. - """ - @asynccontextmanager - async def mock_lifespan(app): - yield - - with patch("core.webhook.server.lifespan", mock_lifespan): - from starlette.applications import Starlette - from starlette.routing import Route - from core.webhook.server import telegram_webhook, send_message_endpoint, health - - test_app = Starlette( - routes=[ - Route("/webhook", telegram_webhook, methods=["POST"]), - Route("/send_message", send_message_endpoint, methods=["POST"]), - Route("/health", health, methods=["GET"]), - ], - lifespan=mock_lifespan, - ) - yield test_app - - -@pytest_asyncio.fixture(scope="module") -async def client(app): - """Async HTTP client hitting the Starlette app via ASGI transport.""" - transport = ASGITransport(app=app) - async with AsyncClient(transport=transport, base_url="http://test") as ac: - yield ac - - -@pytest.fixture -def telegram_auth_headers(): - """Headers dict with valid Telegram secret token.""" - return {"X-Telegram-Bot-Api-Secret-Token": TEST_TELEGRAM_SECRET} diff --git a/apps/telegram-bot/tests/test_webhook.py b/apps/telegram-bot/tests/test_webhook.py deleted file mode 100644 index e1ddd93..0000000 --- a/apps/telegram-bot/tests/test_webhook.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -Tests for the telegram-bot webhook endpoint. - -Verifies: - - Auth: valid/missing/wrong secret token handling - - Processing: updates are dispatched via asyncio.create_task (fire-and-forget) - - Response: 200 with {"status": "ok"} for valid requests -""" - -import asyncio - -import pytest -from unittest.mock import AsyncMock, patch - -from tests.conftest import TEST_TELEGRAM_SECRET - - -SAMPLE_UPDATE = { - "update_id": 123456, - "message": { - "message_id": 1, - "text": "/start", - "chat": {"id": 789, "type": "private"}, - }, -} - - -class TestTelegramWebhook: - """Tests for POST /webhook""" - - @pytest.mark.asyncio - async def test_webhook_accepts_valid_update(self, client, telegram_auth_headers): - """Valid secret token + JSON body -> 200, process_telegram_update is called.""" - with patch( - "core.webhook.server.process_telegram_update", - new_callable=AsyncMock, - ) as mock_process: - resp = await client.post( - "/webhook", - json=SAMPLE_UPDATE, - headers=telegram_auth_headers, - ) - - assert resp.status_code == 200 - assert resp.json() == {"status": "ok"} - - # Allow the create_task coroutine to run - await asyncio.sleep(0) - mock_process.assert_called_once_with(SAMPLE_UPDATE) - - @pytest.mark.asyncio - async def test_webhook_rejects_missing_token(self, client): - """No secret token header -> 401.""" - resp = await client.post("/webhook", json=SAMPLE_UPDATE) - assert resp.status_code == 401 - assert resp.json() == {"error": "unauthorized"} - - @pytest.mark.asyncio - async def test_webhook_rejects_wrong_token(self, client): - """Wrong secret token -> 401.""" - resp = await client.post( - "/webhook", - json=SAMPLE_UPDATE, - headers={"X-Telegram-Bot-Api-Secret-Token": "wrong-token"}, - ) - assert resp.status_code == 401 - assert resp.json() == {"error": "unauthorized"} - - @pytest.mark.asyncio - async def test_webhook_responds_before_processing_completes( - self, client, telegram_auth_headers - ): - """ - The webhook must return 200 immediately, before the update - processing finishes. This is the fire-and-forget behavior that - prevents Telegram from timing out on slow handlers. - """ - processing_started = asyncio.Event() - processing_gate = asyncio.Event() - - async def slow_process(data): - processing_started.set() - await processing_gate.wait() # Block until test releases - - with patch( - "core.webhook.server.process_telegram_update", - side_effect=slow_process, - ): - resp = await client.post( - "/webhook", - json=SAMPLE_UPDATE, - headers=telegram_auth_headers, - ) - - # Response arrived while processing is still blocked - assert resp.status_code == 200 - assert resp.json() == {"status": "ok"} - - # Let the background task finish to avoid warnings - processing_gate.set() - await asyncio.sleep(0) diff --git a/packages/shared/tests/test_user_setting.py b/packages/shared/tests/test_user_setting.py deleted file mode 100644 index 4acebcd..0000000 --- a/packages/shared/tests/test_user_setting.py +++ /dev/null @@ -1,115 +0,0 @@ -import pytest -import pytest_asyncio -from sqlalchemy import select -from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine - -from fastfetchbot_shared.database.base import Base -from fastfetchbot_shared.database.models.user_setting import UserSetting - - -@pytest_asyncio.fixture -async def db_session(): - """In-memory SQLite session for testing.""" - engine = create_async_engine("sqlite+aiosqlite://", echo=False) - async with engine.begin() as conn: - await conn.run_sync(Base.metadata.create_all) - session_factory = async_sessionmaker(engine, expire_on_commit=False) - async with session_factory() as session: - yield session - await engine.dispose() - - -@pytest.mark.asyncio -async def test_create_user_setting(db_session): - setting = UserSetting(telegram_user_id=123456789, auto_fetch_in_dm=True) - db_session.add(setting) - await db_session.commit() - - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == 123456789) - ) - fetched = result.scalar_one() - assert fetched.auto_fetch_in_dm is True - assert fetched.created_at is not None - assert fetched.updated_at is not None - - -@pytest.mark.asyncio -async def test_toggle_user_setting(db_session): - setting = UserSetting(telegram_user_id=123456789, auto_fetch_in_dm=True) - db_session.add(setting) - await db_session.commit() - - setting.auto_fetch_in_dm = False - await db_session.commit() - - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == 123456789) - ) - fetched = result.scalar_one() - assert fetched.auto_fetch_in_dm is False - - -@pytest.mark.asyncio -async def test_default_auto_fetch_is_true(db_session): - setting = UserSetting(telegram_user_id=999999) - db_session.add(setting) - await db_session.commit() - - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == 999999) - ) - fetched = result.scalar_one() - assert fetched.auto_fetch_in_dm is True - - -@pytest.mark.asyncio -async def test_no_record_returns_none(db_session): - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == 888888) - ) - assert result.scalar_one_or_none() is None - - -@pytest.mark.asyncio -async def test_ensure_user_settings_creates_row(db_session): - """ensure pattern: first call creates row with defaults, second is a no-op.""" - user_id = 777777 - - # No row yet - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == user_id) - ) - assert result.scalar_one_or_none() is None - - # Simulate ensure: create if missing - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == user_id) - ) - if result.scalar_one_or_none() is None: - db_session.add(UserSetting(telegram_user_id=user_id)) - await db_session.commit() - - # Row exists with defaults - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == user_id) - ) - setting = result.scalar_one() - assert setting.auto_fetch_in_dm is True - assert setting.created_at is not None - - # Second ensure is a no-op — row unchanged - original_created_at = setting.created_at - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == user_id) - ) - if result.scalar_one_or_none() is None: - db_session.add(UserSetting(telegram_user_id=user_id)) - await db_session.commit() - - result = await db_session.execute( - select(UserSetting).where(UserSetting.telegram_user_id == user_id) - ) - setting = result.scalar_one() - assert setting.auto_fetch_in_dm is True - assert setting.created_at == original_created_at diff --git a/pyproject.toml b/pyproject.toml index 43c35c4..3100a79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "pytest>=8.3.5,<9.0.0", "pytest-asyncio>=0.26.0,<0.27.0", "celery-types>=0.24.0", + "pytest-cov>=7.1.0", ] [build-system] @@ -74,3 +75,4 @@ fastfetchbot-file-export = { workspace = true } [tool.pytest.ini_options] asyncio_default_fixture_loop_scope = "module" +testpaths = ["tests"] diff --git a/apps/telegram-bot/tests/__init__.py b/tests/integration/__init__.py similarity index 100% rename from apps/telegram-bot/tests/__init__.py rename to tests/integration/__init__.py diff --git a/tests/routers/test_scraper.py b/tests/routers/test_scraper.py deleted file mode 100644 index 4aebbef..0000000 --- a/tests/routers/test_scraper.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Tests for /scraper router endpoints. - -Endpoints: - POST /scraper/getItem — Scrape content from a URL - POST /scraper/getUrlMetadata — Get URL metadata without scraping - -All downstream services (InfoExtractService, get_url_metadata) are mocked. -We only test: routing, auth, parameter parsing, and response shape. -""" - -import pytest -from unittest.mock import AsyncMock, patch - -from tests.conftest import TEST_API_KEY, TEST_API_KEY_NAME - -# NOTE on "no API key" tests: -# auth.py has a bug where verify_key checks `api_key_query is None` (module-level -# variable, always not None) instead of checking `input_key is None`. When no key -# is provided, secrets.compare_digest(None, str) raises TypeError which propagates -# as an unhandled exception. These tests are marked xfail to document this known bug. -# Once auth.py is fixed, remove xfail and assert 401. - - -# ─── POST /scraper/getItem ─────────────────────────────────────────── - - -class TestGetItem: - """Tests for POST /scraper/getItem""" - - @pytest.mark.asyncio - async def test_returns_scraped_data( - self, client, auth_params, mock_get_url_metadata, mock_info_extract_service - ): - """Happy path: valid API key + valid url → returns scraped result.""" - _, mock_result = mock_info_extract_service - params = {**auth_params, "url": "https://twitter.com/user/status/123"} - - resp = await client.post("/scraper/getItem", params=params) - - assert resp.status_code == 200 - assert resp.json() == mock_result - - @pytest.mark.asyncio - async def test_rejects_with_wrong_api_key(self, client): - """Wrong API key → 401.""" - resp = await client.post( - "/scraper/getItem", - params={TEST_API_KEY_NAME: "wrong-key", "url": "https://example.com"}, - ) - assert resp.status_code == 401 - - @pytest.mark.xfail( - reason="auth.py bug: verify_key checks wrong variable for None, " - "TypeError propagates instead of returning 401", - raises=TypeError, - strict=True, - ) - @pytest.mark.asyncio - async def test_no_api_key_returns_401(self, client): - """No API key → should be 401. Blocked by auth.py bug.""" - resp = await client.post( - "/scraper/getItem", params={"url": "https://example.com"} - ) - assert resp.status_code == 401 - - @pytest.mark.xfail( - reason="scraper.py does dict.pop('url') without default → unhandled KeyError", - raises=KeyError, - strict=True, - ) - @pytest.mark.asyncio - async def test_missing_url_returns_error( - self, client, auth_params, mock_get_url_metadata, mock_info_extract_service - ): - """No url param → should return 4xx, but KeyError propagates unhandled.""" - resp = await client.post("/scraper/getItem", params=auth_params) - assert resp.status_code in (400, 422) - - @pytest.mark.asyncio - async def test_strips_api_key_from_downstream_params( - self, client, auth_params, mock_get_url_metadata, mock_info_extract_service - ): - """ - API_KEY_NAME should be stripped from query_params before passing - to InfoExtractService. Extra params should pass through. - """ - mock_cls, _ = mock_info_extract_service - params = { - **auth_params, - "url": "https://twitter.com/user/status/123", - "extra_option": "value", - } - - resp = await client.post("/scraper/getItem", params=params) - - assert resp.status_code == 200 - # InfoExtractService(url_metadata, **query_params) — verify call - call_args, call_kwargs = mock_cls.call_args - # API key name must NOT be in kwargs - assert TEST_API_KEY_NAME not in call_kwargs - # extra_option MUST be in kwargs - assert call_kwargs.get("extra_option") == "value" - - @pytest.mark.asyncio - async def test_passes_ban_list_to_metadata( - self, client, auth_params, mock_get_url_metadata, mock_info_extract_service - ): - """ban_list param should be forwarded to get_url_metadata.""" - mock_fn, _ = mock_get_url_metadata - params = { - **auth_params, - "url": "https://twitter.com/user/status/123", - "ban_list": "twitter,weibo", - } - - resp = await client.post("/scraper/getItem", params=params) - - assert resp.status_code == 200 - mock_fn.assert_called_once_with( - "https://twitter.com/user/status/123", "twitter,weibo" - ) - - -# ─── POST /scraper/getUrlMetadata ──────────────────────────────────── - - -class TestGetUrlMetadata: - """Tests for POST /scraper/getUrlMetadata""" - - @pytest.mark.asyncio - async def test_returns_metadata_dict( - self, client, auth_params, mock_get_url_metadata - ): - """Happy path: returns UrlMetadata.to_dict() result.""" - params = {**auth_params, "url": "https://twitter.com/user/status/123"} - - resp = await client.post("/scraper/getUrlMetadata", params=params) - - assert resp.status_code == 200 - data = resp.json() - assert data["source"] == "twitter" - assert data["content_type"] == "social_media" - assert "url" in data - - @pytest.mark.asyncio - async def test_rejects_with_wrong_api_key(self, client): - """Wrong API key → 401.""" - resp = await client.post( - "/scraper/getUrlMetadata", - params={TEST_API_KEY_NAME: "wrong-key", "url": "https://example.com"}, - ) - assert resp.status_code == 401 - - @pytest.mark.xfail( - reason="auth.py bug: verify_key checks wrong variable for None", - raises=TypeError, - strict=True, - ) - @pytest.mark.asyncio - async def test_no_api_key_returns_401(self, client): - """No API key → should be 401. Blocked by auth.py bug.""" - resp = await client.post( - "/scraper/getUrlMetadata", params={"url": "https://example.com"} - ) - assert resp.status_code == 401 - - @pytest.mark.asyncio - async def test_metadata_url_and_ban_list_passthrough( - self, client, auth_params, mock_get_url_metadata - ): - """url and ban_list params reach get_url_metadata unchanged.""" - mock_fn, _ = mock_get_url_metadata - test_url = "https://weibo.com/some/post/456" - params = {**auth_params, "url": test_url, "ban_list": "reddit"} - - await client.post("/scraper/getUrlMetadata", params=params) - - mock_fn.assert_called_once() - args = mock_fn.call_args[0] - assert args[0] == test_url diff --git a/tests/routers/test_telegram_bot.py b/tests/routers/test_telegram_bot.py deleted file mode 100644 index 8673c19..0000000 --- a/tests/routers/test_telegram_bot.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Tests for /telegram router endpoints. - -Endpoints: - POST /telegram/bot/webhook — Receive Telegram updates - GET /telegram/bot/set_webhook — Set the webhook URL - -All Telegram service calls are mocked. -""" - -import pytest -from unittest.mock import AsyncMock, patch - -from tests.conftest import ( - TEST_API_KEY, - TEST_API_KEY_NAME, - TEST_TELEGRAM_SECRET, -) - - -class TestTelegramWebhook: - """Tests for POST /telegram/bot/webhook""" - - @pytest.mark.asyncio - async def test_webhook_accepts_valid_update( - self, client, telegram_auth_headers - ): - """ - Valid secret token + JSON body → 200, background task queued. - """ - with patch( - "app.routers.telegram_bot.process_telegram_update", - new_callable=AsyncMock, - ): - update_data = { - "update_id": 123456, - "message": { - "message_id": 1, - "text": "/start", - "chat": {"id": 789, "type": "private"}, - }, - } - - resp = await client.post( - "/telegram/bot/webhook", - json=update_data, - headers=telegram_auth_headers, - ) - - assert resp.status_code == 200 - assert resp.json() == "ok" - # Background task should have been called with the update data - # Note: BackgroundTasks in test mode may execute synchronously - # The key assertion is that the endpoint accepted the request - - @pytest.mark.asyncio - async def test_webhook_rejects_missing_token(self, client): - """No secret token header → 401.""" - resp = await client.post( - "/telegram/bot/webhook", - json={"update_id": 1}, - ) - assert resp.status_code == 401 - - @pytest.mark.asyncio - async def test_webhook_rejects_wrong_token(self, client): - """Wrong secret token → 401.""" - resp = await client.post( - "/telegram/bot/webhook", - json={"update_id": 1}, - headers={"X-Telegram-Bot-Api-Secret-Token": "wrong-token"}, - ) - assert resp.status_code == 401 - - -class TestSetWebhook: - """Tests for GET /telegram/bot/set_webhook""" - - @pytest.mark.asyncio - async def test_set_webhook_success(self, client, auth_params): - """set_webhook returns True → 200 'ok'.""" - with patch( - "app.routers.telegram_bot.set_webhook", - new_callable=AsyncMock, - return_value=True, - ): - resp = await client.get( - "/telegram/bot/set_webhook", params=auth_params - ) - assert resp.status_code == 200 - assert resp.json() == "ok" - - @pytest.mark.asyncio - async def test_set_webhook_failure(self, client, auth_params): - """set_webhook returns False → 500.""" - with patch( - "app.routers.telegram_bot.set_webhook", - new_callable=AsyncMock, - return_value=False, - ): - resp = await client.get( - "/telegram/bot/set_webhook", params=auth_params - ) - assert resp.status_code == 500 - - @pytest.mark.asyncio - async def test_set_webhook_wrong_api_key(self, client): - """Wrong API key → 401.""" - with patch( - "app.routers.telegram_bot.set_webhook", - new_callable=AsyncMock, - return_value=True, - ): - resp = await client.get( - "/telegram/bot/set_webhook", - params={TEST_API_KEY_NAME: "bad-key"}, - ) - assert resp.status_code == 401 - - @pytest.mark.xfail( - reason="auth.py bug: verify_key checks wrong variable for None", - raises=TypeError, - strict=True, - ) - @pytest.mark.asyncio - async def test_set_webhook_no_api_key_returns_401(self, client): - """No API key → should be 401. Blocked by auth.py bug.""" - with patch( - "app.routers.telegram_bot.set_webhook", - new_callable=AsyncMock, - return_value=True, - ): - resp = await client.get("/telegram/bot/set_webhook") - assert resp.status_code == 401 diff --git a/tests/routers/test_twitter.py b/tests/routers/test_twitter.py deleted file mode 100644 index f7d9e6e..0000000 --- a/tests/routers/test_twitter.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Tests for /twitter router endpoints. - -Endpoints: - POST /twitter/repost — Handle twitter repost webhook - -NOTE: twitter router is NOT registered in production app (main.py). - It's included in the test app via conftest.py for testing purposes. - This is either an oversight or intentional — flag for review. - -InfoExtractService is mocked — we don't make real Twitter API calls in tests. -""" - -import pytest -from unittest.mock import AsyncMock, patch - -from tests.conftest import TEST_API_KEY, TEST_API_KEY_NAME - - -class TestTwitterRepost: - """Tests for POST /twitter/repost""" - - @pytest.mark.asyncio - async def test_repost_returns_ok(self, client, auth_params): - """Happy path: valid url → InfoExtractService called → returns 'ok'.""" - with patch( - "app.routers.twitter.InfoExtractService" - ) as MockCls: - instance = MockCls.return_value - instance.get_item = AsyncMock(return_value={"text": "mocked"}) - - params = {**auth_params, "url": "https://twitter.com/user/status/999"} - resp = await client.post("/twitter/repost", params=params) - - assert resp.status_code == 200 - assert resp.json() == "ok" - - # Verify InfoExtractService was constructed with correct metadata dict - call_args = MockCls.call_args[0][0] - assert call_args["url"] == "https://twitter.com/user/status/999" - assert call_args["source"] == "twitter" - assert call_args["type"] == "social_media" - - # Verify get_item was actually called - instance.get_item.assert_awaited_once() - - @pytest.mark.asyncio - async def test_repost_rejects_wrong_api_key(self, client): - """Wrong API key → 401.""" - resp = await client.post( - "/twitter/repost", - params={ - TEST_API_KEY_NAME: "totally-wrong-key", - "url": "https://twitter.com/x/status/1", - }, - ) - assert resp.status_code == 401 - - @pytest.mark.xfail( - reason="auth.py bug: verify_key checks wrong variable for None", - raises=TypeError, - strict=True, - ) - @pytest.mark.asyncio - async def test_repost_no_api_key_returns_401(self, client): - """No API key → should be 401. Blocked by auth.py bug.""" - resp = await client.post( - "/twitter/repost", - params={"url": "https://twitter.com/x/status/1"}, - ) - assert resp.status_code == 401 - - @pytest.mark.asyncio - async def test_repost_missing_url(self, client, auth_params): - """Missing url param → 422 (FastAPI validation error for required param).""" - resp = await client.post("/twitter/repost", params=auth_params) - assert resp.status_code == 422 diff --git a/tests/test_bluesky.py b/tests/test_bluesky.py deleted file mode 100644 index 9639cc5..0000000 --- a/tests/test_bluesky.py +++ /dev/null @@ -1,81 +0,0 @@ -from typing import Tuple - -import pytest -import pytest_asyncio - -from app.services.scrapers.bluesky.scraper import BlueskyScraper -from app.services.scrapers.scraper_manager import ScraperManager -from app.utils.logger import logger -from tests.cases.bluesky import bluesky_cases - - -@pytest_asyncio.fixture(scope="module", autouse=True) -async def bluesky_scraper(): - bluesky_scraper = await ScraperManager.init_bluesky_scraper() - return bluesky_scraper - - -async def get_item_from_url(bluesky_scraper: BlueskyScraper, url: str) -> dict: - data_processor = await bluesky_scraper.get_processor_by_url(url) - item = await data_processor.get_item() - return item - - -async def get_test_data(bluesky_scraper: BlueskyScraper, case: str) -> Tuple[dict, dict]: - data = await get_item_from_url(bluesky_scraper=bluesky_scraper, url=bluesky_cases[case]["url"]) - return data, bluesky_cases[case]["expected"] - - -@pytest.mark.asyncio -async def test_bluesky_init(bluesky_scraper: BlueskyScraper): - assert bluesky_scraper is not None - assert isinstance(bluesky_scraper, BlueskyScraper) - - -@pytest.mark.asyncio -async def test_bluesky_pure_text_post(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "pure_text") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_text_with_media_post(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "text_with_media") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_text_with_text_repost_post(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "text_with_text_repost") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_single_video_post(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "single_video_2") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_post_in_middle_of_thread(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "post_in_middle_of_thread") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_post_as_first_of_thread(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "post_as_first_of_thread") - assert True - # assert data == expected - - -@pytest.mark.asyncio -async def test_bluesky_post_as_last_of_thread(bluesky_scraper: BlueskyScraper): - data, expected = await get_test_data(bluesky_scraper, "post_as_last_of_thread") - assert True - # assert data == expected diff --git a/tests/test_weibo.py b/tests/test_weibo.py deleted file mode 100644 index ae36f13..0000000 --- a/tests/test_weibo.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Tuple - -import pytest -import pytest_asyncio - -from app.services.scrapers.weibo.scraper import WeiboScraper -from app.services.scrapers.scraper_manager import ScraperManager -from app.utils.logger import logger -from tests.cases.weibo import weibo_cases - - -@pytest_asyncio.fixture(scope="module", autouse=True) -async def weibo_scraper(): - weibo_scraper = await ScraperManager.init_weibo_scraper() - return weibo_scraper - - -async def get_item_from_url(weibo_scraper: WeiboScraper, url: str) -> dict: - data_processor = await weibo_scraper.get_processor_by_url(url) - item = await data_processor.get_item() - return item - - -async def get_test_data(weibo_scraper: WeiboScraper, case: str) -> Tuple[dict, dict]: - data = await get_item_from_url(weibo_scraper=weibo_scraper, url=weibo_cases[case]["url"]) - return data, weibo_cases[case]["expected"] - - -@pytest.mark.asyncio -async def test_pure_short_text(weibo_scraper: WeiboScraper): - data, expected = await get_test_data(weibo_scraper, "pure_short_text") - assert True diff --git a/tests/test_zhihu_content_processing.py b/tests/test_zhihu_content_processing.py deleted file mode 100644 index 4e773c1..0000000 --- a/tests/test_zhihu_content_processing.py +++ /dev/null @@ -1,58 +0,0 @@ -import sys -import os - -# Import content_processing directly to avoid pulling in the full zhihu scraper -# which has heavy dependencies (fastfetchbot_shared, httpx, etc.) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "apps", "api", "src", "services", "scrapers", "zhihu")) -from content_processing import ( - fix_images_and_links, - extract_references, - unmask_zhihu_links, -) - - -def test_fix_images_replaces_data_actualsrc(): - html = '' - result = fix_images_and_links(html) - assert 'src="https://real.jpg"' in result - assert "data-actualsrc" not in result - - -def test_fix_images_preserves_normal_src(): - html = '' - result = fix_images_and_links(html) - assert 'src="https://normal.jpg"' in result - - -def test_fix_images_removes_u_tags(): - html = "

Hello world

" - result = fix_images_and_links(html) - assert "" not in result - assert "world" in result - - -def test_extract_references_with_refs(): - html = '

Text[1]

' - result = extract_references(html) - assert "参考" in result - assert "Ref 1" in result - assert "https://example.com" in result - - -def test_extract_references_empty(): - html = "

No references here

" - result = extract_references(html) - assert result == "" - - -def test_unmask_zhihu_links(): - html = 'link' - result = unmask_zhihu_links(html) - assert "https://example.com" in result - assert "link.zhihu.com" not in result - - -def test_unmask_preserves_normal_links(): - html = 'link' - result = unmask_zhihu_links(html) - assert 'href="https://example.com"' in result diff --git a/packages/shared/tests/__init__.py b/tests/unit/__init__.py similarity index 100% rename from packages/shared/tests/__init__.py rename to tests/unit/__init__.py diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000..ed23746 --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,97 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from fastfetchbot_shared.models.url_metadata import UrlMetadata + + +@pytest.fixture +def make_url_metadata(): + """Factory fixture to create UrlMetadata instances.""" + + def _make(source="twitter", url="https://example.com", content_type=""): + return UrlMetadata(url=url, source=source, content_type=content_type) + + return _make + + +@pytest.fixture +def sample_metadata_item_dict(): + """Minimal valid metadata_item dict.""" + return { + "url": "https://example.com/post/1", + "telegraph_url": "", + "content": "

Test content

", + "text": "Test content", + "media_files": [], + "author": "testuser", + "title": "Test Title", + "author_url": "https://example.com/testuser", + "category": "twitter", + "message_type": "short", + } + + +@pytest.fixture(autouse=True) +def reset_scraper_manager(): + """Reset ScraperManager class-level state after each test.""" + yield + from fastfetchbot_shared.services.scrapers.scraper_manager import ScraperManager + + ScraperManager.bluesky_scraper = None + ScraperManager.weibo_scraper = None + ScraperManager.general_scraper = None + ScraperManager.scrapers = { + "bluesky": None, + "weibo": None, + "other": None, + "unknown": None, + } + + +@pytest.fixture +def mock_jinja2_env(): + """Patch JINJA2_ENV to return a mock template.""" + mock_template = MagicMock() + mock_template.render.return_value = "

rendered

" + mock_env = MagicMock() + mock_env.get_template.return_value = mock_template + with patch( + "fastfetchbot_shared.services.scrapers.config.JINJA2_ENV", mock_env + ) as m: + yield m + + +@pytest.fixture +def mock_get_response_json(): + """Patch network.get_response_json.""" + with patch( + "fastfetchbot_shared.utils.network.get_response_json", new_callable=AsyncMock + ) as m: + yield m + + +@pytest.fixture +def mock_get_selector(): + """Patch network.get_selector.""" + with patch( + "fastfetchbot_shared.utils.network.get_selector", new_callable=AsyncMock + ) as m: + yield m + + +@pytest.fixture +def mock_get_response(): + """Patch network.get_response.""" + with patch( + "fastfetchbot_shared.utils.network.get_response", new_callable=AsyncMock + ) as m: + yield m + + +@pytest.fixture +def mock_get_redirect_url(): + """Patch network.get_redirect_url.""" + with patch( + "fastfetchbot_shared.utils.network.get_redirect_url", new_callable=AsyncMock + ) as m: + yield m diff --git a/tests/routers/__init__.py b/tests/unit/scrapers/__init__.py similarity index 100% rename from tests/routers/__init__.py rename to tests/unit/scrapers/__init__.py diff --git a/tests/unit/scrapers/test_bluesky.py b/tests/unit/scrapers/test_bluesky.py new file mode 100644 index 0000000..a5cbf1c --- /dev/null +++ b/tests/unit/scrapers/test_bluesky.py @@ -0,0 +1,656 @@ +"""Unit tests for bluesky scraper: Bluesky dataclass, BlueskyPost, BlueskyDataProcessor, BlueskyScraper.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock +from dataclasses import dataclass + +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType + + +# --------------------------------------------------------------------------- +# Helpers – lightweight fakes for atproto types +# --------------------------------------------------------------------------- + +def _make_author(handle="alice.bsky.social", display_name="Alice", did="did:plc:abc123"): + author = MagicMock() + author.handle = handle + author.display_name = display_name + author.did = did + return author + + +def _make_post_view( + uri="at://did:plc:abc123/app.bsky.feed.post/rkey123", + text="Hello world", + author=None, + embed=None, + created_at="2024-01-01T00:00:00Z", +): + if author is None: + author = _make_author() + post = MagicMock() + post.uri = uri + post.author = author + post.record = MagicMock() + post.record.text = text + post.record.created_at = created_at + post.embed = embed + return post + + +def _make_thread(post=None, parent=None, replies=None): + thread = MagicMock() + thread.post = post or _make_post_view() + thread.parent = parent + thread.replies = replies + return thread + + +# --------------------------------------------------------------------------- +# Bluesky dataclass tests (bluesky/__init__.py) +# --------------------------------------------------------------------------- + +class TestBlueskyDataclass: + + def test_from_dict_basic(self): + """from_dict should populate cid/author_did from the dict.""" + obj = { + "url": "https://bsky.app/profile/alice/post/123", + "telegraph_url": "", + "content": "

hi

", + "text": "hi", + "media_files": [], + "author": "Alice", + "title": "Alice's Bluesky post", + "author_url": "https://bsky.app/profile/alice", + "category": "bluesky", + "message_type": "short", + "cid": "cidvalue", + "author_did": "did:plc:abc", + } + from fastfetchbot_shared.services.scrapers.bluesky import Bluesky + + item = Bluesky.from_dict(obj) + assert item.cid == "cidvalue" + assert item.author_did == "did:plc:abc" + assert item.url == "https://bsky.app/profile/alice/post/123" + assert item.author == "Alice" + + def test_to_dict_without_retweet(self): + from fastfetchbot_shared.services.scrapers.bluesky import Bluesky + + item = Bluesky( + url="https://bsky.app/profile/alice/post/123", + telegraph_url="", + content="

hi

", + text="hi", + media_files=[], + author="Alice", + title="Alice's Bluesky post", + author_url="https://bsky.app/profile/alice", + category="bluesky", + message_type=MessageType.SHORT, + cid="cidvalue", + author_did="did:plc:abc", + retweet_post=None, + ) + d = item.to_dict() + assert d["cid"] == "cidvalue" + assert d["author_did"] == "did:plc:abc" + assert "retweet_post" not in d + + def test_to_dict_with_retweet(self): + from fastfetchbot_shared.services.scrapers.bluesky import Bluesky + + retweet = Bluesky( + url="https://bsky.app/profile/bob/post/456", + telegraph_url="", + content="

retweet

", + text="retweet", + media_files=[], + author="Bob", + title="Bob's Bluesky post", + author_url="https://bsky.app/profile/bob", + category="bluesky", + message_type=MessageType.SHORT, + cid="cid2", + author_did="did:plc:bob", + retweet_post=None, + ) + item = Bluesky( + url="https://bsky.app/profile/alice/post/123", + telegraph_url="", + content="

hi

", + text="hi", + media_files=[], + author="Alice", + title="Alice's Bluesky post", + author_url="https://bsky.app/profile/alice", + category="bluesky", + message_type=MessageType.SHORT, + cid="cid1", + author_did="did:plc:alice", + retweet_post=retweet, + ) + d = item.to_dict() + assert "retweet_post" in d + assert d["retweet_post"]["cid"] == "cid2" + + +# --------------------------------------------------------------------------- +# Bluesky config tests +# --------------------------------------------------------------------------- + +class TestBlueskyConfig: + + def test_constants(self): + from fastfetchbot_shared.services.scrapers.bluesky.config import ( + BLUESKY_HOST, + BLUESKY_MAX_LENGTH, + ) + + assert BLUESKY_HOST == "https://bsky.app" + assert BLUESKY_MAX_LENGTH == 800 + + +# --------------------------------------------------------------------------- +# BlueskyPost tests +# --------------------------------------------------------------------------- + +class TestBlueskyPost: + + @patch("fastfetchbot_shared.services.scrapers.bluesky.scraper.BlueskyScraper") + def test_init_parses_url(self, mock_scraper_cls): + """BlueskyPost should parse handle, post_rkey, and resolve DID.""" + mock_resolver = MagicMock() + mock_resolver.handle.resolve.return_value = "did:plc:resolved" + mock_scraper_cls.id_resolver = mock_resolver + + # Patch at class level before import + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.BlueskyScraper.id_resolver", + mock_resolver, + ): + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyPost + + post = BlueskyPost("https://bsky.app/profile/alice.bsky.social/post/rkey123") + assert post.handle == "alice.bsky.social" + assert post.post_rkey == "rkey123" + assert post.bluesky_host == "bsky.app" + assert post.did == "did:plc:resolved" + + +# --------------------------------------------------------------------------- +# BlueskyDataProcessor tests +# --------------------------------------------------------------------------- + +class TestBlueskyDataProcessor: + + @pytest.fixture(autouse=True) + def _patch_templates(self): + mock_tpl = MagicMock() + mock_tpl.render.return_value = "

rendered

" + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.telegram_text_template", + mock_tpl, + ), patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.content_template", + mock_tpl, + ): + self.mock_tpl = mock_tpl + yield + + @pytest.fixture + def _patch_at_uri(self): + mock_at_uri = MagicMock() + mock_at_uri.rkey = "rkey123" + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AtUri" + ) as at_uri_cls: + at_uri_cls.from_str.return_value = mock_at_uri + yield at_uri_cls + + @pytest.mark.asyncio + async def test_get_item_short_text(self, _patch_at_uri): + """get_item should return dict with SHORT message_type for short text.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + post = _make_post_view(text="short") + thread = _make_thread(post=post, parent=None, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + + assert isinstance(result, dict) + assert result["category"] == "bluesky" + assert result["message_type"] == "short" + + @pytest.mark.asyncio + async def test_get_item_long_text(self, _patch_at_uri): + """Text longer than BLUESKY_MAX_LENGTH should set LONG message type.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + # The rendered template returns "

rendered

" which is short, + # but we need the combined text to exceed 800 chars. + # We mock template to return long text. + self.mock_tpl.render.return_value = "x" * 900 + + post = _make_post_view(text="x" * 900) + thread = _make_thread(post=post, parent=None, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + + assert result["message_type"] == "long" + + @pytest.mark.asyncio + async def test_resolve_thread_with_parent(self, _patch_at_uri): + """Parent posts should be collected recursively.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + grandparent_post = _make_post_view(text="grandparent") + grandparent_thread = _make_thread(post=grandparent_post, parent=None, replies=None) + + parent_post = _make_post_view(text="parent") + parent_thread = _make_thread(post=parent_post, parent=grandparent_thread, replies=None) + + base_post = _make_post_view(text="base") + thread = _make_thread(post=base_post, parent=parent_thread, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert isinstance(result, dict) + + @pytest.mark.asyncio + async def test_resolve_thread_with_replies_same_author(self, _patch_at_uri): + """Replies by the same author should be included in the combined text.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + author = _make_author(did="did:plc:abc123") + base_post = _make_post_view(text="base", author=author) + + reply_post = _make_post_view(text="reply", author=author) + reply_thread = _make_thread(post=reply_post) + + thread = _make_thread(post=base_post, parent=None, replies=[reply_thread]) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert isinstance(result, dict) + + @pytest.mark.asyncio + async def test_resolve_thread_with_replies_different_author(self, _patch_at_uri): + """Replies by a different author should be excluded.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + base_author = _make_author(did="did:plc:abc123") + other_author = _make_author(did="did:plc:other") + + base_post = _make_post_view(text="base", author=base_author) + reply_post = _make_post_view(text="other reply", author=other_author) + reply_thread = _make_thread(post=reply_post) + + thread = _make_thread(post=base_post, parent=None, replies=[reply_thread]) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert isinstance(result, dict) + + @pytest.mark.asyncio + async def test_resolve_single_post_with_images(self, _patch_at_uri): + """Posts with image embeds should have media_files populated.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + image_mock = MagicMock() + image_mock.fullsize = "https://cdn.bsky.app/img/feed/abc/image.jpg" + + # Use a simple namespace object instead of MagicMock to avoid __dict__ conflicts + class FakeEmbed: + def __init__(self): + self.images = [image_mock] + self.record = None + + embed = FakeEmbed() + + post = _make_post_view(text="photo post", embed=embed) + thread = _make_thread(post=post, parent=None, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert len(result["media_files"]) == 1 + assert result["media_files"][0]["media_type"] == "image" + + @pytest.mark.asyncio + async def test_resolve_single_post_with_retweet(self, _patch_at_uri): + """Posts with embed.record as ViewRecord should resolve retweet.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + from atproto_client.models.app.bsky.embed.record import ViewRecord + + # Use a simple namespace to avoid MagicMock __dict__ issues + class FakeEmbed: + def __init__(self): + self.images = [] + self.record = ViewRecord # identity check: `is ViewRecord` + + embed = FakeEmbed() + + post = _make_post_view(text="check this out", embed=embed) + thread = _make_thread(post=post, parent=None, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + # Mock _resolve_single_post_data entirely to avoid calling into ViewRecord as PostView + call_count = 0 + + async def side_effect(post_data): + nonlocal call_count + call_count += 1 + if call_count == 1: + return { + "url": "https://bsky.app/profile/alice/post/rkey123", + "title": "Alice's Bluesky post", + "author": "Alice", + "author_url": "https://bsky.app/profile/alice", + "text": "check this out", + "category": "bluesky", + "media_files": [], + "created_at": "2024-01-01T00:00:00Z", + "author_did": "did:plc:abc123", + "content": "

rendered

", + "retweet_post": { + "url": "https://bsky.app/profile/bob/post/456", + "title": "Bob's post", + "author": "Bob", + "author_url": "https://bsky.app/profile/bob", + "text": "original post", + "category": "bluesky", + "media_files": [], + "author_did": "did:plc:bob", + "content": "

original

", + }, + } + return { + "url": "https://bsky.app/profile/bob/post/456", + "title": "Bob's post", + "author": "Bob", + "author_url": "https://bsky.app/profile/bob", + "text": "original post", + "category": "bluesky", + "media_files": [], + "author_did": "did:plc:bob", + "content": "

original

", + } + + with patch.object( + BlueskyDataProcessor, + "_resolve_single_post_data", + side_effect=side_effect, + ): + result = await processor.get_item() + assert isinstance(result, dict) + + @pytest.mark.asyncio + async def test_resolve_single_post_retweet_branch_executed(self, _patch_at_uri): + """Directly test _resolve_single_post_data with embed.record is ViewRecord to cover lines 141-142.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + from atproto_client.models.app.bsky.embed.record import ViewRecord + + class FakeEmbed: + def __init__(self): + self.images = [] + self.record = ViewRecord # `is ViewRecord` will be True + + embed = FakeEmbed() + post = _make_post_view(text="quoting post", embed=embed) + + # Mock the recursive call to _resolve_single_post_data for the retweet + original_method = BlueskyDataProcessor._resolve_single_post_data + call_count = 0 + + async def patched_resolve(post_data): + nonlocal call_count + call_count += 1 + if call_count > 1: + # This is the recursive call for the retweet record + return { + "url": "https://bsky.app/profile/bob/post/456", + "title": "Bob's post", + "author": "Bob", + "author_url": "https://bsky.app/profile/bob", + "text": "retweeted content", + "category": "bluesky", + "media_files": [], + "author_did": "did:plc:bob", + "content": "

retweeted

", + "created_at": "2024-01-01", + } + return await original_method(post_data) + + with patch.object( + BlueskyDataProcessor, + "_resolve_single_post_data", + side_effect=patched_resolve, + ): + result = await BlueskyDataProcessor._resolve_single_post_data(post) + assert "retweet_post" in result + assert result["retweet_post"]["author"] == "Bob" + + @pytest.mark.asyncio + async def test_resolve_single_post_no_embed(self, _patch_at_uri): + """Post without embed should have empty media_files.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + post = _make_post_view(text="text only", embed=None) + thread = _make_thread(post=post, parent=None, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert result["media_files"] == [] + + @pytest.mark.asyncio + async def test_empty_parent_posts_data_list(self, _patch_at_uri): + """When parent exists but parent_posts_data is empty after collection, no text is prepended.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + # Create parent with a post + parent_post = _make_post_view(text="parent text") + parent_thread = _make_thread(post=parent_post, parent=None, replies=None) + + base_post = _make_post_view(text="base text") + thread = _make_thread(post=base_post, parent=parent_thread, replies=None) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert isinstance(result, dict) + + @pytest.mark.asyncio + async def test_empty_replies_posts_data_list(self, _patch_at_uri): + """When replies exist but none match author, replies data is empty.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyDataProcessor + + base_author = _make_author(did="did:plc:abc123") + other_author = _make_author(did="did:plc:other") + + base_post = _make_post_view(text="base", author=base_author) + reply_post = _make_post_view(text="different author reply", author=other_author) + reply_thread = _make_thread(post=reply_post) + + thread = _make_thread(post=base_post, parent=None, replies=[reply_thread]) + + processor = BlueskyDataProcessor("https://bsky.app/profile/alice/post/rkey123", thread) + result = await processor.get_item() + assert isinstance(result, dict) + + +# --------------------------------------------------------------------------- +# BlueskyScraper tests +# --------------------------------------------------------------------------- + +class TestBlueskyScraper: + + @pytest.fixture(autouse=True) + def _patch_deps(self): + """Patch atproto classes and templates at module level.""" + mock_tpl = MagicMock() + mock_tpl.render.return_value = "

rendered

" + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.telegram_text_template", + mock_tpl, + ), patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.content_template", + mock_tpl, + ): + yield + + @pytest.mark.asyncio + async def test_init_with_credentials(self): + """init() should call client.login when username and password are provided.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls: + mock_client = AsyncMock() + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper(username="user", password="pass") + await scraper.init() + mock_client.login.assert_awaited_once_with("user", "pass") + + @pytest.mark.asyncio + async def test_init_without_credentials(self): + """init() should not call login when credentials are missing.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls: + mock_client = AsyncMock() + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper() + await scraper.init() + mock_client.login.assert_not_awaited() + + @pytest.mark.asyncio + async def test_get_processor_by_url(self): + """get_processor_by_url should return a BlueskyDataProcessor.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import ( + BlueskyScraper, + BlueskyDataProcessor, + ) + + mock_resolver = MagicMock() + mock_resolver.handle.resolve.return_value = "did:plc:resolved" + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls, patch.object( + BlueskyScraper, "id_resolver", mock_resolver + ): + mock_client = AsyncMock() + mock_post_data = MagicMock() + mock_post_data.uri = "at://did:plc:resolved/app.bsky.feed.post/rkey123" + mock_client.get_post.return_value = mock_post_data + + mock_thread_data = MagicMock() + mock_thread_data.thread = _make_thread() + mock_client.get_post_thread.return_value = mock_thread_data + + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper() + processor = await scraper.get_processor_by_url( + "https://bsky.app/profile/alice.bsky.social/post/rkey123" + ) + assert isinstance(processor, BlueskyDataProcessor) + + @pytest.mark.asyncio + async def test_request_post_data_uses_did_when_available(self): + """_request_post_data should use did as profile_identify when available.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper + + mock_resolver = MagicMock() + mock_resolver.handle.resolve.return_value = "did:plc:resolved" + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls, patch.object( + BlueskyScraper, "id_resolver", mock_resolver + ): + mock_client = AsyncMock() + mock_post_data = MagicMock() + mock_post_data.uri = "at://did:plc:resolved/app.bsky.feed.post/rkey123" + mock_client.get_post.return_value = mock_post_data + + mock_thread_response = MagicMock() + mock_thread_response.thread = _make_thread() + mock_client.get_post_thread.return_value = mock_thread_response + + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper() + + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyPost + + bluesky_post = MagicMock(spec=BlueskyPost) + bluesky_post.did = "did:plc:resolved" + bluesky_post.handle = "alice.bsky.social" + bluesky_post.post_rkey = "rkey123" + + result = await scraper._request_post_data(bluesky_post) + mock_client.get_post.assert_awaited_once_with( + profile_identify="did:plc:resolved", post_rkey="rkey123" + ) + + @pytest.mark.asyncio + async def test_request_post_data_uses_handle_when_no_did(self): + """_request_post_data should fall back to handle when did is empty.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls: + mock_client = AsyncMock() + mock_post_data = MagicMock() + mock_post_data.uri = "at://did:plc:resolved/app.bsky.feed.post/rkey123" + mock_client.get_post.return_value = mock_post_data + + mock_thread_response = MagicMock() + mock_thread_response.thread = _make_thread() + mock_client.get_post_thread.return_value = mock_thread_response + + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper() + + bluesky_post = MagicMock() + bluesky_post.did = "" # falsy + bluesky_post.handle = "alice.bsky.social" + bluesky_post.post_rkey = "rkey123" + + result = await scraper._request_post_data(bluesky_post) + mock_client.get_post.assert_awaited_once_with( + profile_identify="alice.bsky.social", post_rkey="rkey123" + ) + + @pytest.mark.asyncio + async def test_request_post_data_exception_handling(self): + """_request_post_data should log error and return None on exception.""" + from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper + + with patch( + "fastfetchbot_shared.services.scrapers.bluesky.scraper.AsyncClient" + ) as mock_client_cls: + mock_client = AsyncMock() + mock_client.get_post.side_effect = Exception("network error") + mock_client_cls.return_value = mock_client + + scraper = BlueskyScraper() + + bluesky_post = MagicMock() + bluesky_post.did = "did:plc:abc" + bluesky_post.handle = "alice" + bluesky_post.post_rkey = "rkey123" + + result = await scraper._request_post_data(bluesky_post) + assert result is None diff --git a/tests/unit/scrapers/test_common.py b/tests/unit/scrapers/test_common.py new file mode 100644 index 0000000..decfb26 --- /dev/null +++ b/tests/unit/scrapers/test_common.py @@ -0,0 +1,217 @@ +"""Tests for packages/shared/fastfetchbot_shared/services/scrapers/common.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.models.url_metadata import UrlMetadata +from fastfetchbot_shared.services.scrapers.common import InfoExtractService + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + +class TestInfoExtractServiceInit: + def test_init_sets_all_fields(self, make_url_metadata): + url_metadata = make_url_metadata( + source="twitter", + url="https://twitter.com/user/status/123", + content_type="post", + ) + svc = InfoExtractService( + url_metadata=url_metadata, + data={"key": "val"}, + store_database=True, + store_telegraph=False, + store_document=True, + extra_kwarg="extra", + ) + assert svc.url == "https://twitter.com/user/status/123" + assert svc.content_type == "post" + assert svc.source == "twitter" + assert svc.data == {"key": "val"} + assert svc.store_database is True + assert svc.store_telegraph is False + assert svc.store_document is True + assert svc.kwargs == {"extra_kwarg": "extra"} + + def test_init_defaults(self, make_url_metadata): + url_metadata = make_url_metadata() + svc = InfoExtractService(url_metadata=url_metadata) + assert svc.data is None + assert svc.store_database is False + assert svc.store_telegraph is True + assert svc.store_document is False + assert svc.kwargs == {} + + +# --------------------------------------------------------------------------- +# category property +# --------------------------------------------------------------------------- + +class TestCategory: + def test_category_returns_source(self, make_url_metadata): + url_metadata = make_url_metadata(source="reddit") + svc = InfoExtractService(url_metadata=url_metadata) + assert svc.category == "reddit" + + +# --------------------------------------------------------------------------- +# get_item with pre-existing metadata_item (skips scraping) +# --------------------------------------------------------------------------- + +class TestGetItemWithExistingMetadata: + @pytest.mark.asyncio + async def test_get_item_with_metadata_skips_scraping( + self, make_url_metadata, sample_metadata_item_dict + ): + svc = InfoExtractService(url_metadata=make_url_metadata()) + result = await svc.get_item(metadata_item=sample_metadata_item_dict) + assert result["title"] == "Test Title" + + @pytest.mark.asyncio + async def test_get_item_with_metadata_strips_title(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata()) + item = {"title": " padded title ", "url": "https://example.com"} + result = await svc.get_item(metadata_item=item) + assert result["title"] == "padded title" + + +# --------------------------------------------------------------------------- +# get_item with category in service_classes (e.g. "twitter") +# --------------------------------------------------------------------------- + +class TestGetItemServiceClasses: + @pytest.mark.asyncio + async def test_get_item_twitter_category(self, make_url_metadata): + mock_scraper_instance = MagicMock() + mock_scraper_instance.get_item = AsyncMock( + return_value={"title": " Twitter Post ", "content": "hello"} + ) + mock_scraper_class = MagicMock(return_value=mock_scraper_instance) + + svc = InfoExtractService( + url_metadata=make_url_metadata(source="twitter", url="https://twitter.com/x/1"), + data={"some": "data"}, + ) + + with patch.dict(svc.service_classes, {"twitter": mock_scraper_class}): + result = await svc.get_item() + + mock_scraper_class.assert_called_once_with( + url="https://twitter.com/x/1", data={"some": "data"} + ) + mock_scraper_instance.get_item.assert_awaited_once() + assert result["title"] == "Twitter Post" + + @pytest.mark.asyncio + async def test_get_item_zhihu_category(self, make_url_metadata): + mock_scraper_instance = MagicMock() + mock_scraper_instance.get_item = AsyncMock( + return_value={"title": "Zhihu Answer", "content": "answer"} + ) + mock_scraper_class = MagicMock(return_value=mock_scraper_instance) + + svc = InfoExtractService( + url_metadata=make_url_metadata(source="zhihu"), + ) + + with patch.dict(svc.service_classes, {"zhihu": mock_scraper_class}): + result = await svc.get_item() + + assert result["title"] == "Zhihu Answer" + + +# --------------------------------------------------------------------------- +# get_item with ScraperManager categories +# --------------------------------------------------------------------------- + +class TestGetItemScraperManager: + @pytest.mark.asyncio + @pytest.mark.parametrize("category", ["bluesky", "weibo", "other", "unknown"]) + async def test_get_item_scraper_manager_categories( + self, make_url_metadata, category + ): + mock_processor = MagicMock() + mock_processor.get_item = AsyncMock( + return_value={"title": f" {category} item "} + ) + + mock_scraper = MagicMock() + mock_scraper.get_processor_by_url = AsyncMock(return_value=mock_processor) + + with patch( + "fastfetchbot_shared.services.scrapers.common.ScraperManager" + ) as MockSM: + MockSM.init_scraper = AsyncMock() + MockSM.scrapers = {category: mock_scraper} + + svc = InfoExtractService( + url_metadata=make_url_metadata( + source=category, url="https://example.com/post" + ), + ) + result = await svc.get_item() + + MockSM.init_scraper.assert_awaited_once_with(category) + mock_scraper.get_processor_by_url.assert_awaited_once_with( + url="https://example.com/post" + ) + mock_processor.get_item.assert_awaited_once() + assert result["title"] == f"{category} item" + + +# --------------------------------------------------------------------------- +# get_item exception re-raise +# --------------------------------------------------------------------------- + +class TestGetItemException: + @pytest.mark.asyncio + async def test_get_item_exception_reraises(self, make_url_metadata): + mock_scraper_instance = MagicMock() + mock_scraper_instance.get_item = AsyncMock( + side_effect=RuntimeError("scraper failed") + ) + mock_scraper_class = MagicMock(return_value=mock_scraper_instance) + + svc = InfoExtractService( + url_metadata=make_url_metadata(source="twitter"), + ) + + with patch.dict(svc.service_classes, {"twitter": mock_scraper_class}): + with pytest.raises(RuntimeError, match="scraper failed"): + await svc.get_item() + + @pytest.mark.asyncio + async def test_get_item_scraper_manager_exception_reraises(self, make_url_metadata): + with patch( + "fastfetchbot_shared.services.scrapers.common.ScraperManager" + ) as MockSM: + MockSM.init_scraper = AsyncMock( + side_effect=ValueError("init failed") + ) + + svc = InfoExtractService( + url_metadata=make_url_metadata(source="bluesky"), + ) + with pytest.raises(ValueError, match="init failed"): + await svc.get_item() + + +# --------------------------------------------------------------------------- +# process_item +# --------------------------------------------------------------------------- + +class TestProcessItem: + @pytest.mark.asyncio + async def test_process_item_strips_title(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata()) + result = await svc.process_item({"title": " hello world "}) + assert result["title"] == "hello world" + + @pytest.mark.asyncio + async def test_process_item_no_strip_needed(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata()) + result = await svc.process_item({"title": "clean"}) + assert result["title"] == "clean" diff --git a/tests/unit/scrapers/test_douban.py b/tests/unit/scrapers/test_douban.py new file mode 100644 index 0000000..0af01ec --- /dev/null +++ b/tests/unit/scrapers/test_douban.py @@ -0,0 +1,662 @@ +"""Unit tests for douban scraper: DoubanType enum, Douban class with all methods.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch, call +from lxml import etree + +from fastfetchbot_shared.models.metadata_item import MessageType + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_selector_with_xpaths(xpath_map: dict): + """Create a mock lxml selector that responds to xpath() calls.""" + selector = MagicMock() + + def xpath_side_effect(expr): + for key, val in xpath_map.items(): + if key in expr: + return val + return "" + + selector.xpath = MagicMock(side_effect=xpath_side_effect) + return selector + + +def _make_html_element(html_str: str): + """Create a real lxml element from HTML string for tostring calls.""" + tree = etree.HTML(html_str) + return tree + + +@pytest.fixture(autouse=True) +def _patch_douban_templates(): + mock_tpl = MagicMock() + mock_tpl.render.return_value = "

rendered

" + with patch( + "fastfetchbot_shared.services.scrapers.douban.short_text_template", mock_tpl + ), patch( + "fastfetchbot_shared.services.scrapers.douban.content_template", mock_tpl + ): + yield mock_tpl + + +@pytest.fixture +def _patch_get_selector(): + with patch( + "fastfetchbot_shared.services.scrapers.douban.get_selector", + new_callable=AsyncMock, + ) as m: + yield m + + +# --------------------------------------------------------------------------- +# DoubanType enum tests +# --------------------------------------------------------------------------- + +class TestDoubanType: + + def test_enum_values(self): + from fastfetchbot_shared.services.scrapers.douban import DoubanType + + assert DoubanType.MOVIE_REVIEW == "movie_review" + assert DoubanType.BOOK_REVIEW == "book_review" + assert DoubanType.NOTE == "note" + assert DoubanType.STATUS == "status" + assert DoubanType.GROUP == "group" + assert DoubanType.UNKNOWN == "unknown" + + def test_enum_is_string(self): + from fastfetchbot_shared.services.scrapers.douban import DoubanType + + assert isinstance(DoubanType.MOVIE_REVIEW, str) + + +# --------------------------------------------------------------------------- +# Douban.__init__ tests +# --------------------------------------------------------------------------- + +class TestDoubanInit: + + def test_default_fields(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/note/12345/") + assert d.url == "https://www.douban.com/note/12345/" + assert d.title == "" + assert d.author == "" + assert d.author_url == "" + assert d.text == "" + assert d.content == "" + assert d.media_files == [] + assert d.category == "douban" + assert d.message_type == MessageType.SHORT + assert d.item_title is None + assert d.item_url is None + assert d.group_name is None + assert d.group_url is None + assert d.douban_type == DoubanType.UNKNOWN + assert d.text_group is None + assert d.raw_content is None + assert d.date is None + + def test_cookie_passed_to_headers(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/", cookie="session=abc") + assert d.headers["Cookie"] == "session=abc" + + def test_no_cookie(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/") + assert d.headers["Cookie"] == "" + + +# --------------------------------------------------------------------------- +# check_douban_type tests +# --------------------------------------------------------------------------- + +class TestCheckDoubanType: + + def test_note_type(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/note/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.NOTE + + def test_status_type_with_status_path(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/status/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.STATUS + + def test_status_type_with_people_status_path(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/people/12345/status/67890") + d.check_douban_type() + assert d.douban_type == DoubanType.STATUS + + def test_group_type(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/group/topic/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.GROUP + + def test_movie_review_direct(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://movie.douban.com/review/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.MOVIE_REVIEW + + def test_book_review_direct(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://book.douban.com/review/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.BOOK_REVIEW + + def test_m_douban_movie_review(self): + """m.douban.com with /movie/review path should map to MOVIE_REVIEW.""" + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://m.douban.com/movie/review/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.MOVIE_REVIEW + # URL should be rewritten to desktop domain + assert "movie.douban.com" in d.url + assert "/review/12345/" in d.url + + def test_m_douban_book_review(self): + """m.douban.com with /book/review path should map to BOOK_REVIEW.""" + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://m.douban.com/book/review/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.BOOK_REVIEW + assert "book.douban.com" in d.url + + def test_m_douban_note(self): + """m.douban.com with /note/ path should map to NOTE.""" + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://m.douban.com/note/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.NOTE + + def test_unknown_type(self): + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://www.douban.com/people/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.UNKNOWN + + def test_url_rewritten(self): + """URL should be rewritten to https://{host}{path} format.""" + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/?query=1") + d.check_douban_type() + assert d.url == "https://www.douban.com/note/12345/" + + def test_m_douban_non_review(self): + """m.douban.com with non-review path should still rewrite host.""" + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + d = Douban("https://m.douban.com/group/topic/12345/") + d.check_douban_type() + assert d.douban_type == DoubanType.GROUP + assert "douban.com" in d.url + + +# --------------------------------------------------------------------------- +# get_douban_item tests +# --------------------------------------------------------------------------- + +class TestGetDoubanItem: + + @pytest.mark.asyncio + async def test_get_item_returns_dict(self, _patch_get_selector, _patch_douban_templates): + from fastfetchbot_shared.services.scrapers.douban import Douban + + # Build a real lxml tree for xpath calls + html = """ + +

Test Note

+
Author
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + _patch_douban_templates.render.return_value = "short" + + d = Douban("https://www.douban.com/note/12345/") + result = await d.get_item() + + assert isinstance(result, dict) + assert result["category"] == "douban" + + @pytest.mark.asyncio + async def test_get_douban_item_long_content(self, _patch_get_selector, _patch_douban_templates): + """When content exceeds SHORT_LIMIT, message_type should be LONG.""" + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

Test Note

+
Author
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + # Make wrap_text_into_html return long content + with patch( + "fastfetchbot_shared.services.scrapers.douban.wrap_text_into_html", + return_value="x" * 700, + ), patch( + "fastfetchbot_shared.services.scrapers.douban.get_html_text_length", + return_value=700, + ): + d = Douban("https://www.douban.com/note/12345/") + await d.get_douban() + assert d.message_type == MessageType.LONG + + @pytest.mark.asyncio + async def test_get_douban_item_short_content(self, _patch_get_selector, _patch_douban_templates): + """When content is within SHORT_LIMIT, message_type should be SHORT.""" + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

Test Note

+
Author
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + with patch( + "fastfetchbot_shared.services.scrapers.douban.wrap_text_into_html", + return_value="

Short

", + ), patch( + "fastfetchbot_shared.services.scrapers.douban.get_html_text_length", + return_value=5, + ): + d = Douban("https://www.douban.com/note/12345/") + await d.get_douban() + assert d.message_type == MessageType.SHORT + + @pytest.mark.asyncio + async def test_short_text_ending_with_newline_stripped(self, _patch_get_selector, _patch_douban_templates): + """If short_text ends with newline, it should be stripped.""" + from fastfetchbot_shared.services.scrapers.douban import Douban, DoubanType + + html = """ + +

Test Note

+
Author
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/note/12345/") + d.douban_type = DoubanType.NOTE + # Patch _douban_short_text_process to return text ending with \n + with patch.object(d, "_douban_short_text_process", return_value="text\n"): + await d.get_douban_item() + # The template should receive short_text without trailing newline + call_args = _patch_douban_templates.render.call_args_list + # Find the call where short_text was passed + found = False + for c in call_args: + if c.kwargs.get("data", {}).get("short_text") == "text": + found = True + break + if c.args and isinstance(c.args[0], dict) and c.args[0].get("short_text") == "text": + found = True + break + + +# --------------------------------------------------------------------------- +# _get_douban_movie_review tests +# --------------------------------------------------------------------------- + +class TestGetDoubanMovieReview: + + @pytest.mark.asyncio + async def test_movie_review_fields(self, _patch_get_selector): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

Movie Review Title

+
+ Author Link + ReviewAuthor + Movie Name +
+
Review body text
+ + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://movie.douban.com/review/12345/") + d.check_douban_type() + await d._get_douban_movie_review() + + assert d.title == "Movie Review Title" + assert d.raw_content is not None + + +# --------------------------------------------------------------------------- +# _get_douban_book_review tests +# --------------------------------------------------------------------------- + +class TestGetDoubanBookReview: + + @pytest.mark.asyncio + async def test_book_review_fields(self, _patch_get_selector): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

Book Review Title

+
+ Author + BookReviewAuthor + Book Name +
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://book.douban.com/review/12345/") + d.check_douban_type() + await d._get_douban_book_review() + + assert d.title == "Book Review Title" + assert d.raw_content is not None + + +# --------------------------------------------------------------------------- +# _get_douban_note tests +# --------------------------------------------------------------------------- + +class TestGetDoubanNote: + + @pytest.mark.asyncio + async def test_note_fields(self, _patch_get_selector): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

My Note Title

+
NoteAuthor
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/note/12345/") + d.check_douban_type() + await d._get_douban_note() + + assert d.title == "My Note Title" + assert d.author == "NoteAuthor" + assert d.raw_content is not None + + +# --------------------------------------------------------------------------- +# _get_douban_status tests +# --------------------------------------------------------------------------- + +class TestGetDoubanStatus: + + @pytest.mark.asyncio + async def test_status_fields(self, _patch_get_selector): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +
StatusAuthor
+
Status text here
+ + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/status/12345/") + d.check_douban_type() + await d._get_douban_status() + + assert d.author == "StatusAuthor" + assert d.title == "StatusAuthor\u7684\u5e7f\u64ad" # "StatusAuthor的广播" + assert "blockquote" not in d.raw_content + + @pytest.mark.asyncio + async def test_status_replaces_special_chars(self, _patch_get_selector): + """Status should replace blockquote tags, >+<, and .""" + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +
Author
+
Text More
+ + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/status/12345/") + d.check_douban_type() + await d._get_douban_status() + + assert "
" not in d.raw_content + assert "
" not in d.raw_content + + +# --------------------------------------------------------------------------- +# _get_douban_group_article tests +# --------------------------------------------------------------------------- + +class TestGetDoubanGroupArticle: + + @pytest.mark.asyncio + async def test_group_article_fields(self, _patch_get_selector): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

+ Group Article Title +

+ GroupAuthor +
+ + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/group/topic/12345/") + d.check_douban_type() + await d._get_douban_group_article() + + assert d.title == "Group Article Title" + assert d.author == "GroupAuthor" + assert d.group_name == "Test Group" + assert d.raw_content is not None + + +# --------------------------------------------------------------------------- +# _douban_short_text_process tests +# --------------------------------------------------------------------------- + +class TestDoubanShortTextProcess: + + def test_images_extracted_to_media_files(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/") + d.raw_content = '

Text

' + result = d._douban_short_text_process() + + assert len(d.media_files) == 1 + assert d.media_files[0].url == "https://img.douban.com/pic.jpg" + assert "img" not in result + + def test_p_span_div_unwrapped(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/") + d.raw_content = "

inner text

" + result = d._douban_short_text_process() + + assert "

" not in result + assert "" not in result + assert "

" not in result + + def test_link_and_script_decomposed(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + d = Douban("https://www.douban.com/note/12345/") + d.raw_content = '

Text

' + result = d._douban_short_text_process() + + assert "Hello world

" + + def test_multiple_paragraphs(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + result = Douban.raw_content_to_html("Para 1
\nPara 2
\nPara 3") + assert "

Para 1

" in result + assert "

Para 2

" in result + assert "

Para 3

" in result + + def test_strips_whitespace(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + result = Douban.raw_content_to_html(" Hello
\n World ") + assert "

Hello

" in result + assert "

World

" in result + + def test_empty_string(self): + from fastfetchbot_shared.services.scrapers.douban import Douban + + result = Douban.raw_content_to_html("") + assert result == "

" + + def test_no_br_newline_separator(self): + """Text without
\\n should be a single paragraph.""" + from fastfetchbot_shared.services.scrapers.douban import Douban + + result = Douban.raw_content_to_html("Just a single line") + assert result == "

Just a single line

" + + +# --------------------------------------------------------------------------- +# get_douban (integration of check_douban_type + get_douban_item) +# --------------------------------------------------------------------------- + +class TestGetDouban: + + @pytest.mark.asyncio + async def test_get_douban_note_full_flow(self, _patch_get_selector, _patch_douban_templates): + from fastfetchbot_shared.services.scrapers.douban import Douban + + html = """ + +

Full Flow Note

+ + + + """ + selector = etree.HTML(html) + _patch_get_selector.return_value = selector + + d = Douban("https://www.douban.com/note/12345/") + await d.get_douban() + + assert d.title == "Full Flow Note" + assert d.author == "Author" + assert d.text is not None + assert d.content is not None diff --git a/tests/unit/scrapers/test_general_base.py b/tests/unit/scrapers/test_general_base.py new file mode 100644 index 0000000..e7d824a --- /dev/null +++ b/tests/unit/scrapers/test_general_base.py @@ -0,0 +1,473 @@ +"""Tests for packages/shared/fastfetchbot_shared/services/scrapers/general/base.py""" + +import hashlib +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.models.metadata_item import MessageType + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + + +class TestConstants: + def test_general_text_limit(self): + from fastfetchbot_shared.services.scrapers.general.base import GENERAL_TEXT_LIMIT + assert GENERAL_TEXT_LIMIT == 800 + + def test_default_openai_model(self): + from fastfetchbot_shared.services.scrapers.general.base import DEFAULT_OPENAI_MODEL + assert DEFAULT_OPENAI_MODEL == "gpt-5-nano" + + +# --------------------------------------------------------------------------- +# BaseGeneralScraper (abstract – just verify it cannot be instantiated) +# --------------------------------------------------------------------------- + + +class TestBaseGeneralScraper: + def test_has_abstract_method(self): + """BaseGeneralScraper declares get_processor_by_url as abstract.""" + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralScraper + assert hasattr(BaseGeneralScraper, "get_processor_by_url") + assert getattr( + BaseGeneralScraper.get_processor_by_url, "__isabstractmethod__", False + ) + + @pytest.mark.asyncio + async def test_abstract_get_processor_by_url_pass(self): + """Execute the abstract pass body for coverage.""" + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralScraper + + class ConcreteScraper(BaseGeneralScraper): + async def get_processor_by_url(self, url): + return await super().get_processor_by_url(url) + + s = ConcreteScraper() + result = await s.get_processor_by_url("https://example.com") + assert result is None + + +# --------------------------------------------------------------------------- +# BaseGeneralDataProcessor +# --------------------------------------------------------------------------- + + +class _ConcreteProcessor: + """Minimal concrete subclass for testing the base class logic.""" + _get_page_called = False + + async def _get_page_content(self): + self._get_page_called = True + + +def _make_processor(url="https://example.com/page"): + """Create a concrete processor that inherits BaseGeneralDataProcessor.""" + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + + class ConcreteProcessor(BaseGeneralDataProcessor): + _get_page_called = False + + async def _get_page_content(self): + self._get_page_called = True + + return ConcreteProcessor(url) + + +class TestBaseGeneralDataProcessorInit: + def test_init_sets_fields(self): + url = "https://example.com/page" + proc = _make_processor(url) + assert proc.url == url + assert proc._data == {} + assert proc.url_parser.netloc == "example.com" + expected_id = hashlib.md5(url.encode()).hexdigest()[:16] + assert proc.id == expected_id + assert proc.scraper_type == "base" + + +class TestBaseGeneralDataProcessorGetItem: + @pytest.mark.asyncio + async def test_get_item_calls_process_data(self): + proc = _make_processor("https://example.com/page") + # Populate _data so GeneralItem.from_dict works + proc._data = { + "id": "abc", + "category": "other", + "url": "https://example.com/page", + "title": "Title", + "author": "Author", + "author_url": "https://example.com", + "text": "hello", + "content": "

hello

", + "raw_content": "hello", + "media_files": [], + "message_type": "short", + "telegraph_url": "", + "scraper_type": "base", + } + # Override process_data to avoid real scraping + proc.process_data = AsyncMock() + result = await proc.get_item() + proc.process_data.assert_awaited_once() + assert isinstance(result, dict) + assert result["title"] == "Title" + + @pytest.mark.asyncio + async def test_process_data_calls_get_page_content(self): + proc = _make_processor("https://example.com/page") + await proc.process_data() + assert proc._get_page_called + + @pytest.mark.asyncio + async def test_abstract_get_page_content_pass(self): + """Execute the abstract _get_page_content pass body for coverage.""" + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + + class DirectProcessor(BaseGeneralDataProcessor): + async def _get_page_content(self): + await super()._get_page_content() + + proc = DirectProcessor("https://example.com/page") + await proc._get_page_content() # should just pass + + +# --------------------------------------------------------------------------- +# _build_item_data +# --------------------------------------------------------------------------- + + +class TestBuildItemData: + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.base.BaseGeneralDataProcessor.parsing_article_body_by_llm", + new_callable=AsyncMock, + return_value="

cleaned

", + ) + async def test_with_html_content_and_og_image(self, mock_llm): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="My Title", + author="Author", + description="desc", + markdown_content="md content", + html_content="

raw html

", + og_image="https://img.example.com/pic.jpg", + ) + data = proc._data + assert data["title"] == "My Title" + assert data["author"] == "Author" + assert data["author_url"] == "https://example.com" + assert data["text"] == "desc" + assert len(data["media_files"]) == 1 + assert data["media_files"][0]["url"] == "https://img.example.com/pic.jpg" + mock_llm.assert_awaited_once() + + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.base.BaseGeneralDataProcessor.parsing_article_body_by_llm", + new_callable=AsyncMock, + return_value="

cleaned

", + ) + async def test_without_og_image(self, mock_llm): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="Title", + author="A", + description="d", + markdown_content="md", + html_content="

html

", + og_image=None, + ) + assert proc._data["media_files"] == [] + + @pytest.mark.asyncio + async def test_empty_title_and_author_fallback(self): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="", + author="", + description="", + markdown_content="md", + html_content="", + og_image=None, + ) + data = proc._data + assert data["title"] == "https://example.com/page" + assert data["author"] == "example.com" + + @pytest.mark.asyncio + async def test_no_html_content_uses_markdown(self): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="T", + author="A", + description="", + markdown_content="some markdown", + html_content="", + og_image=None, + ) + data = proc._data + # wrap_text_into_html wraps non-html text into

tags + assert "

" in data["content"] + + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.base.BaseGeneralDataProcessor.parsing_article_body_by_llm", + new_callable=AsyncMock, + return_value="

c

", + ) + async def test_long_message_type(self, mock_llm): + proc = _make_processor("https://example.com/page") + long_html = "

" + "x" * 1000 + "

" + await proc._build_item_data( + title="T", + author="A", + description="d", + markdown_content="", + html_content=long_html, + og_image=None, + ) + # The LLM mock returns short content so message_type is SHORT + assert proc._data["message_type"] == MessageType.SHORT + + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.base.BaseGeneralDataProcessor.parsing_article_body_by_llm", + new_callable=AsyncMock, + ) + async def test_long_message_type_actual_long(self, mock_llm): + long_text = "x" * 1000 + mock_llm.return_value = f"

{long_text}

" + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="T", + author="A", + description="d", + markdown_content="", + html_content=f"

{long_text}

", + og_image=None, + ) + assert proc._data["message_type"] == MessageType.LONG + + @pytest.mark.asyncio + async def test_description_fallback_to_markdown_prefix(self): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="T", + author="A", + description="", + markdown_content="short md text", + html_content="", + og_image=None, + ) + assert proc._data["text"] == "short md text" + + @pytest.mark.asyncio + async def test_description_strips_html_tags(self): + proc = _make_processor("https://example.com/page") + await proc._build_item_data( + title="T", + author="A", + description="bold text", + markdown_content="", + html_content="", + og_image=None, + ) + assert proc._data["text"] == "bold text" + + +# --------------------------------------------------------------------------- +# sanitize_html +# --------------------------------------------------------------------------- + + +class TestSanitizeHtml: + def test_empty_string(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + assert BaseGeneralDataProcessor.sanitize_html("") == "" + + def test_none_returns_none(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + assert BaseGeneralDataProcessor.sanitize_html(None) is None + + def test_removes_doctype(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = "

Hello

" + result = BaseGeneralDataProcessor.sanitize_html(html) + assert "DOCTYPE" not in result + assert "

Hello

" in result + + def test_removes_script_style(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = "

text

" + result = BaseGeneralDataProcessor.sanitize_html(html) + assert "script" not in result + assert "style" not in result + assert "

text

" in result + + def test_removes_head_meta_link_noscript_iframe_svg_form_input_button(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = ( + "t" + "" + "" + "" + "" + "" + "
" + "

keep

" + ) + result = BaseGeneralDataProcessor.sanitize_html(html) + assert "

keep

" in result + for tag in ["head", "meta", "link", "noscript", "iframe", "svg", "form", "input", "button"]: + assert f"<{tag}" not in result + + def test_unwraps_structural_tags(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = "
text
" + result = BaseGeneralDataProcessor.sanitize_html(html) + assert "text" in result + for tag in ["html", "body", "div", "span"]: + assert f"<{tag}" not in result + + def test_unwraps_semantic_layout_tags(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = ( + "
h
" + "
f
m
" + "
fc
" + "
s
" + "
term
def
" + ) + result = BaseGeneralDataProcessor.sanitize_html(html) + for tag in ["section", "article", "nav", "header", "footer", "main", + "aside", "figure", "figcaption", "details", "summary", + "dl", "dt", "dd"]: + assert f"<{tag}" not in result + # Text content preserved + for text in ["n", "h", "f", "m", "a", "fc", "s", "term", "def"]: + assert text in result + + def test_preserves_content_tags(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + html = ( + "

para

h

linkbold" + "strongitalicemunderline" + "
  • item
  1. num
" + "
quote
code
" + "
" + "
h
d
" + ) + result = BaseGeneralDataProcessor.sanitize_html(html) + for tag in ["p", "h1", "a", "b", "strong", "i", "em", "u", + "ul", "ol", "li", "blockquote", "pre", "code", + "img", "br", "table", "thead", "tbody", "tr", "th", "td"]: + assert f"<{tag}" in result + + +# --------------------------------------------------------------------------- +# parsing_article_body_by_llm +# --------------------------------------------------------------------------- + + +class TestParsingArticleBodyByLlm: + @pytest.mark.asyncio + async def test_empty_input(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("") + assert result == "" + + @pytest.mark.asyncio + async def test_none_input(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm(None) + assert result is None + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", None) + async def test_no_api_key(self): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("

html

") + assert result == "

html

" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") + async def test_success(self, mock_openai_cls): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + mock_client = AsyncMock() + mock_openai_cls.return_value = mock_client + mock_choice = MagicMock() + mock_choice.message.content = "

extracted

" + mock_client.chat.completions.create.return_value = MagicMock(choices=[mock_choice]) + + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("

raw

") + assert result == "

extracted

" + mock_openai_cls.assert_called_once_with(api_key="sk-test") + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") + async def test_empty_response(self, mock_openai_cls): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + mock_client = AsyncMock() + mock_openai_cls.return_value = mock_client + mock_choice = MagicMock() + mock_choice.message.content = None + mock_client.chat.completions.create.return_value = MagicMock(choices=[mock_choice]) + + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("

raw

") + assert result == "

raw

" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") + async def test_exception(self, mock_openai_cls): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + mock_client = AsyncMock() + mock_openai_cls.return_value = mock_client + mock_client.chat.completions.create.side_effect = RuntimeError("boom") + + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("

raw

") + assert result == "

raw

" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") + async def test_truncates_long_content(self, mock_openai_cls): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + mock_client = AsyncMock() + mock_openai_cls.return_value = mock_client + mock_choice = MagicMock() + mock_choice.message.content = "

ok

" + mock_client.chat.completions.create.return_value = MagicMock(choices=[mock_choice]) + + long_html = "x" * 60000 + result = await BaseGeneralDataProcessor.parsing_article_body_by_llm(long_html) + assert result == "

ok

" + # Verify the content sent to OpenAI was truncated + call_args = mock_client.chat.completions.create.call_args + user_msg = call_args.kwargs["messages"][1]["content"] + # The user message includes the prefix + truncated content + assert len(user_msg) < 60000 + 200 + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") + async def test_short_content_not_truncated(self, mock_openai_cls): + from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor + mock_client = AsyncMock() + mock_openai_cls.return_value = mock_client + mock_choice = MagicMock() + mock_choice.message.content = "

ok

" + mock_client.chat.completions.create.return_value = MagicMock(choices=[mock_choice]) + + short_html = "

short

" + await BaseGeneralDataProcessor.parsing_article_body_by_llm(short_html) + call_args = mock_client.chat.completions.create.call_args + user_msg = call_args.kwargs["messages"][1]["content"] + assert short_html in user_msg diff --git a/tests/unit/scrapers/test_general_firecrawl.py b/tests/unit/scrapers/test_general_firecrawl.py new file mode 100644 index 0000000..a619bfa --- /dev/null +++ b/tests/unit/scrapers/test_general_firecrawl.py @@ -0,0 +1,500 @@ +"""Tests for firecrawl.py and firecrawl_client.py in general scrapers.""" + +import threading +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.services.scrapers.general.firecrawl_client import ( + FirecrawlClient, + FirecrawlSettings, +) + + +# --------------------------------------------------------------------------- +# FirecrawlSettings (frozen dataclass) +# --------------------------------------------------------------------------- + + +class TestFirecrawlSettings: + def test_create(self): + s = FirecrawlSettings(api_url="https://api.firecrawl.dev", api_key="key123") + assert s.api_url == "https://api.firecrawl.dev" + assert s.api_key == "key123" + + def test_frozen(self): + s = FirecrawlSettings(api_url="x", api_key="y") + with pytest.raises(AttributeError): + s.api_url = "z" + + +# --------------------------------------------------------------------------- +# FirecrawlClient singleton +# --------------------------------------------------------------------------- + + +class TestFirecrawlClientSingleton: + def setup_method(self): + FirecrawlClient.reset_instance() + + def teardown_method(self): + FirecrawlClient.reset_instance() + + @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "https://fc.example.com", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "test-key", + ) + def test_get_instance_creates_singleton(self, mock_fc_cls): + mock_fc_cls.return_value = MagicMock() + instance1 = FirecrawlClient.get_instance() + instance2 = FirecrawlClient.get_instance() + assert instance1 is instance2 + # AsyncFirecrawl called once (on first get_instance) + mock_fc_cls.assert_called_once() + + @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "https://fc.example.com", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "test-key", + ) + def test_reset_instance(self, mock_fc_cls): + mock_fc_cls.return_value = MagicMock() + inst1 = FirecrawlClient.get_instance() + FirecrawlClient.reset_instance() + inst2 = FirecrawlClient.get_instance() + assert inst1 is not inst2 + + @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "https://fc.example.com", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "test-key", + ) + def test_double_check_locking_inner_branch(self, mock_fc_cls): + """Cover the second `if cls._instance is not None` (line 48-49) inside the lock. + + We replace the lock with a wrapper that sets _instance after __enter__, + simulating another thread having created the instance while we waited. + """ + mock_fc_cls.return_value = MagicMock() + sentinel = MagicMock() + + original_lock = FirecrawlClient._lock + + class SneakyLock: + def __enter__(self_lock): + original_lock.__enter__() + FirecrawlClient._instance = sentinel + return self_lock + + def __exit__(self_lock, *args): + original_lock.__exit__(*args) + + FirecrawlClient._lock = SneakyLock() + try: + inst = FirecrawlClient.get_instance() + assert inst is sentinel + finally: + FirecrawlClient._lock = original_lock + + +# --------------------------------------------------------------------------- +# FirecrawlClient.scrape_url +# --------------------------------------------------------------------------- + + +class TestFirecrawlClientScrapeUrl: + def setup_method(self): + FirecrawlClient.reset_instance() + + def teardown_method(self): + FirecrawlClient.reset_instance() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "https://fc.example.com", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "k", + ) + async def test_scrape_url_success(self, mock_fc_cls): + mock_app = AsyncMock() + mock_fc_cls.return_value = mock_app + mock_result = MagicMock() + mock_result.model_dump.return_value = {"markdown": "hello", "html": "

hello

"} + mock_app.scrape.return_value = mock_result + + client = FirecrawlClient.get_instance() + result = await client.scrape_url( + url="https://example.com", + formats=["markdown", "html"], + only_main_content=True, + exclude_tags=["nav"], + wait_for=3000, + ) + assert result == {"markdown": "hello", "html": "

hello

"} + mock_app.scrape.assert_awaited_once() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "https://fc.example.com", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "k", + ) + async def test_scrape_url_exception(self, mock_fc_cls): + mock_app = AsyncMock() + mock_fc_cls.return_value = mock_app + mock_app.scrape.side_effect = Exception("network error") + + client = FirecrawlClient.get_instance() + with pytest.raises(RuntimeError, match="Firecrawl scrape_url failed"): + await client.scrape_url(url="https://fail.com") + + +# --------------------------------------------------------------------------- +# _is_content_truncated +# --------------------------------------------------------------------------- + + +class TestIsContentTruncated: + def test_not_truncated(self): + from fastfetchbot_shared.services.scrapers.general.firecrawl import _is_content_truncated + assert _is_content_truncated("

abcdefghij

", "

abcdefghij

") is False + + def test_truncated(self): + from fastfetchbot_shared.services.scrapers.general.firecrawl import _is_content_truncated + short = "

ab

" + long = "

" + "x" * 100 + "

" + assert _is_content_truncated(short, long) is True + + def test_raw_zero_length(self): + from fastfetchbot_shared.services.scrapers.general.firecrawl import _is_content_truncated + assert _is_content_truncated("

abc

", "") is False + + def test_exact_threshold(self): + """Ratio exactly at threshold is not truncated.""" + from fastfetchbot_shared.services.scrapers.general.firecrawl import ( + _is_content_truncated, + _TRUNCATION_RATIO_THRESHOLD, + ) + # 40 chars extracted out of 100 raw = ratio 0.4 exactly + raw = "x" * 100 + extracted = "x" * 40 + assert _is_content_truncated(extracted, raw) is False + + +# --------------------------------------------------------------------------- +# FirecrawlDataProcessor +# --------------------------------------------------------------------------- + + +class TestFirecrawlDataProcessor: + def setup_method(self): + FirecrawlClient.reset_instance() + + def teardown_method(self): + FirecrawlClient.reset_instance() + + def _make_processor(self, url="https://example.com/article", use_json=None): + with patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FirecrawlClient.get_instance" + ) as mock_gi: + mock_client = MagicMock() + mock_gi.return_value = mock_client + from fastfetchbot_shared.services.scrapers.general.firecrawl import FirecrawlDataProcessor + proc = FirecrawlDataProcessor(url, use_json_extraction=use_json) + return proc, mock_client + + def test_init_default(self): + proc, _ = self._make_processor() + assert proc.scraper_type == "firecrawl" + assert proc.url == "https://example.com/article" + + def test_init_use_json_explicit(self): + proc, _ = self._make_processor(use_json=True) + assert proc._use_json_extraction is True + + @pytest.mark.asyncio + async def test_get_page_content_legacy(self): + proc, mock_client = self._make_processor(use_json=False) + mock_client.scrape_url = AsyncMock(return_value={ + "metadata": {"title": "T", "author": "A", "description": "D"}, + "markdown": "md", + "html": "

html

", + }) + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._get_page_content() + mock_build.assert_awaited_once() + call_kw = mock_build.call_args.kwargs + assert call_kw["title"] == "T" + + @pytest.mark.asyncio + async def test_get_page_content_json_with_data(self): + proc, mock_client = self._make_processor(use_json=True) + mock_client.scrape_url = AsyncMock(return_value={ + "json": { + "title": "JSON Title", + "author": "JSON Author", + "author_url": "https://example.com/author", + "text": "summary", + "content": "

" + "x" * 1000 + "

", + "media_files": [ + {"media_type": "image", "url": "https://img.com/1.jpg", "caption": "cap"}, + ], + }, + "metadata": {"title": "meta title"}, + "html": "

" + "x" * 1000 + "

", + "markdown": "md", + }) + await proc._get_page_content() + assert proc._data["title"] == "JSON Title" + assert proc._data["author"] == "JSON Author" + assert len(proc._data["media_files"]) == 1 + + @pytest.mark.asyncio + async def test_get_page_content_json_no_data_falls_back(self): + proc, mock_client = self._make_processor(use_json=True) + mock_client.scrape_url = AsyncMock(return_value={ + "json": None, + "metadata": {"title": "T", "ogSiteName": "Site"}, + "markdown": "md", + "html": "

html

", + }) + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._get_page_content() + mock_build.assert_awaited_once() + + @pytest.mark.asyncio + async def test_get_page_content_exception(self): + proc, mock_client = self._make_processor(use_json=False) + mock_client.scrape_url = AsyncMock(side_effect=RuntimeError("fail")) + with pytest.raises(RuntimeError): + await proc._get_page_content() + + @pytest.mark.asyncio + async def test_process_firecrawl_result_og_metadata_fallbacks(self): + proc, _ = self._make_processor() + result = { + "metadata": { + "ogTitle": "OG Title", + "ogSiteName": "OG Site", + "ogDescription": "OG Desc", + "ogImage": "https://img.com/og.jpg", + }, + "markdown": "md", + "html": "

h

", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_firecrawl_result(result) + kw = mock_build.call_args.kwargs + assert kw["title"] == "OG Title" + assert kw["author"] == "OG Site" + assert kw["description"] == "OG Desc" + assert kw["og_image"] == "https://img.com/og.jpg" + + @pytest.mark.asyncio + async def test_process_json_extraction_no_media_with_og_image(self): + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "T", + "author": "", + "text": "t", + "content": "

" + "a" * 500 + "

", + "media_files": [], + } + full_result = { + "metadata": {"ogImage": "https://og.com/img.png"}, + "html": "

" + "a" * 500 + "

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + # Should fall back to ogImage + assert len(proc._data["media_files"]) == 1 + assert proc._data["media_files"][0]["url"] == "https://og.com/img.png" + + @pytest.mark.asyncio + async def test_process_json_extraction_truncated_content_fallback(self): + """When JSON content appears truncated, falls back to raw HTML.""" + proc, _ = self._make_processor(use_json=True) + long_raw = "

" + "x" * 1000 + "

" + short_json_content = "

ab

" + json_data = { + "title": "T", + "author": "A", + "text": "t", + "content": short_json_content, + "media_files": [], + } + full_result = { + "metadata": {}, + "html": long_raw, + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + # Content should come from raw HTML since truncation was detected + assert proc._data["content"] # not empty + + @pytest.mark.asyncio + async def test_process_json_extraction_empty_content_fallback(self): + """When JSON content is empty, falls back to raw HTML.""" + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "T", + "author": "A", + "text": "t", + "content": "", + "media_files": [], + } + full_result = { + "metadata": {}, + "html": "

raw

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + assert proc._data["content"] + + @pytest.mark.asyncio + async def test_process_json_extraction_empty_content_no_raw_html(self): + """When both JSON content and raw HTML are empty.""" + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "", + "author": "", + "text": "", + "content": "", + "media_files": [], + } + full_result = { + "metadata": {}, + "html": "", + "markdown": "", + } + await proc._process_json_extraction(json_data, full_result) + assert proc._data["title"] == proc.url + + @pytest.mark.asyncio + async def test_process_json_extraction_author_url_fallback(self): + """When json_data has no author_url, falls back to url_parser.""" + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "T", + "author": "A", + "author_url": None, + "text": "t", + "content": "

" + "a" * 500 + "

", + "media_files": [], + } + full_result = { + "metadata": {}, + "html": "

" + "a" * 500 + "

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + assert proc._data["author_url"] == "https://example.com" + + @pytest.mark.asyncio + async def test_process_json_extraction_text_truncation(self): + proc, _ = self._make_processor(use_json=True) + long_text = "x" * 600 + json_data = { + "title": "T", + "author": "A", + "text": long_text, + "content": "

" + "a" * 500 + "

", + "media_files": [], + } + full_result = { + "metadata": {}, + "html": "

" + "a" * 500 + "

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + assert len(proc._data["text"]) == 500 + + @pytest.mark.asyncio + async def test_process_json_extraction_empty_text(self): + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "T", + "author": "A", + "text": "", + "content": "

" + "a" * 500 + "

", + "media_files": [], + } + full_result = { + "metadata": {}, + "html": "

" + "a" * 500 + "

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + assert proc._data["text"] == "" + + @pytest.mark.asyncio + async def test_process_json_extraction_og_image_key(self): + """Test og_image fallback via 'og_image' key (not 'ogImage').""" + proc, _ = self._make_processor(use_json=True) + json_data = { + "title": "T", + "author": "A", + "text": "t", + "content": "

" + "a" * 500 + "

", + "media_files": [], + } + full_result = { + "metadata": {"og_image": "https://og2.com/img.png"}, + "html": "

" + "a" * 500 + "

", + "markdown": "md", + } + await proc._process_json_extraction(json_data, full_result) + assert len(proc._data["media_files"]) == 1 + + @pytest.mark.asyncio + async def test_json_extraction_non_dict_falls_back(self): + """When json is not a dict, falls back to legacy processing.""" + proc, mock_client = self._make_processor(use_json=True) + mock_client.scrape_url = AsyncMock(return_value={ + "json": "not a dict", + "metadata": {"title": "T"}, + "markdown": "md", + "html": "

html

", + }) + with patch.object(proc, "_build_item_data", new_callable=AsyncMock): + await proc._get_page_content() + + +# --------------------------------------------------------------------------- +# FirecrawlScraper +# --------------------------------------------------------------------------- + + +class TestFirecrawlScraper: + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FirecrawlClient.get_instance" + ) + async def test_get_processor_by_url(self, mock_gi): + mock_gi.return_value = MagicMock() + from fastfetchbot_shared.services.scrapers.general.firecrawl import FirecrawlScraper, FirecrawlDataProcessor + scraper = FirecrawlScraper() + processor = await scraper.get_processor_by_url("https://example.com/page") + assert isinstance(processor, FirecrawlDataProcessor) + assert processor.url == "https://example.com/page" diff --git a/tests/unit/scrapers/test_general_firecrawl_schema.py b/tests/unit/scrapers/test_general_firecrawl_schema.py new file mode 100644 index 0000000..5d987f2 --- /dev/null +++ b/tests/unit/scrapers/test_general_firecrawl_schema.py @@ -0,0 +1,120 @@ +"""Tests for Firecrawl extraction Pydantic schema.""" + +import pytest + +from fastfetchbot_shared.services.scrapers.general.firecrawl_schema import ( + FIRECRAWL_EXTRACTION_PROMPT, + ExtractedArticle, + ExtractedMediaFile, +) + + +class TestExtractedMediaFile: + """Tests for ExtractedMediaFile Pydantic model.""" + + def test_required_fields(self): + media = ExtractedMediaFile(media_type="image", url="https://example.com/img.jpg") + assert media.media_type == "image" + assert media.url == "https://example.com/img.jpg" + + def test_optional_fields_default_to_none(self): + media = ExtractedMediaFile(media_type="video", url="https://example.com/vid.mp4") + assert media.original_url is None + assert media.caption is None + + def test_all_fields_set(self): + media = ExtractedMediaFile( + media_type="audio", + url="https://example.com/audio.mp3", + original_url="https://source.com/audio.mp3", + caption="A podcast episode", + ) + assert media.media_type == "audio" + assert media.url == "https://example.com/audio.mp3" + assert media.original_url == "https://source.com/audio.mp3" + assert media.caption == "A podcast episode" + + def test_serialization(self): + media = ExtractedMediaFile( + media_type="image", + url="https://example.com/img.jpg", + caption="A photo", + ) + data = media.model_dump() + assert data["media_type"] == "image" + assert data["url"] == "https://example.com/img.jpg" + assert data["caption"] == "A photo" + assert data["original_url"] is None + + +class TestExtractedArticle: + """Tests for ExtractedArticle Pydantic model.""" + + def test_defaults(self): + article = ExtractedArticle() + assert article.title == "" + assert article.author == "" + assert article.author_url is None + assert article.text == "" + assert article.content == "" + assert article.media_files == [] + + def test_all_fields_set(self): + media = ExtractedMediaFile(media_type="image", url="https://example.com/img.jpg") + article = ExtractedArticle( + title="Test Article", + author="John Doe", + author_url="https://example.com/john", + text="A brief summary of the article.", + content="

Full article content here.

", + media_files=[media], + ) + assert article.title == "Test Article" + assert article.author == "John Doe" + assert article.author_url == "https://example.com/john" + assert article.text == "A brief summary of the article." + assert article.content == "

Full article content here.

" + assert len(article.media_files) == 1 + assert article.media_files[0].media_type == "image" + + def test_media_files_default_is_empty_list(self): + article = ExtractedArticle(title="No media") + assert article.media_files == [] + # Ensure default_factory creates independent lists + article2 = ExtractedArticle(title="Also no media") + assert article.media_files is not article2.media_files + + def test_serialization(self): + article = ExtractedArticle( + title="Serialization Test", + author="Author", + content="

Content

", + media_files=[ + ExtractedMediaFile(media_type="image", url="https://example.com/1.jpg"), + ], + ) + data = article.model_dump() + assert data["title"] == "Serialization Test" + assert data["author"] == "Author" + assert len(data["media_files"]) == 1 + assert data["media_files"][0]["url"] == "https://example.com/1.jpg" + + def test_multiple_media_files(self): + files = [ + ExtractedMediaFile(media_type="image", url="https://example.com/1.jpg"), + ExtractedMediaFile(media_type="video", url="https://example.com/2.mp4"), + ExtractedMediaFile(media_type="audio", url="https://example.com/3.mp3"), + ] + article = ExtractedArticle(media_files=files) + assert len(article.media_files) == 3 + + +class TestFirecrawlExtractionPrompt: + """Tests for the FIRECRAWL_EXTRACTION_PROMPT constant.""" + + def test_prompt_is_non_empty_string(self): + assert isinstance(FIRECRAWL_EXTRACTION_PROMPT, str) + assert len(FIRECRAWL_EXTRACTION_PROMPT) > 0 + + def test_prompt_mentions_extraction(self): + assert "Extract" in FIRECRAWL_EXTRACTION_PROMPT diff --git a/tests/unit/scrapers/test_general_init.py b/tests/unit/scrapers/test_general_init.py new file mode 100644 index 0000000..1869b28 --- /dev/null +++ b/tests/unit/scrapers/test_general_init.py @@ -0,0 +1,176 @@ +"""Tests for GeneralItem dataclass.""" + +import pytest + +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from fastfetchbot_shared.services.scrapers.general import GeneralItem + + +class TestGeneralItemFromDict: + """Tests for GeneralItem.from_dict class method.""" + + def test_from_dict_with_all_fields(self): + data = { + "url": "https://example.com/article", + "title": "Test Article", + "author": "Author Name", + "author_url": "https://example.com/author", + "telegraph_url": "https://telegra.ph/test", + "text": "Summary text", + "content": "

Full content

", + "media_files": [ + {"media_type": "image", "url": "https://example.com/img.jpg", "caption": "Photo"}, + ], + "category": "general", + "message_type": "short", + "id": "abc123", + "raw_content": "raw", + "scraper_type": "firecrawl", + } + item = GeneralItem.from_dict(data) + assert item.url == "https://example.com/article" + assert item.title == "Test Article" + assert item.author == "Author Name" + assert item.author_url == "https://example.com/author" + assert item.telegraph_url == "https://telegra.ph/test" + assert item.text == "Summary text" + assert item.content == "

Full content

" + assert len(item.media_files) == 1 + assert item.media_files[0].media_type == "image" + assert item.category == "general" + assert item.message_type == MessageType.SHORT + assert item.id == "abc123" + assert item.raw_content == "raw" + assert item.scraper_type == "firecrawl" + + def test_from_dict_with_defaults_for_general_fields(self): + data = { + "url": "https://example.com", + "title": "", + "author": "", + "author_url": "", + "telegraph_url": "", + "text": "", + "content": "", + "media_files": [], + "category": "", + "message_type": "short", + } + item = GeneralItem.from_dict(data) + assert item.id == "" + assert item.raw_content == "" + assert item.scraper_type == "" + + def test_from_dict_preserves_metadata_fields(self): + data = { + "url": "https://test.com", + "title": "Title", + "author": "Author", + "author_url": "", + "telegraph_url": "", + "text": "text", + "content": "content", + "media_files": [], + "category": "news", + "message_type": "long", + "id": "x", + "raw_content": "raw", + "scraper_type": "zyte", + } + item = GeneralItem.from_dict(data) + assert item.url == "https://test.com" + assert item.message_type == MessageType.LONG + + +class TestGeneralItemToDict: + """Tests for GeneralItem.to_dict method.""" + + def test_to_dict_includes_general_fields(self): + item = GeneralItem( + url="https://example.com", + title="Title", + author="Author", + author_url="", + telegraph_url="", + text="text", + content="content", + media_files=[], + category="general", + message_type=MessageType.SHORT, + id="item-1", + raw_content="raw", + scraper_type="firecrawl", + ) + d = item.to_dict() + assert d["id"] == "item-1" + assert d["raw_content"] == "raw" + assert d["scraper_type"] == "firecrawl" + + def test_to_dict_includes_base_fields(self): + item = GeneralItem( + url="https://example.com", + title="My Title", + author="Jane", + author_url="https://example.com/jane", + telegraph_url="", + text="summary", + content="

body

", + media_files=[], + category="blog", + message_type=MessageType.LONG, + id="", + raw_content="", + scraper_type="", + ) + d = item.to_dict() + assert d["url"] == "https://example.com" + assert d["title"] == "My Title" + assert d["author"] == "Jane" + assert d["author_url"] == "https://example.com/jane" + assert d["text"] == "summary" + assert d["content"] == "

body

" + assert d["category"] == "blog" + assert d["message_type"] == "long" + + def test_to_dict_with_media_files(self): + media = MediaFile(media_type="image", url="https://example.com/img.jpg", caption="cap") + item = GeneralItem( + url="https://example.com", + title="", + author="", + author_url="", + telegraph_url="", + text="", + content="", + media_files=[media], + category="", + message_type=MessageType.SHORT, + id="", + raw_content="", + scraper_type="", + ) + d = item.to_dict() + assert len(d["media_files"]) == 1 + assert d["media_files"][0]["media_type"] == "image" + assert d["media_files"][0]["url"] == "https://example.com/img.jpg" + + +class TestGeneralItemDefaults: + """Tests for GeneralItem default field values.""" + + def test_general_specific_defaults(self): + item = GeneralItem( + url="https://example.com", + title="", + author="", + author_url="", + telegraph_url="", + text="", + content="", + media_files=[], + category="", + message_type=MessageType.SHORT, + ) + assert item.id == "" + assert item.raw_content == "" + assert item.scraper_type == "" diff --git a/tests/unit/scrapers/test_general_scraper.py b/tests/unit/scrapers/test_general_scraper.py new file mode 100644 index 0000000..dc082df --- /dev/null +++ b/tests/unit/scrapers/test_general_scraper.py @@ -0,0 +1,119 @@ +"""Tests for packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py""" + +from unittest.mock import AsyncMock, patch, MagicMock + +import pytest + +from fastfetchbot_shared.services.scrapers.general.scraper import GeneralScraper +from fastfetchbot_shared.services.scrapers.general.firecrawl import FirecrawlScraper +from fastfetchbot_shared.services.scrapers.general.zyte import ZyteScraper +from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralScraper + + +# --------------------------------------------------------------------------- +# SCRAPER_REGISTRY +# --------------------------------------------------------------------------- + + +class TestScraperRegistry: + def test_default_registry_has_firecrawl_and_zyte(self): + assert "FIRECRAWL" in GeneralScraper.SCRAPER_REGISTRY + assert "ZYTE" in GeneralScraper.SCRAPER_REGISTRY + assert GeneralScraper.SCRAPER_REGISTRY["FIRECRAWL"] is FirecrawlScraper + assert GeneralScraper.SCRAPER_REGISTRY["ZYTE"] is ZyteScraper + + +# --------------------------------------------------------------------------- +# __init__ / _init_scraper +# --------------------------------------------------------------------------- + + +class TestGeneralScraperInit: + @patch( + "fastfetchbot_shared.services.scrapers.general.scraper.GENERAL_SCRAPING_API", + "FIRECRAWL", + ) + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FirecrawlClient.get_instance" + ) + def test_default_type_from_config(self, mock_fc_instance): + """When no scraper_type is passed, uses GENERAL_SCRAPING_API env var.""" + mock_fc_instance.return_value = MagicMock() + gs = GeneralScraper() + assert gs.scraper_type == "FIRECRAWL" + assert isinstance(gs._scraper, FirecrawlScraper) + + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FirecrawlClient.get_instance" + ) + def test_custom_type_firecrawl(self, mock_fc_instance): + mock_fc_instance.return_value = MagicMock() + gs = GeneralScraper(scraper_type="firecrawl") + assert gs.scraper_type == "firecrawl" + assert isinstance(gs._scraper, FirecrawlScraper) + + def test_custom_type_zyte(self): + gs = GeneralScraper(scraper_type="ZYTE") + assert gs.scraper_type == "ZYTE" + assert isinstance(gs._scraper, ZyteScraper) + + @patch( + "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FirecrawlClient.get_instance" + ) + def test_unknown_type_falls_back_to_firecrawl(self, mock_fc_instance): + mock_fc_instance.return_value = MagicMock() + gs = GeneralScraper(scraper_type="UNKNOWN_SCRAPER") + # Should fall back to FirecrawlScraper + assert isinstance(gs._scraper, FirecrawlScraper) + + +# --------------------------------------------------------------------------- +# get_processor_by_url +# --------------------------------------------------------------------------- + + +class TestGetProcessorByUrl: + @pytest.mark.asyncio + async def test_delegates_to_underlying_scraper(self): + gs = GeneralScraper(scraper_type="ZYTE") + processor = await gs.get_processor_by_url("https://example.com") + from fastfetchbot_shared.services.scrapers.general.zyte import ZyteDataProcessor + assert isinstance(processor, ZyteDataProcessor) + + +# --------------------------------------------------------------------------- +# register_scraper / get_available_scrapers +# --------------------------------------------------------------------------- + + +class TestRegisterAndGetAvailable: + def test_register_scraper(self): + class FakeScraper(BaseGeneralScraper): + async def get_processor_by_url(self, url): + pass + + original_registry = dict(GeneralScraper.SCRAPER_REGISTRY) + try: + GeneralScraper.register_scraper("FAKE", FakeScraper) + assert "FAKE" in GeneralScraper.SCRAPER_REGISTRY + assert GeneralScraper.SCRAPER_REGISTRY["FAKE"] is FakeScraper + finally: + GeneralScraper.SCRAPER_REGISTRY = original_registry + + def test_register_scraper_uppercases_name(self): + class FakeScraper2(BaseGeneralScraper): + async def get_processor_by_url(self, url): + pass + + original_registry = dict(GeneralScraper.SCRAPER_REGISTRY) + try: + GeneralScraper.register_scraper("lowercase", FakeScraper2) + assert "LOWERCASE" in GeneralScraper.SCRAPER_REGISTRY + finally: + GeneralScraper.SCRAPER_REGISTRY = original_registry + + def test_get_available_scrapers(self): + scrapers = GeneralScraper.get_available_scrapers() + assert isinstance(scrapers, list) + assert "FIRECRAWL" in scrapers + assert "ZYTE" in scrapers diff --git a/tests/unit/scrapers/test_general_zyte.py b/tests/unit/scrapers/test_general_zyte.py new file mode 100644 index 0000000..8df0e95 --- /dev/null +++ b/tests/unit/scrapers/test_general_zyte.py @@ -0,0 +1,226 @@ +"""Tests for packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.services.scrapers.general.zyte import ( + ZyteDataProcessor, + ZyteScraper, +) + + +# --------------------------------------------------------------------------- +# ZyteDataProcessor.__init__ +# --------------------------------------------------------------------------- + + +class TestZyteDataProcessorInit: + def test_init(self): + proc = ZyteDataProcessor("https://example.com/page") + assert proc.url == "https://example.com/page" + assert proc.scraper_type == "zyte" + assert proc._data == {} + + +# --------------------------------------------------------------------------- +# ZyteDataProcessor._get_page_content +# --------------------------------------------------------------------------- + + +class TestZyteGetPageContent: + @pytest.mark.asyncio + @patch( + "fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", + None, + ) + async def test_no_api_key_raises(self): + proc = ZyteDataProcessor("https://example.com") + with pytest.raises(RuntimeError, match="ZYTE_API_KEY is not configured"): + await proc._get_page_content() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", "zyte-key") + @patch("fastfetchbot_shared.services.scrapers.general.zyte.AsyncZyteAPI") + async def test_success(self, mock_zyte_cls): + mock_client = AsyncMock() + mock_zyte_cls.return_value = mock_client + mock_client.get.return_value = { + "article": { + "headline": "Title", + "authors": [{"name": "Author"}], + "description": "Desc", + "articleBodyHtml": "

body

", + "articleBodyRaw": "raw body", + "mainImage": {"url": "https://img.com/pic.jpg"}, + }, + "browserHtml": "full", + } + proc = ZyteDataProcessor("https://example.com/article") + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._get_page_content() + mock_build.assert_awaited_once() + kw = mock_build.call_args.kwargs + assert kw["title"] == "Title" + assert kw["author"] == "Author" + assert kw["description"] == "Desc" + assert kw["html_content"] == "

body

" + assert kw["markdown_content"] == "raw body" + assert kw["og_image"] == "https://img.com/pic.jpg" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", "zyte-key") + @patch("fastfetchbot_shared.services.scrapers.general.zyte.AsyncZyteAPI") + async def test_exception_propagates(self, mock_zyte_cls): + mock_client = AsyncMock() + mock_zyte_cls.return_value = mock_client + mock_client.get.side_effect = RuntimeError("zyte failure") + proc = ZyteDataProcessor("https://example.com") + with pytest.raises(RuntimeError): + await proc._get_page_content() + + +# --------------------------------------------------------------------------- +# ZyteDataProcessor._process_zyte_result +# --------------------------------------------------------------------------- + + +class TestProcessZyteResult: + @pytest.mark.asyncio + async def test_full_article(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": { + "headline": "Headline", + "name": "Name", + "authors": [{"name": "Writer"}], + "description": "Short desc", + "articleBodyHtml": "

body html

", + "articleBodyRaw": "body raw", + "mainImage": {"url": "https://img.com/main.jpg"}, + }, + "browserHtml": "full", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + kw = mock_build.call_args.kwargs + assert kw["title"] == "Headline" + assert kw["author"] == "Writer" + assert kw["og_image"] == "https://img.com/main.jpg" + + @pytest.mark.asyncio + async def test_fallback_to_name_when_no_headline(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": { + "name": "Fallback Name", + "authors": [], + "articleBodyHtml": "", + "articleBodyRaw": "raw", + "description": "", + }, + "browserHtml": "browser", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + kw = mock_build.call_args.kwargs + assert kw["title"] == "Fallback Name" + assert kw["author"] == "" + # Falls back to browserHtml when articleBodyHtml is empty + assert kw["html_content"] == "browser" + + @pytest.mark.asyncio + async def test_no_authors(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": { + "headline": "T", + "authors": [], + "articleBodyHtml": "

b

", + "articleBodyRaw": "b", + }, + "browserHtml": "", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + assert mock_build.call_args.kwargs["author"] == "" + + @pytest.mark.asyncio + async def test_no_main_image(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": { + "headline": "T", + "authors": [], + "articleBodyHtml": "

b

", + "articleBodyRaw": "b", + }, + "browserHtml": "", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + assert mock_build.call_args.kwargs["og_image"] is None + + @pytest.mark.asyncio + async def test_empty_main_image_dict(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": { + "headline": "T", + "authors": [], + "articleBodyHtml": "

b

", + "articleBodyRaw": "b", + "mainImage": {}, + }, + "browserHtml": "", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + assert mock_build.call_args.kwargs["og_image"] is None + + @pytest.mark.asyncio + async def test_description_fallback_to_article_body_raw(self): + proc = ZyteDataProcessor("https://example.com/article") + long_raw = "x" * 600 + result = { + "article": { + "headline": "T", + "authors": [], + "description": "", + "articleBodyHtml": "

b

", + "articleBodyRaw": long_raw, + }, + "browserHtml": "", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + desc = mock_build.call_args.kwargs["description"] + assert len(desc) == 500 + + @pytest.mark.asyncio + async def test_empty_article(self): + proc = ZyteDataProcessor("https://example.com/article") + result = { + "article": {}, + "browserHtml": "page", + } + with patch.object(proc, "_build_item_data", new_callable=AsyncMock) as mock_build: + await proc._process_zyte_result(result) + kw = mock_build.call_args.kwargs + assert kw["title"] == "" + assert kw["author"] == "" + assert kw["html_content"] == "page" + + +# --------------------------------------------------------------------------- +# ZyteScraper +# --------------------------------------------------------------------------- + + +class TestZyteScraper: + @pytest.mark.asyncio + async def test_get_processor_by_url(self): + scraper = ZyteScraper() + processor = await scraper.get_processor_by_url("https://example.com/page") + assert isinstance(processor, ZyteDataProcessor) + assert processor.url == "https://example.com/page" diff --git a/tests/unit/scrapers/test_instagram.py b/tests/unit/scrapers/test_instagram.py new file mode 100644 index 0000000..0b63462 --- /dev/null +++ b/tests/unit/scrapers/test_instagram.py @@ -0,0 +1,544 @@ +"""Unit tests for Instagram scraper and config modules. + +Covers: +- packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py +- packages/shared/fastfetchbot_shared/services/scrapers/instagram/config.py +""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from fastfetchbot_shared.services.scrapers.instagram.config import ( + API_HEADERS_LIST, + ALL_SCRAPERS, +) +from fastfetchbot_shared.services.scrapers.instagram import Instagram +from fastfetchbot_shared.models.metadata_item import MessageType, MediaFile + +# Patch target for get_response at the Instagram module level (where it was imported via `from ... import`) +_PATCH_GET_RESPONSE = "fastfetchbot_shared.services.scrapers.instagram.get_response" + + +@pytest.fixture +def mock_ig_get_response(): + """Patch get_response at the Instagram module level.""" + with patch(_PATCH_GET_RESPONSE, new_callable=AsyncMock) as m: + yield m + + +# --------------------------------------------------------------------------- +# config.py tests +# --------------------------------------------------------------------------- + +class TestInstagramConfig: + """Tests for instagram/config.py constants.""" + + def test_all_scrapers_is_list(self): + assert isinstance(ALL_SCRAPERS, list) + assert len(ALL_SCRAPERS) > 0 + + def test_all_scrapers_contents(self): + assert ALL_SCRAPERS == ["ins28", "scraper2", "looter2", "ins191", "ins130"] + + def test_api_headers_list_is_dict(self): + assert isinstance(API_HEADERS_LIST, dict) + + def test_api_headers_list_keys(self): + expected = {"looter2", "ins28", "scraper2", "ins191", "ins130", "api2"} + assert set(API_HEADERS_LIST.keys()) == expected + + def test_each_scraper_has_required_keys(self): + for name, entry in API_HEADERS_LIST.items(): + assert "host" in entry, f"{name} missing 'host'" + assert "top_domain" in entry, f"{name} missing 'top_domain'" + assert "params" in entry, f"{name} missing 'params'" + + def test_looter2_params_value_is_url(self): + assert API_HEADERS_LIST["looter2"]["params"] == "url" + + +# --------------------------------------------------------------------------- +# Instagram class tests +# --------------------------------------------------------------------------- + +class TestInstagramInit: + """Tests for Instagram.__init__.""" + + def test_init_post_url(self): + url = "https://www.instagram.com/p/ABC123/" + ig = Instagram(url) + assert ig.url == url + assert ig.category == "instagram" + assert ig.post_id == "ABC123" + assert ig.message_type == MessageType.SHORT + + def test_init_reel_url(self): + url = "https://www.instagram.com/reel/XYZ789/" + ig = Instagram(url) + assert ig.post_id == "XYZ789" + + def test_init_with_data_kwarg(self): + ig = Instagram("https://www.instagram.com/p/TEST/", data={"key": "val"}) + assert ig.post_id == "TEST" + + +class TestCheckInstagramUrl: + """Tests for Instagram._check_instagram_url.""" + + def test_post_url(self): + ig = Instagram("https://www.instagram.com/p/ABC123/") + ig._check_instagram_url() + assert ig.ins_type == "post" + + def test_reel_url(self): + ig = Instagram("https://www.instagram.com/reel/ABC123/") + ig._check_instagram_url() + assert ig.ins_type == "post" # "reel" path also contains no "stories" + + def test_story_url(self): + ig = Instagram("https://www.instagram.com/stories/user/12345/") + ig._check_instagram_url() + assert ig.ins_type == "story" + + def test_story_overrides_post(self): + """Path with both 'p' and 'stories' should end up as 'story'.""" + ig = Instagram("https://www.instagram.com/stories/p/12345/") + ig._check_instagram_url() + assert ig.ins_type == "story" + + +class TestGetStoryInfo: + """Tests for Instagram._get_story_info.""" + + @pytest.mark.asyncio + async def test_get_story_info_returns_none(self): + ig = Instagram("https://www.instagram.com/stories/user/1/") + result = await ig._get_story_info() + assert result is None + + +class TestGetInsPostLooter2: + """Tests for Instagram._get_ins_post_looter2 — static method.""" + + def _make_base_data(self, typename, **overrides): + data = { + "edge_media_to_caption": {"edges": [{"node": {"text": "caption text"}}]}, + "owner": {"username": "testuser", "full_name": "Test User"}, + "__typename": typename, + } + data.update(overrides) + return data + + def test_graph_video(self): + data = self._make_base_data("GraphVideo", video_url="https://vid.com/v.mp4") + result = Instagram._get_ins_post_looter2(data) + assert result["status"] is True + assert result["author"] == "testuser(Test User)" + assert result["text"] == "caption text" + assert len(result["media_files"]) == 1 + assert result["media_files"][0].media_type == "video" + assert "video" in result["content"] + + def test_graph_image(self): + data = self._make_base_data("GraphImage", display_url="https://img.com/i.jpg") + result = Instagram._get_ins_post_looter2(data) + assert len(result["media_files"]) == 1 + assert result["media_files"][0].media_type == "image" + assert "img" in result["content"] + + def test_graph_image_no_display_url(self): + data = self._make_base_data("GraphImage", display_url="") + result = Instagram._get_ins_post_looter2(data) + assert result["media_files"][0].url == "" + # content should not have img tag when display_url is empty + assert "alert(1)", + "media_files": [], + "content": "", + "status": True, + } + ) + assert "' + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = html_text + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is not None + assert result["note_id"] == "n1" + + @pytest.mark.asyncio + async def test_with_xsec_token_in_url(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 200 + state = {"note": {"noteDetailMap": {"n1": {"note": { + "type": "normal", "note_id": "n1", "title": "T", "desc": "", + "user": {}, "interact_info": {}, + }}}}} + mock_resp.text = f'window.__INITIAL_STATE__={json.dumps(state)}' + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + await adapter._fetch_note_by_html("n1", "tok_val", "src_val") + call_url = adapter._http.get.call_args[0][0] + assert "xsec_token=tok_val" in call_url + + @pytest.mark.asyncio + async def test_no_xsec_token(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 200 + state = {"note": {"noteDetailMap": {"n1": {"note": { + "type": "normal", "note_id": "n1", "title": "T", "desc": "", + "user": {}, "interact_info": {}, + }}}}} + mock_resp.text = f'window.__INITIAL_STATE__={json.dumps(state)}' + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + await adapter._fetch_note_by_html("n1", "", "") + call_url = adapter._http.get.call_args[0][0] + assert "xsec_token" not in call_url + + @pytest.mark.asyncio + async def test_non_200_returns_none(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 403 + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is None + + @pytest.mark.asyncio + async def test_no_initial_state_returns_none(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "no state here" + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is None + + @pytest.mark.asyncio + async def test_invalid_json_returns_none(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = 'window.__INITIAL_STATE__={not valid json}' + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is None + + @pytest.mark.asyncio + async def test_note_not_found_in_map_returns_none(self): + adapter = self._make_adapter() + state = {"note": {"noteDetailMap": {"other_id": {"note": {}}}}} + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = f'window.__INITIAL_STATE__={json.dumps(state)}' + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is None + + @pytest.mark.asyncio + async def test_empty_text(self): + adapter = self._make_adapter() + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "" + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "tok", "src") + assert result is None + + @pytest.mark.asyncio + async def test_undefined_replaced_with_null(self): + """The method replaces 'undefined' with 'null' in the JSON.""" + adapter = self._make_adapter() + state_str = '{"note":{"noteDetailMap":{"n1":{"note":{"type":"normal","note_id":"n1","title":"T","desc":"","user":{},"interact_info":{},"some_field":undefined}}}}}' + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = f'window.__INITIAL_STATE__={state_str}' + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._fetch_note_by_html("n1", "", "") + assert result is not None + + +class TestFetchPost: + """Tests for fetch_post.""" + + def _make_adapter(self): + return XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + + def _make_normalized_note(self, note_id="n1"): + return { + "note_id": note_id, + "type": "normal", + "title": "T", + "desc": "", + "video_urls": [], + "time": "", + "last_update_time": "", + "ip_location": "", + "image_list": [], + "tag_list": [], + "url": f"{XHS_WEB_URL}/explore/{note_id}?xsec_token=&xsec_source=pc_search", + "note_url": f"{XHS_WEB_URL}/explore/{note_id}?xsec_token=&xsec_source=pc_search", + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "user": {"user_id": "", "nickname": "", "avatar": ""}, + } + + @pytest.mark.asyncio + async def test_api_success(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(return_value=note) + adapter._fetch_note_by_html = AsyncMock() + + url = f"{XHS_WEB_URL}/explore/n1?xsec_token=tok&xsec_source=src" + result = await adapter.fetch_post(note_url=url) + assert result["note"]["note_id"] == "n1" + assert result["platform"] == "xhs" + assert result["comments"] == [] + adapter._fetch_note_by_html.assert_not_awaited() + + @pytest.mark.asyncio + async def test_api_fails_html_fallback(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(side_effect=Exception("API error")) + adapter._fetch_note_by_html = AsyncMock(return_value=note) + + url = f"{XHS_WEB_URL}/explore/n1?xsec_token=tok&xsec_source=src" + result = await adapter.fetch_post(note_url=url) + assert result["note"]["note_id"] == "n1" + + @pytest.mark.asyncio + async def test_api_returns_none_html_fallback(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(return_value=None) + adapter._fetch_note_by_html = AsyncMock(return_value=note) + + url = f"{XHS_WEB_URL}/explore/n1" + result = await adapter.fetch_post(note_url=url) + assert result["note"]["note_id"] == "n1" + + @pytest.mark.asyncio + async def test_both_fail_raises(self): + adapter = self._make_adapter() + adapter._fetch_note_by_api = AsyncMock(return_value=None) + adapter._fetch_note_by_html = AsyncMock(return_value=None) + + url = f"{XHS_WEB_URL}/explore/n1" + with pytest.raises(RuntimeError, match="Cannot fetch note"): + await adapter.fetch_post(note_url=url) + + @pytest.mark.asyncio + async def test_short_url_triggers_redirect(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._get_redirection_url = AsyncMock( + return_value=f"{XHS_WEB_URL}/explore/n1?xsec_token=tok&xsec_source=src" + ) + adapter._fetch_note_by_api = AsyncMock(return_value=note) + + result = await adapter.fetch_post(note_url="https://xhslink.com/abc") + adapter._get_redirection_url.assert_awaited_once() + assert result["note"]["note_id"] == "n1" + + @pytest.mark.asyncio + async def test_with_comments(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(return_value=note) + adapter._fetch_comments = AsyncMock(return_value=[{"comment_id": "c1"}]) + + url = f"{XHS_WEB_URL}/explore/n1" + result = await adapter.fetch_post(note_url=url, with_comments=True, max_comments=10) + assert len(result["comments"]) == 1 + + @pytest.mark.asyncio + async def test_with_comments_error_returns_empty(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(return_value=note) + adapter._fetch_comments = AsyncMock(side_effect=Exception("comment error")) + + url = f"{XHS_WEB_URL}/explore/n1" + result = await adapter.fetch_post(note_url=url, with_comments=True) + assert result["comments"] == [] + + @pytest.mark.asyncio + async def test_url_in_result_is_pure(self): + adapter = self._make_adapter() + note = self._make_normalized_note() + adapter._fetch_note_by_api = AsyncMock(return_value=note) + + url = f"{XHS_WEB_URL}/explore/n1?xsec_token=tok&xsec_source=src" + result = await adapter.fetch_post(note_url=url) + assert result["url"] == f"{XHS_WEB_URL}/explore/n1" + + +class TestFetchComments: + """Tests for _fetch_comments.""" + + def _make_adapter(self): + return XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + + @pytest.mark.asyncio + async def test_single_page(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [ + {"id": "c1", "content": "hi", "user_info": {}, "target_comment": {}, "pictures": []}, + ], + "has_more": False, + "cursor": "", + }) + result = await adapter._fetch_comments("n1", "tok") + assert len(result) == 1 + assert result[0]["comment_id"] == "c1" + + @pytest.mark.asyncio + async def test_pagination(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(side_effect=[ + { + "comments": [{"id": "c1", "content": "a", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": True, + "cursor": "page2", + }, + { + "comments": [{"id": "c2", "content": "b", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": False, + "cursor": "", + }, + ]) + result = await adapter._fetch_comments("n1", "tok") + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_max_comments(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [ + {"id": f"c{i}", "content": str(i), "user_info": {}, "target_comment": {}, "pictures": []} + for i in range(5) + ], + "has_more": True, + "cursor": "next", + }) + result = await adapter._fetch_comments("n1", "tok", max_comments=3) + assert len(result) == 3 + + @pytest.mark.asyncio + async def test_no_cursor_breaks_loop(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [{"id": "c1", "content": "a", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": True, + "cursor": "", + }) + result = await adapter._fetch_comments("n1", "tok") + assert len(result) == 1 + + @pytest.mark.asyncio + async def test_with_xsec_token_in_params(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [], + "has_more": False, + "cursor": "", + }) + await adapter._fetch_comments("n1", "tok") + call_params = adapter._signed_get.call_args.kwargs.get("params") or adapter._signed_get.call_args[1].get("params") + assert call_params["xsec_token"] == "tok" + + @pytest.mark.asyncio + async def test_without_xsec_token(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [], + "has_more": False, + "cursor": "", + }) + await adapter._fetch_comments("n1", "") + call_params = adapter._signed_get.call_args.kwargs.get("params") or adapter._signed_get.call_args[1].get("params") + assert "xsec_token" not in call_params + + @pytest.mark.asyncio + async def test_none_comments_treated_as_empty(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": None, + "has_more": False, + "cursor": "", + }) + result = await adapter._fetch_comments("n1", "tok") + assert result == [] + + @pytest.mark.asyncio + async def test_include_sub_comments(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [ + { + "id": "c1", "content": "root", "user_info": {}, "target_comment": {}, + "pictures": [], "sub_comments": [ + {"id": "sc1", "content": "sub", "user_info": {}, "target_comment": {}, "pictures": []}, + ], + "sub_comment_has_more": False, + "sub_comment_cursor": "", + }, + ], + "has_more": False, + "cursor": "", + }) + adapter._fetch_sub_comments = AsyncMock(return_value=[ + {"comment_id": "sc1", "content": "sub"}, + ]) + result = await adapter._fetch_comments("n1", "tok", include_sub_comments=True) + assert len(result) == 2 + adapter._fetch_sub_comments.assert_awaited_once() + + +class TestFetchSubComments: + """Tests for _fetch_sub_comments.""" + + def _make_adapter(self): + return XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + + @pytest.mark.asyncio + async def test_inline_only(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock() # should not be called + root = { + "id": "c1", + "sub_comments": [ + {"id": "sc1", "content": "inline", "user_info": {}, "target_comment": {}, "pictures": []}, + ], + "sub_comment_has_more": False, + "sub_comment_cursor": "", + } + result = await adapter._fetch_sub_comments("n1", root, "tok") + assert len(result) == 1 + assert result[0]["comment_id"] == "sc1" + adapter._signed_get.assert_not_awaited() + + @pytest.mark.asyncio + async def test_pagination(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(side_effect=[ + { + "comments": [{"id": "sc2", "content": "p1", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": True, + "cursor": "next_cursor", + }, + { + "comments": [{"id": "sc3", "content": "p2", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": False, + "cursor": "", + }, + ]) + root = { + "id": "c1", + "sub_comments": [], + "sub_comment_has_more": True, + "sub_comment_cursor": "first_cursor", + } + result = await adapter._fetch_sub_comments("n1", root, "tok") + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_no_cursor_breaks(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [{"id": "sc1", "content": "x", "user_info": {}, "target_comment": {}, "pictures": []}], + "has_more": True, + "cursor": "", + }) + root = { + "id": "c1", + "sub_comments": [], + "sub_comment_has_more": True, + "sub_comment_cursor": "start", + } + result = await adapter._fetch_sub_comments("n1", root, "tok") + assert len(result) == 1 + + @pytest.mark.asyncio + async def test_none_sub_comments(self): + adapter = self._make_adapter() + root = { + "id": "c1", + "sub_comments": None, + "sub_comment_has_more": False, + } + result = await adapter._fetch_sub_comments("n1", root, "") + assert result == [] + + @pytest.mark.asyncio + async def test_xsec_token_in_params(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [], + "has_more": False, + "cursor": "", + }) + root = { + "id": "c1", + "sub_comments": [], + "sub_comment_has_more": True, + "sub_comment_cursor": "cur", + } + await adapter._fetch_sub_comments("n1", root, "my_tok") + call_params = adapter._signed_get.call_args.kwargs.get("params") or adapter._signed_get.call_args[1].get("params") + assert call_params["xsec_token"] == "my_tok" + + @pytest.mark.asyncio + async def test_no_xsec_token_in_params(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": [], + "has_more": False, + "cursor": "", + }) + root = { + "id": "c1", + "sub_comments": [], + "sub_comment_has_more": True, + "sub_comment_cursor": "cur", + } + await adapter._fetch_sub_comments("n1", root, "") + call_params = adapter._signed_get.call_args.kwargs.get("params") or adapter._signed_get.call_args[1].get("params") + assert "xsec_token" not in call_params + + @pytest.mark.asyncio + async def test_none_sub_comments_list_in_payload(self): + adapter = self._make_adapter() + adapter._signed_get = AsyncMock(return_value={ + "comments": None, + "has_more": False, + "cursor": "", + }) + root = { + "id": "c1", + "sub_comments": [], + "sub_comment_has_more": True, + "sub_comment_cursor": "cur", + } + result = await adapter._fetch_sub_comments("n1", root, "") + assert result == [] + + +class TestGetRedirectionUrl: + """Tests for _get_redirection_url.""" + + @pytest.mark.asyncio + async def test_success(self): + adapter = XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + mock_resp = MagicMock() + mock_resp.url = "https://www.xiaohongshu.com/explore/n1?xsec_token=tok" + + with patch("fastfetchbot_shared.services.scrapers.xiaohongshu.adaptar.httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_resp) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + MockClient.return_value = mock_client + + result = await adapter._get_redirection_url("https://xhslink.com/abc") + assert "xiaohongshu.com" in result + + @pytest.mark.asyncio + async def test_not_xhs_raises(self): + adapter = XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + mock_resp = MagicMock() + mock_resp.url = "https://www.google.com/" + + with patch("fastfetchbot_shared.services.scrapers.xiaohongshu.adaptar.httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_resp) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + MockClient.return_value = mock_client + + with pytest.raises(RuntimeError, match="did not redirect to xiaohongshu.com"): + await adapter._get_redirection_url("https://xhslink.com/abc") + + +class TestSignedPost: + """Tests for _signed_post.""" + + @pytest.mark.asyncio + async def test_signed_post(self): + adapter = XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + adapter._sign_headers = AsyncMock(return_value={"X-s": "val"}) + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.status_code = 200 + mock_resp.json.return_value = {"success": True, "data": {"result": "ok"}} + adapter._http = AsyncMock() + adapter._http.post = AsyncMock(return_value=mock_resp) + + result = await adapter._signed_post("/api/test", data={"key": "val"}) + assert result == {"result": "ok"} + adapter._sign_headers.assert_awaited_once() + + +class TestSignedGet: + """Tests for _signed_get.""" + + @pytest.mark.asyncio + async def test_signed_get_with_params(self): + adapter = XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + adapter._sign_headers = AsyncMock(return_value={"X-s": "val"}) + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.status_code = 200 + mock_resp.json.return_value = {"success": True, "data": {"items": []}} + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + result = await adapter._signed_get("/api/test", params={"a": "1"}) + assert result == {"items": []} + # Verify the sign_headers was called with URI including query string + sign_uri = adapter._sign_headers.call_args.kwargs.get("uri") or adapter._sign_headers.call_args[1].get("uri") + assert "a=1" in sign_uri + + @pytest.mark.asyncio + async def test_signed_get_no_params(self): + adapter = XhsSinglePostAdapter( + cookies="c=1", sign_server_endpoint="http://s:8989" + ) + adapter._sign_headers = AsyncMock(return_value={"X-s": "val"}) + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.status_code = 200 + mock_resp.json.return_value = {"success": True, "data": {"items": []}} + adapter._http = AsyncMock() + adapter._http.get = AsyncMock(return_value=mock_resp) + + await adapter._signed_get("/api/test") + sign_uri = adapter._sign_headers.call_args.kwargs.get("uri") or adapter._sign_headers.call_args[1].get("uri") + assert sign_uri == "/api/test" + + +# --------------------------------------------------------------------------- +# Xiaohongshu class tests (from __init__.py) +# --------------------------------------------------------------------------- + +class TestXiaohongshuInit: + """Tests for Xiaohongshu.__init__.""" + + @patch("fastfetchbot_shared.services.scrapers.xiaohongshu.JINJA2_ENV") + def test_init(self, mock_env): + mock_template = MagicMock() + mock_template.render.return_value = "

rendered

" + mock_env.get_template.return_value = mock_template + + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n1", data=None) + assert xhs.url == "https://www.xiaohongshu.com/explore/n1" + assert xhs.category == "xiaohongshu" + assert xhs.message_type == MessageType.SHORT + assert xhs.media_files == [] + assert xhs.id is None + + +class TestXiaohongshuGetItem: + """Tests for Xiaohongshu.get_item and _get_xiaohongshu.""" + + @pytest.mark.asyncio + async def test_get_item_returns_dict(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n1", data=None) + note = { + "note_id": "n1", + "title": "Test Note", + "desc": "Description", + "user": {"user_id": "u1", "nickname": "Nick", "avatar": ""}, + "time": 1700000000000, + "last_update_time": 1700001000000, + "liked_count": 10, + "collected_count": 5, + "comment_count": 3, + "share_count": 2, + "ip_location": "Beijing", + "image_list": ["https://img1.jpg"], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, + "url": "https://www.xiaohongshu.com/explore/n1", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + result = await xhs.get_item() + + assert isinstance(result, dict) + assert result["category"] == "xiaohongshu" + assert xhs.id == "n1" + + @pytest.mark.asyncio + async def test_get_item_with_video(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n2", data=None) + note = { + "note_id": "n2", + "title": "Video Note", + "desc": "Vid desc", + "user": {"user_id": "u1", "nickname": "Nick", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": ["https://video.mp4"], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, + "url": "https://www.xiaohongshu.com/explore/n2", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + result = await xhs.get_item() + + video_files = [mf for mf in xhs.media_files if mf.media_type == "video"] + assert len(video_files) == 1 + + @pytest.mark.asyncio + async def test_no_title_uses_author_fallback(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n3", data=None) + note = { + "note_id": "n3", + "title": "", + "desc": "desc", + "user": {"user_id": "u1", "nickname": "Author", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n3", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + await xhs.get_item() + + assert xhs.title == "Author\u7684\u5c0f\u7ea2\u4e66\u7b14\u8bb0" + + @pytest.mark.asyncio + async def test_no_title_no_author(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n4", data=None) + note = { + "note_id": "n4", + "title": "", + "desc": "", + "user": {"user_id": "u1", "nickname": "", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n4", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + await xhs.get_item() + + # title stays empty/falsy, fallback condition is not met since author is also falsy + assert not xhs.title + + @pytest.mark.asyncio + async def test_long_text_switches_message_type(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + # Make the template render a long string + mock_jinja2_env.get_template.return_value.render.return_value = "a" * 600 + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n5", data=None) + note = { + "note_id": "n5", + "title": "Long", + "desc": "x" * 600, + "user": {"user_id": "u1", "nickname": "N", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n5", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + await xhs.get_item() + + assert xhs.message_type == MessageType.LONG + + @pytest.mark.asyncio + async def test_raw_content_tab_and_newline_stripping(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + mock_short_template = MagicMock() + mock_short_template.render.return_value = "

short

" + mock_content_template = MagicMock() + mock_content_template.render.return_value = "

content

" + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n6", data=None) + note = { + "note_id": "n6", + "title": "Tabs", + "desc": "line1\t\tline2\n", + "user": {"user_id": "u1", "nickname": "N", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n6", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ), patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.short_text_template", + mock_short_template, + ), patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.content_template", + mock_content_template, + ): + await xhs.get_item() + + # raw_content should have tabs stripped and trailing newline removed + render_calls = mock_short_template.render.call_args_list + first_call_data = render_calls[0].kwargs.get("data") or render_calls[0][1].get("data") + assert "\t" not in first_call_data["raw_content"] + + @pytest.mark.asyncio + async def test_none_user(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n7", data=None) + note = { + "note_id": "n7", + "title": "No User", + "desc": "", + "user": None, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": None, + "video_urls": None, + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n7", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + result = await xhs.get_item() + + assert result is not None + + @pytest.mark.asyncio + async def test_none_raw_content(self, mock_jinja2_env): + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n8", data=None) + note = { + "note_id": "n8", + "title": "No Desc", + "desc": None, + "user": {"user_id": "u1", "nickname": "N", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": [], + "video_urls": [], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n8", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ): + await xhs.get_item() + + # raw_content should be empty string, not None + assert xhs.raw_content is None or xhs.raw_content == "" + + @pytest.mark.asyncio + async def test_content_template_includes_media(self, mock_jinja2_env): + """Verify that content template render is called after media files are appended.""" + from fastfetchbot_shared.services.scrapers.xiaohongshu import Xiaohongshu + + mock_short_template = MagicMock() + mock_short_template.render.return_value = "

short

" + mock_content_template = MagicMock() + mock_content_template.render.return_value = "

content

" + + xhs = Xiaohongshu(url="https://www.xiaohongshu.com/explore/n9", data=None) + note = { + "note_id": "n9", + "title": "Media", + "desc": "desc", + "user": {"user_id": "u1", "nickname": "N", "avatar": ""}, + "time": 0, + "last_update_time": 0, + "liked_count": 0, + "collected_count": 0, + "comment_count": 0, + "share_count": 0, + "ip_location": "", + "image_list": ["https://img.jpg"], + "video_urls": ["https://vid.mp4"], + } + + mock_adapter = AsyncMock() + mock_adapter.fetch_post = AsyncMock(return_value={ + "note": note, "url": "https://www.xiaohongshu.com/explore/n9", + }) + mock_adapter.__aenter__ = AsyncMock(return_value=mock_adapter) + mock_adapter.__aexit__ = AsyncMock(return_value=None) + + with patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.XhsSinglePostAdapter", + return_value=mock_adapter, + ), patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.short_text_template", + mock_short_template, + ), patch( + "fastfetchbot_shared.services.scrapers.xiaohongshu.content_template", + mock_content_template, + ): + await xhs.get_item() + + # content_template.render was called + render_calls = mock_content_template.render.call_args_list + assert len(render_calls) == 1 + call_data = render_calls[0].kwargs.get("data") or render_calls[0][1].get("data") + # raw_content should have img and video tags appended + assert "img" in call_data["raw_content"] + assert "video" in call_data["raw_content"] diff --git a/tests/unit/scrapers/test_zhihu.py b/tests/unit/scrapers/test_zhihu.py new file mode 100644 index 0000000..7e21eb1 --- /dev/null +++ b/tests/unit/scrapers/test_zhihu.py @@ -0,0 +1,2524 @@ +"""Unit tests for Zhihu scraper and config modules. + +Covers: +- packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +- packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py +""" + +import json +import pytest +from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock + +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType + + +# --------------------------------------------------------------------------- +# Module-level / config tests +# --------------------------------------------------------------------------- + + +class TestZhihuConfig: + """Tests for zhihu/config.py cookie resolution logic. + + The config module imports ZHIHU_Z_C0 and ZHIHU_COOKIES_JSON from the parent + scrapers.config, so we must patch on that parent module before reloading. + """ + + def test_config_with_z_c0(self): + """When ZHIHU_Z_C0 is set, ZHIHU_API_COOKIE uses it.""" + with patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "test_token" + ), patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", None + ): + import importlib + import fastfetchbot_shared.services.scrapers.zhihu.config as cfg + + importlib.reload(cfg) + assert cfg.ZHIHU_API_COOKIE == "z_c0=test_token" + + def test_config_with_cookies_json(self): + """When ZHIHU_Z_C0 is empty but ZHIHU_COOKIES_JSON is set, use cookies JSON.""" + cookies = [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}] + with patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "" + ), patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", + cookies, + ): + import importlib + import fastfetchbot_shared.services.scrapers.zhihu.config as cfg + + importlib.reload(cfg) + assert cfg.ZHIHU_API_COOKIE == "a=1;b=2" + assert cfg.ZHIHU_COOKIES == "a=1;b=2" + + def test_config_no_cookies(self): + """When both ZHIHU_Z_C0 and ZHIHU_COOKIES_JSON are empty/None.""" + with patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "" + ), patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", + None, + ): + import importlib + import fastfetchbot_shared.services.scrapers.zhihu.config as cfg + + importlib.reload(cfg) + assert cfg.ZHIHU_API_COOKIE is None + assert cfg.ZHIHU_COOKIES is None + + def test_config_z_c0_takes_precedence(self): + """ZHIHU_Z_C0 takes priority over ZHIHU_COOKIES_JSON for API cookie.""" + cookies = [{"name": "a", "value": "1"}] + with patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "my_z_c0" + ), patch( + "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", + cookies, + ): + import importlib + import fastfetchbot_shared.services.scrapers.zhihu.config as cfg + + importlib.reload(cfg) + assert cfg.ZHIHU_API_COOKIE == "z_c0=my_z_c0" + # ZHIHU_COOKIES still uses JSON cookies + assert cfg.ZHIHU_COOKIES == "a=1" + + +# --------------------------------------------------------------------------- +# Helper function tests +# --------------------------------------------------------------------------- + + +class TestParseAnswerApiJsonData: + """Tests for _parse_answer_api_json_data module-level function.""" + + def test_parses_fields(self): + from fastfetchbot_shared.services.scrapers.zhihu import ( + _parse_answer_api_json_data, + ) + + data = { + "question": { + "id": 123, + "title": "Test Q", + "detail": "

detail

", + "answer_count": 10, + "follower_count": 20, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "TestAuthor", "url_token": "test_token"}, + "content": "

answer content

", + "created_time": 3000, + "updated_time": 4000, + "comment_count": 5, + "voteup_count": 50, + "ipInfo": "Beijing", + } + result = _parse_answer_api_json_data(data) + assert result["question_id"] == 123 + assert result["title"] == "Test Q" + assert result["author"] == "TestAuthor" + assert result["content"] == "

answer content

" + assert result["voteup_count"] == 50 + assert result["ip_info"] == "Beijing" + + def test_missing_fields_returns_none(self): + from fastfetchbot_shared.services.scrapers.zhihu import ( + _parse_answer_api_json_data, + ) + + data = {} + result = _parse_answer_api_json_data(data) + assert result["question_id"] is None + assert result["title"] is None + + +class TestFixJsonQuotes: + """Tests for _fix_json_quotes function.""" + + def test_fixes_newlines(self): + from fastfetchbot_shared.services.scrapers.zhihu import _fix_json_quotes + + result = _fix_json_quotes("hello\nworld\rtest") + assert "\\n" in result + assert "\\r" in result + assert "\n" not in result + + def test_fixes_href_quotes(self): + from fastfetchbot_shared.services.scrapers.zhihu import _fix_json_quotes + + raw = 'href="http://example.com"' + result = _fix_json_quotes(raw) + assert '\\"' in result + + def test_fixes_content_key_inner_quotes(self): + from fastfetchbot_shared.services.scrapers.zhihu import _fix_json_quotes + + raw = '"content":"some \\"quoted\\" text","another_key":"value"' + result = _fix_json_quotes(raw) + # Should not raise and should produce a string + assert isinstance(result, str) + + def test_fixes_detail_key_inner_quotes(self): + from fastfetchbot_shared.services.scrapers.zhihu import _fix_json_quotes + + raw = '"detail":"has a \\"quote\\" inside","next_key":"val"' + result = _fix_json_quotes(raw) + assert isinstance(result, str) + + def test_no_target_keys(self): + from fastfetchbot_shared.services.scrapers.zhihu import _fix_json_quotes + + raw = '"title":"no issue"' + result = _fix_json_quotes(raw) + assert result == '"title":"no issue"' + + +# --------------------------------------------------------------------------- +# Zhihu class tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def _patch_zhihu_module(): + """Patch module-level template objects and httpx client for Zhihu import.""" + mock_template = MagicMock() + mock_template.render.return_value = "

rendered text

" + mock_content_template = MagicMock() + mock_content_template.render.return_value = "
rendered content
" + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.short_text_template", + mock_template, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.content_template", + mock_content_template, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.zhihu_client", + MagicMock(), + ): + yield { + "short_text_template": mock_template, + "content_template": mock_content_template, + } + + +class TestZhihuInit: + """Tests for Zhihu.__init__.""" + + def test_default_init(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", "api_cookie" + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", "full_cookie" + ): + z = Zhihu(url="https://www.zhihu.com/question/123/answer/456") + assert z.url == "https://www.zhihu.com/question/123/answer/456" + assert z.category == "zhihu" + assert z.message_type == MessageType.SHORT + assert z.method == "api" + assert z.headers["Cookie"] == "full_cookie" + + def test_init_with_custom_cookie(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", "api_cookie" + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", "full_cookie" + ): + z = Zhihu( + url="https://www.zhihu.com/question/123/answer/456", + cookie="custom_cookie", + ) + assert z.headers["Cookie"] == "custom_cookie" + + def test_init_no_api_cookie(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ): + z = Zhihu(url="https://www.zhihu.com/pin/123") + assert "Cookie" not in z.headers + + def test_init_with_method_kwarg(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ): + z = Zhihu( + url="https://www.zhihu.com/question/1/answer/2", + method="fxzhihu", + ) + assert z.method == "fxzhihu" + + def test_init_api_cookie_set_no_zhihu_cookies(self, _patch_zhihu_module): + """API cookie is set but ZHIHU_COOKIES is None — no extra cookie header from ZHIHU_COOKIES.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", "api_c" + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ): + z = Zhihu(url="https://www.zhihu.com/pin/1") + # Cookie set from ZHIHU_API_COOKIE, then kwargs.cookie not provided and + # ZHIHU_COOKIES is None so the elif doesn't fire + assert z.headers["Cookie"] == "api_c" + + +class TestCheckZhihuType: + """Tests for Zhihu._check_zhihu_type.""" + + @pytest.mark.asyncio + async def test_article_type(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/35142635") + await z._check_zhihu_type() + assert z.zhihu_type == "article" + assert z.article_id == "35142635" + + @pytest.mark.asyncio + async def test_answer_type_with_question(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/19998424/answer/603067076") + await z._check_zhihu_type() + assert z.zhihu_type == "answer" + assert z.answer_id == "603067076" + assert z.question_id == "19998424" + + @pytest.mark.asyncio + async def test_answer_type_without_question(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/answer/603067076") + await z._check_zhihu_type() + assert z.zhihu_type == "answer" + assert z.answer_id == "603067076" + + @pytest.mark.asyncio + async def test_status_type(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/1667965059081945088") + await z._check_zhihu_type() + assert z.zhihu_type == "status" + assert z.status_id == "1667965059081945088" + + @pytest.mark.asyncio + async def test_unknown_type(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/people/someone") + await z._check_zhihu_type() + assert z.zhihu_type == "unknown" + + +class TestGetRequestUrl: + """Tests for Zhihu._get_request_url.""" + + @pytest.mark.asyncio + async def test_fxzhihu_answer_with_question_id(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.question_id = "100" + z.method = "fxzhihu" + await z._get_request_url() + assert z.request_url == "https://fxzhihu.com/question/100/answer/200" + + @pytest.mark.asyncio + async def test_fxzhihu_answer_no_question_id(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://www.zhihu.com/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.question_id = "" + z.method = "fxzhihu" + await z._get_request_url() + assert z.request_url == "https://fxzhihu.com/answer/200" + + @pytest.mark.asyncio + async def test_fxzhihu_article(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "fxzhihu" + await z._get_request_url() + assert z.request_url == "https://fxzhihu.com/p/12345" + + @pytest.mark.asyncio + async def test_fxzhihu_status(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "fxzhihu" + await z._get_request_url() + assert z.request_url == "https://fxzhihu.com/pin/999" + + @pytest.mark.asyncio + async def test_api_answer(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + await z._get_request_url() + assert "answers/200" in z.request_url + assert z.request_url.startswith("https://www.zhihu.com/api/v4") + + @pytest.mark.asyncio + async def test_api_article(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "api" + await z._get_request_url() + assert z.request_url == "https://www.zhihu.com/api/v4/articles/12345" + + @pytest.mark.asyncio + async def test_api_status(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + await z._get_request_url() + assert z.request_url == "https://www.zhihu.com/api/v4/pins/999" + + @pytest.mark.asyncio + async def test_non_api_answer_with_question_in_path(self, _patch_zhihu_module): + """When method is not api/fxzhihu and path contains 'question'.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.question_id = "100" + z.method = "html" + await z._get_request_url() + assert "/aria/question/100/answer/200" in z.request_url + + @pytest.mark.asyncio + async def test_non_api_answer_without_question_in_path(self, _patch_zhihu_module): + """When method is html and path doesn't contain 'question', _get_question_id is called.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ): + z = Zhihu(url="https://www.zhihu.com/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.question_id = "" + z.method = "html" + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_redirect_url", + new_callable=AsyncMock, + return_value="https://www.zhihu.com/question/555/answer/200", + ): + await z._get_request_url() + assert z.question_id == "555" + + @pytest.mark.asyncio + async def test_non_api_non_fxzhihu_article_falls_through(self, _patch_zhihu_module): + """Article with method='html' falls through to default URL construction.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "html" + await z._get_request_url() + assert z.request_url == "https://zhuanlan.zhihu.com/p/12345" + + @pytest.mark.asyncio + async def test_non_api_non_fxzhihu_status_falls_through(self, _patch_zhihu_module): + """Status with method='html' falls through to default URL construction.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "html" + await z._get_request_url() + assert z.request_url == "https://www.zhihu.com/pin/999" + + +class TestGetZhihuAnswer: + """Tests for Zhihu._get_zhihu_answer.""" + + @pytest.mark.asyncio + async def test_api_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + api_response = { + "question": { + "id": 100, + "title": "Test Question", + "detail": "detail", + "answer_count": 5, + "follower_count": 10, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "Author", "url_token": "author_token"}, + "content": "

answer

", + "created_time": 3000, + "updated_time": 4000, + "comment_count": 2, + "voteup_count": 30, + "ipInfo": "", + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=api_response, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/answers/200" + await z._get_zhihu_answer() + assert z.title == "Test Question" + assert z.author == "Author" + + @pytest.mark.asyncio + async def test_api_method_failure_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + side_effect=Exception("network error"), + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/answers/200" + with pytest.raises(Exception, match="Cannot get the answer by API"): + await z._get_zhihu_answer() + + @pytest.mark.asyncio + async def test_fxzhihu_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + response_data = { + "question": { + "id": 100, + "title": "FxQ", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "FxAuthor", "url_token": "fx_token"}, + "content": "

fx answer

", + "created_time": 3000, + "updated_time": 4000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + mock_resp = MagicMock() + mock_resp.text = json.dumps(response_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response", + new_callable=AsyncMock, + return_value=mock_resp, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "fxzhihu" + z.request_url = "https://fxzhihu.com/question/100/answer/200" + await z._get_zhihu_answer() + assert z.title == "FxQ" + + @pytest.mark.asyncio + async def test_fxzhihu_method_failure_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response", + new_callable=AsyncMock, + side_effect=Exception("fail"), + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "fxzhihu" + z.request_url = "https://fxzhihu.com/question/100/answer/200" + with pytest.raises(Exception, match="Cannot get the answer by fxzhihu"): + await z._get_zhihu_answer() + + @pytest.mark.asyncio + async def test_json_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "initialState": { + "entities": { + "answers": { + "200": { + "question": {"id": 100}, + "author": {"name": "JsonAuthor", "urlToken": "jt"}, + "content": "

json content

", + "createdTime": 1000, + "updatedTime": 2000, + "commentCount": 1, + "voteupCount": 5, + "ipInfo": "", + } + }, + "questions": { + "100": { + "title": "JsonQ", + "detail": "", + "answerCount": 3, + "followerCount": 7, + "created": 500, + "updatedTime": 1500, + } + }, + } + } + } + mock_selector = MagicMock() + mock_selector.xpath.return_value = json.dumps(json_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "json" + z.request_url = "https://www.zhihu.com/aria/question/100/answer/200" + await z._get_zhihu_answer() + assert z.title == "JsonQ" + + @pytest.mark.asyncio + async def test_json_method_failure_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + side_effect=Exception("fail"), + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "json" + z.request_url = "https://www.zhihu.com/aria/question/100/answer/200" + with pytest.raises(Exception, match="Cannot get the selector"): + await z._get_zhihu_answer() + + @pytest.mark.asyncio + async def test_html_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + + def xpath_side_effect(expr): + if "VoteButton" in expr: + return "100" + if "RichContent-inner" in expr: + mock_elem = MagicMock() + from lxml import etree + + mock_elem.__class__ = etree._Element + # Return a mock that etree.tostring can handle + return [MagicMock()] + if "string(//h1)" == expr: + return "HTML Title" + if 'itemprop="name"' in expr: + return "HTML Author" + if 'itemprop="url"' in expr: + return "https://www.zhihu.com/people/someone" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree: + mock_etree.tostring.return_value = b"content" + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "html" + z.request_url = "https://www.zhihu.com/question/100/answer/200" + await z._get_zhihu_answer() + assert z.title == "HTML Title" + assert z.author == "HTML Author" + + @pytest.mark.asyncio + async def test_html_method_empty_author_url(self, _patch_zhihu_module): + """When author_url equals the bare /people/ URL, it should be cleared.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + + def xpath_side_effect(expr): + if "VoteButton" in expr: + return "10" + if "RichContent-inner" in expr: + return [MagicMock()] + if "string(//h1)" == expr: + return "Title" + if 'itemprop="name"' in expr: + return "Author" + if 'itemprop="url"' in expr: + return "https://www.zhihu.com/people/" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree: + mock_etree.tostring.return_value = b"text" + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "html" + z.request_url = "https://www.zhihu.com/question/100/answer/200" + await z._get_zhihu_answer() + assert z.author_url == "" + + @pytest.mark.asyncio + async def test_html_method_failure_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + side_effect=Exception("fail"), + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "html" + z.request_url = "https://www.zhihu.com/question/100/answer/200" + with pytest.raises(Exception, match="Cannot get the answer"): + await z._get_zhihu_answer() + + @pytest.mark.asyncio + async def test_empty_answer_data_raises(self, _patch_zhihu_module): + """When API returns empty data, _resolve_answer_json_data raises TypeError + due to None concatenation, which propagates as an exception.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value={}, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/answers/200" + with pytest.raises(TypeError): + await z._get_zhihu_answer() + + @pytest.mark.asyncio + async def test_title_empty_after_resolve_raises(self, _patch_zhihu_module): + """When answer_data resolves but title is empty, should raise.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + api_response = { + "question": { + "id": 100, + "title": None, # Will result in empty title after resolve + "detail": "", + "answer_count": 0, + "follower_count": 0, + "created": 0, + "updated_time": 0, + }, + "author": {"name": "A", "url_token": "t"}, + "content": "

c

", + "created_time": 0, + "updated_time": 0, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=api_response, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/answers/200" + with pytest.raises(Exception, match="Cannot get the answer"): + await z._get_zhihu_answer() + + +class TestGetZhihuArticle: + """Tests for Zhihu._get_zhihu_article.""" + + @pytest.mark.asyncio + async def test_api_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "title": "Article Title", + "content": "

article

", + "author": {"name": "ArtAuthor", "url": "https://zhihu.com/people/art"}, + "voteup_count": 100, + "comment_count": 5, + "created": 1000, + "updated": 2000, + "column": {"title": "Col", "url": "http://col", "intro": "intro"}, + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/articles/12345" + await z._get_zhihu_article() + assert z.title == "Article Title" + assert z.column == "Col" + + @pytest.mark.asyncio + async def test_api_method_no_column(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "title": "No Col", + "content": "

c

", + "author": {"name": "A", "url": "u"}, + "voteup_count": 0, + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/articles/12345" + await z._get_zhihu_article() + assert z.title == "No Col" + assert not hasattr(z, "column") + + @pytest.mark.asyncio + async def test_api_method_failure_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + side_effect=Exception("fail"), + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/articles/12345" + with pytest.raises(Exception, match="zhihu request failed"): + await z._get_zhihu_article() + + @pytest.mark.asyncio + async def test_fxzhihu_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "title": "Fx Article", + "content": "

fx

", + "author": {"name": "FxA", "url": "u"}, + "voteup_count": 0, + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "fxzhihu" + z.request_url = "https://fxzhihu.com/p/12345" + await z._get_zhihu_article() + assert z.title == "Fx Article" + + @pytest.mark.asyncio + async def test_json_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + page_data = { + "initialState": { + "entities": { + "articles": { + "12345": { + "title": "Json Article", + "content": "

jc

", + "author": {"name": "JA", "urlToken": "ja_token"}, + "voteupCount": 10, + "commentCount": 2, + "created": 1000, + "updated": 2000, + "column": { + "title": "JCol", + "url": "http://jcol", + "intro": "jintro", + }, + } + } + } + } + } + mock_selector = MagicMock() + mock_selector.xpath.return_value = json.dumps(page_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "json" + z.request_url = "https://zhuanlan.zhihu.com/p/12345" + await z._get_zhihu_article() + assert z.title == "Json Article" + assert z.column == "JCol" + + @pytest.mark.asyncio + async def test_html_method_success(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + + def xpath_side_effect(expr): + if "string(//h1)" == expr: + return "HTML Article" + if "VoteButton" in expr: + return "50" + if "RichText" in expr and "ztext" in expr: + return [MagicMock()] + if "AuthorInfo-head" in expr: + return "HtmlAuthor" + if "UserLink-link" in expr: + return "//www.zhihu.com/people/ha" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree: + mock_etree.tostring.return_value = b"
content
" + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "html" + z.request_url = "https://zhuanlan.zhihu.com/p/12345" + await z._get_zhihu_article() + assert z.title == "HTML Article" + assert z.author_url == "https://www.zhihu.com/people/ha" + + @pytest.mark.asyncio + async def test_get_selector_failure(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + side_effect=Exception("network"), + ): + z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") + z.zhihu_type = "article" + z.article_id = "12345" + z.method = "html" + z.request_url = "https://zhuanlan.zhihu.com/p/12345" + with pytest.raises(Exception, match="zhihu request failed"): + await z._get_zhihu_article() + + +class TestGetZhihuStatus: + """Tests for Zhihu._get_zhihu_status.""" + + @pytest.mark.asyncio + async def test_api_method_no_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "StatusAuthor", "url_token": "sa"}, + "created": 1000, + "updated": 2000, + "content_html": "

status

", + "reaction": { + "statistics": {"up_vote_count": 10, "comment_count": 3} + }, + "content": [ + {"type": "text", "content": "hello"}, + {"type": "image", "original_url": "http://img.jpg"}, + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.title == "StatusAuthor的想法" + assert z.upvote == 10 + assert len(z.media_files) == 1 + assert z.retweeted is False + + @pytest.mark.asyncio + async def test_api_method_with_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "Main", "url_token": "main"}, + "created": 1000, + "updated": 2000, + "content_html": "

main

", + "reaction": { + "statistics": {"up_vote_count": 5, "comment_count": 1} + }, + "content": [], + "origin_pin": { + "id": 888, + "author": {"name": "Origin", "url_token": "origin"}, + "created": 500, + "updated": 600, + "content_html": "

origin

", + "content": [], + "like_count": 2, + "comment_count": 0, + }, + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.retweeted is True + assert z.origin_pin_author == "Origin" + + @pytest.mark.asyncio + async def test_api_method_without_reaction_field(self, _patch_zhihu_module): + """When response uses like_count/comment_count instead of reaction.statistics.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "Author2", "url_token": "a2"}, + "created": 1000, + "updated": 2000, + "content_html": "

status2

", + "like_count": 7, + "comment_count": 4, + "content": [], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.upvote == 7 + + @pytest.mark.asyncio + async def test_api_video_content_types(self, _patch_zhihu_module): + """Test video content parsing in _resolve_status_api_data.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + # Test with video_info.playlist.hd + json_data = { + "author": {"name": "VA", "url_token": "va"}, + "created": 1000, + "updated": 2000, + "content_html": "", + "like_count": 0, + "comment_count": 0, + "content": [ + { + "type": "video", + "video_info": { + "playlist": { + "hd": {"play_url": "http://hd.mp4"}, + "sd": {"play_url": "http://sd.mp4"}, + } + }, + } + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert len(z.media_files) == 1 + assert z.media_files[0].url == "http://hd.mp4" + + @pytest.mark.asyncio + async def test_api_video_no_hd_fallback(self, _patch_zhihu_module): + """Test video fallback when no hd quality.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "VA", "url_token": "va"}, + "created": 1000, + "updated": 2000, + "content_html": "", + "like_count": 0, + "comment_count": 0, + "content": [ + { + "type": "video", + "video_info": { + "playlist": { + "sd": {"play_url": "http://sd.mp4"}, + } + }, + } + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.media_files[0].url == "http://sd.mp4" + + @pytest.mark.asyncio + async def test_api_video_playlist_format(self, _patch_zhihu_module): + """Test video with playlist list format instead of video_info.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "VA", "url_token": "va"}, + "created": 1000, + "updated": 2000, + "content_html": "", + "like_count": 0, + "comment_count": 0, + "content": [ + { + "type": "video", + "playlist": [ + {"quality": "sd", "url": "http://sd2.mp4"}, + {"quality": "hd", "url": "http://hd2.mp4"}, + ], + } + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.media_files[0].url == "http://hd2.mp4" + + @pytest.mark.asyncio + async def test_api_video_playlist_no_hd_fallback(self, _patch_zhihu_module): + """Test video playlist format without hd quality falls back to first entry.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "VA", "url_token": "va"}, + "created": 1000, + "updated": 2000, + "content_html": "", + "like_count": 0, + "comment_count": 0, + "content": [ + { + "type": "video", + "playlist": [ + {"quality": "sd", "url": "http://sd3.mp4"}, + ], + } + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert z.media_files[0].url == "http://sd3.mp4" + + @pytest.mark.asyncio + async def test_api_video_no_url_found(self, _patch_zhihu_module): + """Video content with empty playlist yields no media files.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "VA", "url_token": "va"}, + "created": 1000, + "updated": 2000, + "content_html": "", + "like_count": 0, + "comment_count": 0, + "content": [ + {"type": "video"}, + ], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/pins/999" + await z._get_zhihu_status() + assert len(z.media_files) == 0 + + @pytest.mark.asyncio + async def test_html_method_selector_failure(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + side_effect=Exception("fail"), + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "html" + z.request_url = "https://www.zhihu.com/pin/999" + with pytest.raises(Exception, match="zhihu request failed"): + await z._get_zhihu_status() + + @pytest.mark.asyncio + async def test_fxzhihu_method_status(self, _patch_zhihu_module): + """fxzhihu method for status uses get_response_json same as api.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + json_data = { + "author": {"name": "FxStatus", "url_token": "fs"}, + "created": 1000, + "updated": 2000, + "content_html": "

fx status

", + "like_count": 3, + "comment_count": 1, + "content": [], + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=json_data, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "fxzhihu" + z.request_url = "https://fxzhihu.com/pin/999" + await z._get_zhihu_status() + assert z.title == "FxStatus的想法" + # fxzhihu should NOT call fix_images_and_links (only api does) + + +class TestGetZhihuStatusJsonMethod: + """Tests for Zhihu._get_zhihu_status with method='json'.""" + + @pytest.mark.asyncio + async def test_json_method_no_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + page_data = { + "initialState": { + "entities": { + "pins": { + "999": { + "author": "author_token", + "created": 1000, + "updated": 2000, + "content": [ + {"content": "hello status"}, + {"type": "image", "isGif": False, "originalUrl": "http://img.jpg"}, + ], + "likeCount": 5, + "commentCount": 2, + "originPin": {"url": None}, + } + }, + "users": { + "author_token": {"name": "StatusAuthor"} + }, + } + } + } + mock_selector = MagicMock() + mock_selector.xpath.return_value = json.dumps(page_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "json" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert z.title == "StatusAuthor的想法" + assert z.author == "StatusAuthor" + assert z.upvote == 5 + assert len(z.media_files) == 1 + assert z.media_files[0].media_type == "image" + + @pytest.mark.asyncio + async def test_json_method_with_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + page_data = { + "initialState": { + "entities": { + "pins": { + "999": { + "author": "author_token", + "created": 1000, + "updated": 2000, + "content": [{"content": "main status"}], + "likeCount": 5, + "commentCount": 2, + "originPin": { + "url": "https://www.zhihu.com/pin/888", + "author": { + "name": "OriginAuthor", + "urlToken": "origin_token", + }, + "created": 500, + "updated": 600, + "content": [ + {"content": "origin text"}, + {"type": "video", "isGif": False, "originalUrl": "http://vid.mp4"}, + ], + "likeCount": 1, + "commentCount": 0, + }, + } + }, + "users": { + "author_token": {"name": "MainAuthor"} + }, + } + } + } + mock_selector = MagicMock() + mock_selector.xpath.return_value = json.dumps(page_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "json" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert z.retweeted is True + assert z.origin_pin_author == "OriginAuthor" + assert len(z.media_files) == 1 + assert z.media_files[0].media_type == "video" + + @pytest.mark.asyncio + async def test_json_method_gif_image(self, _patch_zhihu_module): + """Test _process_picture with isGif=True.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + page_data = { + "initialState": { + "entities": { + "pins": { + "999": { + "author": "at", + "created": 1000, + "updated": 2000, + "content": [ + {"content": "text"}, + {"type": "image", "isGif": True, "originalUrl": "http://gif.gif"}, + ], + "likeCount": 0, + "commentCount": 0, + "originPin": {"url": None}, + } + }, + "users": {"at": {"name": "A"}}, + } + } + } + mock_selector = MagicMock() + mock_selector.xpath.return_value = json.dumps(page_data) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "json" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert len(z.media_files) == 1 + assert z.media_files[0].media_type == "gif" + + +class TestGetZhihuStatusHtmlMethod: + """Tests for Zhihu._get_zhihu_status with method='html'.""" + + @pytest.mark.asyncio + async def test_html_method_no_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + + def xpath_side_effect(expr): + if "RichText" in expr and "itemprop" in expr: + return [MagicMock()] + if "VoteButton" in expr: + return "10" + if "ContentItem-time" in expr: + return "2024-01-01" + if "RichContent" in expr and "@class" in expr: + return "some-other-class" # No PinItem-content-originpin + if 'itemprop="name"' in expr: + return "HtmlAuthor" + if 'itemprop="url"' in expr: + return "https://www.zhihu.com/people/ha" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree: + mock_etree.tostring.return_value = b"status content" + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "html" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert z.title == "HtmlAuthor的想法" + assert z.author == "HtmlAuthor" + + @pytest.mark.asyncio + async def test_html_method_with_retweet_with_pics(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + + def xpath_side_effect(expr): + if "RichText" in expr and "itemprop" in expr: + return [MagicMock()] + if "VoteButton" in expr: + return "10" + if "ContentItem-time" in expr: + return "2024-01-01" + if "RichContent" in expr and "@class" in expr: + return "PinItem-content-originpin" # Has retweet + if "PinItem-content-originpin" in expr and "div[3]" in expr: + return [MagicMock()] + if "PinItem-content-originpin" in expr: + return [MagicMock()] + if 'itemprop="name"' in expr: + return "Author" + if 'itemprop="url"' in expr: + return "https://www.zhihu.com/people/author" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree, patch( + "fastfetchbot_shared.services.scrapers.zhihu.html" + ) as mock_html: + # Non-empty retweet content (not the empty marker div) + mock_etree.tostring.return_value = b"
retweet content
" + mock_html.fromstring.return_value = MagicMock() + mock_html.tostring.return_value = b"
pretty retweet
" + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "html" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert z.title == "Author的想法" + assert z.retweet_html != "" + + @pytest.mark.asyncio + async def test_html_method_with_retweet_no_pics(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_selector = MagicMock() + call_count = {"originpin_div3": 0} + + def xpath_side_effect(expr): + if "RichText" in expr and "itemprop" in expr: + return [MagicMock()] + if "VoteButton" in expr: + return "10" + if "ContentItem-time" in expr: + return "2024-01-01" + if "RichContent" in expr and "@class" in expr: + return "PinItem-content-originpin" + if "PinItem-content-originpin" in expr and "div[3]" in expr: + return [MagicMock()] + if "PinItem-content-originpin" in expr: + return [MagicMock()] + if 'itemprop="name"' in expr: + return "Author" + if 'itemprop="url"' in expr: + return "https://www.zhihu.com/people/author" + return "" + + mock_selector.xpath.side_effect = xpath_side_effect + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_selector", + new_callable=AsyncMock, + return_value=mock_selector, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.etree" + ) as mock_etree: + # Return the empty marker div for retweet check + mock_etree.tostring.side_effect = [ + b'content', # main content + b'
', # originpin/div[3] + b'
originpin content
', # PinItem-content-originpin + ] + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.status_id = "999" + z.method = "html" + z.request_url = "https://www.zhihu.com/pin/999" + await z._get_zhihu_status() + assert z.title == "Author的想法" + + +class TestParseStatusJsonData: + """Tests for Zhihu._parse_status_json_data.""" + + def test_parses_status_data(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.status_id = "999" + data = { + "pins": { + "999": { + "author": "author_token", + "created": 1000, + "updated": 2000, + "content": [ + {"content": "status text"}, + ], + "likeCount": 5, + "commentCount": 2, + "originPin": { + "url": None, + "author": {"name": "X", "urlToken": "xt"}, + "created": 0, + "updated": 0, + "content": [{"content": ""}], + "likeCount": 0, + "commentCount": 0, + }, + } + }, + "users": { + "author_token": {"name": "Author"} + }, + } + result = z._parse_status_json_data(data) + assert result["author"] == "Author" + assert result["content"] == "status text" + assert result["like_count"] == 5 + + +class TestGetZhihuItem: + """Tests for Zhihu._get_zhihu_item (the main fallback logic).""" + + @pytest.mark.asyncio + async def test_first_method_succeeds(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + api_response = { + "question": { + "id": 100, + "title": "Q", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "A", "url_token": "at"}, + "content": "

c

", + "created_time": 1000, + "updated_time": 2000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=api_response, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + await z._get_zhihu_item() + assert z.title == "Q" + + @pytest.mark.asyncio + async def test_first_method_fails_second_succeeds(self, _patch_zhihu_module): + """First method (api) fails, second method (fxzhihu) succeeds.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + call_count = 0 + + async def mock_get_response_json(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise Exception("api failed") + return { + "question": { + "id": 100, + "title": "FallbackQ", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "A", "url_token": "at"}, + "content": "

c

", + "created_time": 1000, + "updated_time": 2000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + + # fxzhihu for answer uses get_response, not get_response_json + response_data = { + "question": { + "id": 100, + "title": "FallbackQ", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "FA", "url_token": "fat"}, + "content": "

fx

", + "created_time": 1000, + "updated_time": 2000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + mock_resp = MagicMock() + mock_resp.text = json.dumps(response_data) + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + side_effect=mock_get_response_json, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response", + new_callable=AsyncMock, + return_value=mock_resp, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + await z._get_zhihu_item() + assert z.title == "FallbackQ" + + @pytest.mark.asyncio + async def test_all_methods_fail(self, _patch_zhihu_module): + """When all methods fail, raises the last exception.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + side_effect=Exception("api fail"), + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response", + new_callable=AsyncMock, + side_effect=Exception("fx fail"), + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + with pytest.raises(Exception): + await z._get_zhihu_item() + + @pytest.mark.asyncio + async def test_invalid_method_defaults_to_api(self, _patch_zhihu_module): + """When self.method is not in ALL_METHODS, it's reset to 'api'.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + api_response = { + "question": { + "id": 100, + "title": "Q", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "A", "url_token": "at"}, + "content": "

c

", + "created_time": 1000, + "updated_time": 2000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=api_response, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu( + url="https://www.zhihu.com/question/100/answer/200", + method="invalid_method", + ) + await z._get_zhihu_item() + assert z.title == "Q" + + +class TestZhihuShortTextProcess: + """Tests for Zhihu._zhihu_short_text_process.""" + + def test_basic_processing(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.raw_content = "

Hello

" + z._zhihu_short_text_process() + # Template was called + assert isinstance(z.text, str) + + def test_status_with_retweet(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.retweeted = True + z.raw_content = "

main

" + z.origin_pin_raw_content = "

origin

" + z._zhihu_short_text_process() + assert isinstance(z.text, str) + + def test_img_with_data_image_skipped(self, _patch_zhihu_module): + """Images with data:image src should be skipped (no media_files added).""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = '' + z._zhihu_short_text_process() + # No media files should be added for data:image src + assert len(z.media_files) == 0 + + def test_img_with_actual_src(self, _patch_zhihu_module): + """Images with real src are added to media_files for non-status types.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = '' + z._zhihu_short_text_process() + assert len(z.media_files) == 1 + + def test_img_status_type_not_added(self, _patch_zhihu_module): + """For status type, images are not added to media_files in short text processing.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/999") + z.zhihu_type = "status" + z.retweeted = False + z.raw_content = '' + z._zhihu_short_text_process() + assert len(z.media_files) == 0 + + def test_a_tag_without_href(self, _patch_zhihu_module): + """ tags without href should be unwrapped.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = 'no hrefwith href' + z._zhihu_short_text_process() + assert isinstance(z.text, str) + + def test_text_ends_with_newline_stripped(self, _patch_zhihu_module): + """Text ending with a single newline should have it stripped.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + templates = _patch_zhihu_module + # Return plain text ending with \n (no html tags that would get processed) + templates["short_text_template"].render.return_value = "simple text\n" + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "

text

" + z._zhihu_short_text_process() + assert not z.text.endswith("\n") + + def test_h_tags_and_p_tags_processing(self, _patch_zhihu_module): + """h tags and p tags should be unwrapped with br appended.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + templates = _patch_zhihu_module + templates["short_text_template"].render.return_value = ( + "

Header

Paragraph

" + ) + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "

text

" + z._zhihu_short_text_process() + # h and p tags with text get
appended; empty ones still get unwrapped + assert isinstance(z.text, str) + + +class TestZhihuShortTextProcessExtra: + """Additional tests for inner _html_process function in _zhihu_short_text_process.""" + + def test_figure_tags_decomposed(self, _patch_zhihu_module): + """Figure tags should be decomposed.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = '
' + z._zhihu_short_text_process() + assert isinstance(z.text, str) + assert len(z.media_files) == 1 + + def test_br_tags_replaced_with_newline(self, _patch_zhihu_module): + """br tags should be replaced with newlines in the processed content.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "line1
line2" + z._zhihu_short_text_process() + assert isinstance(z.text, str) + + def test_content_with_br_replacement(self, _patch_zhihu_module): + """Raw content with

should be replaced with newlines.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "paragraph1

paragraph2" + z._zhihu_short_text_process() + assert isinstance(z.text, str) + + +class TestZhihuContentProcess: + """Tests for Zhihu._zhihu_content_process.""" + + def test_content_rendering(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.raw_content = "

test\ncontent

" + z._zhihu_content_process() + assert z.content == "
rendered content
" + + +class TestResolveAnswerJsonData: + """Tests for Zhihu._resolve_answer_json_data.""" + + def test_resolve_with_full_data(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + answer_data = { + "question_detail": "

detail

", + "question_created": 1000, + "question_updated": 2000, + "follower_count": 10, + "answer_count": 5, + "title": "Title", + "author": "Author", + "author_url_token": "token", + "content": "

content

", + "created": 3000, + "updated": 4000, + "comment_count": 2, + "voteup_count": 50, + "ip_info": "Beijing", + } + z._resolve_answer_json_data(answer_data) + assert z.title == "Title" + assert z.author == "Author" + assert z.upvote == 50 + + def test_resolve_with_none_author_url_token_raises(self, _patch_zhihu_module): + """When author_url_token is None, concatenation with ZHIHU_HOST raises TypeError.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + answer_data = { + "question_detail": None, + "question_created": None, + "question_updated": None, + "follower_count": None, + "answer_count": None, + "title": None, + "author": None, + "author_url_token": None, + "content": None, + "created": None, + "updated": None, + "comment_count": None, + "voteup_count": None, + "ip_info": None, + } + with pytest.raises(TypeError): + z._resolve_answer_json_data(answer_data) + + def test_resolve_with_empty_string_values(self, _patch_zhihu_module): + """When values are empty strings instead of None, resolution works correctly.""" + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + answer_data = { + "question_detail": "", + "question_created": "", + "question_updated": "", + "follower_count": 0, + "answer_count": 0, + "title": "", + "author": "", + "author_url_token": "", + "content": "", + "created": "", + "updated": "", + "comment_count": 0, + "voteup_count": 0, + "ip_info": "", + } + z._resolve_answer_json_data(answer_data) + assert z.title == "" + assert z.question == "" + assert z.question_follower_count == 0 + + +class TestGetItem: + """Test the get_item and get_zhihu methods.""" + + @pytest.mark.asyncio + async def test_get_item_returns_dict(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + api_response = { + "question": { + "id": 100, + "title": "Q", + "detail": "", + "answer_count": 1, + "follower_count": 1, + "created": 1000, + "updated_time": 2000, + }, + "author": {"name": "A", "url_token": "at"}, + "content": "

c

", + "created_time": 1000, + "updated_time": 2000, + "comment_count": 0, + "voteup_count": 0, + "ipInfo": "", + } + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value=api_response, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.fix_images_and_links", + side_effect=lambda x: x, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.unmask_zhihu_links", + side_effect=lambda x: x, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + result = await z.get_item() + assert isinstance(result, dict) + assert "url" in result + assert "title" in result + + +class TestGetQuestionId: + """Test Zhihu._get_question_id.""" + + @pytest.mark.asyncio + async def test_gets_question_id_from_redirect(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_redirect_url", + new_callable=AsyncMock, + return_value="https://www.zhihu.com/question/777/answer/200", + ): + z = Zhihu(url="https://www.zhihu.com/answer/200") + await z._get_question_id() + assert z.question_id == "777" + + +class TestGenerateZhihuCookie: + """Test Zhihu._generate_zhihu_cookie (currently a pass/no-op).""" + + def test_no_op(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch("fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None): + z = Zhihu(url="https://www.zhihu.com/pin/1") + result = z._generate_zhihu_cookie() + assert result is None + + +class TestAnswerDataEmptyDict: + """Cover line 322: answer_data == {} raises.""" + + @pytest.mark.asyncio + async def test_api_returns_empty_dict_raises(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.get_response_json", + new_callable=AsyncMock, + return_value={}, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu._parse_answer_api_json_data", + return_value={}, + ): + z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") + z.zhihu_type = "answer" + z.answer_id = "200" + z.method = "api" + z.request_url = "https://www.zhihu.com/api/v4/answers/200" + with pytest.raises(Exception, match="Cannot get the answer"): + await z._get_zhihu_answer() + + +class TestShortTextProcessPTags: + """Cover lines 652-654: p tag processing after format_telegram_short_text.""" + + def test_p_tags_survive_format_telegram(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_template = MagicMock() + mock_template.render.return_value = "

paragraph one

paragraph two

" + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.short_text_template", + mock_template, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.format_telegram_short_text", + side_effect=lambda soup: soup, # Don't unwrap p tags + ): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "

content

" + z.retweeted = False + z._zhihu_short_text_process() + assert isinstance(z.text, str) + assert "paragraph one" in z.text + + def test_empty_p_tags_no_br_appended(self, _patch_zhihu_module): + from fastfetchbot_shared.services.scrapers.zhihu import Zhihu + + mock_template = MagicMock() + mock_template.render.return_value = "

text

" + + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_API_COOKIE", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.short_text_template", + mock_template, + ), patch( + "fastfetchbot_shared.services.scrapers.zhihu.format_telegram_short_text", + side_effect=lambda soup: soup, + ): + z = Zhihu(url="https://www.zhihu.com/question/1/answer/2") + z.zhihu_type = "answer" + z.raw_content = "

content

" + z.retweeted = False + z._zhihu_short_text_process() + assert isinstance(z.text, str) diff --git a/tests/unit/scrapers/test_zhihu_content_processing.py b/tests/unit/scrapers/test_zhihu_content_processing.py new file mode 100644 index 0000000..6222c3d --- /dev/null +++ b/tests/unit/scrapers/test_zhihu_content_processing.py @@ -0,0 +1,187 @@ +"""Tests for Zhihu content processing functions.""" + +import pytest + +from fastfetchbot_shared.services.scrapers.zhihu.content_processing import ( + extract_references, + fix_images_and_links, + unmask_zhihu_links, +) + + +class TestFixImagesAndLinks: + """Tests for fix_images_and_links function.""" + + def test_replaces_data_actualsrc_with_src(self): + html = '' + result = fix_images_and_links(html) + assert 'src="https://pic.zhimg.com/real.jpg"' in result + assert "data-actualsrc" not in result + + def test_img_without_data_actualsrc_unchanged(self): + html = '' + result = fix_images_and_links(html) + assert 'src="https://example.com/image.jpg"' in result + + def test_removes_u_tags_preserving_content(self): + html = "

Some underlined text

" + result = fix_images_and_links(html) + assert "" not in result + assert "underlined" in result + assert "

" in result + + def test_multiple_img_tags(self): + html = ( + '' + '' + ) + result = fix_images_and_links(html) + assert 'src="https://a.com/1.jpg"' in result + assert 'src="https://a.com/2.jpg"' in result + assert "data-actualsrc" not in result + + def test_multiple_u_tags(self): + html = "

first and second

" + result = fix_images_and_links(html) + assert "" not in result + assert "first" in result + assert "second" in result + + def test_empty_string(self): + result = fix_images_and_links("") + assert result == "" + + def test_no_matching_elements(self): + html = "

Just plain text

" + result = fix_images_and_links(html) + assert "Just plain text" in result + + def test_img_with_only_data_actualsrc_no_existing_src(self): + html = '' + result = fix_images_and_links(html) + assert 'src="https://pic.zhimg.com/real.jpg"' in result + assert "data-actualsrc" not in result + + +class TestExtractReferences: + """Tests for extract_references function.""" + + def test_extracts_single_reference(self): + html = '

Some text

' + result = extract_references(html) + assert "

参考

" in result + assert "Ref text" in result + assert "https://example.com" in result + assert "
    " in result + + def test_extracts_multiple_references_sorted(self): + html = ( + '' + '' + ) + result = extract_references(html) + first_pos = result.index("First") + second_pos = result.index("Second") + assert first_pos < second_pos + + def test_reference_without_url(self): + html = '' + result = extract_references(html) + assert "No URL ref" in result + assert "' + result = extract_references(html) + assert "Empty URL" in result + assert "No references here

    " + result = extract_references(html) + assert result == "" + + def test_sup_without_data_text_ignored(self): + html = '' + result = extract_references(html) + assert result == "" + + def test_sup_without_data_numero_ignored(self): + html = '' + result = extract_references(html) + assert result == "" + + def test_empty_string(self): + result = extract_references("") + assert result == "" + + def test_reference_output_format(self): + html = '' + result = extract_references(html) + assert result.startswith("
    ") + assert "
    " in result + assert "
  1. " in result + assert "
" in result + + +class TestUnmaskZhihuLinks: + """Tests for unmask_zhihu_links function.""" + + def test_unmasks_zhihu_redirect_link(self): + html = 'link' + result = unmask_zhihu_links(html) + assert 'href="https://example.com/page"' in result + + def test_non_zhihu_link_unchanged(self): + html = 'link' + result = unmask_zhihu_links(html) + assert 'href="https://example.com/page"' in result + + def test_multiple_links_mixed(self): + html = ( + 'A' + 'B' + 'C' + ) + result = unmask_zhihu_links(html) + assert 'href="https://a.com"' in result + assert 'href="https://b.com"' in result + assert 'href="https://c.com"' in result + + def test_zhihu_link_without_target_param(self): + html = 'link' + result = unmask_zhihu_links(html) + # No target param, so href should remain unchanged + assert 'href="https://link.zhihu.com/?other=value"' in result + + def test_empty_string(self): + result = unmask_zhihu_links("") + assert result == "" + + def test_no_links(self): + html = "

No links here

" + result = unmask_zhihu_links(html) + assert "No links here" in result + + def test_encoded_target_decoded(self): + html = 'link' + result = unmask_zhihu_links(html) + assert "https://example.com/path?q=hello&" in result + + def test_a_tag_without_href_skipped(self): + html = "no href" + result = unmask_zhihu_links(html) + assert "no href" in result + + def test_malformed_zhihu_link_handled_gracefully(self): + """A zhihu link that causes a parsing error is left unchanged.""" + from unittest.mock import patch + + html = 'link' + with patch( + "fastfetchbot_shared.services.scrapers.zhihu.content_processing.parse_qs", + side_effect=Exception("parse error"), + ): + result = unmask_zhihu_links(html) + # The link should remain unchanged since the exception was caught + assert "link.zhihu.com" in result diff --git a/tests/unit/test_telegraph.py b/tests/unit/test_telegraph.py new file mode 100644 index 0000000..5992608 --- /dev/null +++ b/tests/unit/test_telegraph.py @@ -0,0 +1,204 @@ +"""Tests for packages/shared/fastfetchbot_shared/services/telegraph/__init__.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.services.telegraph import Telegraph + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +class TestTelegraphInit: + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + def test_init_sets_all_fields(self, mock_poster_cls): + mock_poster_cls.return_value = MagicMock() + t = Telegraph( + title="My Title", + url="https://example.com/post", + author="Author Name", + author_url="https://example.com/author", + category="tech", + content="

Hello

", + ) + assert t.title == "My Title" + assert t.url == "https://example.com/post" + assert t.author == "Author Name" + assert t.author_url == "https://example.com/author" + assert t.category == "tech" + assert t.content == "

Hello

" + mock_poster_cls.assert_called_once_with(use_api=True) + assert t.telegraph is mock_poster_cls.return_value + + +# --------------------------------------------------------------------------- +# from_dict +# --------------------------------------------------------------------------- + + +class TestFromDict: + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + def test_from_dict(self, mock_poster_cls): + mock_poster_cls.return_value = MagicMock() + obj = { + "title": "Title", + "url": "https://example.com", + "author": "Auth", + "author_url": "https://example.com/auth", + "category": "cat", + "content": "

content

", + } + t = Telegraph.from_dict(obj) + assert isinstance(t, Telegraph) + assert t.title == "Title" + assert t.url == "https://example.com" + assert t.author == "Auth" + assert t.author_url == "https://example.com/auth" + assert t.category == "cat" + assert t.content == "

content

" + + def test_from_dict_non_dict_raises(self): + with pytest.raises(AssertionError): + Telegraph.from_dict("not a dict") + + +# --------------------------------------------------------------------------- +# get_telegraph +# --------------------------------------------------------------------------- + + +class TestGetTelegraph: + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok1", "tok2"]) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") + async def test_upload_images_true_with_token_list( + self, mock_doc_pre_cls, mock_poster_cls + ): + # Setup mock poster + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.return_value = {"url": "https://telegra.ph/test-page"} + + # Setup mock DocumentPreprocessor + mock_doc_pre = MagicMock() + mock_doc_pre.upload_all_images = AsyncMock() + mock_doc_pre.get_processed_html.return_value = "

processed

" + mock_doc_pre_cls.return_value = mock_doc_pre + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

c

") + + result = await t.get_telegraph(upload_images=True) + + assert result == "https://telegra.ph/test-page" + mock_doc_pre_cls.assert_called_once_with("

c

", url="https://ex.com") + mock_doc_pre.upload_all_images.assert_awaited_once() + mock_poster.set_token.assert_awaited_once() + mock_poster.post.assert_awaited_once() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok1"]) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + async def test_upload_images_false(self, mock_poster_cls): + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.return_value = {"url": "https://telegra.ph/page"} + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

c

") + result = await t.get_telegraph(upload_images=False) + + assert result == "https://telegra.ph/page" + # DocumentPreprocessor should NOT have been called + mock_poster.post.assert_awaited_once() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", None) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + async def test_no_token_list_creates_token(self, mock_poster_cls): + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.return_value = {"url": "https://telegra.ph/page2"} + + t = Telegraph("T", "https://ex.com", "LongAuthorName12345", "https://ex.com/a", "cat", "

c

") + result = await t.get_telegraph(upload_images=False) + + assert result == "https://telegra.ph/page2" + mock_poster.create_api_token.assert_awaited_once_with( + short_name="LongAuthorName", author_name="LongAuthorName12345" + ) + mock_poster.set_token.assert_not_awaited() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", []) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + async def test_empty_token_list_creates_token(self, mock_poster_cls): + """Empty list is falsy, so it should create a token.""" + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.return_value = {"url": "https://telegra.ph/page3"} + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

c

") + result = await t.get_telegraph(upload_images=False) + + assert result == "https://telegra.ph/page3" + mock_poster.create_api_token.assert_awaited_once() + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + async def test_exception_returns_empty_string(self, mock_poster_cls): + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.side_effect = RuntimeError("upload failed") + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

c

") + result = await t.get_telegraph(upload_images=False) + + assert result == "" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") + async def test_exception_during_image_upload_returns_empty( + self, mock_doc_pre_cls, mock_poster_cls + ): + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + + mock_doc_pre = MagicMock() + mock_doc_pre.upload_all_images = AsyncMock(side_effect=RuntimeError("img fail")) + mock_doc_pre_cls.return_value = mock_doc_pre + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

c

") + result = await t.get_telegraph(upload_images=True) + + assert result == "" + + @pytest.mark.asyncio + @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") + @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") + async def test_content_updated_after_image_processing( + self, mock_doc_pre_cls, mock_poster_cls + ): + """Verify self.content is updated with processed HTML before posting.""" + mock_poster = AsyncMock() + mock_poster_cls.return_value = mock_poster + mock_poster.post.return_value = {"url": "https://telegra.ph/ok"} + + mock_doc_pre = MagicMock() + mock_doc_pre.upload_all_images = AsyncMock() + mock_doc_pre.get_processed_html.return_value = "

images-uploaded

" + mock_doc_pre_cls.return_value = mock_doc_pre + + t = Telegraph("T", "https://ex.com", "Auth", "https://ex.com/a", "cat", "

original

") + await t.get_telegraph(upload_images=True) + + # The content passed to post() should be the processed one + post_call = mock_poster.post.call_args + assert post_call.kwargs["text"] == "

images-uploaded

" + assert t.content == "

images-uploaded

" diff --git a/uv.lock b/uv.lock index af7ff28..9d88f05 100644 --- a/uv.lock +++ b/uv.lock @@ -575,6 +575,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coverage" +version = "7.13.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" }, + { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" }, + { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" }, + { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159, upload-time = "2026-03-17T10:30:47.204Z" }, + { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270, upload-time = "2026-03-17T10:30:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/93/89/7ffc4ba0f5d0a55c1e84ea7cee39c9fc06af7b170513d83fbf3bbefce280/coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256", size = 257538, upload-time = "2026-03-17T10:30:50.77Z" }, + { url = "https://files.pythonhosted.org/packages/81/bd/73ddf85f93f7e6fa83e77ccecb6162d9415c79007b4bc124008a4995e4a7/coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c", size = 251821, upload-time = "2026-03-17T10:30:52.5Z" }, + { url = "https://files.pythonhosted.org/packages/a0/81/278aff4e8dec4926a0bcb9486320752811f543a3ce5b602cc7a29978d073/coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5", size = 253191, upload-time = "2026-03-17T10:30:54.543Z" }, + { url = "https://files.pythonhosted.org/packages/70/ee/fe1621488e2e0a58d7e94c4800f0d96f79671553488d401a612bebae324b/coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09", size = 251337, upload-time = "2026-03-17T10:30:56.663Z" }, + { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404, upload-time = "2026-03-17T10:30:58.427Z" }, + { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903, upload-time = "2026-03-17T10:31:00.093Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780, upload-time = "2026-03-17T10:31:01.916Z" }, + { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093, upload-time = "2026-03-17T10:31:03.642Z" }, + { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900, upload-time = "2026-03-17T10:31:05.651Z" }, + { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515, upload-time = "2026-03-17T10:31:07.293Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" }, +] + [[package]] name = "cryptography" version = "45.0.7" @@ -715,6 +739,7 @@ dev = [ { name = "celery-types" }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "pytest-cov" }, ] [package.metadata] @@ -762,6 +787,7 @@ dev = [ { name = "celery-types", specifier = ">=0.24.0" }, { name = "pytest", specifier = ">=8.3.5,<9.0.0" }, { name = "pytest-asyncio", specifier = ">=0.26.0,<0.27.0" }, + { name = "pytest-cov", specifier = ">=7.1.0" }, ] [[package]] @@ -1761,6 +1787,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/7f/338843f449ace853647ace35870874f69a764d251872ed1b4de9f234822c/pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0", size = 19694, upload-time = "2025-03-25T06:22:27.807Z" }, ] +[[package]] +name = "pytest-cov" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage" }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" }, +] + [[package]] name = "python-bidi" version = "0.6.7"