From d86e9268ff64d64e0d22f14a175559d3fa94ffc6 Mon Sep 17 00:00:00 2001 From: aturret Date: Thu, 19 Mar 2026 22:19:21 -0500 Subject: [PATCH] feat: integrate FxZhihu implementation --- apps/api/src/config.py | 1 + .../src/services/scrapers/zhihu/__init__.py | 111 ++++++++++++------ .../api/src/services/scrapers/zhihu/config.py | 24 ++-- .../scrapers/zhihu/content_processing.py | 64 ++++++++++ template.env | 4 + tests/test_zhihu_content_processing.py | 58 +++++++++ 6 files changed, 218 insertions(+), 44 deletions(-) create mode 100644 apps/api/src/services/scrapers/zhihu/content_processing.py create mode 100644 tests/test_zhihu_content_processing.py diff --git a/apps/api/src/config.py b/apps/api/src/config.py index 1c3086d..12a339b 100644 --- a/apps/api/src/config.py +++ b/apps/api/src/config.py @@ -123,6 +123,7 @@ # Zhihu FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") +ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None) zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json") if os.path.exists(zhihu_cookie_path): diff --git a/apps/api/src/services/scrapers/zhihu/__init__.py b/apps/api/src/services/scrapers/zhihu/__init__.py index 33e5284..b8c8a1a 100644 --- a/apps/api/src/services/scrapers/zhihu/__init__.py +++ b/apps/api/src/services/scrapers/zhihu/__init__.py @@ -26,8 +26,10 @@ ZHIHU_HOST, ALL_METHODS, ZHIHU_COOKIES, - ZHIHU_API_ANSWER_PARAMS + ZHIHU_API_COOKIE, + ZHIHU_API_ANSWER_PARAMS, ) +from .content_processing import fix_images_and_links, extract_references, unmask_zhihu_links from fastfetchbot_shared.utils.logger import logger environment = JINJA2_ENV @@ -115,16 +117,19 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): self.retweeted: bool = False # reqeust fields self.httpx_client = zhihu_client - self.headers = {"User-Agent": get_random_user_agent(), - "Accept": "*/*", - "Referer": self.url, - "Connection": "keep-alive", - } + self.headers = { + "User-Agent": "node", + "Accept": "*/*", + "Referer": self.url, + "Connection": "keep-alive", + } + if ZHIHU_API_COOKIE: + self.headers["Cookie"] = ZHIHU_API_COOKIE if kwargs.get("cookie"): self.headers["Cookie"] = kwargs.get("cookie") - if ZHIHU_COOKIES: + elif ZHIHU_COOKIES: self.headers["Cookie"] = ZHIHU_COOKIES - self.method = kwargs.get("method", "fxzhihu") + self.method = kwargs.get("method", "api") self.urlparser = urlparse(self.url) self.api_url = "" self.status_id = "" @@ -163,7 +168,7 @@ async def _get_zhihu_item(self) -> None: for method in ALL_METHODS: try: if self.method not in ALL_METHODS: - self.method = "json" + self.method = "api" else: self.method = method await self._get_request_url() @@ -265,17 +270,11 @@ async def _get_request_url(self) -> None: elif self.zhihu_type == "article": if self.method == "api": self.request_url = ( - ZHIHU_COLUMNS_API_HOST + ZHIHU_API_HOST + "/articles/" + self.article_id - + "?" - + ZHIHU_API_ANSWER_PARAMS ) return - # TODO: There are two api url to get a single article. The first one may fail in the future. - # Therefore, I remain the second one. - # self.request_url = ( - # ZHIHU_COLUMNS_API_HOST_V2 + self.article_id + "?" + ZHIHU_API_ANSWER_PARAMS) elif self.zhihu_type == "status": if self.method == "api": self.request_url = ( @@ -322,6 +321,10 @@ async def _get_zhihu_answer(self) -> None: if answer_data == {}: raise Exception("Cannot get the answer") self._resolve_answer_json_data(answer_data) + # Apply FxZhihu-style content processing for api method + if self.method == "api": + self.raw_content = fix_images_and_links(self.raw_content) + self.raw_content = unmask_zhihu_links(self.raw_content) else: try: selector = await get_selector(self.request_url, headers=self.headers) @@ -360,11 +363,15 @@ async def _get_zhihu_status(self): """ if self.method in ["api", "fxzhihu"]: json_data = await get_response_json(self.request_url, headers=self.headers, client=self.httpx_client) - data = self._resolve_status_api_data(json_data) # TODO: separate the function to resolve the api data + data = self._resolve_status_api_data(json_data) self.author = data["author"] self.author_url = data["author_url"] self.title = data["author"] + "的想法" - self.raw_content = json_data["content_html"] + self.raw_content = json_data.get("content_html", "") + # Apply FxZhihu-style content processing for api method + if self.method == "api": + self.raw_content = fix_images_and_links(self.raw_content) + self.raw_content = unmask_zhihu_links(self.raw_content) self.media_files.extend(data["media_files"]) self.date = unix_timestamp_to_utc(data["created"]) self.updated = unix_timestamp_to_utc(data["updated"]) @@ -540,6 +547,17 @@ async def _get_zhihu_article(self): self.author = json_data["author"]["name"] self.author_url = json_data["author"]["url"] self.upvote = json_data["voteup_count"] + self.comment_count = json_data.get("comment_count", 0) + self.date = unix_timestamp_to_utc(json_data.get("created", 0)) + self.updated = unix_timestamp_to_utc(json_data.get("updated", 0)) + if json_data.get("column"): + self.column = json_data["column"].get("title", "") + self.column_url = json_data["column"].get("url", "") + self.column_intro = json_data["column"].get("intro", "") + # Apply FxZhihu-style content processing for api method + if self.method == "api": + self.raw_content = fix_images_and_links(self.raw_content) + self.raw_content = unmask_zhihu_links(self.raw_content) except Exception as e: raise Exception("zhihu request failed") else: @@ -717,21 +735,29 @@ def _parse_article_json_data(self, data: Dict) -> Dict: @staticmethod def _resolve_status_api_data(data: Dict) -> Dict: + # Handle both API response formats (reaction.statistics vs direct fields) + if "reaction" in data: + like_count = data["reaction"]["statistics"]["up_vote_count"] + comment_count = data["reaction"]["statistics"]["comment_count"] + else: + like_count = data.get("like_count", 0) + comment_count = data.get("comment_count", 0) + result = { "author": data["author"]["name"], - "author_url": ZHIHU_HOST + "/people/" + data["author"]["url_token"], + "author_url": ZHIHU_HOST + "/people/" + data["author"].get("url_token", ""), "created": data["created"], "updated": data["updated"], "text": None, - "raw_content": data["content_html"], - "like_count": data["like_count"], - "comment_count": data["comment_count"], + "raw_content": data.get("content_html", ""), + "like_count": like_count, + "comment_count": comment_count, "media_files": [], "origin_pin_id": None, } - for content in data["content"]: + for content in data.get("content", []): if content["type"] == "text": - result["text"] = content["content"] + result["text"] = content.get("content", "") elif content["type"] == "image": media_item = MediaFile.from_dict( { @@ -742,16 +768,33 @@ def _resolve_status_api_data(data: Dict) -> Dict: ) result["media_files"].append(media_item) elif content["type"] == "video": - media_item = MediaFile.from_dict( - { - "media_type": "video", - "url": content["video_info"]["playlist"]["hd"]["play_url"], - "caption": "", - } - ) - result["media_files"].append(media_item) - if "origin_pin" in data: - result["origin_pin_id"] = data["origin_pin"]["id"] + # Try HD quality first, fallback to any available + video_url = None + if "video_info" in content: + playlist = content["video_info"].get("playlist", {}) + if "hd" in playlist: + video_url = playlist["hd"].get("play_url") + elif playlist: + first_quality = next(iter(playlist.values()), {}) + video_url = first_quality.get("play_url") + elif "playlist" in content: + for item in content["playlist"]: + if item.get("quality") == "hd": + video_url = item.get("url") + break + if not video_url and content["playlist"]: + video_url = content["playlist"][0].get("url") + if video_url: + media_item = MediaFile.from_dict( + { + "media_type": "video", + "url": video_url, + "caption": "", + } + ) + result["media_files"].append(media_item) + if "origin_pin" in data and data["origin_pin"]: + result["origin_pin_id"] = str(data["origin_pin"]["id"]) result["origin_pin_data"] = Zhihu._resolve_status_api_data(data["origin_pin"]) return result diff --git a/apps/api/src/services/scrapers/zhihu/config.py b/apps/api/src/services/scrapers/zhihu/config.py index 117df27..4d44fc8 100644 --- a/apps/api/src/services/scrapers/zhihu/config.py +++ b/apps/api/src/services/scrapers/zhihu/config.py @@ -1,22 +1,26 @@ -from src.config import ZHIHU_COOKIES_JSON +from src.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0 SHORT_LIMIT = 600 ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api" ZHIHU_COLUMNS_API_HOST_V2 = "https://api.zhihu.com/article/" ZHIHU_API_HOST = "https://www.zhihu.com/api/v4" -ZHIHU_API_ANSWER_PARAMS = ("include=content%2Cexcerpt%2Cauthor%2Cvoteup_count%2Ccomment_count%2Cquestion%2Ccreated_time" - "%2Cquestion.detail") +ZHIHU_API_ANSWER_PARAMS = "include=content,excerpt,voteup_count,comment_count,question.detail" ZHIHU_HOST = "https://www.zhihu.com" -ALL_METHODS = ["fxzhihu"] +ALL_METHODS = ["api", "fxzhihu"] """ -There are three methods to get zhihu item: from zhihu v4 api(api), a json object in the html script(json), - or parsing the html page content directly. - For most occasions, the api method is the best choice. But Zhihu official api only opens for status and article. - Therefore, we must use the json method to get the answer. And if one of the above two methods fails, the get_item method - would try to parse the html page content directly. - You can also pass the method as a parameter when initializing the Zhihu object. If not, the default method is api. +Methods: "api" calls Zhihu API v4 directly (ported from FxZhihu), "fxzhihu" calls external FxZhihu server as fallback. +The "json" method parses HTML script tags, "html" parses page content directly. """ +# Cookie for direct API calls: prefer ZHIHU_Z_C0 env var, fall back to cookies JSON +if ZHIHU_Z_C0: + ZHIHU_API_COOKIE = f"z_c0={ZHIHU_Z_C0}" +elif ZHIHU_COOKIES_JSON: + ZHIHU_API_COOKIE = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON) +else: + ZHIHU_API_COOKIE = None + +# Full cookie string for HTML/JSON methods and fxzhihu fallback if ZHIHU_COOKIES_JSON: ZHIHU_COOKIES = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON) else: diff --git a/apps/api/src/services/scrapers/zhihu/content_processing.py b/apps/api/src/services/scrapers/zhihu/content_processing.py new file mode 100644 index 0000000..87dbe5a --- /dev/null +++ b/apps/api/src/services/scrapers/zhihu/content_processing.py @@ -0,0 +1,64 @@ +from urllib.parse import urlparse, parse_qs, unquote + +from bs4 import BeautifulSoup + + +def fix_images_and_links(html: str) -> str: + """ + Port of FxZhihu's fixImagesAndLinks: + - Replace data-actualsrc with src on img tags + - Remove tags preserving text content + """ + soup = BeautifulSoup(html, "html.parser") + for img in soup.find_all("img"): + actualsrc = img.get("data-actualsrc") + if actualsrc: + img["src"] = actualsrc + del img["data-actualsrc"] + for u_tag in soup.find_all("u"): + u_tag.unwrap() + return str(soup) + + +def extract_references(html: str) -> str: + """ + Port of FxZhihu's extractReference: + - Find tags with data-text, data-url, data-numero + - Return formatted reference list HTML + """ + soup = BeautifulSoup(html, "html.parser") + references = {} + for sup in soup.find_all("sup"): + text = sup.get("data-text") + url = sup.get("data-url", "") + numero = sup.get("data-numero") + if text and numero: + references[numero] = {"text": text, "url": url} + if not references: + return "" + sorted_refs = sorted(references.items(), key=lambda x: int(x[0])) + items = [] + for index, ref in sorted_refs: + url_html = f'{ref["url"]}' if ref["url"] else "" + items.append(f"
  • {ref['text']}{url_html}
  • ") + return f'

    参考

      {"".join(items)}
    ' + + +def unmask_zhihu_links(html: str) -> str: + """ + Port of FxZhihu's link unmasking: + - Decode https://link.zhihu.com/?target=... to actual URLs + """ + soup = BeautifulSoup(html, "html.parser") + for a_tag in soup.find_all("a", href=True): + href = a_tag["href"] + if href.startswith("https://link.zhihu.com/"): + try: + parsed = urlparse(href) + qs = parse_qs(parsed.query) + target = qs.get("target", [None])[0] + if target: + a_tag["href"] = unquote(target) + except Exception: + pass + return str(soup) diff --git a/template.env b/template.env index 0eb1ac7..84ac84c 100644 --- a/template.env +++ b/template.env @@ -124,6 +124,10 @@ REDDIT_PASSWORD= REDDIT_USERNAME= FXZHIHU_HOST= +# Zhihu z_c0 cookie for direct API authentication. Extract from browser cookies. +# Takes priority over conf/zhihu_cookies.json when set. +ZHIHU_Z_C0= + # General Webpage Scraping # Enable general webpage scraping for unrecognized URLs. Default: `false` GENERAL_SCRAPING_ON=false diff --git a/tests/test_zhihu_content_processing.py b/tests/test_zhihu_content_processing.py new file mode 100644 index 0000000..4e773c1 --- /dev/null +++ b/tests/test_zhihu_content_processing.py @@ -0,0 +1,58 @@ +import sys +import os + +# Import content_processing directly to avoid pulling in the full zhihu scraper +# which has heavy dependencies (fastfetchbot_shared, httpx, etc.) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "apps", "api", "src", "services", "scrapers", "zhihu")) +from content_processing import ( + fix_images_and_links, + extract_references, + unmask_zhihu_links, +) + + +def test_fix_images_replaces_data_actualsrc(): + html = '' + result = fix_images_and_links(html) + assert 'src="https://real.jpg"' in result + assert "data-actualsrc" not in result + + +def test_fix_images_preserves_normal_src(): + html = '' + result = fix_images_and_links(html) + assert 'src="https://normal.jpg"' in result + + +def test_fix_images_removes_u_tags(): + html = "

    Hello world

    " + result = fix_images_and_links(html) + assert "" not in result + assert "world" in result + + +def test_extract_references_with_refs(): + html = '

    Text[1]

    ' + result = extract_references(html) + assert "参考" in result + assert "Ref 1" in result + assert "https://example.com" in result + + +def test_extract_references_empty(): + html = "

    No references here

    " + result = extract_references(html) + assert result == "" + + +def test_unmask_zhihu_links(): + html = 'link' + result = unmask_zhihu_links(html) + assert "https://example.com" in result + assert "link.zhihu.com" not in result + + +def test_unmask_preserves_normal_links(): + html = 'link' + result = unmask_zhihu_links(html) + assert 'href="https://example.com"' in result