From d86e9268ff64d64e0d22f14a175559d3fa94ffc6 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Thu, 19 Mar 2026 22:19:21 -0500
Subject: [PATCH] feat: integrate FxZhihu implementation

---
 apps/api/src/config.py                        |   1 +
 .../src/services/scrapers/zhihu/__init__.py   | 111 ++++++++++++------
 .../api/src/services/scrapers/zhihu/config.py |  24 ++--
 .../scrapers/zhihu/content_processing.py      |  64 ++++++++++
 template.env                                  |   4 +
 tests/test_zhihu_content_processing.py        |  58 +++++++++
 6 files changed, 218 insertions(+), 44 deletions(-)
 create mode 100644 apps/api/src/services/scrapers/zhihu/content_processing.py
 create mode 100644 tests/test_zhihu_content_processing.py

diff --git a/apps/api/src/config.py b/apps/api/src/config.py
index 1c3086d..12a339b 100644
--- a/apps/api/src/config.py
+++ b/apps/api/src/config.py
@@ -123,6 +123,7 @@
 
 # Zhihu
 FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com")
+ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None)
 
 zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json")
 if os.path.exists(zhihu_cookie_path):
diff --git a/apps/api/src/services/scrapers/zhihu/__init__.py b/apps/api/src/services/scrapers/zhihu/__init__.py
index 33e5284..b8c8a1a 100644
--- a/apps/api/src/services/scrapers/zhihu/__init__.py
+++ b/apps/api/src/services/scrapers/zhihu/__init__.py
@@ -26,8 +26,10 @@
     ZHIHU_HOST,
     ALL_METHODS,
     ZHIHU_COOKIES,
-    ZHIHU_API_ANSWER_PARAMS
+    ZHIHU_API_COOKIE,
+    ZHIHU_API_ANSWER_PARAMS,
 )
+from .content_processing import fix_images_and_links, extract_references, unmask_zhihu_links
 from fastfetchbot_shared.utils.logger import logger
 
 environment = JINJA2_ENV
@@ -115,16 +117,19 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.retweeted: bool = False
         # reqeust fields
         self.httpx_client = zhihu_client
-        self.headers = {"User-Agent": get_random_user_agent(),
-                        "Accept": "*/*",
-                        "Referer": self.url,
-                        "Connection": "keep-alive",
-                        }
+        self.headers = {
+            "User-Agent": "node",
+            "Accept": "*/*",
+            "Referer": self.url,
+            "Connection": "keep-alive",
+        }
+        if ZHIHU_API_COOKIE:
+            self.headers["Cookie"] = ZHIHU_API_COOKIE
         if kwargs.get("cookie"):
             self.headers["Cookie"] = kwargs.get("cookie")
-        if ZHIHU_COOKIES:
+        elif ZHIHU_COOKIES:
             self.headers["Cookie"] = ZHIHU_COOKIES
-        self.method = kwargs.get("method", "fxzhihu")
+        self.method = kwargs.get("method", "api")
         self.urlparser = urlparse(self.url)
         self.api_url = ""
         self.status_id = ""
@@ -163,7 +168,7 @@ async def _get_zhihu_item(self) -> None:
         for method in ALL_METHODS:
             try:
                 if self.method not in ALL_METHODS:
-                    self.method = "json"
+                    self.method = "api"
                 else:
                     self.method = method
                 await self._get_request_url()
@@ -265,17 +270,11 @@ async def _get_request_url(self) -> None:
         elif self.zhihu_type == "article":
             if self.method == "api":
                 self.request_url = (
-                        ZHIHU_COLUMNS_API_HOST
+                        ZHIHU_API_HOST
                         + "/articles/"
                         + self.article_id
-                        + "?"
-                        + ZHIHU_API_ANSWER_PARAMS
                 )
                 return
-                # TODO: There are two api url to get a single article. The first one may fail in the future.
-                # Therefore, I remain the second one.
-                # self.request_url = (
-                #    ZHIHU_COLUMNS_API_HOST_V2 + self.article_id + "?" + ZHIHU_API_ANSWER_PARAMS)
         elif self.zhihu_type == "status":
             if self.method == "api":
                 self.request_url = (
@@ -322,6 +321,10 @@ async def _get_zhihu_answer(self) -> None:
             if answer_data == {}:
                 raise Exception("Cannot get the answer")
             self._resolve_answer_json_data(answer_data)
+            # Apply FxZhihu-style content processing for api method
+            if self.method == "api":
+                self.raw_content = fix_images_and_links(self.raw_content)
+                self.raw_content = unmask_zhihu_links(self.raw_content)
         else:
             try:
                 selector = await get_selector(self.request_url, headers=self.headers)
@@ -360,11 +363,15 @@ async def _get_zhihu_status(self):
         """
         if self.method in ["api", "fxzhihu"]:
             json_data = await get_response_json(self.request_url, headers=self.headers, client=self.httpx_client)
-            data = self._resolve_status_api_data(json_data)  # TODO: separate the function to resolve the api data
+            data = self._resolve_status_api_data(json_data)
             self.author = data["author"]
             self.author_url = data["author_url"]
             self.title = data["author"] + "的想法"
-            self.raw_content = json_data["content_html"]
+            self.raw_content = json_data.get("content_html", "")
+            # Apply FxZhihu-style content processing for api method
+            if self.method == "api":
+                self.raw_content = fix_images_and_links(self.raw_content)
+                self.raw_content = unmask_zhihu_links(self.raw_content)
             self.media_files.extend(data["media_files"])
             self.date = unix_timestamp_to_utc(data["created"])
             self.updated = unix_timestamp_to_utc(data["updated"])
@@ -540,6 +547,17 @@ async def _get_zhihu_article(self):
                 self.author = json_data["author"]["name"]
                 self.author_url = json_data["author"]["url"]
                 self.upvote = json_data["voteup_count"]
+                self.comment_count = json_data.get("comment_count", 0)
+                self.date = unix_timestamp_to_utc(json_data.get("created", 0))
+                self.updated = unix_timestamp_to_utc(json_data.get("updated", 0))
+                if json_data.get("column"):
+                    self.column = json_data["column"].get("title", "")
+                    self.column_url = json_data["column"].get("url", "")
+                    self.column_intro = json_data["column"].get("intro", "")
+                # Apply FxZhihu-style content processing for api method
+                if self.method == "api":
+                    self.raw_content = fix_images_and_links(self.raw_content)
+                    self.raw_content = unmask_zhihu_links(self.raw_content)
             except Exception as e:
                 raise Exception("zhihu request failed")
         else:
@@ -717,21 +735,29 @@ def _parse_article_json_data(self, data: Dict) -> Dict:
 
     @staticmethod
     def _resolve_status_api_data(data: Dict) -> Dict:
+        # Handle both API response formats (reaction.statistics vs direct fields)
+        if "reaction" in data:
+            like_count = data["reaction"]["statistics"]["up_vote_count"]
+            comment_count = data["reaction"]["statistics"]["comment_count"]
+        else:
+            like_count = data.get("like_count", 0)
+            comment_count = data.get("comment_count", 0)
+
         result = {
             "author": data["author"]["name"],
-            "author_url": ZHIHU_HOST + "/people/" + data["author"]["url_token"],
+            "author_url": ZHIHU_HOST + "/people/" + data["author"].get("url_token", ""),
             "created": data["created"],
             "updated": data["updated"],
             "text": None,
-            "raw_content": data["content_html"],
-            "like_count": data["like_count"],
-            "comment_count": data["comment_count"],
+            "raw_content": data.get("content_html", ""),
+            "like_count": like_count,
+            "comment_count": comment_count,
             "media_files": [],
             "origin_pin_id": None,
         }
-        for content in data["content"]:
+        for content in data.get("content", []):
             if content["type"] == "text":
-                result["text"] = content["content"]
+                result["text"] = content.get("content", "")
             elif content["type"] == "image":
                 media_item = MediaFile.from_dict(
                     {
@@ -742,16 +768,33 @@ def _resolve_status_api_data(data: Dict) -> Dict:
                 )
                 result["media_files"].append(media_item)
             elif content["type"] == "video":
-                media_item = MediaFile.from_dict(
-                    {
-                        "media_type": "video",
-                        "url": content["video_info"]["playlist"]["hd"]["play_url"],
-                        "caption": "",
-                    }
-                )
-                result["media_files"].append(media_item)
-        if "origin_pin" in data:
-            result["origin_pin_id"] = data["origin_pin"]["id"]
+                # Try HD quality first, fallback to any available
+                video_url = None
+                if "video_info" in content:
+                    playlist = content["video_info"].get("playlist", {})
+                    if "hd" in playlist:
+                        video_url = playlist["hd"].get("play_url")
+                    elif playlist:
+                        first_quality = next(iter(playlist.values()), {})
+                        video_url = first_quality.get("play_url")
+                elif "playlist" in content:
+                    for item in content["playlist"]:
+                        if item.get("quality") == "hd":
+                            video_url = item.get("url")
+                            break
+                    if not video_url and content["playlist"]:
+                        video_url = content["playlist"][0].get("url")
+                if video_url:
+                    media_item = MediaFile.from_dict(
+                        {
+                            "media_type": "video",
+                            "url": video_url,
+                            "caption": "",
+                        }
+                    )
+                    result["media_files"].append(media_item)
+        if "origin_pin" in data and data["origin_pin"]:
+            result["origin_pin_id"] = str(data["origin_pin"]["id"])
             result["origin_pin_data"] = Zhihu._resolve_status_api_data(data["origin_pin"])
         return result
 
diff --git a/apps/api/src/services/scrapers/zhihu/config.py b/apps/api/src/services/scrapers/zhihu/config.py
index 117df27..4d44fc8 100644
--- a/apps/api/src/services/scrapers/zhihu/config.py
+++ b/apps/api/src/services/scrapers/zhihu/config.py
@@ -1,22 +1,26 @@
-from src.config import ZHIHU_COOKIES_JSON
+from src.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0
 
 SHORT_LIMIT = 600
 ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api"
 ZHIHU_COLUMNS_API_HOST_V2 = "https://api.zhihu.com/article/"
 ZHIHU_API_HOST = "https://www.zhihu.com/api/v4"
-ZHIHU_API_ANSWER_PARAMS = ("include=content%2Cexcerpt%2Cauthor%2Cvoteup_count%2Ccomment_count%2Cquestion%2Ccreated_time"
-                    "%2Cquestion.detail")
+ZHIHU_API_ANSWER_PARAMS = "include=content,excerpt,voteup_count,comment_count,question.detail"
 ZHIHU_HOST = "https://www.zhihu.com"
-ALL_METHODS = ["fxzhihu"]
+ALL_METHODS = ["api", "fxzhihu"]
 """
-There are three methods to get zhihu item: from zhihu v4 api(api), a json object in the html script(json),
- or parsing the html page content directly.
- For most occasions, the api method is the best choice. But Zhihu official api only opens for status and article.
- Therefore, we must use the json method to get the answer. And if one of the above two methods fails, the get_item method
- would try to parse the html page content directly.
- You can also pass the method as a parameter when initializing the Zhihu object. If not, the default method is api.
+Methods: "api" calls Zhihu API v4 directly (ported from FxZhihu), "fxzhihu" calls external FxZhihu server as fallback.
+The "json" method parses HTML script tags, "html" parses page content directly.
 """
 
+# Cookie for direct API calls: prefer ZHIHU_Z_C0 env var, fall back to cookies JSON
+if ZHIHU_Z_C0:
+    ZHIHU_API_COOKIE = f"z_c0={ZHIHU_Z_C0}"
+elif ZHIHU_COOKIES_JSON:
+    ZHIHU_API_COOKIE = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
+else:
+    ZHIHU_API_COOKIE = None
+
+# Full cookie string for HTML/JSON methods and fxzhihu fallback
 if ZHIHU_COOKIES_JSON:
     ZHIHU_COOKIES = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
 else:
diff --git a/apps/api/src/services/scrapers/zhihu/content_processing.py b/apps/api/src/services/scrapers/zhihu/content_processing.py
new file mode 100644
index 0000000..87dbe5a
--- /dev/null
+++ b/apps/api/src/services/scrapers/zhihu/content_processing.py
@@ -0,0 +1,64 @@
+from urllib.parse import urlparse, parse_qs, unquote
+
+from bs4 import BeautifulSoup
+
+
+def fix_images_and_links(html: str) -> str:
+    """
+    Port of FxZhihu's fixImagesAndLinks:
+    - Replace data-actualsrc with src on img tags
+    - Remove <u> tags preserving text content
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    for img in soup.find_all("img"):
+        actualsrc = img.get("data-actualsrc")
+        if actualsrc:
+            img["src"] = actualsrc
+            del img["data-actualsrc"]
+    for u_tag in soup.find_all("u"):
+        u_tag.unwrap()
+    return str(soup)
+
+
+def extract_references(html: str) -> str:
+    """
+    Port of FxZhihu's extractReference:
+    - Find <sup> tags with data-text, data-url, data-numero
+    - Return formatted reference list HTML
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    references = {}
+    for sup in soup.find_all("sup"):
+        text = sup.get("data-text")
+        url = sup.get("data-url", "")
+        numero = sup.get("data-numero")
+        if text and numero:
+            references[numero] = {"text": text, "url": url}
+    if not references:
+        return ""
+    sorted_refs = sorted(references.items(), key=lambda x: int(x[0]))
+    items = []
+    for index, ref in sorted_refs:
+        url_html = f'<a href="{ref["url"]}">{ref["url"]}</a>' if ref["url"] else ""
+        items.append(f"<li>{ref['text']}{url_html}</li>")
+    return f'<hr><section><h2>参考</h2><ol>{"".join(items)}</ol></section>'
+
+
+def unmask_zhihu_links(html: str) -> str:
+    """
+    Port of FxZhihu's link unmasking:
+    - Decode https://link.zhihu.com/?target=... to actual URLs
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    for a_tag in soup.find_all("a", href=True):
+        href = a_tag["href"]
+        if href.startswith("https://link.zhihu.com/"):
+            try:
+                parsed = urlparse(href)
+                qs = parse_qs(parsed.query)
+                target = qs.get("target", [None])[0]
+                if target:
+                    a_tag["href"] = unquote(target)
+            except Exception:
+                pass
+    return str(soup)
diff --git a/template.env b/template.env
index 0eb1ac7..84ac84c 100644
--- a/template.env
+++ b/template.env
@@ -124,6 +124,10 @@ REDDIT_PASSWORD=
 REDDIT_USERNAME=
 FXZHIHU_HOST=
 
+# Zhihu z_c0 cookie for direct API authentication. Extract from browser cookies.
+# Takes priority over conf/zhihu_cookies.json when set.
+ZHIHU_Z_C0=
+
 # General Webpage Scraping
 # Enable general webpage scraping for unrecognized URLs. Default: `false`
 GENERAL_SCRAPING_ON=false
diff --git a/tests/test_zhihu_content_processing.py b/tests/test_zhihu_content_processing.py
new file mode 100644
index 0000000..4e773c1
--- /dev/null
+++ b/tests/test_zhihu_content_processing.py
@@ -0,0 +1,58 @@
+import sys
+import os
+
+# Import content_processing directly to avoid pulling in the full zhihu scraper
+# which has heavy dependencies (fastfetchbot_shared, httpx, etc.)
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "apps", "api", "src", "services", "scrapers", "zhihu"))
+from content_processing import (
+    fix_images_and_links,
+    extract_references,
+    unmask_zhihu_links,
+)
+
+
+def test_fix_images_replaces_data_actualsrc():
+    html = '<img src="placeholder.jpg" data-actualsrc="https://real.jpg">'
+    result = fix_images_and_links(html)
+    assert 'src="https://real.jpg"' in result
+    assert "data-actualsrc" not in result
+
+
+def test_fix_images_preserves_normal_src():
+    html = '<img src="https://normal.jpg">'
+    result = fix_images_and_links(html)
+    assert 'src="https://normal.jpg"' in result
+
+
+def test_fix_images_removes_u_tags():
+    html = "<p>Hello <u>world</u></p>"
+    result = fix_images_and_links(html)
+    assert "<u>" not in result
+    assert "world" in result
+
+
+def test_extract_references_with_refs():
+    html = '<p>Text<sup data-text="Ref 1" data-url="https://example.com" data-numero="1">[1]</sup></p>'
+    result = extract_references(html)
+    assert "参考" in result
+    assert "Ref 1" in result
+    assert "https://example.com" in result
+
+
+def test_extract_references_empty():
+    html = "<p>No references here</p>"
+    result = extract_references(html)
+    assert result == ""
+
+
+def test_unmask_zhihu_links():
+    html = '<a href="https://link.zhihu.com/?target=https%3A%2F%2Fexample.com">link</a>'
+    result = unmask_zhihu_links(html)
+    assert "https://example.com" in result
+    assert "link.zhihu.com" not in result
+
+
+def test_unmask_preserves_normal_links():
+    html = '<a href="https://example.com">link</a>'
+    result = unmask_zhihu_links(html)
+    assert 'href="https://example.com"' in result