Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/api/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@

# Zhihu
FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com")
ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None)

zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json")
if os.path.exists(zhihu_cookie_path):
Expand Down
111 changes: 77 additions & 34 deletions apps/api/src/services/scrapers/zhihu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
ZHIHU_HOST,
ALL_METHODS,
ZHIHU_COOKIES,
ZHIHU_API_ANSWER_PARAMS
ZHIHU_API_COOKIE,
ZHIHU_API_ANSWER_PARAMS,
)
from .content_processing import fix_images_and_links, extract_references, unmask_zhihu_links
from fastfetchbot_shared.utils.logger import logger

environment = JINJA2_ENV
Expand Down Expand Up @@ -115,16 +117,19 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
self.retweeted: bool = False
# reqeust fields
self.httpx_client = zhihu_client
self.headers = {"User-Agent": get_random_user_agent(),
"Accept": "*/*",
"Referer": self.url,
"Connection": "keep-alive",
}
self.headers = {
"User-Agent": "node",
"Accept": "*/*",
"Referer": self.url,
"Connection": "keep-alive",
}
if ZHIHU_API_COOKIE:
self.headers["Cookie"] = ZHIHU_API_COOKIE
if kwargs.get("cookie"):
self.headers["Cookie"] = kwargs.get("cookie")
if ZHIHU_COOKIES:
elif ZHIHU_COOKIES:
self.headers["Cookie"] = ZHIHU_COOKIES
self.method = kwargs.get("method", "fxzhihu")
self.method = kwargs.get("method", "api")
self.urlparser = urlparse(self.url)
self.api_url = ""
self.status_id = ""
Expand Down Expand Up @@ -163,7 +168,7 @@ async def _get_zhihu_item(self) -> None:
for method in ALL_METHODS:
try:
if self.method not in ALL_METHODS:
self.method = "json"
self.method = "api"
else:
self.method = method
await self._get_request_url()
Expand Down Expand Up @@ -265,17 +270,11 @@ async def _get_request_url(self) -> None:
elif self.zhihu_type == "article":
if self.method == "api":
self.request_url = (
ZHIHU_COLUMNS_API_HOST
ZHIHU_API_HOST
+ "/articles/"
+ self.article_id
+ "?"
+ ZHIHU_API_ANSWER_PARAMS
)
return
# TODO: There are two api url to get a single article. The first one may fail in the future.
# Therefore, I remain the second one.
# self.request_url = (
# ZHIHU_COLUMNS_API_HOST_V2 + self.article_id + "?" + ZHIHU_API_ANSWER_PARAMS)
elif self.zhihu_type == "status":
if self.method == "api":
self.request_url = (
Expand Down Expand Up @@ -322,6 +321,10 @@ async def _get_zhihu_answer(self) -> None:
if answer_data == {}:
raise Exception("Cannot get the answer")
self._resolve_answer_json_data(answer_data)
# Apply FxZhihu-style content processing for api method
if self.method == "api":
self.raw_content = fix_images_and_links(self.raw_content)
self.raw_content = unmask_zhihu_links(self.raw_content)
else:
try:
selector = await get_selector(self.request_url, headers=self.headers)
Expand Down Expand Up @@ -360,11 +363,15 @@ async def _get_zhihu_status(self):
"""
if self.method in ["api", "fxzhihu"]:
json_data = await get_response_json(self.request_url, headers=self.headers, client=self.httpx_client)
data = self._resolve_status_api_data(json_data) # TODO: separate the function to resolve the api data
data = self._resolve_status_api_data(json_data)
self.author = data["author"]
self.author_url = data["author_url"]
self.title = data["author"] + "的想法"
self.raw_content = json_data["content_html"]
self.raw_content = json_data.get("content_html", "")
# Apply FxZhihu-style content processing for api method
if self.method == "api":
self.raw_content = fix_images_and_links(self.raw_content)
self.raw_content = unmask_zhihu_links(self.raw_content)
self.media_files.extend(data["media_files"])
self.date = unix_timestamp_to_utc(data["created"])
self.updated = unix_timestamp_to_utc(data["updated"])
Expand Down Expand Up @@ -540,6 +547,17 @@ async def _get_zhihu_article(self):
self.author = json_data["author"]["name"]
self.author_url = json_data["author"]["url"]
self.upvote = json_data["voteup_count"]
self.comment_count = json_data.get("comment_count", 0)
self.date = unix_timestamp_to_utc(json_data.get("created", 0))
self.updated = unix_timestamp_to_utc(json_data.get("updated", 0))
if json_data.get("column"):
self.column = json_data["column"].get("title", "")
self.column_url = json_data["column"].get("url", "")
self.column_intro = json_data["column"].get("intro", "")
# Apply FxZhihu-style content processing for api method
if self.method == "api":
self.raw_content = fix_images_and_links(self.raw_content)
self.raw_content = unmask_zhihu_links(self.raw_content)
except Exception as e:
raise Exception("zhihu request failed")
else:
Expand Down Expand Up @@ -717,21 +735,29 @@ def _parse_article_json_data(self, data: Dict) -> Dict:

@staticmethod
def _resolve_status_api_data(data: Dict) -> Dict:
# Handle both API response formats (reaction.statistics vs direct fields)
if "reaction" in data:
like_count = data["reaction"]["statistics"]["up_vote_count"]
comment_count = data["reaction"]["statistics"]["comment_count"]
else:
like_count = data.get("like_count", 0)
comment_count = data.get("comment_count", 0)

result = {
"author": data["author"]["name"],
"author_url": ZHIHU_HOST + "/people/" + data["author"]["url_token"],
"author_url": ZHIHU_HOST + "/people/" + data["author"].get("url_token", ""),
"created": data["created"],
"updated": data["updated"],
"text": None,
"raw_content": data["content_html"],
"like_count": data["like_count"],
"comment_count": data["comment_count"],
"raw_content": data.get("content_html", ""),
"like_count": like_count,
"comment_count": comment_count,
"media_files": [],
"origin_pin_id": None,
}
for content in data["content"]:
for content in data.get("content", []):
if content["type"] == "text":
result["text"] = content["content"]
result["text"] = content.get("content", "")
elif content["type"] == "image":
media_item = MediaFile.from_dict(
{
Expand All @@ -742,16 +768,33 @@ def _resolve_status_api_data(data: Dict) -> Dict:
)
result["media_files"].append(media_item)
elif content["type"] == "video":
media_item = MediaFile.from_dict(
{
"media_type": "video",
"url": content["video_info"]["playlist"]["hd"]["play_url"],
"caption": "",
}
)
result["media_files"].append(media_item)
if "origin_pin" in data:
result["origin_pin_id"] = data["origin_pin"]["id"]
# Try HD quality first, fallback to any available
video_url = None
if "video_info" in content:
playlist = content["video_info"].get("playlist", {})
if "hd" in playlist:
video_url = playlist["hd"].get("play_url")
elif playlist:
first_quality = next(iter(playlist.values()), {})
video_url = first_quality.get("play_url")
elif "playlist" in content:
for item in content["playlist"]:
if item.get("quality") == "hd":
video_url = item.get("url")
break
if not video_url and content["playlist"]:
video_url = content["playlist"][0].get("url")
if video_url:
media_item = MediaFile.from_dict(
{
"media_type": "video",
"url": video_url,
"caption": "",
}
)
result["media_files"].append(media_item)
if "origin_pin" in data and data["origin_pin"]:
result["origin_pin_id"] = str(data["origin_pin"]["id"])
result["origin_pin_data"] = Zhihu._resolve_status_api_data(data["origin_pin"])
return result

Expand Down
24 changes: 14 additions & 10 deletions apps/api/src/services/scrapers/zhihu/config.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
from src.config import ZHIHU_COOKIES_JSON
from src.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0

SHORT_LIMIT = 600
ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api"
ZHIHU_COLUMNS_API_HOST_V2 = "https://api.zhihu.com/article/"
ZHIHU_API_HOST = "https://www.zhihu.com/api/v4"
ZHIHU_API_ANSWER_PARAMS = ("include=content%2Cexcerpt%2Cauthor%2Cvoteup_count%2Ccomment_count%2Cquestion%2Ccreated_time"
"%2Cquestion.detail")
ZHIHU_API_ANSWER_PARAMS = "include=content,excerpt,voteup_count,comment_count,question.detail"
ZHIHU_HOST = "https://www.zhihu.com"
ALL_METHODS = ["fxzhihu"]
ALL_METHODS = ["api", "fxzhihu"]
"""
There are three methods to get zhihu item: from zhihu v4 api(api), a json object in the html script(json),
or parsing the html page content directly.
For most occasions, the api method is the best choice. But Zhihu official api only opens for status and article.
Therefore, we must use the json method to get the answer. And if one of the above two methods fails, the get_item method
would try to parse the html page content directly.
You can also pass the method as a parameter when initializing the Zhihu object. If not, the default method is api.
Methods: "api" calls Zhihu API v4 directly (ported from FxZhihu), "fxzhihu" calls external FxZhihu server as fallback.
The "json" method parses HTML script tags, "html" parses page content directly.
"""

# Cookie for direct API calls: prefer ZHIHU_Z_C0 env var, fall back to cookies JSON
if ZHIHU_Z_C0:
ZHIHU_API_COOKIE = f"z_c0={ZHIHU_Z_C0}"
elif ZHIHU_COOKIES_JSON:
ZHIHU_API_COOKIE = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
else:
ZHIHU_API_COOKIE = None

# Full cookie string for HTML/JSON methods and fxzhihu fallback
if ZHIHU_COOKIES_JSON:
ZHIHU_COOKIES = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
else:
Expand Down
64 changes: 64 additions & 0 deletions apps/api/src/services/scrapers/zhihu/content_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from urllib.parse import urlparse, parse_qs, unquote

from bs4 import BeautifulSoup


def fix_images_and_links(html: str) -> str:
"""
Port of FxZhihu's fixImagesAndLinks:
- Replace data-actualsrc with src on img tags
- Remove <u> tags preserving text content
"""
soup = BeautifulSoup(html, "html.parser")
for img in soup.find_all("img"):
actualsrc = img.get("data-actualsrc")
if actualsrc:
img["src"] = actualsrc
del img["data-actualsrc"]
for u_tag in soup.find_all("u"):
u_tag.unwrap()
return str(soup)


def extract_references(html: str) -> str:
"""
Port of FxZhihu's extractReference:
- Find <sup> tags with data-text, data-url, data-numero
- Return formatted reference list HTML
"""
soup = BeautifulSoup(html, "html.parser")
references = {}
for sup in soup.find_all("sup"):
text = sup.get("data-text")
url = sup.get("data-url", "")
numero = sup.get("data-numero")
if text and numero:
references[numero] = {"text": text, "url": url}
if not references:
return ""
sorted_refs = sorted(references.items(), key=lambda x: int(x[0]))
items = []
for index, ref in sorted_refs:
url_html = f'<a href="{ref["url"]}">{ref["url"]}</a>' if ref["url"] else ""
items.append(f"<li>{ref['text']}{url_html}</li>")
return f'<hr><section><h2>参考</h2><ol>{"".join(items)}</ol></section>'


def unmask_zhihu_links(html: str) -> str:
"""
Port of FxZhihu's link unmasking:
- Decode https://link.zhihu.com/?target=... to actual URLs
"""
soup = BeautifulSoup(html, "html.parser")
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if href.startswith("https://link.zhihu.com/"):
try:
parsed = urlparse(href)
qs = parse_qs(parsed.query)
target = qs.get("target", [None])[0]
if target:
a_tag["href"] = unquote(target)
except Exception:
pass
Comment on lines +56 to +63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Consider logging failed URL unmasking attempts.

The silent except: pass makes debugging difficult when URL parsing fails unexpectedly. Per coding guidelines, Loguru should be used for logging. Consider logging at debug level to aid troubleshooting without cluttering normal output.

🛠️ Proposed fix
+from fastfetchbot_shared.utils.logger import logger
 from urllib.parse import urlparse, parse_qs, unquote
             try:
                 parsed = urlparse(href)
                 qs = parse_qs(parsed.query)
                 target = qs.get("target", [None])[0]
                 if target:
                     a_tag["href"] = unquote(target)
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug(f"Failed to unmask Zhihu link {href}: {e}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try:
parsed = urlparse(href)
qs = parse_qs(parsed.query)
target = qs.get("target", [None])[0]
if target:
a_tag["href"] = unquote(target)
except Exception:
pass
try:
parsed = urlparse(href)
qs = parse_qs(parsed.query)
target = qs.get("target", [None])[0]
if target:
a_tag["href"] = unquote(target)
except Exception as e:
logger.debug(f"Failed to unmask Zhihu link {href}: {e}")
🧰 Tools
🪛 Ruff (0.15.6)

[error] 62-63: try-except-pass detected, consider logging the exception

(S110)


[warning] 62-62: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/api/src/services/scrapers/zhihu/content_processing.py` around lines 56 -
63, The try/except in the URL unmasking block silently swallows errors; update
the block in content_processing.py that parses href (uses urlparse, parse_qs,
and unquote to set a_tag["href"]) to catch Exception as e and log a debug-level
message with the original href and exception details using Loguru (ensure logger
is imported/available), then proceed without raising; include contextual text
like "failed to unmask href" in the log so failures are discoverable during
debugging.

return str(soup)
4 changes: 4 additions & 0 deletions template.env
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ REDDIT_PASSWORD=
REDDIT_USERNAME=
FXZHIHU_HOST=

# Zhihu z_c0 cookie for direct API authentication. Extract from browser cookies.
# Takes priority over conf/zhihu_cookies.json when set.
ZHIHU_Z_C0=

# General Webpage Scraping
# Enable general webpage scraping for unrecognized URLs. Default: `false`
GENERAL_SCRAPING_ON=false
Expand Down
58 changes: 58 additions & 0 deletions tests/test_zhihu_content_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys
import os

# Import content_processing directly to avoid pulling in the full zhihu scraper
# which has heavy dependencies (fastfetchbot_shared, httpx, etc.)
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "apps", "api", "src", "services", "scrapers", "zhihu"))
from content_processing import (
fix_images_and_links,
extract_references,
unmask_zhihu_links,
)


def test_fix_images_replaces_data_actualsrc():
html = '<img src="placeholder.jpg" data-actualsrc="https://real.jpg">'
result = fix_images_and_links(html)
assert 'src="https://real.jpg"' in result
assert "data-actualsrc" not in result


def test_fix_images_preserves_normal_src():
html = '<img src="https://normal.jpg">'
result = fix_images_and_links(html)
assert 'src="https://normal.jpg"' in result


def test_fix_images_removes_u_tags():
html = "<p>Hello <u>world</u></p>"
result = fix_images_and_links(html)
assert "<u>" not in result
assert "world" in result


def test_extract_references_with_refs():
html = '<p>Text<sup data-text="Ref 1" data-url="https://example.com" data-numero="1">[1]</sup></p>'
result = extract_references(html)
assert "参考" in result
assert "Ref 1" in result
assert "https://example.com" in result


def test_extract_references_empty():
html = "<p>No references here</p>"
result = extract_references(html)
assert result == ""


def test_unmask_zhihu_links():
html = '<a href="https://link.zhihu.com/?target=https%3A%2F%2Fexample.com">link</a>'
result = unmask_zhihu_links(html)
assert "https://example.com" in result
assert "link.zhihu.com" not in result


def test_unmask_preserves_normal_links():
html = '<a href="https://example.com">link</a>'
result = unmask_zhihu_links(html)
assert 'href="https://example.com"' in result