-
Notifications
You must be signed in to change notification settings - Fork 4
feat: revive xiaohongshu scraping feature #60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a100dac
ee2d78b
9d2d1e3
e43dfad
0d8b5c3
6ae16ee
772adab
5e248bc
6bb5526
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -272,3 +272,4 @@ conf/* | |
| .DS_Store | ||
| /.claude/ | ||
| /apps/worker/conf/ | ||
| apps/worker/celerybeat-schedule.db | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,14 @@ | ||
| import asyncio | ||
| from typing import Any | ||
| from urllib.parse import urlparse | ||
|
|
||
| import httpx | ||
| import jmespath | ||
|
|
||
| from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType | ||
| from fastfetchbot_shared.utils.network import HEADERS | ||
| from src.config import JINJA2_ENV, HTTP_REQUEST_TIMEOUT | ||
| from .xhs.core import XiaoHongShuCrawler | ||
| from .xhs.client import XHSClient | ||
| from .xhs import proxy_account_pool | ||
|
|
||
| from fastfetchbot_shared.utils.logger import logger | ||
| from fastfetchbot_shared.utils.parse import ( | ||
| unix_timestamp_to_utc, | ||
| get_html_text_length, | ||
| wrap_text_into_html, | ||
| ) | ||
| from src.config import JINJA2_ENV, XHS_COOKIE_STRING, XHS_SIGN_SERVER_URL | ||
| from .adaptar import XhsSinglePostAdapter | ||
|
|
||
| environment = JINJA2_ENV | ||
| short_text_template = environment.get_template("xiaohongshu_short_text.jinja2") | ||
|
|
@@ -42,78 +33,51 @@ def __init__(self, url: str, data: Any, **kwargs): | |
| self.raw_content = None | ||
|
|
||
| async def get_item(self) -> dict: | ||
| await self.get_xiaohongshu() | ||
| await self._get_xiaohongshu() | ||
| return self.to_dict() | ||
|
|
||
| async def get_xiaohongshu(self) -> None: | ||
| if self.url.find("xiaohongshu.com") == -1: | ||
| async with httpx.AsyncClient() as client: | ||
| resp = await client.get( | ||
| self.url, | ||
| headers=HEADERS, | ||
| follow_redirects=True, | ||
| timeout=HTTP_REQUEST_TIMEOUT, | ||
| ) | ||
| if ( | ||
| resp.history | ||
| ): # if there is a redirect, the request will have a response chain | ||
| for h in resp.history: | ||
| print(h.status_code, h.url) | ||
| self.url = str(resp.url) | ||
| urlparser = urlparse(self.url) | ||
| self.id = urlparser.path.split("/")[-1] | ||
| crawler = XiaoHongShuCrawler() | ||
| account_pool = proxy_account_pool.create_account_pool() | ||
| crawler.init_config("xhs", "cookie", account_pool) | ||
| note_detail = None | ||
| for _ in range(5): | ||
| try: | ||
| note_detail = await crawler.start(id=self.id) | ||
| break | ||
| except Exception as e: | ||
| await asyncio.sleep(3) | ||
| logger.error(f"error: {e}") | ||
| logger.error(f"retrying...") | ||
| if not note_detail: | ||
| raise Exception("重试了这么多次还是无法签名成功,寄寄寄") | ||
| # logger.debug(f"json_data: {json.dumps(note_detail, ensure_ascii=False, indent=4)}") | ||
| parsed_data = self.process_note_json(note_detail) | ||
| await self.process_xiaohongshu_note(parsed_data) | ||
| async def _get_xiaohongshu(self) -> None: | ||
| async with XhsSinglePostAdapter( | ||
| cookies=XHS_COOKIE_STRING, | ||
| sign_server_endpoint=XHS_SIGN_SERVER_URL, | ||
| ) as adapter: | ||
| result = await adapter.fetch_post(note_url=self.url) | ||
|
Comment on lines
+40
to
+44
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new flow performs a single Useful? React with 👍 / 👎. |
||
| note = result["note"] | ||
| self.id = note.get("note_id") | ||
| self.url = result["url"] | ||
| await self._process_xiaohongshu_note(note) | ||
|
|
||
| async def process_xiaohongshu_note(self, json_data: dict): | ||
| async def _process_xiaohongshu_note(self, json_data: dict): | ||
| user = json_data.get("user", {}) or {} | ||
| self.title = json_data.get("title") | ||
| self.author = json_data.get("author") | ||
| self.author = user.get("nickname") | ||
| if not self.title and self.author: | ||
| self.title = f"{self.author}的小红书笔记" | ||
| self.author_url = "https://www.xiaohongshu.com/user/profile/" + json_data.get( | ||
| "user_id" | ||
| self.author_url = ( | ||
| "https://www.xiaohongshu.com/user/profile/" + user.get("user_id", "") | ||
| ) | ||
| self.raw_content = json_data.get("raw_content") | ||
| logger.debug(f"{json_data.get('created')}") | ||
| self.raw_content = json_data.get("desc", "") | ||
| raw_time = json_data.get("time", 0) | ||
| raw_updated = json_data.get("last_update_time", 0) | ||
| self.created = ( | ||
| unix_timestamp_to_utc(json_data.get("created") / 1000) | ||
| if json_data.get("created") | ||
| else None | ||
| unix_timestamp_to_utc(int(raw_time) / 1000) if raw_time else None | ||
| ) | ||
| self.updated = ( | ||
| unix_timestamp_to_utc(json_data.get("updated") / 1000) | ||
| if json_data.get("updated") | ||
| else None | ||
| unix_timestamp_to_utc(int(raw_updated) / 1000) if raw_updated else None | ||
| ) | ||
| self.like_count = json_data.get("like_count") | ||
| self.like_count = json_data.get("liked_count") | ||
| self.collected_count = json_data.get("collected_count") | ||
| self.comment_count = json_data.get("comment_count") | ||
| self.share_count = json_data.get("share_count") | ||
| self.ip_location = json_data.get("ip_location") | ||
| if json_data.get("image_list"): | ||
| for image_url in json_data.get("image_list"): | ||
| self.media_files.append(MediaFile(url=image_url, media_type="image")) | ||
| if json_data.get("video"): | ||
| self.media_files.append( | ||
| MediaFile(url=json_data.get("video"), media_type="video") | ||
| ) | ||
| for image_url in json_data.get("image_list", []) or []: | ||
| self.media_files.append(MediaFile(url=image_url, media_type="image")) | ||
| video_urls = json_data.get("video_urls", []) or [] | ||
| if video_urls: | ||
| self.media_files.append(MediaFile(url=video_urls[0], media_type="video")) | ||
| data = self.__dict__ | ||
| data["raw_content"] = data["raw_content"].replace("\t", "") | ||
| raw_content = self.raw_content or "" | ||
| data["raw_content"] = raw_content.replace("\t", "") | ||
| if data["raw_content"].endswith("\n"): | ||
| data["raw_content"] = data["raw_content"][:-1] | ||
| self.text = short_text_template.render(data=data) | ||
|
|
@@ -124,30 +88,7 @@ async def process_xiaohongshu_note(self, json_data: dict): | |
| if media_file.media_type == "image": | ||
| data["raw_content"] += f'<p><img src="{media_file.url}" alt=""/></p>' | ||
| elif media_file.media_type == "video": | ||
| data[ | ||
| "raw_content" | ||
| ] += ( | ||
| data["raw_content"] += ( | ||
| f'<p><video src="{media_file.url}" controls="controls"></video></p>' | ||
| ) | ||
| self.content = content_template.render(data=data) | ||
|
|
||
| @staticmethod | ||
| def process_note_json(json_data: dict): | ||
| expression = """ | ||
| { | ||
| title: title, | ||
| raw_content: desc, | ||
| author: user.nickname, | ||
| user_id: user.user_id, | ||
| image_list: image_list[*].url, | ||
| video: video.media.stream.h264[0].master_url, | ||
| like_count: interact_info.liked_count, | ||
| collected_count: interact_info.collected_count, | ||
| comment_count: interact_info.comment_count, | ||
| share_count: interact_info.share_count, | ||
| ip_location: ip_location, | ||
| created: time, | ||
| updated: last_update_time | ||
| } | ||
| """ | ||
| return jmespath.search(expression, json_data) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a language identifier to the fenced code block.
The cookie-file example block has no language specifier, which triggers markdownlint
MD040. Since it's a plain text example, usetext(orsh).🔧 Proposed fix
🧰 Tools
🪛 markdownlint-cli2 (0.21.0)
[warning] 182-182: Fenced code blocks should have a language specified
(MD040, fenced-code-language)
🤖 Prompt for AI Agents