Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion mission_blue.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from alive_progress import alive_bar
from alive_progress.animations.bars import bar_factory
import click
import scraper

# pylint: disable=C0301

Expand Down Expand Up @@ -276,6 +277,7 @@ def generate_query_params(
}



def search_posts(params, token):
# pylint: disable=E1102
# pylint: disable=C0301
Expand Down Expand Up @@ -543,7 +545,7 @@ def main(query="", sort="", since="", until="", mentions="", author="", lang="",

# Fetch posts
print("Fetching posts...")
raw_posts = search_posts(query_param, access_token)
raw_posts = scraper.search_posts(query_param, access_token)

# Extract post data
print("Extracting post data...")
Expand Down
84 changes: 84 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
This module contains a function to search for posts using the BlueSky API.
"""
import requests
from alive_progress import alive_bar

def search_posts(params, token):
# pylint: disable=E1102
# pylint: disable=C0301

"""
Search for posts using the BlueSky API.

Args:
params (dict): The query parameters for the API request.
- query (str, required): The search term for the BlueSky posts.
- sort (str, optional): The sorting criteria for results.
Options include "top" for top posts or "latest" for the latest posts.
- since (str, optional): The start date for posts (ISO 8601 format).
- until (str, optional): The end date for posts (ISO 8601 format).
- mentions (str, optional): Mentions to filter posts by.
- Handles will be resolved to DIDs using the provided token.
- author (str, optional): The author of the posts (handle or DID).
- lang (str, optional): The language of the posts.
- domain (str, optional): A domain URL included in the posts.
- url (str, optional): A specific URL included in the posts.
- tags (list, optional): Tags to filter posts by (each tag <= 640 characters).
- limit (int, optional): The maximum number of posts to retrieve in a single response.
Defaults to 25.
- cursor (str, optional): Pagination token for continuing from a previous request.
- posts_limit (int, optional): The maximum number of posts to retrieve across all responses.
Defaults to 500.

Returns:
list: A list of posts matching the search criteria.

Notes:
- Progress is displayed using a progress bar indicating the number of posts fetched.
- Handles pagination automatically until `posts_limit` is reached or no further results are available.
- Logs and returns partial results if an error occurs during fetching.
"""
posts = []
url = "https://bsky.social/xrpc/app.bsky.feed.searchPosts"
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}

total_fetched = 0
posts_limit = params.get("posts_limit", 1000)

with alive_bar(posts_limit) as progress:
while True:
try:
response = requests.get(url, headers=headers, params=params, timeout=10)
# print(response)
response.raise_for_status()
data = response.json()

#Check if we have reached our overall posts limit
new_posts = data.get("posts", [])
posts.extend(new_posts)
total_fetched += len(new_posts)

#Update progress bar
progress(len(new_posts))

if posts_limit and total_fetched >= posts_limit:
print(f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}")
return posts[:posts_limit]

#Move to the enxt page if available
next_cursor = data.get("cursor")
if not next_cursor:
print(f"All posts fetched. Total: {total_fetched}")
return posts

params["cursor"] = next_cursor
except requests.exceptions.RequestException as err:
print(f"Error fetching posts: {err}")
print(
"Response:", response.text if "response" in locals() else "No response"
)
return posts