diff --git a/mission_blue.py b/mission_blue.py index 99c0388..32c7d0c 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -10,7 +10,6 @@ import click import file import auth -import scraper # pylint: disable=C0301 lang_dict = { @@ -218,7 +217,6 @@ def generate_query_params( } - def search_posts(params, token): # pylint: disable=E1102 # pylint: disable=C0301 @@ -448,7 +446,7 @@ def main( # Fetch posts print("Fetching posts...") - raw_posts = scraper.search_posts(query_param, access_token) + raw_posts = search_posts(query_param, access_token) # Extract post data print("Extracting post data...") diff --git a/scraper.py b/scraper.py deleted file mode 100644 index 7dcfe1e..0000000 --- a/scraper.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -This module contains a function to search for posts using the BlueSky API. -""" -import requests -from alive_progress import alive_bar - -def search_posts(params, token): - # pylint: disable=E1102 - # pylint: disable=C0301 - - """ - Search for posts using the BlueSky API. - - Args: - params (dict): The query parameters for the API request. - - query (str, required): The search term for the BlueSky posts. - - sort (str, optional): The sorting criteria for results. - Options include "top" for top posts or "latest" for the latest posts. - - since (str, optional): The start date for posts (ISO 8601 format). - - until (str, optional): The end date for posts (ISO 8601 format). - - mentions (str, optional): Mentions to filter posts by. - - Handles will be resolved to DIDs using the provided token. - - author (str, optional): The author of the posts (handle or DID). - - lang (str, optional): The language of the posts. - - domain (str, optional): A domain URL included in the posts. - - url (str, optional): A specific URL included in the posts. - - tags (list, optional): Tags to filter posts by (each tag <= 640 characters). - - limit (int, optional): The maximum number of posts to retrieve in a single response. - Defaults to 25. - - cursor (str, optional): Pagination token for continuing from a previous request. - - posts_limit (int, optional): The maximum number of posts to retrieve across all responses. - Defaults to 500. - - Returns: - list: A list of posts matching the search criteria. - - Notes: - - Progress is displayed using a progress bar indicating the number of posts fetched. - - Handles pagination automatically until `posts_limit` is reached or no further results are available. - - Logs and returns partial results if an error occurs during fetching. - """ - posts = [] - url = "https://bsky.social/xrpc/app.bsky.feed.searchPosts" - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json", - } - - total_fetched = 0 - posts_limit = params.get("posts_limit", 1000) - - with alive_bar(posts_limit) as progress: - while True: - try: - response = requests.get(url, headers=headers, params=params, timeout=10) - # print(response) - response.raise_for_status() - data = response.json() - - #Check if we have reached our overall posts limit - new_posts = data.get("posts", []) - posts.extend(new_posts) - total_fetched += len(new_posts) - - #Update progress bar - progress(len(new_posts)) - - if posts_limit and total_fetched >= posts_limit: - print(f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}") - return posts[:posts_limit] - - #Move to the enxt page if available - next_cursor = data.get("cursor") - if not next_cursor: - print(f"All posts fetched. Total: {total_fetched}") - return posts - - params["cursor"] = next_cursor - except requests.exceptions.RequestException as err: - print(f"Error fetching posts: {err}") - print( - "Response:", response.text if "response" in locals() else "No response" - ) - return posts \ No newline at end of file