diff --git a/.gitignore b/.gitignore index 01a5eb9..b36d575 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ dist/ .env .vscode/ -.DS_Store \ No newline at end of file +.DS_Store +*.json \ No newline at end of file diff --git a/README.md b/README.md index 66e3c9f..84eac8d 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ This library provides Python interfaces for interacting with Substack's unoffici - Get user profile information and subscriptions - Fetch post content and metadata - Search for posts within newsletters +- Access paywalled content **that you have written or paid for** with user-provided authentication ## Installation @@ -65,6 +66,55 @@ metadata = post.get_metadata() content = post.get_content() ``` +### Accessing Paywalled Content with Authentication + +To access paywalled content, you need to provide your own session cookies from a logged-in Substack session: + +```python +from substack_api import Newsletter, Post, SubstackAuth + +# Set up authentication with your cookies +auth = SubstackAuth(cookies_path="path/to/your/cookies.json") + +# Use authentication with newsletters +newsletter = Newsletter("https://example.substack.com", auth=auth) +posts = newsletter.get_posts(limit=5) # Can now access paywalled posts + +# Use authentication with individual posts +post = Post("https://example.substack.com/p/paywalled-post", auth=auth) +content = post.get_content() # Can now access paywalled content + +# Check if a post is paywalled +if post.is_paywalled(): + print("This post requires a subscription") +``` + +#### Getting Your Cookies + +To access paywalled content, you need to export your browser cookies from a logged-in Substack session. The cookies should be in JSON format with the following structure: + +```json +[ + { + "name": "substack.sid", + "value": "your_session_id", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + { + "name": "substack.lli", + "value": "your_lli_value", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + ... +] +``` + +**Important**: Only use your own cookies from your own authenticated session. **This feature is intended for users to access their own subscribed or authored content programmatically.** + ### Working with Users ```python @@ -88,8 +138,9 @@ subscriptions = user.get_subscriptions() - This is an unofficial library and not endorsed by Substack - APIs may change without notice, potentially breaking functionality -- Some features may only work for public content - Rate limiting may be enforced by Substack +- **Authentication requires users to provide their own session cookies** +- **Users are responsible for complying with Substack's terms of service when using authentication features** ## Development diff --git a/docs/api-reference/auth.md b/docs/api-reference/auth.md new file mode 100644 index 0000000..099b2ec --- /dev/null +++ b/docs/api-reference/auth.md @@ -0,0 +1,163 @@ +# SubstackAuth + +The `SubstackAuth` class handles authentication for accessing paywalled Substack content. + +## Class Definition + +```python +SubstackAuth(cookies_path: str) +``` + +### Parameters + +- `cookies_path` (str): Path to the JSON file containing session cookies + +## Properties + +### `authenticated` (bool) +Whether the authentication was successful and cookies were loaded. + +### `cookies_path` (str) +Path to the cookies file. + +### `session` (requests.Session) +The authenticated requests session object. + +## Methods + +### `load_cookies() -> bool` + +Load cookies from the specified file. + +#### Returns + +- `bool`: True if cookies were loaded successfully, False otherwise + +### `get(url: str, **kwargs) -> requests.Response` + +Make an authenticated GET request. + +#### Parameters + +- `url` (str): The URL to request +- `**kwargs`: Additional arguments passed to requests.get + +#### Returns + +- `requests.Response`: The response object + +### `post(url: str, **kwargs) -> requests.Response` + +Make an authenticated POST request. + +#### Parameters + +- `url` (str): The URL to request +- `**kwargs`: Additional arguments passed to requests.post + +#### Returns + +- `requests.Response`: The response object + +## Example Usage + +### Basic Authentication Setup + +```python +from substack_api import SubstackAuth + +# Initialize with cookies file +auth = SubstackAuth(cookies_path="my_cookies.json") + +# Check if authentication succeeded +if auth.authenticated: + print("Successfully authenticated!") +else: + print("Authentication failed") +``` + +### Using with Newsletter and Post Classes + +```python +from substack_api import Newsletter, Post, SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Use with Newsletter +newsletter = Newsletter("https://example.substack.com", auth=auth) +posts = newsletter.get_posts(limit=5) + +# Use with Post +post = Post("https://example.substack.com/p/paywalled-post", auth=auth) +content = post.get_content() +``` + +### Manual Authenticated Requests + +```python +from substack_api import SubstackAuth + +auth = SubstackAuth(cookies_path="cookies.json") + +# Make authenticated GET request +response = auth.get("https://example.substack.com/api/v1/posts/123") +data = response.json() + +# Make authenticated POST request +response = auth.post( + "https://example.substack.com/api/v1/some-endpoint", + json={"key": "value"} +) +``` + +## Cookie File Format + +The cookies file should be in JSON format with the following structure: + +```json +[ + { + "name": "substack.sid", + "value": "your_session_id", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + { + "name": "substack.lli", + "value": "your_lli_value", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + ... +] +``` + +## Error Handling + +The `SubstackAuth` class handles several error conditions: + +- **File not found**: If the cookies file doesn't exist, `authenticated` will be `False` +- **Invalid JSON**: If the cookies file contains invalid JSON, `load_cookies()` returns `False` +- **Missing cookies**: If required cookies are missing, authentication may fail silently + +```python +from substack_api import SubstackAuth + +try: + auth = SubstackAuth(cookies_path="cookies.json") + if not auth.authenticated: + print("Authentication failed - check your cookies file") +except Exception as e: + print(f"Error setting up authentication: {e}") +``` + +## Security Notes + +- Keep your cookies file secure and private +- Don't commit cookies files to version control +- Only use your own session cookies +- Cookies may expire and need to be refreshed periodically +- Respect Substack's Terms of Service when using authentication diff --git a/docs/api-reference/index.md b/docs/api-reference/index.md index 8b86713..9d4bb43 100644 --- a/docs/api-reference/index.md +++ b/docs/api-reference/index.md @@ -8,6 +8,7 @@ This section provides detailed documentation for all modules and classes in the - [Newsletter](newsletter.md): Access to Substack publications, posts, and podcasts - [Post](post.md): Access to individual Substack post content and metadata - [Category](category.md): Discovery of newsletters by category +- [SubstackAuth](auth.md): Authentication for accessing paywalled content Each module documentation includes: diff --git a/docs/api-reference/newsletter.md b/docs/api-reference/newsletter.md index f93fd1c..7b456be 100644 --- a/docs/api-reference/newsletter.md +++ b/docs/api-reference/newsletter.md @@ -5,12 +5,13 @@ The `Newsletter` class provides access to Substack publications. ## Class Definition ```python -Newsletter(url: str) +Newsletter(url: str, auth: Optional[SubstackAuth] = None) ``` ### Parameters - `url` (str): The URL of the Substack newsletter +- `auth` (Optional[SubstackAuth]): Authentication handler for accessing paywalled content ## Methods @@ -85,7 +86,7 @@ Get authors of the newsletter. ## Example Usage ```python -from substack_api import Newsletter +from substack_api import Newsletter, SubstackAuth # Create a newsletter object newsletter = Newsletter("https://example.substack.com") @@ -117,4 +118,13 @@ for author in authors: recommendations = newsletter.get_recommendations() for rec in recommendations: print(f"Recommended: {rec.url}") + +# Use with authentication for paywalled content +auth = SubstackAuth(cookies_path="cookies.json") +authenticated_newsletter = Newsletter("https://example.substack.com", auth=auth) +paywalled_posts = authenticated_newsletter.get_posts(limit=5) +for post in paywalled_posts: + if post.is_paywalled(): + content = post.get_content() # Now accessible with auth + print(f"Paywalled content: {content[:100]}...") ``` diff --git a/docs/api-reference/post.md b/docs/api-reference/post.md index 0efcc38..2c3898e 100644 --- a/docs/api-reference/post.md +++ b/docs/api-reference/post.md @@ -5,12 +5,13 @@ The `Post` class provides access to individual Substack posts. ## Class Definition ```python -Post(url: str) +Post(url: str, auth: Optional[SubstackAuth] = None) ``` ### Parameters - `url` (str): The URL of the Substack post +- `auth` (Optional[SubstackAuth]): Authentication handler for accessing paywalled content ## Methods @@ -48,12 +49,20 @@ Get the HTML content of the post. #### Returns -- `Optional[str]`: HTML content of the post, or None if not available +- `Optional[str]`: HTML content of the post, or None if not available (e.g., for paywalled content without authentication) + +### `is_paywalled() -> bool` + +Check if the post is paywalled. + +#### Returns + +- `bool`: True if the post requires a subscription to access full content ## Example Usage ```python -from substack_api import Post +from substack_api import Post, SubstackAuth # Create a post object post = Post("https://example.substack.com/p/post-slug") @@ -63,11 +72,22 @@ metadata = post.get_metadata() print(f"Title: {metadata['title']}") print(f"Published: {metadata['post_date']}") -# Get post content -content = post.get_content() +# Check if the post is paywalled +if post.is_paywalled(): + print("This post is paywalled") + + # Set up authentication to access paywalled content + auth = SubstackAuth(cookies_path="cookies.json") + authenticated_post = Post("https://example.substack.com/p/post-slug", auth=auth) + content = authenticated_post.get_content() +else: + # Public content - no authentication needed + content = post.get_content() + print(f"Content length: {len(content) if content else 0}") -# Check if the post is paywalled -is_paywalled = metadata.get("audience") == "only_paid" -print(f"Paywalled: {is_paywalled}") +# Alternative: Create post with authentication from the start +auth = SubstackAuth(cookies_path="cookies.json") +authenticated_post = Post("https://example.substack.com/p/paywalled-post", auth=auth) +content = authenticated_post.get_content() # Works for both public and paywalled content ``` diff --git a/docs/authentication.md b/docs/authentication.md new file mode 100644 index 0000000..d3d7051 --- /dev/null +++ b/docs/authentication.md @@ -0,0 +1,172 @@ +# Authentication + +The Substack API library supports authentication to access paywalled content. This feature allows users to access their own subscribed content programmatically by providing their session cookies from a logged-in Substack session. + +## Overview + +Authentication in the Substack API library works by: + +1. Loading session cookies from a JSON file +2. Using those cookies to make authenticated requests +3. Automatically handling authentication for both `Newsletter` and `Post` objects + +## Setting Up Authentication + +### 1. Export Your Cookies + +To use authentication, you need to export your browser cookies from a logged-in Substack session. The cookies should be saved in JSON format with this structure: + +```json +[ + { + "name": "substack.sid", + "value": "your_session_id_here", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + { + "name": "substack.lli", + "value": "your_lli_value_here", + "domain": ".substack.com", + "path": "/", + "secure": true + }, + ... +] +``` + +### 2. Create Authentication Object + +```python +from substack_api import SubstackAuth + +# Initialize with path to your cookies file +auth = SubstackAuth(cookies_path="path/to/your/cookies.json") + +# Check if authentication was successful +if auth.authenticated: + print("Authentication successful!") +else: + print("Authentication failed - check your cookies file") +``` + +## Using Authentication + +### With Newsletter Objects + +```python +from substack_api import Newsletter, SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Create authenticated newsletter +newsletter = Newsletter("https://example.substack.com", auth=auth) + +# All posts retrieved will use authentication +posts = newsletter.get_posts(limit=10) + +# Access paywalled content +for post in posts: + if post.is_paywalled(): + content = post.get_content() # Now accessible with auth + print(f"Paywalled content: {content[:100]}...") +``` + +### With Post Objects + +```python +from substack_api import Post, SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Create authenticated post +post = Post("https://example.substack.com/p/paywalled-post", auth=auth) + +# Check if post is paywalled +if post.is_paywalled(): + print("This post is paywalled") + +# Get content (will use authentication if needed) +content = post.get_content() +``` + +### Checking Paywall Status + +```python +from substack_api import Post + +# Create post object (no auth needed to check paywall status) +post = Post("https://example.substack.com/p/some-post") + +# Check if authentication is required +if post.is_paywalled(): + print("This post requires authentication to access full content") + + # Now add authentication to access content + from substack_api import SubstackAuth + auth = SubstackAuth(cookies_path="cookies.json") + authenticated_post = Post(post.url, auth=auth) + content = authenticated_post.get_content() +else: + # Public content - no authentication needed + content = post.get_content() +``` + +## How to Get Your Cookies + +### Method 1: Browser Developer Tools + +1. Log in to Substack in your browser +2. Open Developer Tools (F12 or right-click → Inspect) +3. Go to the Application/Storage tab +4. Find "Cookies" → "https://substack.com" +5. Export their values to a JSON file in the required format + +### Method 2: Browser Extensions + +You can use browser extensions that export cookies to JSON format. Make sure to: +- Only export cookies for the `.substack.com` domain +- Save in the JSON format shown above + +## Security Considerations + +**Important Security Notes:** + +- **Only use your own cookies** from your own authenticated session +- **Keep your cookies file secure** - treat it like a password +- **Don't share your cookies** with others or commit them to version control +- **Cookies may expire** - you may need to refresh them periodically +- **Respect Substack's Terms of Service** when using authentication features + +## Error Handling + +```python +from substack_api import SubstackAuth, Post + +try: + # Attempt to load authentication + auth = SubstackAuth(cookies_path="cookies.json") + + if not auth.authenticated: + print("Warning: Authentication failed, using public access only") + auth = None + + # Use authentication if available + post = Post("https://example.substack.com/p/some-post", auth=auth) + content = post.get_content() + + if content is None and post.is_paywalled(): + print("This content is paywalled and requires authentication") + +except FileNotFoundError: + print("Cookies file not found - using public access only") + post = Post("https://example.substack.com/p/some-post") + content = post.get_content() +``` + +## API Reference + +See the [SubstackAuth API documentation](api-reference/auth.md) for detailed information about the authentication class and its methods. diff --git a/docs/index.md b/docs/index.md index 5dc29c0..396d9be 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,11 +10,12 @@ The Substack API library provides a simple interface to interact with Substack n - Retrieve user profiles and subscriptions - Access post content and metadata - Discover newsletters by category +- Access paywalled content **that you have access to** with user-provided authentication ## Quick Start ```python -from substack_api import Newsletter, User, Post, Category +from substack_api import Newsletter, User, Post, Category, SubstackAuth # Get information about a newsletter newsletter = Newsletter("https://example.substack.com") @@ -31,6 +32,11 @@ content = post.get_content() # Browse newsletters by category tech_category = Category(name="Technology") tech_newsletters = tech_category.get_newsletters() + +# Access paywalled content with authentication +auth = SubstackAuth(cookies_path="cookies.json") +authenticated_post = Post("https://example.substack.com/p/paywalled-post", auth=auth) +paywalled_content = authenticated_post.get_content() ``` ## Features @@ -39,6 +45,7 @@ tech_newsletters = tech_category.get_newsletters() - Comprehensive access to Substack data - Pagination support for large collections - Automatic caching to minimize API calls +- Authentication support for accessing paywalled content ## Important Note diff --git a/docs/user-guide.md b/docs/user-guide.md index 29e1b71..9a4212d 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -2,12 +2,13 @@ ## Basic Concepts -The Substack API library is organized around four main classes: +The Substack API library is organized around five main classes: - `User` - Represents a Substack user profile - `Newsletter` - Represents a Substack publication - `Post` - Represents an individual post on Substack - `Category` - Represents a Substack category of newsletters +- `SubstackAuth` - Handles authentication for accessing paywalled content Each class provides methods to access different aspects of the Substack ecosystem. @@ -37,6 +38,29 @@ authors = newsletter.get_authors() recommendations = newsletter.get_recommendations() ``` +### Accessing Paywalled Newsletter Content + +To access paywalled posts from a newsletter, provide authentication: + +```python +from substack_api import Newsletter, SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Create authenticated newsletter +newsletter = Newsletter("https://example.substack.com", auth=auth) + +# All retrieved posts will use authentication +posts = newsletter.get_posts(limit=10) + +# Access content from paywalled posts +for post in posts: + if post.is_paywalled(): + content = post.get_content() # Now accessible with auth + print(f"Paywalled content: {content[:100]}...") +``` + ## Working with Users The `User` class allows you to access information about Substack users: @@ -73,6 +97,27 @@ content = post.get_content() # Get post metadata metadata = post.get_metadata() + +# Check if post is paywalled +if post.is_paywalled(): + print("This post requires a subscription") +``` + +### Accessing Paywalled Content + +To access paywalled content, you need to provide authentication: + +```python +from substack_api import Post, SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Create authenticated post +post = Post("https://example.substack.com/p/paywalled-post", auth=auth) + +# Now you can access paywalled content +content = post.get_content() ``` ## Working with Categories @@ -98,6 +143,21 @@ newsletters = category.get_newsletters() newsletter_metadata = category.get_newsletter_metadata() ``` +## Authentication + +The library supports authentication to access paywalled content. See the [Authentication Guide](authentication.md) for detailed information on setting up and using authentication. + +```python +from substack_api import SubstackAuth + +# Set up authentication +auth = SubstackAuth(cookies_path="cookies.json") + +# Use with any class that supports authentication +newsletter = Newsletter("https://example.substack.com", auth=auth) +post = Post("https://example.substack.com/p/paywalled-post", auth=auth) +``` + ## Caching Behavior By default, the library caches API responses to minimize the number of requests. You can force a refresh of the data by passing `force_refresh=True` to relevant methods: diff --git a/examples/usage_walkthrough.ipynb b/examples/usage_walkthrough.ipynb index 71c7332..17a2271 100644 --- a/examples/usage_walkthrough.ipynb +++ b/examples/usage_walkthrough.ipynb @@ -1,5 +1,84 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Setting up authentication (uncomment and provide your own cookies file)\n", + "# from substack_api import SubstackAuth\n", + "# auth = SubstackAuth(cookies_path=\"cookies.json\")\n", + "# print(f\"Authentication successful: {auth.authenticated}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Using authentication with newsletter to access paywalled posts\n", + "# from substack_api import Newsletter\n", + "# authenticated_newsletter = Newsletter(\"https://example.substack.com\", auth=auth)\n", + "# paywalled_posts = authenticated_newsletter.get_posts(limit=5)\n", + "# for post in paywalled_posts:\n", + "# if post.is_paywalled():\n", + "# content = post.get_content() # Now accessible with auth\n", + "# print(f\"Paywalled content length: {len(content) if content else 0}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Using authentication with individual posts\n", + "# from substack_api import Post\n", + "# post_url = \"https://example.substack.com/p/some-paywalled-post\"\n", + "# \n", + "# # Check if post is paywalled without auth\n", + "# post = Post(post_url)\n", + "# print(f\"Is paywalled: {post.is_paywalled()}\")\n", + "# \n", + "# # Access with authentication\n", + "# authenticated_post = Post(post_url, auth=auth)\n", + "# content = authenticated_post.get_content()\n", + "# print(f\"Content accessible: {content is not None}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cookie File Format\n", + "\n", + "Your cookies.json file should contain your session cookies in this format:\n", + "\n", + "```json\n", + "[\n", + " {\n", + " \"name\": \"substack.sid\",\n", + " \"value\": \"your_session_id\",\n", + " \"domain\": \".substack.com\",\n", + " \"path\": \"/\",\n", + " \"secure\": true\n", + " },\n", + " {\n", + " \"name\": \"substack.lli\",\n", + " \"value\": \"your_lli_value\", \n", + " \"domain\": \".substack.com\",\n", + " \"path\": \"/\",\n", + " \"secure\": true\n", + " }\n", + " ...\n", + "]\n", + "```\n", + "\n", + "**Security Note**: Only use your own cookies from your own authenticated session. Keep the cookies file secure and don't share it." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -4333,6 +4412,15 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Authentication for Paywalled Content\n", + "\n", + "The library supports authentication to access paywalled content using your own session cookies." + ] } ], "metadata": { diff --git a/mkdocs.yml b/mkdocs.yml index 2731154..d4da8b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -53,12 +53,14 @@ nav: - Home: index.md - Installation: installation.md - User Guide: user-guide.md + - Authentication: authentication.md - API Reference: - - Overview: api-reference/index.md - - User: api-reference/user.md - - Newsletter: api-reference/newsletter.md - - Post: api-reference/post.md - - Category: api-reference/category.md + - Overview: api-reference/index.md + - User: api-reference/user.md + - Newsletter: api-reference/newsletter.md + - Post: api-reference/post.md + - Category: api-reference/category.md + - SubstackAuth: api-reference/auth.md # Extra CSS extra_css: diff --git a/pyproject.toml b/pyproject.toml index 95ae81f..0a94690 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,9 +4,7 @@ version = "1.0.2" description = "Unofficial wrapper for the Substack API" readme = "README.md" requires-python = ">=3.12" -dependencies = [ - "requests>=2.32.3", -] +dependencies = ["requests>=2.32.3"] [dependency-groups] dev = [ @@ -18,3 +16,8 @@ dev = [ "pytest>=8.3.4", "ruff>=0.9.9", ] + +[project.urls] +"Homepage" = "https://github.com/nhagar/substack_api" +"Bug Tracker" = "https://github.com/nhagar/substack_api/issues" +"Documentation" = "https://nhagar.github.io/substack_api/" diff --git a/substack_api/__init__.py b/substack_api/__init__.py index 5e72b37..cec50ce 100644 --- a/substack_api/__init__.py +++ b/substack_api/__init__.py @@ -1,6 +1,7 @@ +from .auth import SubstackAuth from .category import Category from .newsletter import Newsletter from .post import Post from .user import User -__all__ = ["User", "Post", "Category", "Newsletter"] +__all__ = ["User", "Post", "Category", "Newsletter", "SubstackAuth"] diff --git a/substack_api/auth.py b/substack_api/auth.py new file mode 100644 index 0000000..a6b27ff --- /dev/null +++ b/substack_api/auth.py @@ -0,0 +1,106 @@ +import json +import os + +import requests + + +class SubstackAuth: + """Handles authentication for Substack API requests.""" + + def __init__( + self, + cookies_path: str, + ): + """ + Initialize authentication handler. + + Parameters + ---------- + cookies_path : str, optional + Path to retrieve session cookies from + """ + self.cookies_path = cookies_path + self.session = requests.Session() + self.authenticated = False + + # Set default headers + self.session.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Try to load existing cookies + if os.path.exists(self.cookies_path): + self.load_cookies() + self.authenticated = True + else: + print(f"Cookies file not found at {self.cookies_path}. Please log in.") + self.authenticated = False + self.session.cookies.clear() + + def load_cookies(self) -> bool: + """ + Load cookies from file. + + Returns + ------- + bool + True if cookies loaded successfully + """ + try: + with open(self.cookies_path, "r") as f: + cookies = json.load(f) + + for cookie in cookies: + self.session.cookies.set( + cookie["name"], + cookie["value"], + domain=cookie.get("domain"), + path=cookie.get("path", "/"), + secure=cookie.get("secure", False), + ) + + return True + + except Exception as e: + print(f"Failed to load cookies: {str(e)}") + return False + + def get(self, url: str, **kwargs) -> requests.Response: + """ + Make authenticated GET request. + + Parameters + ---------- + url : str + URL to request + **kwargs + Additional arguments to pass to requests.get + + Returns + ------- + requests.Response + Response object + """ + return self.session.get(url, **kwargs) + + def post(self, url: str, **kwargs) -> requests.Response: + """ + Make authenticated POST request. + + Parameters + ---------- + url : str + URL to request + **kwargs + Additional arguments to pass to requests.post + + Returns + ------- + requests.Response + Response object + """ + return self.session.post(url, **kwargs) diff --git a/substack_api/newsletter.py b/substack_api/newsletter.py index db15834..94e2575 100644 --- a/substack_api/newsletter.py +++ b/substack_api/newsletter.py @@ -3,6 +3,8 @@ import requests +from substack_api.auth import SubstackAuth + HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" } @@ -13,7 +15,7 @@ class Newsletter: Newsletter class for interacting with Substack newsletters """ - def __init__(self, url: str) -> None: + def __init__(self, url: str, auth: Optional[SubstackAuth] = None) -> None: """ Initialize a Newsletter object. @@ -21,8 +23,11 @@ def __init__(self, url: str) -> None: ---------- url : str The URL of the Substack newsletter + auth : Optional[SubstackAuth] + Authentication handler for accessing paywalled content """ self.url = url + self.auth = auth def __str__(self) -> str: return f"Newsletter: {self.url}" @@ -30,6 +35,27 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"Newsletter(url={self.url})" + def _make_request(self, endpoint: str, **kwargs) -> requests.Response: + """ + Make a GET request to the specified endpoint with authentication if needed. + + Parameters + ---------- + endpoint : str + The API endpoint to request + **kwargs : Any + Additional parameters for the request + + Returns + ------- + requests.Response + The response object from the request + """ + if self.auth and self.auth.authenticated: + return self.auth.get(endpoint, **kwargs) + else: + return requests.get(endpoint, headers=HEADERS, **kwargs) + def _fetch_paginated_posts( self, params: Dict[str, str], limit: Optional[int] = None, page_size: int = 15 ) -> List[Dict[str, Any]]: @@ -65,7 +91,7 @@ def _fetch_paginated_posts( endpoint = f"{self.url}/api/v1/archive?{query_string}" # Make the request - response = requests.get(endpoint, headers=HEADERS, timeout=30) + response = self._make_request(endpoint, timeout=30) if response.status_code != 200: break @@ -115,7 +141,7 @@ def get_posts(self, sorting: str = "new", limit: Optional[int] = None) -> List: params = {"sort": sorting} post_data = self._fetch_paginated_posts(params, limit) - return [Post(item["canonical_url"]) for item in post_data] + return [Post(item["canonical_url"], auth=self.auth) for item in post_data] def search_posts(self, query: str, limit: Optional[int] = None) -> List: """ @@ -137,7 +163,7 @@ def search_posts(self, query: str, limit: Optional[int] = None) -> List: params = {"sort": "new", "search": query} post_data = self._fetch_paginated_posts(params, limit) - return [Post(item["canonical_url"]) for item in post_data] + return [Post(item["canonical_url"], auth=self.auth) for item in post_data] def get_podcasts(self, limit: Optional[int] = None) -> List: """ @@ -157,7 +183,7 @@ def get_podcasts(self, limit: Optional[int] = None) -> List: params = {"sort": "new", "type": "podcast"} post_data = self._fetch_paginated_posts(params, limit) - return [Post(item["canonical_url"]) for item in post_data] + return [Post(item["canonical_url"], auth=self.auth) for item in post_data] def get_recommendations(self) -> List["Newsletter"]: """ @@ -177,7 +203,7 @@ def get_recommendations(self) -> List["Newsletter"]: # Now get the recommendations endpoint = f"{self.url}/api/v1/recommendations/from/{publication_id}" - response = requests.get(endpoint, headers=HEADERS, timeout=30) + response = self._make_request(endpoint, timeout=30) if response.status_code != 200: return [] @@ -199,7 +225,9 @@ def get_recommendations(self) -> List["Newsletter"]: # Avoid circular import from .newsletter import Newsletter - result = [Newsletter(url) for url in recommended_newsletter_urls] + result = [ + Newsletter(url, auth=self.auth) for url in recommended_newsletter_urls + ] return result @@ -214,11 +242,8 @@ def get_authors(self) -> List: """ from .user import User # Import here to avoid circular import - r = requests.get( - f"{self.url}/api/v1/publication/users/ranked?public=true", - headers=HEADERS, - timeout=30, - ) + endpoint = f"{self.url}/api/v1/publication/users/ranked?public=true" + r = self._make_request(endpoint, timeout=30) r.raise_for_status() authors = r.json() return [User(author["handle"]) for author in authors] diff --git a/substack_api/post.py b/substack_api/post.py index 8758585..ee931bc 100644 --- a/substack_api/post.py +++ b/substack_api/post.py @@ -3,6 +3,8 @@ import requests +from substack_api.auth import SubstackAuth + HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" } @@ -13,7 +15,7 @@ class Post: A class to represent a Substack post. """ - def __init__(self, url: str) -> None: + def __init__(self, url: str, auth: Optional[SubstackAuth] = None) -> None: """ Initialize a Post object. @@ -21,8 +23,11 @@ def __init__(self, url: str) -> None: ---------- url : str The URL of the Substack post + auth : Optional[SubstackAuth] + Authentication handler for accessing paywalled content """ self.url = url + self.auth = auth parsed_url = urlparse(url) self.base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" path_parts = parsed_url.path.strip("/").split("/") @@ -55,7 +60,11 @@ def _fetch_post_data(self, force_refresh: bool = False) -> Dict[str, Any]: if self._post_data is not None and not force_refresh: return self._post_data - r = requests.get(self.endpoint, headers=HEADERS, timeout=30) + # Use authenticated session if available + if self.auth and self.auth.authenticated: + r = self.auth.get(self.endpoint, timeout=30) + else: + r = requests.get(self.endpoint, headers=HEADERS, timeout=30) r.raise_for_status() self._post_data = r.json() @@ -92,4 +101,24 @@ def get_content(self, force_refresh: bool = False) -> Optional[str]: HTML content of the post, or None if not available """ data = self._fetch_post_data(force_refresh=force_refresh) - return data.get("body_html") + content = data.get("body_html") + + # Check if content is paywalled and we don't have auth + if not content and data.get("audience") == "only_paid" and not self.auth: + print( + "Warning: This post is paywalled. Provide authentication to access full content." + ) + + return content + + def is_paywalled(self) -> bool: + """ + Check if the post is paywalled. + + Returns + ------- + bool + True if post is paywalled + """ + data = self._fetch_post_data() + return data.get("audience") == "only_paid" diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 0000000..fde9dbd --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,237 @@ +import json +import os +import tempfile +from unittest.mock import Mock, patch + +import pytest +import requests + +from substack_api.auth import SubstackAuth + + +@pytest.fixture +def temp_cookies_file(): + """Create a temporary file for cookies storage.""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_path = f.name + yield temp_path + # Cleanup + if os.path.exists(temp_path): + os.remove(temp_path) + + +@pytest.fixture +def mock_cookies(): + """Mock cookies data.""" + return { + "substack.sid": { + "value": "test_session_id", + "domain": ".substack.com", + "path": "/", + "secure": True, + "expires": None, + }, + "substack.lli": { + "value": "test_lli_value", + "domain": ".substack.com", + "path": "/", + "secure": True, + "expires": None, + }, + } + + +@pytest.fixture +def mock_selenium_cookies(): + """Mock cookies returned by Selenium.""" + return [ + { + "name": "substack.sid", + "value": "test_session_id", + "domain": ".substack.com", + "path": "/", + "secure": True, + }, + { + "name": "substack.lli", + "value": "test_lli_value", + "domain": ".substack.com", + "path": "/", + "secure": True, + }, + ] + + +class TestSubstackAuth: + """Test cases for SubstackAuth class.""" + + def test_init_without_credentials(self, temp_cookies_file): + """Test initialization without credentials.""" + auth = SubstackAuth(cookies_path=temp_cookies_file) + + assert auth.cookies_path == temp_cookies_file + assert auth.authenticated + assert isinstance(auth.session, requests.Session) + + def test_init_with_existing_cookies(self, temp_cookies_file, mock_cookies): + """Test initialization with existing cookies file.""" + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + + with patch.object(SubstackAuth, "load_cookies") as mock_load: + _ = SubstackAuth(cookies_path=temp_cookies_file) + mock_load.assert_called_once() + + def test_load_cookies_file_not_found(self, temp_cookies_file): + """Test loading cookies when file doesn't exist.""" + auth = SubstackAuth(cookies_path=temp_cookies_file + ".nonexistent") + result = auth.load_cookies() + + assert result is False + assert not auth.authenticated + + def test_get_request(self, temp_cookies_file, mock_cookies): + """Test authenticated GET request.""" + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + auth = SubstackAuth(cookies_path=temp_cookies_file) + auth.authenticated = True + + mock_response = Mock() + + with patch.object(auth.session, "get", return_value=mock_response) as mock_get: + result = auth.get("https://example.com/api", timeout=30) + + assert result == mock_response + mock_get.assert_called_once_with("https://example.com/api", timeout=30) + + def test_post_request(self, temp_cookies_file, mock_cookies): + """Test authenticated POST request.""" + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + auth = SubstackAuth(cookies_path=temp_cookies_file) + auth.authenticated = True + + mock_response = Mock() + data = {"key": "value"} + + with patch.object( + auth.session, "post", return_value=mock_response + ) as mock_post: + result = auth.post("https://example.com/api", json=data) + + assert result == mock_response + mock_post.assert_called_once_with("https://example.com/api", json=data) + + def test_session_headers(self, temp_cookies_file, mock_cookies): + """Test that session has proper default headers.""" + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + auth = SubstackAuth(cookies_path=temp_cookies_file) + + assert "User-Agent" in auth.session.headers + assert auth.session.headers["Accept"] == "application/json" + assert auth.session.headers["Content-Type"] == "application/json" + + +# Integration tests with Post and Newsletter classes +class TestAuthIntegration: + """Test authentication integration with Post and Newsletter classes.""" + + @patch("substack_api.post.requests.get") + def test_post_without_auth(self, mock_get): + """Test Post class without authentication uses regular requests.""" + from substack_api.post import Post + + mock_response = Mock() + mock_response.json.return_value = { + "id": 123, + "body_html": None, + "audience": "only_paid", + } + mock_get.return_value = mock_response + + post = Post("https://test.substack.com/p/test-post") + content = post.get_content() + + # Should use regular requests.get + mock_get.assert_called_once() + assert content is None + + def test_post_with_auth(self, temp_cookies_file, mock_cookies): + """Test Post class with authentication uses auth session.""" + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + auth = SubstackAuth(cookies_path=temp_cookies_file) + + from substack_api.post import Post + + auth.authenticated = True + + mock_response = Mock() + mock_response.json.return_value = { + "id": 123, + "body_html": "

Paywalled content

", + "audience": "only_paid", + } + + with patch.object(auth, "get", return_value=mock_response) as mock_auth_get: + post = Post("https://test.substack.com/p/test-post", auth=auth) + content = post.get_content() + + # Should use auth.get instead of requests.get + mock_auth_get.assert_called_once() + assert content == "

Paywalled content

" + + def test_post_is_paywalled(self): + """Test is_paywalled method.""" + from substack_api.post import Post + + post = Post("https://test.substack.com/p/test-post") + + # Mock paywalled post + with patch.object( + post, "_fetch_post_data", return_value={"audience": "only_paid"} + ): + assert post.is_paywalled() is True + + # Mock public post + with patch.object( + post, "_fetch_post_data", return_value={"audience": "everyone"} + ): + assert post.is_paywalled() is False + + def test_newsletter_with_auth_passes_to_posts( + self, temp_cookies_file, mock_cookies + ): + """Test Newsletter passes auth to Post objects.""" + from substack_api.newsletter import Newsletter + from substack_api.post import Post + + # Write cookies to file + with open(temp_cookies_file, "w") as f: + json.dump(mock_cookies, f) + auth = SubstackAuth(cookies_path=temp_cookies_file) + auth.authenticated = True + + newsletter = Newsletter("https://test.substack.com", auth=auth) + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + {"canonical_url": "https://test.substack.com/p/post1"}, + {"canonical_url": "https://test.substack.com/p/post2"}, + ] + + with patch.object(newsletter, "_make_request", return_value=mock_response): + posts = newsletter.get_posts(limit=2) + + # Verify auth was passed to Post objects + assert len(posts) == 2 + assert all(isinstance(p, Post) for p in posts) + assert all(p.auth == auth for p in posts) diff --git a/uv.lock b/uv.lock index e57c970..d16a01e 100644 --- a/uv.lock +++ b/uv.lock @@ -43,11 +43,11 @@ wheels = [ [[package]] name = "certifi" -version = "2025.1.31" +version = "2025.4.26" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577 } +sdist = { url = "https://files.pythonhosted.org/packages/e8/9e/c05b3920a3b7d20d3d3310465f50348e5b3694f4f88c6daf736eef3024c4/certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6", size = 160705 } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393 }, + { url = "https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618 }, ] [[package]] @@ -909,7 +909,7 @@ wheels = [ [[package]] name = "substack-api" -version = "0.1.0" +version = "1.0.2" source = { virtual = "." } dependencies = [ { name = "requests" }, @@ -969,11 +969,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.3.0" +version = "2.4.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } +sdist = { url = "https://files.pythonhosted.org/packages/8a/78/16493d9c386d8e60e442a35feac5e00f0913c0f4b7c217c11e8ec2ff53e0/urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466", size = 390672 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, + { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 }, ] [[package]]