phase2interactive · jeremywaller · Jan 19, 2025 · Jan 19, 2025 · Jan 20, 2025
diff --git a/.cursorrules b/.cursorrules
@@ -0,0 +1,66 @@
+// PhaseFeed AI Assistant Rules
+// These rules guide Cursor AI's behavior when working with this podcast monitoring system
+
+// Project Context
+This is a local podcast monitoring and transcription system that handles RSS feed monitoring, 
+audio downloads, transcription (via OpenAI Whisper or mlx-whisper), and content summarization 
+(via OpenAI GPT-4 or Ollama). The system uses SQLite for storage and provides a web interface.
+
+// Core Technologies
+- Python 3.x for backend processing
+- Flask/FastAPI for web server
+- SQLite for data storage
+- OpenAI APIs and local ML models
+- FFmpeg for audio processing
+
+// Code Generation Rules
+1. Always include type hints in Python functions
+2. Use async/await patterns for I/O operations
+3. Implement proper error handling for network and API calls
+4. Follow SQLAlchemy best practices for database operations
+5. Include comprehensive docstrings for all functions and classes
+6. Use dependency injection patterns for better testability
+7. Implement proper logging for monitoring and debugging
+
+// Architecture Guidelines
+1. Maintain separation between feed monitoring, transcription, and summarization services
+2. Use repository pattern for database operations
+3. Implement proper background task handling
+4. Follow REST API best practices for web endpoints
+5. Use environment variables for configuration
+6. Implement proper caching strategies
+
+// Security Considerations
+1. Never hardcode API keys or credentials
+2. Sanitize all user inputs
+3. Implement proper file handling security
+4. Use secure methods for storing sensitive data
+5. Validate RSS feed sources
+
+// Testing Requirements
+1. Write unit tests for core business logic
+2. Include integration tests for API endpoints
+3. Mock external services in tests
+4. Test error handling scenarios
+5. Include performance testing for long-running operations
+
+// Documentation
+1. Include clear function and method documentation
+2. Document API endpoints with OpenAPI/Swagger
+3. Provide clear setup instructions
+4. Document configuration options
+5. Include troubleshooting guides
+
+// Performance Guidelines
+1. Implement proper database indexing
+2. Use connection pooling
+3. Implement caching where appropriate
+4. Handle large audio files efficiently
+5. Optimize database queries
+
+// Maintenance
+1. Include proper cleanup of old files
+2. Implement monitoring and health checks
+3. Handle database migrations properly
+4. Include proper logging for debugging
+5. Implement proper error reporting 
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # PhaseFeed
 
-A local podcast monitoring and transcription system that:
-- Monitors RSS feeds for new podcast episodes
-- Downloads new episodes automatically
+A local podcast and YouTube monitoring system that:
+- Monitors RSS feeds for new podcast episodes and YouTube channels for new videos
+- Downloads new episodes and YouTube videos automatically
 - Transcribes audio using either OpenAI Whisper API or local mlx-whisper
 - Summarizes content using either OpenAI GPT-4 or local LLMs via Ollama
 - Stores metadata in SQLite
@@ -49,12 +49,18 @@ cp .env.example .env
 The application can be configured through `config.py`. Key settings include:
 
 ### Storage Configuration
-- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts (default: `~/Podcasts`)
+- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts and YouTube audio (default: `~/Podcasts`)
 - `TRANSCRIPT_STORAGE_PATH`: Where to store transcripts (default: `~/Podcasts/Transcripts`)
 
 ### Feed Configuration
 - `PODCAST_FEEDS`: List of RSS feed URLs to monitor
-- `MAX_EPISODES_PER_FEED`: Maximum number of episodes to pull from each feed (default: 5)
+- `YOUTUBE_CHANNELS`: List of YouTube channel URLs to monitor
+- `MAX_EPISODES_PER_FEED`: Maximum number of episodes/videos to pull from each feed (default: 5)
+
+### YouTube Configuration
+- `YOUTUBE_AUDIO_QUALITY`: Audio quality for downloaded videos in kbps (default: 192)
+- `YOUTUBE_MAX_RETRIES`: Number of retry attempts for failed downloads
+- `YOUTUBE_TIMEOUT`: Download timeout in seconds
 
 ### Transcription Configuration
 - `TRANSCRIPTION_MODE`: Choose between "local" or "openai"

diff --git a/config.py b/config.py
@@ -13,6 +13,11 @@
     "https://lexfridman.com/feed/podcast/" # Lex Fridman Podcast
 ]
 
+# Example YouTube channels (add your own)
+YOUTUBE_CHANNELS = [
+    "https://www.youtube.com/@matthew_berman"
+]
+
 # Maximum number of episodes to pull from each feed
 MAX_EPISODES_PER_FEED = 5
 
@@ -30,11 +35,16 @@
 OLLAMA_URL = "http://localhost:11434"
 OLLAMA_MODEL = "qwen2.5:3b"
 
+# YouTube configuration
+YOUTUBE_AUDIO_QUALITY = "192"  # Audio quality in kbps
+YOUTUBE_MAX_RETRIES = 3
+YOUTUBE_TIMEOUT = 300  # Timeout in seconds
+
 # Transcript processing configuration
 TRANSCRIPT_CHUNK_TOKENS = 50000  # Tokens per chunk (suitable for most LLM context windows)
 TRANSCRIPT_CHUNK_OVERLAP_TOKENS = 500  # Tokens of overlap between chunks 
 
 # Scheduling configuration
 CHECK_INTERVAL_MINUTES = 60  # How often to check feeds
-RETAIN_DAYS = 30  # How many days of history to keep 
+RETAIN_DAYS = 30  # How many days of history to keep
 
diff --git a/database.py b/database.py
@@ -1,26 +1,33 @@
-from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text, Enum
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker, relationship
 import datetime
 import config
 import os
+import enum
 
 engine = create_engine(f"sqlite:///{config.DB_PATH}", echo=False)
 SessionLocal = sessionmaker(bind=engine)
 Base = declarative_base()
 
+class ContentType(enum.Enum):
+    PODCAST = "podcast"
+    YOUTUBE = "youtube"
+
 class Show(Base):
     __tablename__ = "shows"
 
     id = Column(Integer, primary_key=True, index=True)
     feed_url = Column(String, unique=True, index=True)
     title = Column(String)
+    content_type = Column(Enum(ContentType), default=ContentType.PODCAST)
+    channel_id = Column(String, index=True)  # For YouTube channels
     created_at = Column(DateTime, default=datetime.datetime.utcnow)
 
     # Relationship to episodes
-    episodes = relationship("PodcastEpisode", back_populates="show", cascade="all, delete-orphan")
+    episodes = relationship("Episode", back_populates="show", cascade="all, delete-orphan")
 
-class PodcastEpisode(Base):
+class Episode(Base):
     __tablename__ = "episodes"
 
     id = Column(Integer, primary_key=True, index=True)
@@ -36,6 +43,9 @@ class PodcastEpisode(Base):
     created_at = Column(DateTime, default=datetime.datetime.utcnow)
     file_size = Column(Integer)  # Size in bytes
     duration = Column(Integer)   # Duration in seconds
+    video_id = Column(String, index=True)  # For YouTube videos
+    thumbnail_url = Column(String)  # For YouTube videos
+    original_url = Column(String)  # Original URL (video URL for YouTube, audio URL for podcasts)
 
     # Relationship to show
     show = relationship("Show", back_populates="episodes")
@@ -50,10 +60,12 @@ class EpisodeContent(Base):
     size_formatted = Column(String)
     summary = Column(Text)
     audio_url = Column(String)
+    thumbnail_url = Column(String)  # For YouTube video thumbnails
+    original_url = Column(String)  # Original URL (video URL for YouTube, audio URL for podcasts)
     last_updated = Column(DateTime, default=datetime.datetime.utcnow)
 
     # Relationship to parent episode
-    episode = relationship("PodcastEpisode", backref="content")
+    episode = relationship("Episode", backref="content")
 
 def init_db():
     """Initialize the database, creating tables if they don't exist."""
@@ -72,8 +84,8 @@ def cleanup_old_episodes(days=None):
     cutoff_date = datetime.datetime.utcnow() - datetime.timedelta(days=days)
 
     old_episodes = (
-        session.query(PodcastEpisode)
-        .filter(PodcastEpisode.created_at < cutoff_date)
+        session.query(Episode)
+        .filter(Episode.created_at < cutoff_date)
         .all()
     )
 
@@ -132,6 +144,13 @@ def update_episode_content(session, episode):
         content.audio_url = f"/audio/{os.path.basename(episode.audio_path)}"
     else:
         content.audio_url = None
+
+    # Add YouTube-specific information if applicable
+    if episode.show.content_type == ContentType.YOUTUBE:
+        if episode.video_id:
+            content.original_url = f"https://www.youtube.com/watch?v={episode.video_id}"
+        if episode.thumbnail_url:
+            content.thumbnail_url = episode.thumbnail_url
 
     content.last_updated = datetime.datetime.utcnow()
     session.commit() 
diff --git a/feed_monitor.py b/feed_monitor.py
@@ -2,7 +2,7 @@
 import requests
 import os
 import datetime
-from database import PodcastEpisode, get_db_session, Show
+from database import Episode, get_db_session, Show
 import config
 import logging
 from urllib.parse import urlparse
@@ -64,7 +64,7 @@ def check_feeds():
             for entry in sorted_entries[:config.MAX_EPISODES_PER_FEED]:
                 # Skip if episode already exists
                 existing = (
-                    session.query(PodcastEpisode)
+                    session.query(Episode)
                     .filter_by(show_id=show.id, episode_title=entry.title)
                     .first()
                 )
@@ -75,7 +75,7 @@ def check_feeds():
                 if hasattr(entry, "published_parsed"):
                     pub_date = datetime.datetime(*entry.published_parsed[:6])
 
-                new_episode = PodcastEpisode(
+                new_episode = Episode(
                     show_id=show.id,
                     episode_title=entry.title,
                     pub_date=pub_date
@@ -94,7 +94,7 @@ def download_new_episodes():
     """Download audio files for episodes that haven't been downloaded yet."""
     session = get_db_session()
     episodes_to_download = (
-        session.query(PodcastEpisode)
+        session.query(Episode)
         .filter_by(downloaded=False)
         .all()
     )