From b8e50559bacf9806f20e20749a816314a82181be Mon Sep 17 00:00:00 2001
From: Jeremy Waller <jeremyw@phase2online.com>
Date: Sun, 19 Jan 2025 17:40:53 -0600
Subject: [PATCH 1/3] add cursorrules

---
 .cursorrules | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 .cursorrules

diff --git a/.cursorrules b/.cursorrules
new file mode 100644
index 0000000..941f0b5
--- /dev/null
+++ b/.cursorrules
@@ -0,0 +1,66 @@
+// PhaseFeed AI Assistant Rules
+// These rules guide Cursor AI's behavior when working with this podcast monitoring system
+
+// Project Context
+This is a local podcast monitoring and transcription system that handles RSS feed monitoring, 
+audio downloads, transcription (via OpenAI Whisper or mlx-whisper), and content summarization 
+(via OpenAI GPT-4 or Ollama). The system uses SQLite for storage and provides a web interface.
+
+// Core Technologies
+- Python 3.x for backend processing
+- Flask/FastAPI for web server
+- SQLite for data storage
+- OpenAI APIs and local ML models
+- FFmpeg for audio processing
+
+// Code Generation Rules
+1. Always include type hints in Python functions
+2. Use async/await patterns for I/O operations
+3. Implement proper error handling for network and API calls
+4. Follow SQLAlchemy best practices for database operations
+5. Include comprehensive docstrings for all functions and classes
+6. Use dependency injection patterns for better testability
+7. Implement proper logging for monitoring and debugging
+
+// Architecture Guidelines
+1. Maintain separation between feed monitoring, transcription, and summarization services
+2. Use repository pattern for database operations
+3. Implement proper background task handling
+4. Follow REST API best practices for web endpoints
+5. Use environment variables for configuration
+6. Implement proper caching strategies
+
+// Security Considerations
+1. Never hardcode API keys or credentials
+2. Sanitize all user inputs
+3. Implement proper file handling security
+4. Use secure methods for storing sensitive data
+5. Validate RSS feed sources
+
+// Testing Requirements
+1. Write unit tests for core business logic
+2. Include integration tests for API endpoints
+3. Mock external services in tests
+4. Test error handling scenarios
+5. Include performance testing for long-running operations
+
+// Documentation
+1. Include clear function and method documentation
+2. Document API endpoints with OpenAPI/Swagger
+3. Provide clear setup instructions
+4. Document configuration options
+5. Include troubleshooting guides
+
+// Performance Guidelines
+1. Implement proper database indexing
+2. Use connection pooling
+3. Implement caching where appropriate
+4. Handle large audio files efficiently
+5. Optimize database queries
+
+// Maintenance
+1. Include proper cleanup of old files
+2. Implement monitoring and health checks
+3. Handle database migrations properly
+4. Include proper logging for debugging
+5. Implement proper error reporting 
\ No newline at end of file

From 7c4f2f9eefbc00d748032faf232f4a6e5d83c3b3 Mon Sep 17 00:00:00 2001
From: Jeremy Waller <jeremyw@phase2online.com>
Date: Sun, 19 Jan 2025 17:41:40 -0600
Subject: [PATCH 2/3] add youtube support

---
 config.py                  |  12 +-
 database.py                |  31 +++-
 feed_monitor.py            |   8 +-
 main.py                    | 194 ++++++++++-------------
 progress_handler.py        |  32 ++--
 requirements.txt           |   4 +-
 scripts/reset_summaries.py |   4 +-
 summarizer.py              |   6 +-
 templates/index.html       |  50 ++++++
 transcriber.py             | 100 +++++++-----
 web_server.py              |  12 +-
 youtube_handler.py         | 304 +++++++++++++++++++++++++++++++++++++
 12 files changed, 570 insertions(+), 187 deletions(-)
 create mode 100644 youtube_handler.py

diff --git a/config.py b/config.py
index af486c9..c4f8ef9 100644
--- a/config.py
+++ b/config.py
@@ -13,6 +13,11 @@
     "https://lexfridman.com/feed/podcast/" # Lex Fridman Podcast
 ]
 
+# Example YouTube channels (add your own)
+YOUTUBE_CHANNELS = [
+    "https://www.youtube.com/@matthew_berman"
+]
+
 # Maximum number of episodes to pull from each feed
 MAX_EPISODES_PER_FEED = 5
 
@@ -30,11 +35,16 @@
 OLLAMA_URL = "http://localhost:11434"
 OLLAMA_MODEL = "qwen2.5:3b"
 
+# YouTube configuration
+YOUTUBE_AUDIO_QUALITY = "192"  # Audio quality in kbps
+YOUTUBE_MAX_RETRIES = 3
+YOUTUBE_TIMEOUT = 300  # Timeout in seconds
+
 # Transcript processing configuration
 TRANSCRIPT_CHUNK_TOKENS = 50000  # Tokens per chunk (suitable for most LLM context windows)
 TRANSCRIPT_CHUNK_OVERLAP_TOKENS = 500  # Tokens of overlap between chunks 
 
 # Scheduling configuration
 CHECK_INTERVAL_MINUTES = 60  # How often to check feeds
-RETAIN_DAYS = 30  # How many days of history to keep 
+RETAIN_DAYS = 30  # How many days of history to keep
 
diff --git a/database.py b/database.py
index 578930b..7e1f264 100644
--- a/database.py
+++ b/database.py
@@ -1,26 +1,33 @@
-from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text, Enum
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker, relationship
 import datetime
 import config
 import os
+import enum
 
 engine = create_engine(f"sqlite:///{config.DB_PATH}", echo=False)
 SessionLocal = sessionmaker(bind=engine)
 Base = declarative_base()
 
+class ContentType(enum.Enum):
+    PODCAST = "podcast"
+    YOUTUBE = "youtube"
+
 class Show(Base):
     __tablename__ = "shows"
 
     id = Column(Integer, primary_key=True, index=True)
     feed_url = Column(String, unique=True, index=True)
     title = Column(String)
+    content_type = Column(Enum(ContentType), default=ContentType.PODCAST)
+    channel_id = Column(String, index=True)  # For YouTube channels
     created_at = Column(DateTime, default=datetime.datetime.utcnow)
 
     # Relationship to episodes
-    episodes = relationship("PodcastEpisode", back_populates="show", cascade="all, delete-orphan")
+    episodes = relationship("Episode", back_populates="show", cascade="all, delete-orphan")
 
-class PodcastEpisode(Base):
+class Episode(Base):
     __tablename__ = "episodes"
 
     id = Column(Integer, primary_key=True, index=True)
@@ -36,6 +43,9 @@ class PodcastEpisode(Base):
     created_at = Column(DateTime, default=datetime.datetime.utcnow)
     file_size = Column(Integer)  # Size in bytes
     duration = Column(Integer)   # Duration in seconds
+    video_id = Column(String, index=True)  # For YouTube videos
+    thumbnail_url = Column(String)  # For YouTube videos
+    original_url = Column(String)  # Original URL (video URL for YouTube, audio URL for podcasts)
 
     # Relationship to show
     show = relationship("Show", back_populates="episodes")
@@ -50,10 +60,12 @@ class EpisodeContent(Base):
     size_formatted = Column(String)
     summary = Column(Text)
     audio_url = Column(String)
+    thumbnail_url = Column(String)  # For YouTube video thumbnails
+    original_url = Column(String)  # Original URL (video URL for YouTube, audio URL for podcasts)
     last_updated = Column(DateTime, default=datetime.datetime.utcnow)
 
     # Relationship to parent episode
-    episode = relationship("PodcastEpisode", backref="content")
+    episode = relationship("Episode", backref="content")
 
 def init_db():
     """Initialize the database, creating tables if they don't exist."""
@@ -72,8 +84,8 @@ def cleanup_old_episodes(days=None):
     cutoff_date = datetime.datetime.utcnow() - datetime.timedelta(days=days)
     
     old_episodes = (
-        session.query(PodcastEpisode)
-        .filter(PodcastEpisode.created_at < cutoff_date)
+        session.query(Episode)
+        .filter(Episode.created_at < cutoff_date)
         .all()
     )
     
@@ -132,6 +144,13 @@ def update_episode_content(session, episode):
         content.audio_url = f"/audio/{os.path.basename(episode.audio_path)}"
     else:
         content.audio_url = None
+
+    # Add YouTube-specific information if applicable
+    if episode.show.content_type == ContentType.YOUTUBE:
+        if episode.video_id:
+            content.original_url = f"https://www.youtube.com/watch?v={episode.video_id}"
+        if episode.thumbnail_url:
+            content.thumbnail_url = episode.thumbnail_url
     
     content.last_updated = datetime.datetime.utcnow()
     session.commit() 
\ No newline at end of file
diff --git a/feed_monitor.py b/feed_monitor.py
index 4e39369..ffb3253 100644
--- a/feed_monitor.py
+++ b/feed_monitor.py
@@ -2,7 +2,7 @@
 import requests
 import os
 import datetime
-from database import PodcastEpisode, get_db_session, Show
+from database import Episode, get_db_session, Show
 import config
 import logging
 from urllib.parse import urlparse
@@ -64,7 +64,7 @@ def check_feeds():
             for entry in sorted_entries[:config.MAX_EPISODES_PER_FEED]:
                 # Skip if episode already exists
                 existing = (
-                    session.query(PodcastEpisode)
+                    session.query(Episode)
                     .filter_by(show_id=show.id, episode_title=entry.title)
                     .first()
                 )
@@ -75,7 +75,7 @@ def check_feeds():
                 if hasattr(entry, "published_parsed"):
                     pub_date = datetime.datetime(*entry.published_parsed[:6])
 
-                new_episode = PodcastEpisode(
+                new_episode = Episode(
                     show_id=show.id,
                     episode_title=entry.title,
                     pub_date=pub_date
@@ -94,7 +94,7 @@ def download_new_episodes():
     """Download audio files for episodes that haven't been downloaded yet."""
     session = get_db_session()
     episodes_to_download = (
-        session.query(PodcastEpisode)
+        session.query(Episode)
         .filter_by(downloaded=False)
         .all()
     )
diff --git a/main.py b/main.py
index 8ecbaf0..5755ec5 100644
--- a/main.py
+++ b/main.py
@@ -1,26 +1,21 @@
+import asyncio
+import schedule
+import time
 import os
-import json
-import logging
-from datetime import datetime, timedelta
-from dotenv import load_dotenv
-from apscheduler.schedulers.background import BackgroundScheduler
-from database import init_db, cleanup_old_episodes
 from feed_monitor import check_feeds, download_new_episodes
-from transcriber import (
-    TranscriptionService,
-    LocalWhisperTranscriber,
-    OpenAIWhisperTranscriber
-)
+from youtube_handler import check_youtube_feeds, download_youtube_videos
+from transcriber import TranscriptionService, get_transcriber
 from summarizer import summarize_episodes
+from database import init_db, cleanup_old_episodes
+import logging
 import config
 import openlit
+from dotenv import load_dotenv
+from datetime import datetime, timedelta
 
 load_dotenv()
 
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 # Initialize OpenLIT if OTLP endpoint is configured
@@ -31,119 +26,98 @@
 else:
     logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT not set, OpenLIT initialization skipped")
 
-def get_transcriber():
-    """Initialize and return the appropriate transcriber based on configuration."""
-    if config.TRANSCRIPTION_MODE == "openai":
-        return OpenAIWhisperTranscriber()
-    else:  # default to local
-        return LocalWhisperTranscriber(model_path=config.WHISPER_MODEL)
-
-def generate_daily_feed():
-    """Generate a JSON feed of recent episodes with transcripts and summaries."""
-    from database import get_db_session, PodcastEpisode
-    
-    session = get_db_session()
-    yesterday = datetime.utcnow() - timedelta(days=1)
-    
-    recent_episodes = (
-        session.query(PodcastEpisode)
-        .filter(PodcastEpisode.created_at >= yesterday)
-        .all()
-    )
-    
-    feed_entries = []
-    for ep in recent_episodes:
-        entry = {
-            "podcast_title": ep.show.title,
-            "episode_title": ep.episode_title,
-            "publication_date": ep.pub_date.isoformat() if ep.pub_date else None,
-            "duration_seconds": ep.duration,
-            "file_size_bytes": ep.file_size,
-            "audio_path": ep.audio_path,
-            "transcript_path": ep.transcript_path if ep.transcribed else None,
-            "summary_path": ep.summary_path if ep.summarized else None
-        }
-        feed_entries.append(entry)
-    
-    feed_file = os.path.join(config.AUDIO_STORAGE_PATH, "daily_feed.json")
-    os.makedirs(os.path.dirname(feed_file), exist_ok=True)
-    
-    with open(feed_file, "w", encoding="utf-8") as f:
-        json.dump(
-            {
-                "generated_at": datetime.utcnow().isoformat(),
-                "episodes": feed_entries
-            },
-            f,
-            indent=2
-        )
-    
-    logger.info(f"Generated daily feed with {len(feed_entries)} episodes")
-    session.close()
+def setup_directories():
+    """Create necessary directories if they don't exist."""
+    os.makedirs(config.AUDIO_STORAGE_PATH, exist_ok=True)
+    os.makedirs(config.TRANSCRIPT_STORAGE_PATH, exist_ok=True)
+    logger.info("Storage directories initialized")
 
-def process_episodes():
-    """Main processing function that runs all steps in sequence."""
+async def run_feed_checks():
+    """Run feed checks for both podcasts and YouTube channels."""
     try:
-        logger.info("Starting episode processing...")
-        
-        # Check feeds for new episodes
         check_feeds()
-        
-        # Download new episodes
+        check_youtube_feeds()
+    except Exception as e:
+        logger.error(f"Error in feed checks: {str(e)}")
+
+async def run_downloads():
+    """Run downloads for both podcasts and YouTube videos."""
+    try:
         download_new_episodes()
-        
-        # Generate transcripts
+        download_youtube_videos()
+    except Exception as e:
+        logger.error(f"Error in downloads: {str(e)}")
+
+async def run_transcriptions():
+    """Process transcription queue."""
+    try:
         transcriber = get_transcriber()
-        transcription_service = TranscriptionService(transcriber)
-        transcription_service.transcribe_episodes()
-        
-        # Generate summaries (if Ollama is configured)
+        service = TranscriptionService(transcriber)
+        service.transcribe_episodes()
+    except Exception as e:
+        logger.error(f"Error in transcriptions: {str(e)}")
+
+async def run_summaries():
+    """Process summary queue."""
+    try:
         summarize_episodes()
-        
-        # Generate daily feed
-        generate_daily_feed()
-        
-        # Cleanup old episodes
+    except Exception as e:
+        logger.error(f"Error in summaries: {str(e)}")
+
+async def cleanup():
+    """Run cleanup tasks."""
+    try:
         cleanup_old_episodes()
-        
-        logger.info("Episode processing complete")
-        
     except Exception as e:
-        logger.error(f"Error in process_episodes: {e}")
+        logger.error(f"Error in cleanup: {str(e)}")
 
-def setup_directories():
-    """Create necessary directories if they don't exist."""
-    os.makedirs(config.AUDIO_STORAGE_PATH, exist_ok=True)
-    os.makedirs(config.TRANSCRIPT_STORAGE_PATH, exist_ok=True)
+def log_next_run(job_name: str, minutes: int):
+    """Log when a job will next run based on its schedule."""
+    now = datetime.now()
+    next_run = now + timedelta(minutes=minutes)
+    logger.info(f"Next {job_name} scheduled for: {next_run.strftime('%Y-%m-%d %H:%M:%S')}")
 
-def main():
-    """Main entry point."""
+async def main():
+    """Main application loop."""
     # Create directories
     setup_directories()
     
     # Initialize database
     init_db()
+    logger.info("Database initialized")
     
-    # Set up scheduler
-    scheduler = BackgroundScheduler()
+    # Initialize transcription service
+    transcriber = get_transcriber()
+    transcription_service = TranscriptionService(transcriber)
     
-    # Schedule regular processing
-    scheduler.add_job(
-        process_episodes,
-        'interval',
-        minutes=config.CHECK_INTERVAL_MINUTES,
-        next_run_time=datetime.now()
-    )
+    # Schedule all jobs
+    schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(check_feeds)
+    schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(check_youtube_feeds)
+    schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(download_youtube_videos)
+    schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(lambda: transcription_service.transcribe_episodes())
+    schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(summarize_episodes)
     
-    # Start the scheduler
-    scheduler.start()
+    # Run all jobs immediately on startup
+    logger.info("Starting initial run of all jobs...")
+    check_feeds()
+    check_youtube_feeds()
+    download_youtube_videos()
+    transcription_service.transcribe_episodes()
+    summarize_episodes()
     
-    try:
-        # Keep the main thread alive
-        while True:
-            pass
-    except (KeyboardInterrupt, SystemExit):
-        scheduler.shutdown()
+    logger.info("Initial run complete. Starting scheduled execution...")
+    
+    while True:
+        schedule.run_pending()
+        
+        # Log next run times for all jobs
+        log_next_run("feed check", config.CHECK_INTERVAL_MINUTES)
+        log_next_run("YouTube feed check", config.CHECK_INTERVAL_MINUTES)
+        log_next_run("YouTube download", config.CHECK_INTERVAL_MINUTES)
+        log_next_run("transcription", config.CHECK_INTERVAL_MINUTES)
+        log_next_run("summarization", config.CHECK_INTERVAL_MINUTES)
+        
+        await asyncio.sleep(60)  # Sleep for 1 minute
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    asyncio.run(main()) 
\ No newline at end of file
diff --git a/progress_handler.py b/progress_handler.py
index 7a2d3d4..e06f33d 100644
--- a/progress_handler.py
+++ b/progress_handler.py
@@ -2,6 +2,9 @@
 import threading
 from typing import Union
 import tqdm
+import logging
+
+logger = logging.getLogger(__name__)
 
 class ProgressListener:
     def on_progress(self, current: Union[int, float], total: Union[int, float]):
@@ -69,23 +72,20 @@ def create_progress_listener_handle(progress_listener: ProgressListener):
 
 class DownloadProgressBar:
     def __init__(self, episode_title):
-        self.pbar = None
         self.episode_title = episode_title
-    
-    def __call__(self, block_num, block_size, total_size):
-        if not self.pbar:
-            self.pbar = tqdm.tqdm(
-                total=total_size,
-                desc=f"Downloading {self.episode_title}",
-                unit='iB',
-                unit_scale=True,
-                unit_divisor=1024,
-            )
+        self.started = False
         
-        downloaded = block_num * block_size
-        if downloaded <= total_size:
-            self.pbar.update(block_size)
+    def yt_dlp_hook(self, d):
+        if d['status'] == 'downloading':
+            if not self.started:
+                self.started = True
+                logger.info(f"Starting download of: {self.episode_title}")
+            
+            if 'total_bytes' in d and 'downloaded_bytes' in d:
+                percentage = (d['downloaded_bytes'] / d['total_bytes']) * 100
+                logger.info(f"Download progress for {self.episode_title}: {percentage:.1f}%")
+        elif d['status'] == 'finished':
+            logger.info(f"Download completed for: {self.episode_title}")
     
     def close(self):
-        if self.pbar:
-            self.pbar.close() 
\ No newline at end of file
+        pass 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 91a3b7e..fc1996f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,6 @@ openlit
 ollama
 markdown2
 tiktoken<0.7.0,>=0.6.0
-langchain-text-splitters>=0.0.1
\ No newline at end of file
+langchain-text-splitters>=0.0.1
+yt-dlp>=2023.12.30
+ffmpeg-python>=0.2.0
\ No newline at end of file
diff --git a/scripts/reset_summaries.py b/scripts/reset_summaries.py
index f1736e0..253033b 100644
--- a/scripts/reset_summaries.py
+++ b/scripts/reset_summaries.py
@@ -13,7 +13,7 @@
 # Add parent directory to Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from database import get_db_session, PodcastEpisode, EpisodeContent
+from database import get_db_session, Episode, EpisodeContent
 import logging
 
 logging.basicConfig(level=logging.INFO)
@@ -24,7 +24,7 @@ def reset_summaries():
     session = get_db_session()
     try:
         # Get all episodes that have been summarized
-        episodes = session.query(PodcastEpisode).filter_by(summarized=True).all()
+        episodes = session.query(Episode).filter_by(summarized=True).all()
         
         for ep in episodes:
             # Delete the summary file if it exists
diff --git a/summarizer.py b/summarizer.py
index 14fb758..299dd1c 100644
--- a/summarizer.py
+++ b/summarizer.py
@@ -1,7 +1,7 @@
 import os
 import logging
 from ollama import Client
-from database import PodcastEpisode, get_db_session, update_episode_content
+from database import Episode, get_db_session, update_episode_content
 import config
 import openai
 from abc import ABC, abstractmethod
@@ -267,7 +267,7 @@ def summarize_episodes():
     """Find all transcribed but not summarized episodes and generate summaries."""
     session = get_db_session()
     episodes = (
-        session.query(PodcastEpisode)
+        session.query(Episode)
         .filter_by(transcribed=True, summarized=False)
         .all()
     )
@@ -357,7 +357,7 @@ def summarize_episodes():
 def get_summary(episode_id):
     """Retrieve summary for a specific episode."""
     session = get_db_session()
-    episode = session.query(PodcastEpisode).filter_by(id=episode_id).first()
+    episode = session.query(Episode).filter_by(id=episode_id).first()
     
     if not episode or not episode.summary_path:
         return None
diff --git a/templates/index.html b/templates/index.html
index ae966af..9a1a990 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -362,6 +362,43 @@
             font-size: 0.875rem;
             text-align: center;
         }
+
+        .content-type {
+            margin-bottom: 0.5rem;
+        }
+
+        .badge {
+            display: inline-block;
+            padding: 0.25rem 0.75rem;
+            border-radius: 9999px;
+            font-size: 0.75rem;
+            font-weight: 500;
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+        }
+
+        .badge.youtube {
+            background-color: #ff0000;
+            color: white;
+        }
+
+        .badge.podcast {
+            background-color: #8b5cf6;
+            color: white;
+        }
+
+        .thumbnail {
+            margin: 1rem 0;
+            border-radius: 8px;
+            overflow: hidden;
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+        }
+
+        .thumbnail img {
+            width: 100%;
+            height: auto;
+            display: block;
+        }
     </style>
 </head>
 <body>
@@ -414,11 +451,24 @@ <h1>PhaseFeed</h1>
                     <article class="episode-card" id="episode-{{ episode.id }}">
                         <div class="episode-header">
                             <div>
+                                <div class="content-type">
+                                    {% if episode.content_type == 'YOUTUBE' %}
+                                        <span class="badge youtube">YouTube</span>
+                                    {% else %}
+                                        <span class="badge podcast">Podcast</span>
+                                    {% endif %}
+                                </div>
                                 <h2 class="podcast-title">{{ episode.podcast_title }}</h2>
                                 <h3 class="episode-title">{{ episode.episode_title }}</h3>
                             </div>
                         </div>
                         
+                        {% if episode.content_type == 'YOUTUBE' and episode.thumbnail_url %}
+                        <div class="thumbnail">
+                            <img src="{{ episode.thumbnail_url }}" alt="Video thumbnail" loading="lazy">
+                        </div>
+                        {% endif %}
+                        
                         <div class="meta">
                             <span>{{ episode.formatted_date }}</span>
                             <span>{{ episode.duration_formatted }}</span>
diff --git a/transcriber.py b/transcriber.py
index 4d3529f..853e29a 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from typing import Optional
 import openai
-from database import PodcastEpisode, get_db_session
+from database import Episode, Show, get_db_session
 import config
 from tqdm import tqdm
 from progress_handler import ProgressListener, create_progress_listener_handle
@@ -197,28 +197,40 @@ def transcribe_episodes(self):
         """Find all downloaded but not transcribed episodes and generate transcripts."""
         session = get_db_session()
         episodes = (
-            session.query(PodcastEpisode)
-            .filter_by(downloaded=True, transcribed=False)
+            session.query(Episode)
+            .join(Show)
+            .filter(
+                Episode.downloaded == True,
+                Episode.transcribed == False
+            )
             .all()
         )
-
-        for ep in tqdm(episodes, desc="Processing episodes", unit="episode"):
-            if not ep.audio_path or not os.path.exists(ep.audio_path):
-                logger.error(f"Audio file not found for {ep.episode_title}")
-                continue
-
+        
+        if not episodes:
+            logger.info("No new episodes to transcribe")
+            return
+            
+        logger.info(f"Found {len(episodes)} episodes to transcribe")
+        for ep in episodes:
             try:
-                logger.info(f"Starting transcription of {ep.episode_title}...")
-                
-                # Ensure transcript directory exists
-                self.ensure_transcript_dir()
+                logger.info(f"Processing episode: {ep.episode_title}")
                 
-                # Generate transcript with progress tracking
-                progress_listener = TranscriptionProgressListener(ep.episode_title)
-                transcript = self.transcriber.transcribe_audio(ep.audio_path, progress_listener)
-                
-                # Format transcript with metadata
-                transcript_text = f"""Title: {ep.episode_title}
+                if not ep.audio_path or not os.path.exists(ep.audio_path):
+                    logger.error(f"Audio file not found for {ep.episode_title}")
+                    continue
+
+                try:
+                    logger.info(f"Starting transcription of {ep.episode_title}...")
+                    
+                    # Ensure transcript directory exists
+                    self.ensure_transcript_dir()
+                    
+                    # Generate transcript with progress tracking
+                    progress_listener = TranscriptionProgressListener(ep.episode_title)
+                    transcript = self.transcriber.transcribe_audio(ep.audio_path, progress_listener)
+                    
+                    # Format transcript with metadata
+                    transcript_text = f"""Title: {ep.episode_title}
 Podcast: {ep.show.title}
 Date: {ep.pub_date}
 Duration: {ep.duration} seconds
@@ -226,26 +238,29 @@ def transcribe_episodes(self):
 Transcript:
 {transcript}
 """
-                
-                # Save transcript
-                safe_filename = "".join([c for c in ep.episode_title if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip()
-                transcript_path = os.path.join(
-                    config.TRANSCRIPT_STORAGE_PATH,
-                    f"{ep.show.title}_{safe_filename}.txt"
-                )
-                
-                with open(transcript_path, "w", encoding="utf-8") as f:
-                    f.write(transcript_text)
-                
-                # Update database
-                ep.transcript_path = transcript_path
-                ep.transcribed = True
-                session.commit()
-                
-                logger.info(f"Successfully transcribed: {ep.episode_title}")
-                
+                    
+                    # Save transcript
+                    safe_filename = "".join([c for c in ep.episode_title if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip()
+                    transcript_path = os.path.join(
+                        config.TRANSCRIPT_STORAGE_PATH,
+                        f"{ep.show.title}_{safe_filename}.txt"
+                    )
+                    
+                    with open(transcript_path, "w", encoding="utf-8") as f:
+                        f.write(transcript_text)
+                    
+                    # Update database
+                    ep.transcript_path = transcript_path
+                    ep.transcribed = True
+                    session.commit()
+                    
+                    logger.info(f"Successfully transcribed: {ep.episode_title}")
+                    
+                except Exception as e:
+                    logger.error(f"Failed to transcribe {ep.episode_title}: {e}")
+                    continue
             except Exception as e:
-                logger.error(f"Failed to transcribe {ep.episode_title}: {e}")
+                logger.error(f"Failed to process episode {ep.episode_title}: {e}")
                 continue
 
         session.close()
@@ -253,7 +268,7 @@ def transcribe_episodes(self):
     def get_transcript(self, episode_id):
         """Retrieve transcript for a specific episode."""
         session = get_db_session()
-        episode = session.query(PodcastEpisode).filter_by(id=episode_id).first()
+        episode = session.query(Episode).filter_by(id=episode_id).first()
         
         if not episode or not episode.transcript_path:
             return None
@@ -267,6 +282,13 @@ def get_transcript(self, episode_id):
         finally:
             session.close()
 
+def get_transcriber() -> BaseTranscriber:
+    """Initialize and return the appropriate transcriber based on configuration."""
+    if config.TRANSCRIPTION_MODE == "openai":
+        return OpenAIWhisperTranscriber()
+    else:  # default to local
+        return LocalWhisperTranscriber(model_path=config.WHISPER_MODEL)
+
 # Context manager for null progress listener
 from contextlib import contextmanager
 
diff --git a/web_server.py b/web_server.py
index 850e8b3..f2f8ec8 100644
--- a/web_server.py
+++ b/web_server.py
@@ -6,7 +6,7 @@
 import os
 import config
 from datetime import datetime
-from database import get_db_session, EpisodeContent, PodcastEpisode
+from database import get_db_session, EpisodeContent, Episode
 from urllib.parse import unquote
 import markdown2
 import logging
@@ -38,7 +38,7 @@ def get_episodes():
         query = (
             session.query(EpisodeContent)
             .join(EpisodeContent.episode)
-            .order_by(PodcastEpisode.pub_date.desc())
+            .order_by(Episode.pub_date.desc())
         )
         logger.debug(f"Executing query: {query}")
         
@@ -59,7 +59,9 @@ def get_episodes():
                     'formatted_date': content.formatted_date,
                     'duration_formatted': content.duration_formatted,
                     'size_formatted': content.size_formatted,
-                    'summary': summary_html
+                    'summary': summary_html,
+                    'content_type': content.episode.show.content_type.name,
+                    'thumbnail_url': content.episode.thumbnail_url
                 }
                 episodes_data.append(episode_data)
             except Exception as e:
@@ -105,8 +107,8 @@ async def get_audio(episode_id: int):
     session = get_db_session()
     try:
         episode = (
-            session.query(PodcastEpisode)
-            .filter(PodcastEpisode.id == episode_id)
+            session.query(Episode)
+            .filter(Episode.id == episode_id)
             .first()
         )
         
diff --git a/youtube_handler.py b/youtube_handler.py
new file mode 100644
index 0000000..c102c1c
--- /dev/null
+++ b/youtube_handler.py
@@ -0,0 +1,304 @@
+import feedparser
+import yt_dlp
+import os
+import datetime
+import requests
+import re
+from database import Show, Episode, ContentType, get_db_session
+import config
+import logging
+from urllib.parse import urlparse, parse_qs
+import mimetypes
+from progress_handler import DownloadProgressBar
+from bs4 import BeautifulSoup
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def sanitize_filename(filename):
+    """Create a safe filename from potentially unsafe string."""
+    if not filename:
+        return ""
+    return "".join([c for c in filename if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip()
+
+def extract_channel_id(url: str) -> str | None:
+    """Extract channel ID from various YouTube URL formats."""
+    parsed = urlparse(url)
+    if parsed.hostname not in ('www.youtube.com', 'youtube.com'):
+        return None
+        
+    # Direct channel ID format: /channel/UCxxxxxx
+    if parsed.path.startswith('/channel/'):
+        return parsed.path.split('/')[2]
+        
+    # Handle /@username and /c/ formats by fetching the page
+    if parsed.path.startswith('/@') or parsed.path.startswith('/c/'):
+        try:
+            response = requests.get(url, timeout=config.YOUTUBE_TIMEOUT)
+            response.raise_for_status()
+            
+            # Extract channel ID from meta tags or canonical URL
+            soup = BeautifulSoup(response.text, 'html.parser')
+            
+            # Try meta tag first
+            meta_tag = soup.find('meta', {'itemprop': 'channelId'})
+            if meta_tag and meta_tag.get('content'):
+                return meta_tag['content']
+            
+            # Try canonical URL
+            canonical = soup.find('link', {'rel': 'canonical'})
+            if canonical and 'href' in canonical.attrs:
+                channel_match = re.search(r'channel/([^/]+)', canonical['href'])
+                if channel_match:
+                    return channel_match.group(1)
+                    
+        except Exception as e:
+            logger.error(f"Error extracting channel ID from {url}: {str(e)}")
+            
+    return None
+
+def get_feed_url(channel_id: str) -> str:
+    """Get RSS feed URL for a YouTube channel."""
+    return f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
+
+def check_youtube_feeds():
+    """Check YouTube channel feeds for new videos."""
+    session = get_db_session()
+    
+    try:
+        # First, ensure all channels have proper channel IDs
+        for url in config.YOUTUBE_CHANNELS:
+            existing = session.query(Show).filter_by(feed_url=url).first()
+            if not existing:
+                channel_id = extract_channel_id(url)
+                if channel_id:
+                    # Use yt-dlp to get channel info
+                    ydl_opts = {
+                        'quiet': True,
+                        'no_warnings': True,
+                        'extract_flat': True,
+                    }
+                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                        channel_info = ydl.extract_info(url, download=False)
+                        channel_title = channel_info.get('channel') or channel_info.get('uploader')
+                    
+                    show = Show(
+                        feed_url=url,
+                        channel_id=channel_id,
+                        content_type=ContentType.YOUTUBE,
+                        title=channel_title or f"channel_{channel_id}"  # Fallback if title not found
+                    )
+                    session.add(show)
+                    logger.info(f"Added new YouTube channel: {channel_title or url}")
+                else:
+                    logger.error(f"Could not extract channel ID from: {url}")
+        
+        session.commit()
+        
+        # Now check feeds for all channels
+        youtube_shows = session.query(Show).filter_by(content_type=ContentType.YOUTUBE).all()
+        
+        for show in youtube_shows:
+            try:
+                feed_url = get_feed_url(show.channel_id)
+                logger.info(f"Checking YouTube feed: {feed_url}")
+                
+                feed = feedparser.parse(feed_url)
+                if feed.bozo:
+                    logger.error(f"Error parsing feed: {feed_url} - {feed.bozo_exception}")
+                    continue
+
+                for entry in feed.entries[:config.MAX_EPISODES_PER_FEED]:
+                    video_id = entry.yt_videoid
+                    
+                    # Check if we already have this video
+                    existing = session.query(Episode).filter_by(
+                        show_id=show.id,
+                        video_id=video_id
+                    ).first()
+                    
+                    if not existing:
+                        # Create new episode
+                        episode = Episode(
+                            show_id=show.id,
+                            episode_title=entry.title,
+                            pub_date=datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%S%z"),
+                            video_id=video_id,
+                            thumbnail_url=entry.media_thumbnail[0]['url'] if entry.get('media_thumbnail') else None,
+                            original_url=entry.link
+                        )
+                        session.add(episode)
+                        logger.info(f"Added new video: {entry.title}")
+
+                session.commit()
+                
+            except Exception as e:
+                logger.error(f"Error processing YouTube feed for {show.feed_url}: {str(e)}")
+                session.rollback()
+                continue
+
+    except Exception as e:
+        logger.error(f"Error in check_youtube_feeds: {str(e)}")
+        session.rollback()
+    finally:
+        session.close()
+
+def download_youtube_videos():
+    """Download new YouTube videos and extract audio."""
+    session = get_db_session()
+    
+    try:
+        new_episodes = (
+            session.query(Episode)
+            .join(Show)
+            .filter(
+                Show.content_type == ContentType.YOUTUBE,
+                Episode.downloaded == False
+            )
+            .all()
+        )
+
+        if not new_episodes:
+            logger.info("No new YouTube episodes to download")
+            return
+
+        for episode in new_episodes:
+            # Create channel-specific directory
+            channel_dir = os.path.join(config.AUDIO_STORAGE_PATH, sanitize_filename(episode.show.title or f"channel_{episode.show.id}"))
+            os.makedirs(channel_dir, exist_ok=True)
+            
+            # Setup output path in channel directory
+            output_path = os.path.join(
+                channel_dir,
+                f"{sanitize_filename(episode.episode_title)}_{episode.video_id}"  # Removed .mp3 extension
+            )
+            
+            success = False
+            error_msg = None
+            
+            try:
+                # Setup progress tracking
+                progress_bar = DownloadProgressBar(episode.episode_title)
+                
+                # Setup yt-dlp options
+                ydl_opts = {
+                    # Format selection
+                    'format': 'bestaudio/best',
+                    'format_sort': ['abr', 'asr', 'res', 'br'],  # Prefer better audio quality
+                    
+                    # Audio extraction
+                    'postprocessors': [{
+                        'key': 'FFmpegExtractAudio',
+                        'preferredcodec': 'mp3',
+                        'preferredquality': config.YOUTUBE_AUDIO_QUALITY,
+                    }],
+                    
+                    # Output settings
+                    'outtmpl': output_path,
+                    'writethumbnail': False,
+                    
+                    # Download settings
+                    'progress_hooks': [progress_bar.yt_dlp_hook],
+                    'retries': config.YOUTUBE_MAX_RETRIES,
+                    'fragment_retries': config.YOUTUBE_MAX_RETRIES,
+                    'socket_timeout': config.YOUTUBE_TIMEOUT,
+                    'extractor_retries': config.YOUTUBE_MAX_RETRIES,
+                    
+                    # Network settings
+                    'socket_timeout': config.YOUTUBE_TIMEOUT,
+                    'nocheckcertificate': False,
+                    
+                    # Error handling
+                    'ignoreerrors': False,
+                    'no_warnings': False,
+                    'verbose': False,
+                    
+                    # Geo-restriction handling
+                    'geo_bypass': True,
+                    'geo_bypass_country': 'US',
+                    
+                    # System settings
+                    'quiet': False,
+                    'no_color': False,
+                    
+                    # Sponsorblock settings (optional)
+                    # 'sponsorblock_remove': ['sponsor', 'intro', 'outro', 'selfpromo'],
+                    
+                    # Age-gate bypass
+                    'cookiesfrombrowser': None,  # Can be set to ('chrome', 'firefox', etc)
+                }
+
+                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                    logger.info(f"Downloading video: {episode.episode_title}")
+                    ydl.download([episode.original_url])
+                success = True
+                
+            except yt_dlp.utils.DownloadError as e:
+                error_msg = str(e)
+                if "Video unavailable" in error_msg:
+                    logger.error(f"Video {episode.episode_title} is no longer available")
+                elif "Sign in to confirm your age" in error_msg:
+                    logger.error(f"Video {episode.episode_title} is age restricted. Consider setting cookiesfrombrowser")
+                elif "The uploader has not made this video available in your country" in error_msg:
+                    logger.error(f"Video {episode.episode_title} is geo-restricted")
+                elif "This video is only available to users with special access" in error_msg:
+                    logger.error(f"Video {episode.episode_title} requires special access (members only, etc)")
+                else:
+                    logger.error(f"Error downloading video {episode.episode_title}: {error_msg}")
+                # Cleanup partial download if it exists
+                if os.path.exists(output_path):
+                    try:
+                        os.remove(output_path)
+                    except OSError:
+                        pass
+                        
+            except Exception as e:
+                error_msg = str(e)
+                logger.error(f"Unexpected error downloading video {episode.episode_title}: {error_msg}")
+                # Cleanup partial download if it exists
+                if os.path.exists(output_path):
+                    try:
+                        os.remove(output_path)
+                    except OSError:
+                        pass
+            
+            if success:
+                try:
+                    # Get final output path with .mp3 extension
+                    final_output_path = f"{output_path}.mp3"
+                    
+                    # Update episode record
+                    episode.audio_path = final_output_path
+                    episode.downloaded = True
+                    episode.file_size = os.path.getsize(final_output_path)
+                    
+                    # Get duration using existing function from feed_monitor
+                    from feed_monitor import get_audio_duration
+                    episode.duration = get_audio_duration(final_output_path)
+                    
+                    # Log the values before committing
+                    logger.info(f"Updating database for {episode.episode_title}:")
+                    logger.info(f"  - audio_path: {final_output_path}")
+                    logger.info(f"  - downloaded: True")
+                    logger.info(f"  - file_size: {episode.file_size} bytes")
+                    logger.info(f"  - duration: {episode.duration} seconds")
+                    
+                    session.commit()
+                    logger.info(f"Successfully downloaded and processed: {episode.episode_title}")
+                    
+                except Exception as e:
+                    logger.error(f"Error updating episode record for {episode.episode_title}: {str(e)}")
+                    session.rollback()
+                    # Cleanup downloaded file if we couldn't update the database
+                    if os.path.exists(final_output_path):
+                        try:
+                            os.remove(final_output_path)
+                            logger.info(f"Cleaned up failed download: {final_output_path}")
+                        except OSError as ose:
+                            logger.error(f"Failed to clean up file {final_output_path}: {str(ose)}")
+            
+    except Exception as e:
+        logger.error(f"Error in download_youtube_videos: {str(e)}")
+        session.rollback()
+    finally:
+        session.close() 
\ No newline at end of file

From 19d89b9e0bd8cc0b80b59eed5acca68a8a3610ac Mon Sep 17 00:00:00 2001
From: Jeremy Waller <jeremyw@phase2online.com>
Date: Sun, 19 Jan 2025 18:08:56 -0600
Subject: [PATCH 3/3] Update readme

---
 README.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6d21ccf..1e620a6 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # PhaseFeed
 
-A local podcast monitoring and transcription system that:
-- Monitors RSS feeds for new podcast episodes
-- Downloads new episodes automatically
+A local podcast and YouTube monitoring system that:
+- Monitors RSS feeds for new podcast episodes and YouTube channels for new videos
+- Downloads new episodes and YouTube videos automatically
 - Transcribes audio using either OpenAI Whisper API or local mlx-whisper
 - Summarizes content using either OpenAI GPT-4 or local LLMs via Ollama
 - Stores metadata in SQLite
@@ -49,12 +49,18 @@ cp .env.example .env
 The application can be configured through `config.py`. Key settings include:
 
 ### Storage Configuration
-- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts (default: `~/Podcasts`)
+- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts and YouTube audio (default: `~/Podcasts`)
 - `TRANSCRIPT_STORAGE_PATH`: Where to store transcripts (default: `~/Podcasts/Transcripts`)
 
 ### Feed Configuration
 - `PODCAST_FEEDS`: List of RSS feed URLs to monitor
-- `MAX_EPISODES_PER_FEED`: Maximum number of episodes to pull from each feed (default: 5)
+- `YOUTUBE_CHANNELS`: List of YouTube channel URLs to monitor
+- `MAX_EPISODES_PER_FEED`: Maximum number of episodes/videos to pull from each feed (default: 5)
+
+### YouTube Configuration
+- `YOUTUBE_AUDIO_QUALITY`: Audio quality for downloaded videos in kbps (default: 192)
+- `YOUTUBE_MAX_RETRIES`: Number of retry attempts for failed downloads
+- `YOUTUBE_TIMEOUT`: Download timeout in seconds
 
 ### Transcription Configuration
 - `TRANSCRIPTION_MODE`: Choose between "local" or "openai"