From b8e50559bacf9806f20e20749a816314a82181be Mon Sep 17 00:00:00 2001 From: Jeremy Waller Date: Sun, 19 Jan 2025 17:40:53 -0600 Subject: [PATCH 1/3] add cursorrules --- .cursorrules | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .cursorrules diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 0000000..941f0b5 --- /dev/null +++ b/.cursorrules @@ -0,0 +1,66 @@ +// PhaseFeed AI Assistant Rules +// These rules guide Cursor AI's behavior when working with this podcast monitoring system + +// Project Context +This is a local podcast monitoring and transcription system that handles RSS feed monitoring, +audio downloads, transcription (via OpenAI Whisper or mlx-whisper), and content summarization +(via OpenAI GPT-4 or Ollama). The system uses SQLite for storage and provides a web interface. + +// Core Technologies +- Python 3.x for backend processing +- Flask/FastAPI for web server +- SQLite for data storage +- OpenAI APIs and local ML models +- FFmpeg for audio processing + +// Code Generation Rules +1. Always include type hints in Python functions +2. Use async/await patterns for I/O operations +3. Implement proper error handling for network and API calls +4. Follow SQLAlchemy best practices for database operations +5. Include comprehensive docstrings for all functions and classes +6. Use dependency injection patterns for better testability +7. Implement proper logging for monitoring and debugging + +// Architecture Guidelines +1. Maintain separation between feed monitoring, transcription, and summarization services +2. Use repository pattern for database operations +3. Implement proper background task handling +4. Follow REST API best practices for web endpoints +5. Use environment variables for configuration +6. Implement proper caching strategies + +// Security Considerations +1. Never hardcode API keys or credentials +2. Sanitize all user inputs +3. Implement proper file handling security +4. Use secure methods for storing sensitive data +5. Validate RSS feed sources + +// Testing Requirements +1. Write unit tests for core business logic +2. Include integration tests for API endpoints +3. Mock external services in tests +4. Test error handling scenarios +5. Include performance testing for long-running operations + +// Documentation +1. Include clear function and method documentation +2. Document API endpoints with OpenAPI/Swagger +3. Provide clear setup instructions +4. Document configuration options +5. Include troubleshooting guides + +// Performance Guidelines +1. Implement proper database indexing +2. Use connection pooling +3. Implement caching where appropriate +4. Handle large audio files efficiently +5. Optimize database queries + +// Maintenance +1. Include proper cleanup of old files +2. Implement monitoring and health checks +3. Handle database migrations properly +4. Include proper logging for debugging +5. Implement proper error reporting \ No newline at end of file From 7c4f2f9eefbc00d748032faf232f4a6e5d83c3b3 Mon Sep 17 00:00:00 2001 From: Jeremy Waller Date: Sun, 19 Jan 2025 17:41:40 -0600 Subject: [PATCH 2/3] add youtube support --- config.py | 12 +- database.py | 31 +++- feed_monitor.py | 8 +- main.py | 194 ++++++++++------------- progress_handler.py | 32 ++-- requirements.txt | 4 +- scripts/reset_summaries.py | 4 +- summarizer.py | 6 +- templates/index.html | 50 ++++++ transcriber.py | 100 +++++++----- web_server.py | 12 +- youtube_handler.py | 304 +++++++++++++++++++++++++++++++++++++ 12 files changed, 570 insertions(+), 187 deletions(-) create mode 100644 youtube_handler.py diff --git a/config.py b/config.py index af486c9..c4f8ef9 100644 --- a/config.py +++ b/config.py @@ -13,6 +13,11 @@ "https://lexfridman.com/feed/podcast/" # Lex Fridman Podcast ] +# Example YouTube channels (add your own) +YOUTUBE_CHANNELS = [ + "https://www.youtube.com/@matthew_berman" +] + # Maximum number of episodes to pull from each feed MAX_EPISODES_PER_FEED = 5 @@ -30,11 +35,16 @@ OLLAMA_URL = "http://localhost:11434" OLLAMA_MODEL = "qwen2.5:3b" +# YouTube configuration +YOUTUBE_AUDIO_QUALITY = "192" # Audio quality in kbps +YOUTUBE_MAX_RETRIES = 3 +YOUTUBE_TIMEOUT = 300 # Timeout in seconds + # Transcript processing configuration TRANSCRIPT_CHUNK_TOKENS = 50000 # Tokens per chunk (suitable for most LLM context windows) TRANSCRIPT_CHUNK_OVERLAP_TOKENS = 500 # Tokens of overlap between chunks # Scheduling configuration CHECK_INTERVAL_MINUTES = 60 # How often to check feeds -RETAIN_DAYS = 30 # How many days of history to keep +RETAIN_DAYS = 30 # How many days of history to keep diff --git a/database.py b/database.py index 578930b..7e1f264 100644 --- a/database.py +++ b/database.py @@ -1,26 +1,33 @@ -from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text +from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey, Text, Enum from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, relationship import datetime import config import os +import enum engine = create_engine(f"sqlite:///{config.DB_PATH}", echo=False) SessionLocal = sessionmaker(bind=engine) Base = declarative_base() +class ContentType(enum.Enum): + PODCAST = "podcast" + YOUTUBE = "youtube" + class Show(Base): __tablename__ = "shows" id = Column(Integer, primary_key=True, index=True) feed_url = Column(String, unique=True, index=True) title = Column(String) + content_type = Column(Enum(ContentType), default=ContentType.PODCAST) + channel_id = Column(String, index=True) # For YouTube channels created_at = Column(DateTime, default=datetime.datetime.utcnow) # Relationship to episodes - episodes = relationship("PodcastEpisode", back_populates="show", cascade="all, delete-orphan") + episodes = relationship("Episode", back_populates="show", cascade="all, delete-orphan") -class PodcastEpisode(Base): +class Episode(Base): __tablename__ = "episodes" id = Column(Integer, primary_key=True, index=True) @@ -36,6 +43,9 @@ class PodcastEpisode(Base): created_at = Column(DateTime, default=datetime.datetime.utcnow) file_size = Column(Integer) # Size in bytes duration = Column(Integer) # Duration in seconds + video_id = Column(String, index=True) # For YouTube videos + thumbnail_url = Column(String) # For YouTube videos + original_url = Column(String) # Original URL (video URL for YouTube, audio URL for podcasts) # Relationship to show show = relationship("Show", back_populates="episodes") @@ -50,10 +60,12 @@ class EpisodeContent(Base): size_formatted = Column(String) summary = Column(Text) audio_url = Column(String) + thumbnail_url = Column(String) # For YouTube video thumbnails + original_url = Column(String) # Original URL (video URL for YouTube, audio URL for podcasts) last_updated = Column(DateTime, default=datetime.datetime.utcnow) # Relationship to parent episode - episode = relationship("PodcastEpisode", backref="content") + episode = relationship("Episode", backref="content") def init_db(): """Initialize the database, creating tables if they don't exist.""" @@ -72,8 +84,8 @@ def cleanup_old_episodes(days=None): cutoff_date = datetime.datetime.utcnow() - datetime.timedelta(days=days) old_episodes = ( - session.query(PodcastEpisode) - .filter(PodcastEpisode.created_at < cutoff_date) + session.query(Episode) + .filter(Episode.created_at < cutoff_date) .all() ) @@ -132,6 +144,13 @@ def update_episode_content(session, episode): content.audio_url = f"/audio/{os.path.basename(episode.audio_path)}" else: content.audio_url = None + + # Add YouTube-specific information if applicable + if episode.show.content_type == ContentType.YOUTUBE: + if episode.video_id: + content.original_url = f"https://www.youtube.com/watch?v={episode.video_id}" + if episode.thumbnail_url: + content.thumbnail_url = episode.thumbnail_url content.last_updated = datetime.datetime.utcnow() session.commit() \ No newline at end of file diff --git a/feed_monitor.py b/feed_monitor.py index 4e39369..ffb3253 100644 --- a/feed_monitor.py +++ b/feed_monitor.py @@ -2,7 +2,7 @@ import requests import os import datetime -from database import PodcastEpisode, get_db_session, Show +from database import Episode, get_db_session, Show import config import logging from urllib.parse import urlparse @@ -64,7 +64,7 @@ def check_feeds(): for entry in sorted_entries[:config.MAX_EPISODES_PER_FEED]: # Skip if episode already exists existing = ( - session.query(PodcastEpisode) + session.query(Episode) .filter_by(show_id=show.id, episode_title=entry.title) .first() ) @@ -75,7 +75,7 @@ def check_feeds(): if hasattr(entry, "published_parsed"): pub_date = datetime.datetime(*entry.published_parsed[:6]) - new_episode = PodcastEpisode( + new_episode = Episode( show_id=show.id, episode_title=entry.title, pub_date=pub_date @@ -94,7 +94,7 @@ def download_new_episodes(): """Download audio files for episodes that haven't been downloaded yet.""" session = get_db_session() episodes_to_download = ( - session.query(PodcastEpisode) + session.query(Episode) .filter_by(downloaded=False) .all() ) diff --git a/main.py b/main.py index 8ecbaf0..5755ec5 100644 --- a/main.py +++ b/main.py @@ -1,26 +1,21 @@ +import asyncio +import schedule +import time import os -import json -import logging -from datetime import datetime, timedelta -from dotenv import load_dotenv -from apscheduler.schedulers.background import BackgroundScheduler -from database import init_db, cleanup_old_episodes from feed_monitor import check_feeds, download_new_episodes -from transcriber import ( - TranscriptionService, - LocalWhisperTranscriber, - OpenAIWhisperTranscriber -) +from youtube_handler import check_youtube_feeds, download_youtube_videos +from transcriber import TranscriptionService, get_transcriber from summarizer import summarize_episodes +from database import init_db, cleanup_old_episodes +import logging import config import openlit +from dotenv import load_dotenv +from datetime import datetime, timedelta load_dotenv() -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize OpenLIT if OTLP endpoint is configured @@ -31,119 +26,98 @@ else: logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT not set, OpenLIT initialization skipped") -def get_transcriber(): - """Initialize and return the appropriate transcriber based on configuration.""" - if config.TRANSCRIPTION_MODE == "openai": - return OpenAIWhisperTranscriber() - else: # default to local - return LocalWhisperTranscriber(model_path=config.WHISPER_MODEL) - -def generate_daily_feed(): - """Generate a JSON feed of recent episodes with transcripts and summaries.""" - from database import get_db_session, PodcastEpisode - - session = get_db_session() - yesterday = datetime.utcnow() - timedelta(days=1) - - recent_episodes = ( - session.query(PodcastEpisode) - .filter(PodcastEpisode.created_at >= yesterday) - .all() - ) - - feed_entries = [] - for ep in recent_episodes: - entry = { - "podcast_title": ep.show.title, - "episode_title": ep.episode_title, - "publication_date": ep.pub_date.isoformat() if ep.pub_date else None, - "duration_seconds": ep.duration, - "file_size_bytes": ep.file_size, - "audio_path": ep.audio_path, - "transcript_path": ep.transcript_path if ep.transcribed else None, - "summary_path": ep.summary_path if ep.summarized else None - } - feed_entries.append(entry) - - feed_file = os.path.join(config.AUDIO_STORAGE_PATH, "daily_feed.json") - os.makedirs(os.path.dirname(feed_file), exist_ok=True) - - with open(feed_file, "w", encoding="utf-8") as f: - json.dump( - { - "generated_at": datetime.utcnow().isoformat(), - "episodes": feed_entries - }, - f, - indent=2 - ) - - logger.info(f"Generated daily feed with {len(feed_entries)} episodes") - session.close() +def setup_directories(): + """Create necessary directories if they don't exist.""" + os.makedirs(config.AUDIO_STORAGE_PATH, exist_ok=True) + os.makedirs(config.TRANSCRIPT_STORAGE_PATH, exist_ok=True) + logger.info("Storage directories initialized") -def process_episodes(): - """Main processing function that runs all steps in sequence.""" +async def run_feed_checks(): + """Run feed checks for both podcasts and YouTube channels.""" try: - logger.info("Starting episode processing...") - - # Check feeds for new episodes check_feeds() - - # Download new episodes + check_youtube_feeds() + except Exception as e: + logger.error(f"Error in feed checks: {str(e)}") + +async def run_downloads(): + """Run downloads for both podcasts and YouTube videos.""" + try: download_new_episodes() - - # Generate transcripts + download_youtube_videos() + except Exception as e: + logger.error(f"Error in downloads: {str(e)}") + +async def run_transcriptions(): + """Process transcription queue.""" + try: transcriber = get_transcriber() - transcription_service = TranscriptionService(transcriber) - transcription_service.transcribe_episodes() - - # Generate summaries (if Ollama is configured) + service = TranscriptionService(transcriber) + service.transcribe_episodes() + except Exception as e: + logger.error(f"Error in transcriptions: {str(e)}") + +async def run_summaries(): + """Process summary queue.""" + try: summarize_episodes() - - # Generate daily feed - generate_daily_feed() - - # Cleanup old episodes + except Exception as e: + logger.error(f"Error in summaries: {str(e)}") + +async def cleanup(): + """Run cleanup tasks.""" + try: cleanup_old_episodes() - - logger.info("Episode processing complete") - except Exception as e: - logger.error(f"Error in process_episodes: {e}") + logger.error(f"Error in cleanup: {str(e)}") -def setup_directories(): - """Create necessary directories if they don't exist.""" - os.makedirs(config.AUDIO_STORAGE_PATH, exist_ok=True) - os.makedirs(config.TRANSCRIPT_STORAGE_PATH, exist_ok=True) +def log_next_run(job_name: str, minutes: int): + """Log when a job will next run based on its schedule.""" + now = datetime.now() + next_run = now + timedelta(minutes=minutes) + logger.info(f"Next {job_name} scheduled for: {next_run.strftime('%Y-%m-%d %H:%M:%S')}") -def main(): - """Main entry point.""" +async def main(): + """Main application loop.""" # Create directories setup_directories() # Initialize database init_db() + logger.info("Database initialized") - # Set up scheduler - scheduler = BackgroundScheduler() + # Initialize transcription service + transcriber = get_transcriber() + transcription_service = TranscriptionService(transcriber) - # Schedule regular processing - scheduler.add_job( - process_episodes, - 'interval', - minutes=config.CHECK_INTERVAL_MINUTES, - next_run_time=datetime.now() - ) + # Schedule all jobs + schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(check_feeds) + schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(check_youtube_feeds) + schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(download_youtube_videos) + schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(lambda: transcription_service.transcribe_episodes()) + schedule.every(config.CHECK_INTERVAL_MINUTES).minutes.do(summarize_episodes) - # Start the scheduler - scheduler.start() + # Run all jobs immediately on startup + logger.info("Starting initial run of all jobs...") + check_feeds() + check_youtube_feeds() + download_youtube_videos() + transcription_service.transcribe_episodes() + summarize_episodes() - try: - # Keep the main thread alive - while True: - pass - except (KeyboardInterrupt, SystemExit): - scheduler.shutdown() + logger.info("Initial run complete. Starting scheduled execution...") + + while True: + schedule.run_pending() + + # Log next run times for all jobs + log_next_run("feed check", config.CHECK_INTERVAL_MINUTES) + log_next_run("YouTube feed check", config.CHECK_INTERVAL_MINUTES) + log_next_run("YouTube download", config.CHECK_INTERVAL_MINUTES) + log_next_run("transcription", config.CHECK_INTERVAL_MINUTES) + log_next_run("summarization", config.CHECK_INTERVAL_MINUTES) + + await asyncio.sleep(60) # Sleep for 1 minute if __name__ == "__main__": - main() \ No newline at end of file + asyncio.run(main()) \ No newline at end of file diff --git a/progress_handler.py b/progress_handler.py index 7a2d3d4..e06f33d 100644 --- a/progress_handler.py +++ b/progress_handler.py @@ -2,6 +2,9 @@ import threading from typing import Union import tqdm +import logging + +logger = logging.getLogger(__name__) class ProgressListener: def on_progress(self, current: Union[int, float], total: Union[int, float]): @@ -69,23 +72,20 @@ def create_progress_listener_handle(progress_listener: ProgressListener): class DownloadProgressBar: def __init__(self, episode_title): - self.pbar = None self.episode_title = episode_title - - def __call__(self, block_num, block_size, total_size): - if not self.pbar: - self.pbar = tqdm.tqdm( - total=total_size, - desc=f"Downloading {self.episode_title}", - unit='iB', - unit_scale=True, - unit_divisor=1024, - ) + self.started = False - downloaded = block_num * block_size - if downloaded <= total_size: - self.pbar.update(block_size) + def yt_dlp_hook(self, d): + if d['status'] == 'downloading': + if not self.started: + self.started = True + logger.info(f"Starting download of: {self.episode_title}") + + if 'total_bytes' in d and 'downloaded_bytes' in d: + percentage = (d['downloaded_bytes'] / d['total_bytes']) * 100 + logger.info(f"Download progress for {self.episode_title}: {percentage:.1f}%") + elif d['status'] == 'finished': + logger.info(f"Download completed for: {self.episode_title}") def close(self): - if self.pbar: - self.pbar.close() \ No newline at end of file + pass \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 91a3b7e..fc1996f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,6 @@ openlit ollama markdown2 tiktoken<0.7.0,>=0.6.0 -langchain-text-splitters>=0.0.1 \ No newline at end of file +langchain-text-splitters>=0.0.1 +yt-dlp>=2023.12.30 +ffmpeg-python>=0.2.0 \ No newline at end of file diff --git a/scripts/reset_summaries.py b/scripts/reset_summaries.py index f1736e0..253033b 100644 --- a/scripts/reset_summaries.py +++ b/scripts/reset_summaries.py @@ -13,7 +13,7 @@ # Add parent directory to Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from database import get_db_session, PodcastEpisode, EpisodeContent +from database import get_db_session, Episode, EpisodeContent import logging logging.basicConfig(level=logging.INFO) @@ -24,7 +24,7 @@ def reset_summaries(): session = get_db_session() try: # Get all episodes that have been summarized - episodes = session.query(PodcastEpisode).filter_by(summarized=True).all() + episodes = session.query(Episode).filter_by(summarized=True).all() for ep in episodes: # Delete the summary file if it exists diff --git a/summarizer.py b/summarizer.py index 14fb758..299dd1c 100644 --- a/summarizer.py +++ b/summarizer.py @@ -1,7 +1,7 @@ import os import logging from ollama import Client -from database import PodcastEpisode, get_db_session, update_episode_content +from database import Episode, get_db_session, update_episode_content import config import openai from abc import ABC, abstractmethod @@ -267,7 +267,7 @@ def summarize_episodes(): """Find all transcribed but not summarized episodes and generate summaries.""" session = get_db_session() episodes = ( - session.query(PodcastEpisode) + session.query(Episode) .filter_by(transcribed=True, summarized=False) .all() ) @@ -357,7 +357,7 @@ def summarize_episodes(): def get_summary(episode_id): """Retrieve summary for a specific episode.""" session = get_db_session() - episode = session.query(PodcastEpisode).filter_by(id=episode_id).first() + episode = session.query(Episode).filter_by(id=episode_id).first() if not episode or not episode.summary_path: return None diff --git a/templates/index.html b/templates/index.html index ae966af..9a1a990 100644 --- a/templates/index.html +++ b/templates/index.html @@ -362,6 +362,43 @@ font-size: 0.875rem; text-align: center; } + + .content-type { + margin-bottom: 0.5rem; + } + + .badge { + display: inline-block; + padding: 0.25rem 0.75rem; + border-radius: 9999px; + font-size: 0.75rem; + font-weight: 500; + text-transform: uppercase; + letter-spacing: 0.05em; + } + + .badge.youtube { + background-color: #ff0000; + color: white; + } + + .badge.podcast { + background-color: #8b5cf6; + color: white; + } + + .thumbnail { + margin: 1rem 0; + border-radius: 8px; + overflow: hidden; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + } + + .thumbnail img { + width: 100%; + height: auto; + display: block; + } @@ -414,11 +451,24 @@

PhaseFeed

+
+ {% if episode.content_type == 'YOUTUBE' %} + YouTube + {% else %} + Podcast + {% endif %} +

{{ episode.podcast_title }}

{{ episode.episode_title }}

+ {% if episode.content_type == 'YOUTUBE' and episode.thumbnail_url %} +
+ Video thumbnail +
+ {% endif %} +
{{ episode.formatted_date }} {{ episode.duration_formatted }} diff --git a/transcriber.py b/transcriber.py index 4d3529f..853e29a 100644 --- a/transcriber.py +++ b/transcriber.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from typing import Optional import openai -from database import PodcastEpisode, get_db_session +from database import Episode, Show, get_db_session import config from tqdm import tqdm from progress_handler import ProgressListener, create_progress_listener_handle @@ -197,28 +197,40 @@ def transcribe_episodes(self): """Find all downloaded but not transcribed episodes and generate transcripts.""" session = get_db_session() episodes = ( - session.query(PodcastEpisode) - .filter_by(downloaded=True, transcribed=False) + session.query(Episode) + .join(Show) + .filter( + Episode.downloaded == True, + Episode.transcribed == False + ) .all() ) - - for ep in tqdm(episodes, desc="Processing episodes", unit="episode"): - if not ep.audio_path or not os.path.exists(ep.audio_path): - logger.error(f"Audio file not found for {ep.episode_title}") - continue - + + if not episodes: + logger.info("No new episodes to transcribe") + return + + logger.info(f"Found {len(episodes)} episodes to transcribe") + for ep in episodes: try: - logger.info(f"Starting transcription of {ep.episode_title}...") - - # Ensure transcript directory exists - self.ensure_transcript_dir() + logger.info(f"Processing episode: {ep.episode_title}") - # Generate transcript with progress tracking - progress_listener = TranscriptionProgressListener(ep.episode_title) - transcript = self.transcriber.transcribe_audio(ep.audio_path, progress_listener) - - # Format transcript with metadata - transcript_text = f"""Title: {ep.episode_title} + if not ep.audio_path or not os.path.exists(ep.audio_path): + logger.error(f"Audio file not found for {ep.episode_title}") + continue + + try: + logger.info(f"Starting transcription of {ep.episode_title}...") + + # Ensure transcript directory exists + self.ensure_transcript_dir() + + # Generate transcript with progress tracking + progress_listener = TranscriptionProgressListener(ep.episode_title) + transcript = self.transcriber.transcribe_audio(ep.audio_path, progress_listener) + + # Format transcript with metadata + transcript_text = f"""Title: {ep.episode_title} Podcast: {ep.show.title} Date: {ep.pub_date} Duration: {ep.duration} seconds @@ -226,26 +238,29 @@ def transcribe_episodes(self): Transcript: {transcript} """ - - # Save transcript - safe_filename = "".join([c for c in ep.episode_title if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip() - transcript_path = os.path.join( - config.TRANSCRIPT_STORAGE_PATH, - f"{ep.show.title}_{safe_filename}.txt" - ) - - with open(transcript_path, "w", encoding="utf-8") as f: - f.write(transcript_text) - - # Update database - ep.transcript_path = transcript_path - ep.transcribed = True - session.commit() - - logger.info(f"Successfully transcribed: {ep.episode_title}") - + + # Save transcript + safe_filename = "".join([c for c in ep.episode_title if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip() + transcript_path = os.path.join( + config.TRANSCRIPT_STORAGE_PATH, + f"{ep.show.title}_{safe_filename}.txt" + ) + + with open(transcript_path, "w", encoding="utf-8") as f: + f.write(transcript_text) + + # Update database + ep.transcript_path = transcript_path + ep.transcribed = True + session.commit() + + logger.info(f"Successfully transcribed: {ep.episode_title}") + + except Exception as e: + logger.error(f"Failed to transcribe {ep.episode_title}: {e}") + continue except Exception as e: - logger.error(f"Failed to transcribe {ep.episode_title}: {e}") + logger.error(f"Failed to process episode {ep.episode_title}: {e}") continue session.close() @@ -253,7 +268,7 @@ def transcribe_episodes(self): def get_transcript(self, episode_id): """Retrieve transcript for a specific episode.""" session = get_db_session() - episode = session.query(PodcastEpisode).filter_by(id=episode_id).first() + episode = session.query(Episode).filter_by(id=episode_id).first() if not episode or not episode.transcript_path: return None @@ -267,6 +282,13 @@ def get_transcript(self, episode_id): finally: session.close() +def get_transcriber() -> BaseTranscriber: + """Initialize and return the appropriate transcriber based on configuration.""" + if config.TRANSCRIPTION_MODE == "openai": + return OpenAIWhisperTranscriber() + else: # default to local + return LocalWhisperTranscriber(model_path=config.WHISPER_MODEL) + # Context manager for null progress listener from contextlib import contextmanager diff --git a/web_server.py b/web_server.py index 850e8b3..f2f8ec8 100644 --- a/web_server.py +++ b/web_server.py @@ -6,7 +6,7 @@ import os import config from datetime import datetime -from database import get_db_session, EpisodeContent, PodcastEpisode +from database import get_db_session, EpisodeContent, Episode from urllib.parse import unquote import markdown2 import logging @@ -38,7 +38,7 @@ def get_episodes(): query = ( session.query(EpisodeContent) .join(EpisodeContent.episode) - .order_by(PodcastEpisode.pub_date.desc()) + .order_by(Episode.pub_date.desc()) ) logger.debug(f"Executing query: {query}") @@ -59,7 +59,9 @@ def get_episodes(): 'formatted_date': content.formatted_date, 'duration_formatted': content.duration_formatted, 'size_formatted': content.size_formatted, - 'summary': summary_html + 'summary': summary_html, + 'content_type': content.episode.show.content_type.name, + 'thumbnail_url': content.episode.thumbnail_url } episodes_data.append(episode_data) except Exception as e: @@ -105,8 +107,8 @@ async def get_audio(episode_id: int): session = get_db_session() try: episode = ( - session.query(PodcastEpisode) - .filter(PodcastEpisode.id == episode_id) + session.query(Episode) + .filter(Episode.id == episode_id) .first() ) diff --git a/youtube_handler.py b/youtube_handler.py new file mode 100644 index 0000000..c102c1c --- /dev/null +++ b/youtube_handler.py @@ -0,0 +1,304 @@ +import feedparser +import yt_dlp +import os +import datetime +import requests +import re +from database import Show, Episode, ContentType, get_db_session +import config +import logging +from urllib.parse import urlparse, parse_qs +import mimetypes +from progress_handler import DownloadProgressBar +from bs4 import BeautifulSoup + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def sanitize_filename(filename): + """Create a safe filename from potentially unsafe string.""" + if not filename: + return "" + return "".join([c for c in filename if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip() + +def extract_channel_id(url: str) -> str | None: + """Extract channel ID from various YouTube URL formats.""" + parsed = urlparse(url) + if parsed.hostname not in ('www.youtube.com', 'youtube.com'): + return None + + # Direct channel ID format: /channel/UCxxxxxx + if parsed.path.startswith('/channel/'): + return parsed.path.split('/')[2] + + # Handle /@username and /c/ formats by fetching the page + if parsed.path.startswith('/@') or parsed.path.startswith('/c/'): + try: + response = requests.get(url, timeout=config.YOUTUBE_TIMEOUT) + response.raise_for_status() + + # Extract channel ID from meta tags or canonical URL + soup = BeautifulSoup(response.text, 'html.parser') + + # Try meta tag first + meta_tag = soup.find('meta', {'itemprop': 'channelId'}) + if meta_tag and meta_tag.get('content'): + return meta_tag['content'] + + # Try canonical URL + canonical = soup.find('link', {'rel': 'canonical'}) + if canonical and 'href' in canonical.attrs: + channel_match = re.search(r'channel/([^/]+)', canonical['href']) + if channel_match: + return channel_match.group(1) + + except Exception as e: + logger.error(f"Error extracting channel ID from {url}: {str(e)}") + + return None + +def get_feed_url(channel_id: str) -> str: + """Get RSS feed URL for a YouTube channel.""" + return f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}" + +def check_youtube_feeds(): + """Check YouTube channel feeds for new videos.""" + session = get_db_session() + + try: + # First, ensure all channels have proper channel IDs + for url in config.YOUTUBE_CHANNELS: + existing = session.query(Show).filter_by(feed_url=url).first() + if not existing: + channel_id = extract_channel_id(url) + if channel_id: + # Use yt-dlp to get channel info + ydl_opts = { + 'quiet': True, + 'no_warnings': True, + 'extract_flat': True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + channel_info = ydl.extract_info(url, download=False) + channel_title = channel_info.get('channel') or channel_info.get('uploader') + + show = Show( + feed_url=url, + channel_id=channel_id, + content_type=ContentType.YOUTUBE, + title=channel_title or f"channel_{channel_id}" # Fallback if title not found + ) + session.add(show) + logger.info(f"Added new YouTube channel: {channel_title or url}") + else: + logger.error(f"Could not extract channel ID from: {url}") + + session.commit() + + # Now check feeds for all channels + youtube_shows = session.query(Show).filter_by(content_type=ContentType.YOUTUBE).all() + + for show in youtube_shows: + try: + feed_url = get_feed_url(show.channel_id) + logger.info(f"Checking YouTube feed: {feed_url}") + + feed = feedparser.parse(feed_url) + if feed.bozo: + logger.error(f"Error parsing feed: {feed_url} - {feed.bozo_exception}") + continue + + for entry in feed.entries[:config.MAX_EPISODES_PER_FEED]: + video_id = entry.yt_videoid + + # Check if we already have this video + existing = session.query(Episode).filter_by( + show_id=show.id, + video_id=video_id + ).first() + + if not existing: + # Create new episode + episode = Episode( + show_id=show.id, + episode_title=entry.title, + pub_date=datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%S%z"), + video_id=video_id, + thumbnail_url=entry.media_thumbnail[0]['url'] if entry.get('media_thumbnail') else None, + original_url=entry.link + ) + session.add(episode) + logger.info(f"Added new video: {entry.title}") + + session.commit() + + except Exception as e: + logger.error(f"Error processing YouTube feed for {show.feed_url}: {str(e)}") + session.rollback() + continue + + except Exception as e: + logger.error(f"Error in check_youtube_feeds: {str(e)}") + session.rollback() + finally: + session.close() + +def download_youtube_videos(): + """Download new YouTube videos and extract audio.""" + session = get_db_session() + + try: + new_episodes = ( + session.query(Episode) + .join(Show) + .filter( + Show.content_type == ContentType.YOUTUBE, + Episode.downloaded == False + ) + .all() + ) + + if not new_episodes: + logger.info("No new YouTube episodes to download") + return + + for episode in new_episodes: + # Create channel-specific directory + channel_dir = os.path.join(config.AUDIO_STORAGE_PATH, sanitize_filename(episode.show.title or f"channel_{episode.show.id}")) + os.makedirs(channel_dir, exist_ok=True) + + # Setup output path in channel directory + output_path = os.path.join( + channel_dir, + f"{sanitize_filename(episode.episode_title)}_{episode.video_id}" # Removed .mp3 extension + ) + + success = False + error_msg = None + + try: + # Setup progress tracking + progress_bar = DownloadProgressBar(episode.episode_title) + + # Setup yt-dlp options + ydl_opts = { + # Format selection + 'format': 'bestaudio/best', + 'format_sort': ['abr', 'asr', 'res', 'br'], # Prefer better audio quality + + # Audio extraction + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': config.YOUTUBE_AUDIO_QUALITY, + }], + + # Output settings + 'outtmpl': output_path, + 'writethumbnail': False, + + # Download settings + 'progress_hooks': [progress_bar.yt_dlp_hook], + 'retries': config.YOUTUBE_MAX_RETRIES, + 'fragment_retries': config.YOUTUBE_MAX_RETRIES, + 'socket_timeout': config.YOUTUBE_TIMEOUT, + 'extractor_retries': config.YOUTUBE_MAX_RETRIES, + + # Network settings + 'socket_timeout': config.YOUTUBE_TIMEOUT, + 'nocheckcertificate': False, + + # Error handling + 'ignoreerrors': False, + 'no_warnings': False, + 'verbose': False, + + # Geo-restriction handling + 'geo_bypass': True, + 'geo_bypass_country': 'US', + + # System settings + 'quiet': False, + 'no_color': False, + + # Sponsorblock settings (optional) + # 'sponsorblock_remove': ['sponsor', 'intro', 'outro', 'selfpromo'], + + # Age-gate bypass + 'cookiesfrombrowser': None, # Can be set to ('chrome', 'firefox', etc) + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + logger.info(f"Downloading video: {episode.episode_title}") + ydl.download([episode.original_url]) + success = True + + except yt_dlp.utils.DownloadError as e: + error_msg = str(e) + if "Video unavailable" in error_msg: + logger.error(f"Video {episode.episode_title} is no longer available") + elif "Sign in to confirm your age" in error_msg: + logger.error(f"Video {episode.episode_title} is age restricted. Consider setting cookiesfrombrowser") + elif "The uploader has not made this video available in your country" in error_msg: + logger.error(f"Video {episode.episode_title} is geo-restricted") + elif "This video is only available to users with special access" in error_msg: + logger.error(f"Video {episode.episode_title} requires special access (members only, etc)") + else: + logger.error(f"Error downloading video {episode.episode_title}: {error_msg}") + # Cleanup partial download if it exists + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + + except Exception as e: + error_msg = str(e) + logger.error(f"Unexpected error downloading video {episode.episode_title}: {error_msg}") + # Cleanup partial download if it exists + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + + if success: + try: + # Get final output path with .mp3 extension + final_output_path = f"{output_path}.mp3" + + # Update episode record + episode.audio_path = final_output_path + episode.downloaded = True + episode.file_size = os.path.getsize(final_output_path) + + # Get duration using existing function from feed_monitor + from feed_monitor import get_audio_duration + episode.duration = get_audio_duration(final_output_path) + + # Log the values before committing + logger.info(f"Updating database for {episode.episode_title}:") + logger.info(f" - audio_path: {final_output_path}") + logger.info(f" - downloaded: True") + logger.info(f" - file_size: {episode.file_size} bytes") + logger.info(f" - duration: {episode.duration} seconds") + + session.commit() + logger.info(f"Successfully downloaded and processed: {episode.episode_title}") + + except Exception as e: + logger.error(f"Error updating episode record for {episode.episode_title}: {str(e)}") + session.rollback() + # Cleanup downloaded file if we couldn't update the database + if os.path.exists(final_output_path): + try: + os.remove(final_output_path) + logger.info(f"Cleaned up failed download: {final_output_path}") + except OSError as ose: + logger.error(f"Failed to clean up file {final_output_path}: {str(ose)}") + + except Exception as e: + logger.error(f"Error in download_youtube_videos: {str(e)}") + session.rollback() + finally: + session.close() \ No newline at end of file From 19d89b9e0bd8cc0b80b59eed5acca68a8a3610ac Mon Sep 17 00:00:00 2001 From: Jeremy Waller Date: Sun, 19 Jan 2025 18:08:56 -0600 Subject: [PATCH 3/3] Update readme --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6d21ccf..1e620a6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # PhaseFeed -A local podcast monitoring and transcription system that: -- Monitors RSS feeds for new podcast episodes -- Downloads new episodes automatically +A local podcast and YouTube monitoring system that: +- Monitors RSS feeds for new podcast episodes and YouTube channels for new videos +- Downloads new episodes and YouTube videos automatically - Transcribes audio using either OpenAI Whisper API or local mlx-whisper - Summarizes content using either OpenAI GPT-4 or local LLMs via Ollama - Stores metadata in SQLite @@ -49,12 +49,18 @@ cp .env.example .env The application can be configured through `config.py`. Key settings include: ### Storage Configuration -- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts (default: `~/Podcasts`) +- `AUDIO_STORAGE_PATH`: Where to store downloaded podcasts and YouTube audio (default: `~/Podcasts`) - `TRANSCRIPT_STORAGE_PATH`: Where to store transcripts (default: `~/Podcasts/Transcripts`) ### Feed Configuration - `PODCAST_FEEDS`: List of RSS feed URLs to monitor -- `MAX_EPISODES_PER_FEED`: Maximum number of episodes to pull from each feed (default: 5) +- `YOUTUBE_CHANNELS`: List of YouTube channel URLs to monitor +- `MAX_EPISODES_PER_FEED`: Maximum number of episodes/videos to pull from each feed (default: 5) + +### YouTube Configuration +- `YOUTUBE_AUDIO_QUALITY`: Audio quality for downloaded videos in kbps (default: 192) +- `YOUTUBE_MAX_RETRIES`: Number of retry attempts for failed downloads +- `YOUTUBE_TIMEOUT`: Download timeout in seconds ### Transcription Configuration - `TRANSCRIPTION_MODE`: Choose between "local" or "openai"