PlanetRead · Govindggupta · May 6, 2026 · May 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.venv/
+__pycache__/
+*.py[cod]
+.pytest_cache/
+
+outputs/events.*
+samples/*.wav
+*.tmp.wav
+test.mp4
diff --git a/README.md b/README.md
@@ -0,0 +1,121 @@
+# Intelligent CC Generation - Module 1 MVP
+
+This demo implements the first module from the PlanetRead Intelligent Closed Caption
+Suggestion Tool pipeline:
+
+```text
+video input -> audio extraction -> sound event detection -> JSON/CSV output
+```
+
+The current MVP does not decide whether a caption should be shown and does not check
+speaker or scene reaction. It only detects candidate non-speech sound events with
+timestamps and confidence scores.
+
+## What It Uses
+
+- Python 3.12
+- YAMNet from TensorFlow Hub for sound event classification
+- `imageio-ffmpeg` for a pip-provided FFmpeg binary
+- `soundfile` for loading WAV audio
+- built-in `json` and `csv` for outputs
+
+## Setup
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+No system FFmpeg install is required. The project uses FFmpeg from the
+`imageio-ffmpeg` Python package.
+
+## Run On A Video
+
+```bash
+python detect_sound_events.py \
+  --input samples/sample_video.mp4 \
+  --json outputs/sample_events.json \
+  --csv outputs/sample_events.csv
+```
+
+The first run may take extra time because TensorFlow Hub downloads and caches the
+YAMNet model.
+
+## Test Video Demo
+
+This PR also includes a short test video:
+
+```text
+samples/test_video.mp4
+```
+
+Run the detector on it with a slightly higher confidence threshold for cleaner
+demo output:
+
+```bash
+python detect_sound_events.py \
+  --input samples/test_video.mp4 \
+  --json outputs/test_video_events.json \
+  --csv outputs/test_video_events.csv \
+  --min-confidence 0.5 \
+  --block-label Animal,Bird
+```
+
+Example detected events from this video include `Explosion`, `Gunshot, gunfire`,
+and `Machine gun`.
+
+## Create A Small Sample Video
+
+```bash
+python scripts/create_sample_video.py
+```
+
+This creates:
+
+```text
+samples/sample_audio.wav
+samples/sample_video.mp4
+```
+
+Then run the detector command above.
+
+## Output Format
+
+JSON output:
+
+```json
+[
+  {
+    "label": "Busy signal",
+    "caption_label": "[busy signal]",
+    "start_time": 0.48,
+    "end_time": 1.44,
+    "confidence": 0.8768,
+    "start_timestamp": "00:00:00.480",
+    "end_timestamp": "00:00:01.440",
+    "duration": 0.96
+  }
+]
+```
+
+CSV output:
+
+```csv
+label,caption_label,start_time,end_time,start_timestamp,end_timestamp,duration,confidence
+Busy signal,[busy signal],0.48,1.44,00:00:00.480,00:00:01.440,0.96,0.8768
+```
+
+## Useful Options
+
+- `--min-confidence 0.5` keeps only stronger detections.
+- `--top-k 3` keeps more candidate labels per YAMNet frame.
+- `--block-label Animal,Bird` suppresses noisy labels for a specific demo clip.
+- `--keep-audio outputs/audio.wav` saves the extracted 16 kHz mono WAV.
+
+## Current Limitations
+
+- This is only Module 1, so it does not check visual reaction yet.
+- YAMNet labels are generic AudioSet labels and may need mapping to cleaner CC text.
+- The default threshold is conservative but not tuned on PlanetRead content yet.
+- First run requires internet access to download the YAMNet model from TensorFlow Hub.
diff --git a/detect_sound_events.py b/detect_sound_events.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))
+
+from cc_event_detector.cli import main
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/outputs/test_video_events.csv b/outputs/test_video_events.csv
@@ -0,0 +1,5 @@
+label,caption_label,start_time,end_time,start_timestamp,end_timestamp,duration,confidence
+Explosion,[explosion],0.96,2.4,00:00:00.960,00:00:02.400,1.44,0.6279
+Fusillade,[rapid gunfire],1.92,2.88,00:00:01.920,00:00:02.880,0.96,0.7086
+"Gunshot, gunfire",[gunshot],9.6,12.48,00:00:09.600,00:00:12.480,2.88,0.9488
+Machine gun,[machine gun],12.48,13.44,00:00:12.480,00:00:13.440,0.96,0.8211
diff --git a/outputs/test_video_events.json b/outputs/test_video_events.json
@@ -0,0 +1,42 @@
+[
+  {
+    "label": "Explosion",
+    "caption_label": "[explosion]",
+    "start_time": 0.96,
+    "end_time": 2.4,
+    "confidence": 0.6279,
+    "start_timestamp": "00:00:00.960",
+    "end_timestamp": "00:00:02.400",
+    "duration": 1.44
+  },
+  {
+    "label": "Fusillade",
+    "caption_label": "[rapid gunfire]",
+    "start_time": 1.92,
+    "end_time": 2.88,
+    "confidence": 0.7086,
+    "start_timestamp": "00:00:01.920",
+    "end_timestamp": "00:00:02.880",
+    "duration": 0.96
+  },
+  {
+    "label": "Gunshot, gunfire",
+    "caption_label": "[gunshot]",
+    "start_time": 9.6,
+    "end_time": 12.48,
+    "confidence": 0.9488,
+    "start_timestamp": "00:00:09.600",
+    "end_timestamp": "00:00:12.480",
+    "duration": 2.88
+  },
+  {
+    "label": "Machine gun",
+    "caption_label": "[machine gun]",
+    "start_time": 12.48,
+    "end_time": 13.44,
+    "confidence": 0.8211,
+    "start_timestamp": "00:00:12.480",
+    "end_timestamp": "00:00:13.440",
+    "duration": 0.96
+  }
+]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+setuptools<81
+tensorflow==2.21.0
+tensorflow-hub==0.16.1
+numpy>=1.26,<3
+soundfile>=0.13.1
+imageio-ffmpeg>=0.6.0
diff --git a/samples/test_video.mp4 b/samples/test_video.mp4
diff --git a/scripts/create_sample_video.py b/scripts/create_sample_video.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import math
+import subprocess
+import wave
+from pathlib import Path
+
+import imageio_ffmpeg
+
+
+SAMPLE_RATE = 16_000
+DURATION_SECONDS = 6
+
+
+def tone(sample_index: int, frequency: float, amplitude: float) -> int:
+    value = amplitude * math.sin(2 * math.pi * frequency * sample_index / SAMPLE_RATE)
+    return int(max(-1.0, min(1.0, value)) * 32767)
+
+
+def make_sample_wav(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    total_samples = SAMPLE_RATE * DURATION_SECONDS
+    frames = bytearray()
+
+    for index in range(total_samples):
+        second = index / SAMPLE_RATE
+        if 1.0 <= second < 1.8:
+            sample = tone(index, 880.0, 0.55)
+        elif 3.2 <= second < 4.1:
+            sample = tone(index, 440.0, 0.45) + tone(index, 660.0, 0.25)
+        else:
+            sample = 0
+        frames.extend(int(sample).to_bytes(2, byteorder="little", signed=True))
+
+    with wave.open(str(path), "wb") as handle:
+        handle.setnchannels(1)
+        handle.setsampwidth(2)
+        handle.setframerate(SAMPLE_RATE)
+        handle.writeframes(frames)
+
+
+def make_sample_video(audio_path: Path, video_path: Path) -> None:
+    video_path.parent.mkdir(parents=True, exist_ok=True)
+    ffmpeg = imageio_ffmpeg.get_ffmpeg_exe()
+    command = [
+        ffmpeg,
+        "-y",
+        "-f",
+        "lavfi",
+        "-i",
+        f"color=c=black:s=640x360:d={DURATION_SECONDS}",
+        "-i",
+        str(audio_path),
+        "-shortest",
+        "-c:v",
+        "libx264",
+        "-pix_fmt",
+        "yuv420p",
+        "-c:a",
+        "aac",
+        str(video_path),
+    ]
+    completed = subprocess.run(command, capture_output=True, text=True, check=False)
+    if completed.returncode != 0:
+        raise RuntimeError(completed.stderr.strip() or "Could not create sample video.")
+
+
+def main() -> int:
+    audio_path = Path("samples/sample_audio.wav")
+    video_path = Path("samples/sample_video.mp4")
+    make_sample_wav(audio_path)
+    make_sample_video(audio_path, video_path)
+    print(f"Wrote {audio_path}")
+    print(f"Wrote {video_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/cc_event_detector/__init__.py b/src/cc_event_detector/__init__.py
@@ -0,0 +1,3 @@
+"""Module 1 MVP for non-speech sound event detection."""
+
+__version__ = "0.1.0"
diff --git a/src/cc_event_detector/__main__.py b/src/cc_event_detector/__main__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from .cli import main
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/cc_event_detector/audio.py b/src/cc_event_detector/audio.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import imageio_ffmpeg
+import soundfile as sf
+
+
+SUPPORTED_VIDEO_EXTENSIONS = {".mp4", ".mkv", ".mov", ".avi", ".webm"}
+SUPPORTED_AUDIO_EXTENSIONS = {".wav", ".mp3", ".m4a", ".aac", ".flac", ".ogg"}
+TARGET_SAMPLE_RATE = 16_000
+
+
+class AudioExtractionError(RuntimeError):
+    """Raised when audio extraction from a media file fails."""
+
+
+def is_video_file(path: Path) -> bool:
+    return path.suffix.lower() in SUPPORTED_VIDEO_EXTENSIONS
+
+
+def is_audio_file(path: Path) -> bool:
+    return path.suffix.lower() in SUPPORTED_AUDIO_EXTENSIONS
+
+
+def extract_audio_to_wav(input_path: Path, output_path: Path) -> Path:
+    """Extract 16 kHz mono WAV audio using the FFmpeg binary bundled by pip."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
+    command = [
+        ffmpeg_path,
+        "-y",
+        "-i",
+        str(input_path),
+        "-vn",
+        "-ac",
+        "1",
+        "-ar",
+        str(TARGET_SAMPLE_RATE),
+        "-f",
+        "wav",
+        str(output_path),
+    ]
+    completed = subprocess.run(command, capture_output=True, text=True, check=False)
+    if completed.returncode != 0:
+        message = completed.stderr.strip() or "FFmpeg failed while extracting audio."
+        raise AudioExtractionError(message)
+    return output_path
+
+
+def load_wav_mono(path: Path) -> tuple[list[float], int]:
+    """Load WAV audio as mono float samples."""
+    audio, sample_rate = sf.read(str(path), dtype="float32")
+    if getattr(audio, "ndim", 1) > 1:
+        audio = audio.mean(axis=1)
+    return audio.tolist(), int(sample_rate)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Module 1 MVP for non-speech sound event detection."""

		__version__ = "0.1.0"