From 282734ba38f77e02c54b3843a57e7e557cc63164 Mon Sep 17 00:00:00 2001 From: Ashutoshx7 Date: Mon, 4 May 2026 20:38:57 +0530 Subject: [PATCH 01/28] feat: implement intelligent CC suggestion pipeline (DMP 2026 PlanetRead) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Goals 1, 2 & 3 — full end-to-end pipeline: Goal 1 — Sound Event Detection: - YAMNet-based audio classification (521 AudioSet classes) - WebRTC VAD / energy-based speech filtering - Consecutive event merging with peak confidence Goal 2 — Speaker Reaction Detection: - Temporal reaction windows (300ms-1500ms after sound onset) - Scene cut detection (histogram comparison) to prevent false positives - MediaPipe PoseLandmarker (flinch/head turn via shoulder/ear/nose landmarks) - MediaPipe FaceLandmarker (surprise via mouth openness) - Multi-person scoring (max reaction across all detected people) Goal 3 — CC Decision Engine + SRT Output: - Category-aware fusion weights (high_impact, interactive, social, ambient) - Speech-pause bonus for interrupted dialogue - Scene-cut fallback to audio-only scoring - Standard SRT output with human-readable summary - IoU-based evaluation framework (P/R/F1/overcaption rate) 19/19 tests passing. Full pipeline tested end-to-end. --- .gitignore | 37 ++++ README.md | 139 ++++++++++++++ config/default.yaml | 35 ++++ config/sound_categories.yaml | 70 +++++++ eval/evaluator.py | 104 +++++++++++ main.py | 168 +++++++++++++++++ requirements.txt | 8 + samples/test_clip_cc.srt | 20 ++ samples/test_clip_cc_summary.txt | 27 +++ setup.sh | 44 +++++ src/__init__.py | 0 src/audio/__init__.py | 0 src/audio/extractor.py | 106 +++++++++++ src/audio/speech_filter.py | 187 +++++++++++++++++++ src/audio/yamnet_detector.py | 125 +++++++++++++ src/config_loader.py | 42 +++++ src/fusion/__init__.py | 0 src/fusion/category_mapper.py | 71 +++++++ src/fusion/decision_engine.py | 116 ++++++++++++ src/output/__init__.py | 0 src/output/label_mapper.py | 75 ++++++++ src/output/srt_writer.py | 104 +++++++++++ src/pipeline.py | 196 ++++++++++++++++++++ src/visual/__init__.py | 0 src/visual/face_analyzer.py | 111 +++++++++++ src/visual/frame_extractor.py | 96 ++++++++++ src/visual/pose_analyzer.py | 142 ++++++++++++++ src/visual/scene_cut.py | 94 ++++++++++ tests/generate_test_data.py | 153 ++++++++++++++++ tests/test_all.py | 306 +++++++++++++++++++++++++++++++ 30 files changed, 2576 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config/default.yaml create mode 100644 config/sound_categories.yaml create mode 100644 eval/evaluator.py create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 samples/test_clip_cc.srt create mode 100644 samples/test_clip_cc_summary.txt create mode 100755 setup.sh create mode 100644 src/__init__.py create mode 100644 src/audio/__init__.py create mode 100644 src/audio/extractor.py create mode 100644 src/audio/speech_filter.py create mode 100644 src/audio/yamnet_detector.py create mode 100644 src/config_loader.py create mode 100644 src/fusion/__init__.py create mode 100644 src/fusion/category_mapper.py create mode 100644 src/fusion/decision_engine.py create mode 100644 src/output/__init__.py create mode 100644 src/output/label_mapper.py create mode 100644 src/output/srt_writer.py create mode 100644 src/pipeline.py create mode 100644 src/visual/__init__.py create mode 100644 src/visual/face_analyzer.py create mode 100644 src/visual/frame_extractor.py create mode 100644 src/visual/pose_analyzer.py create mode 100644 src/visual/scene_cut.py create mode 100644 tests/generate_test_data.py create mode 100644 tests/test_all.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..460540b --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +dist/ +build/ +*.egg + +# Virtual environments +.venv/ +venv/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Test/build artifacts +.pytest_cache/ +*.wav +*.avi +*.mp4 +*.mkv + +# Models (large files — download via setup script) +models/*.task + +# Temp +get-pip.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..0f66f8b --- /dev/null +++ b/README.md @@ -0,0 +1,139 @@ +# Intelligent CC Suggestion Tool + +> **DMP 2026 · PlanetRead · C4GT** + +AI-powered tool that intelligently identifies moments in a video where a Closed Caption (CC) annotation is genuinely necessary — such as when a non-speech audio event meaningfully affects the speakers or the scene — and suggests contextually relevant CC text, without over-captioning routine or low-impact sounds. + +## Architecture + +``` +Video → Audio Extraction → YAMNet Detection → Speech Filtering + → Scene Cut Detection → Reaction Window Frame Extraction + → Pose Analysis (flinch, head turn) + Face Analysis (surprise) + → Category-Aware Fusion Engine → SRT Output +``` + +### Key Innovations + +1. **Temporal Reaction Windows** — Extracts frames 300ms–1500ms *after* the sound (when reactions actually happen), not at the midpoint +2. **Category-Aware Fusion** — Different sound types use different weights (explosions don't need visual confirmation; doorbells do) +3. **Scene Cut Detection** — Skips visual analysis at edit points to prevent false positive reactions +4. **Overcaption Prevention** — Primary design goal is to filter ambient/insignificant sounds, not just detect everything + +## Quick Start + +```bash +# Install dependencies +pip install -r requirements.txt +sudo apt install ffmpeg + +# Run on a video +python main.py video.mp4 + +# Run with options +python main.py video.mp4 -o captions.srt --verbose + +# Run with custom threshold +python main.py video.mp4 --threshold 0.35 + +# Run evaluation +python main.py video.mp4 --evaluate --ground-truth eval/ground_truth/clip.json +``` + +## Output + +The tool produces: +- `