PlanetRead · anmol457 · May 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,14 @@
+.venv/
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+*.srt
+*.sls
+*.wav
+*.log
+
+models/
+.cache/
diff --git a/README.md b/README.md
@@ -0,0 +1,64 @@
+# Intelligent Closed Caption Suggestion Tool
+
+AI-assisted backend pipeline for finding meaningful non-speech moments in a video and exporting closed-caption suggestions as SRT or SLS. The pipeline combines:
+
+- YAMNet sound event detection for non-speech audio events.
+- OpenCV frame sampling and optical-flow motion analysis.
+- Face-position shift detection using MediaPipe when installed, with an OpenCV Haar-cascade fallback in the default setup.
+- A decision engine that avoids captioning low-impact ambient sounds unless the audio event and scene reaction justify it.
+
+## Python And Dependencies
+
+Use Python `3.10.x`. The project pins `>=3.10,<3.11` because this machine has Python 3.10 installed and TensorFlow `2.10.x` provides compatible native Windows wheels for that runtime.
+
+The app uses `imageio-ffmpeg` to provide an FFmpeg executable, so you do not need a separate system FFmpeg install for normal CLI use.
+
+```powershell
+py -3.10 -m venv .venv
+.\.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+pip install -e .
+```
+
+YAMNet is loaded from TensorFlow Hub on first use, so the first run needs internet access to download the model cache.
+
+## Usage
+
+```powershell
+intelligent-cc video.mp4 -o output.srt
+```
+
+For structured JSON-style output:
+
+```powershell
+intelligent-cc video.mp4 --format sls -o output.sls
+```
+
+Useful tuning flags:
+
+```powershell
+intelligent-cc video.mp4 --audio-threshold 0.30 --decision-threshold 0.55 --max-events 20
+```
+
+## Pipeline
+
+1. Extract mono 16 kHz audio from the input video with FFmpeg.
+2. Run YAMNet and keep captionable non-speech classes such as honking, glass breaking, alarms, applause, explosions, sirens, laughter, and music.
+3. Merge adjacent detections into timestamped audio events with confidence scores.
+4. Sample video frames around each event timestamp.
+5. Score visible reaction using optical flow and MediaPipe face-center movement.
+6. Combine audio confidence, visual reaction confidence, and high-impact label rules.
+7. Export accepted suggestions as SRT captions like `[honking]`.
+
+## Development
+
+```powershell
+pip install -e ".[dev]"
+pytest
+```
+
+## Notes
+
+- The included `video.mp4` can be used for a smoke test after dependencies are installed.
+- The reaction detector is intentionally conservative: routine background sounds are rejected unless paired with visible motion/reaction or a high-impact audio label.
+- For production review workflows, keep rejected events as diagnostic metadata by using the Python API and inspecting `PipelineResult.audio_events`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,40 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "intelligent-cc"
+version = "0.1.0"
+description = "AI-assisted closed-caption suggestion pipeline for meaningful non-speech events in video."
+readme = "README.md"
+requires-python = ">=3.10,<3.11"
+dependencies = [
+  "click==8.1.7",
+  "ffmpeg-python==0.2.0",
+  "imageio-ffmpeg==0.4.9",
+  "librosa==0.10.1",
+  "numpy==1.26.4",
+  "opencv-python-headless==4.10.0.84",
+  "protobuf==3.19.6",
+  "scipy==1.10.1",
+  "tensorflow==2.10.1",
+  "tensorflow-hub==0.12.0",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest==8.2.2",
+]
+
+[project.scripts]
+intelligent-cc = "intelligent_cc.cli:main"
+
+[tool.setuptools.package-dir]
+"" = "src"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["src"]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+click==8.1.7
+ffmpeg-python==0.2.0
+imageio-ffmpeg==0.4.9
+librosa==0.10.1
+numpy==1.26.4
+opencv-python-headless==4.10.0.84
+protobuf==3.19.6
+scipy==1.10.1
+tensorflow==2.10.1
+tensorflow-hub==0.12.0
diff --git a/sls/applause_output.sls b/sls/applause_output.sls
@@ -0,0 +1,13 @@
+[
+  {
+    "index": 1,
+    "label": "applause",
+    "text": "[applause]",
+    "start": 0.48,
+    "end": 24.96,
+    "audio_confidence": 0.9995602965354919,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8422581630945207,
+    "reason": "audio+visual"
+  }
+]
diff --git a/sls/glass_output.sls b/sls/glass_output.sls
@@ -0,0 +1,156 @@
+[
+  {
+    "index": 1,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 7.199999999999999,
+    "end": 10.56,
+    "audio_confidence": 0.9861820936203003,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9549001514911651,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 7.68,
+    "end": 10.08,
+    "audio_confidence": 0.9511262774467468,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8156194525957108,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 8.64,
+    "end": 10.08,
+    "audio_confidence": 0.9510408043861389,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8155724424123765,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 12.0,
+    "end": 17.28,
+    "audio_confidence": 0.9526554942131042,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9364605218172074,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 12.0,
+    "end": 12.96,
+    "audio_confidence": 0.8753526210784912,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7739439415931703,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 12.0,
+    "end": 12.96,
+    "audio_confidence": 0.6574118137359619,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.6540764975547791,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 13.92,
+    "end": 16.32,
+    "audio_confidence": 0.9251334071159363,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8013233739137651,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 8,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 13.92,
+    "end": 14.4,
+    "audio_confidence": 0.6074072122573853,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.626573966741562,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 9,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 15.36,
+    "end": 16.32,
+    "audio_confidence": 0.8594706654548645,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7652088660001756,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 10,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8928834795951843,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9035859137773515,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 11,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8732807040214539,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7728043872117997,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 12,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8596591949462891,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7653125572204591,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 13,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 21.599999999999998,
+    "end": 24.96,
+    "audio_confidence": 0.9833531975746155,
+    "reaction_confidence": 0.46931650208883047,
+    "decision_score": 0.8720366846060122,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 14,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 21.599999999999998,
+    "end": 24.48,
+    "audio_confidence": 0.9815574884414673,
+    "reaction_confidence": 0.4739797139495354,
+    "decision_score": 0.7531474899200981,
+    "reason": "audio+visual"
+  }
+]
diff --git a/sls/gun_output.sls b/sls/gun_output.sls
@@ -0,0 +1,79 @@
+[
+  {
+    "index": 1,
+    "label": "explosion",
+    "text": "[explosion]",
+    "start": 5.76,
+    "end": 9.120000000000001,
+    "audio_confidence": 0.7828652858734131,
+    "reaction_confidence": 0.24199659019974723,
+    "decision_score": 0.6594743728202636,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 5.76,
+    "end": 6.720000000000001,
+    "audio_confidence": 0.5234716534614563,
+    "reaction_confidence": 0.38832228271901104,
+    "decision_score": 0.5826544366273559,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "explosion",
+    "text": "[explosion]",
+    "start": 10.08,
+    "end": 23.52,
+    "audio_confidence": 0.9754327535629272,
+    "reaction_confidence": 0.45062498595623396,
+    "decision_score": 0.8592692581399153,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 12.0,
+    "end": 18.72,
+    "audio_confidence": 0.9698405265808105,
+    "reaction_confidence": 0.4639640204182693,
+    "decision_score": 0.862196098807667,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "fireworks",
+    "text": "[fireworks]",
+    "start": 18.24,
+    "end": 21.599999999999998,
+    "audio_confidence": 0.8429054021835327,
+    "reaction_confidence": 0.19450016282831448,
+    "decision_score": 0.5511230444736845,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 21.119999999999997,
+    "end": 23.04,
+    "audio_confidence": 0.8048539757728577,
+    "reaction_confidence": 0.6408123896081255,
+    "decision_score": 0.8510352619987281,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "artillery fire",
+    "text": "[artillery fire]",
+    "start": 22.56,
+    "end": 23.04,
+    "audio_confidence": 0.6131650805473328,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.6297407943010331,
+    "reason": "audio+visual"
+  }
+]