diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8b11fd1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+.venv/
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+*.srt
+*.sls
+*.wav
+*.log
+
+models/
+.cache/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6ab4371
--- /dev/null
+++ b/README.md
@@ -0,0 +1,64 @@
+# Intelligent Closed Caption Suggestion Tool
+
+AI-assisted backend pipeline for finding meaningful non-speech moments in a video and exporting closed-caption suggestions as SRT or SLS. The pipeline combines:
+
+- YAMNet sound event detection for non-speech audio events.
+- OpenCV frame sampling and optical-flow motion analysis.
+- Face-position shift detection using MediaPipe when installed, with an OpenCV Haar-cascade fallback in the default setup.
+- A decision engine that avoids captioning low-impact ambient sounds unless the audio event and scene reaction justify it.
+
+## Python And Dependencies
+
+Use Python `3.10.x`. The project pins `>=3.10,<3.11` because this machine has Python 3.10 installed and TensorFlow `2.10.x` provides compatible native Windows wheels for that runtime.
+
+The app uses `imageio-ffmpeg` to provide an FFmpeg executable, so you do not need a separate system FFmpeg install for normal CLI use.
+
+```powershell
+py -3.10 -m venv .venv
+.\.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+pip install -e .
+```
+
+YAMNet is loaded from TensorFlow Hub on first use, so the first run needs internet access to download the model cache.
+
+## Usage
+
+```powershell
+intelligent-cc video.mp4 -o output.srt
+```
+
+For structured JSON-style output:
+
+```powershell
+intelligent-cc video.mp4 --format sls -o output.sls
+```
+
+Useful tuning flags:
+
+```powershell
+intelligent-cc video.mp4 --audio-threshold 0.30 --decision-threshold 0.55 --max-events 20
+```
+
+## Pipeline
+
+1. Extract mono 16 kHz audio from the input video with FFmpeg.
+2. Run YAMNet and keep captionable non-speech classes such as honking, glass breaking, alarms, applause, explosions, sirens, laughter, and music.
+3. Merge adjacent detections into timestamped audio events with confidence scores.
+4. Sample video frames around each event timestamp.
+5. Score visible reaction using optical flow and MediaPipe face-center movement.
+6. Combine audio confidence, visual reaction confidence, and high-impact label rules.
+7. Export accepted suggestions as SRT captions like `[honking]`.
+
+## Development
+
+```powershell
+pip install -e ".[dev]"
+pytest
+```
+
+## Notes
+
+- The included `video.mp4` can be used for a smoke test after dependencies are installed.
+- The reaction detector is intentionally conservative: routine background sounds are rejected unless paired with visible motion/reaction or a high-impact audio label.
+- For production review workflows, keep rejected events as diagnostic metadata by using the Python API and inspecting `PipelineResult.audio_events`.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..74fb913
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,40 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "intelligent-cc"
+version = "0.1.0"
+description = "AI-assisted closed-caption suggestion pipeline for meaningful non-speech events in video."
+readme = "README.md"
+requires-python = ">=3.10,<3.11"
+dependencies = [
+  "click==8.1.7",
+  "ffmpeg-python==0.2.0",
+  "imageio-ffmpeg==0.4.9",
+  "librosa==0.10.1",
+  "numpy==1.26.4",
+  "opencv-python-headless==4.10.0.84",
+  "protobuf==3.19.6",
+  "scipy==1.10.1",
+  "tensorflow==2.10.1",
+  "tensorflow-hub==0.12.0",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest==8.2.2",
+]
+
+[project.scripts]
+intelligent-cc = "intelligent_cc.cli:main"
+
+[tool.setuptools.package-dir]
+"" = "src"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["src"]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..917e846
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+click==8.1.7
+ffmpeg-python==0.2.0
+imageio-ffmpeg==0.4.9
+librosa==0.10.1
+numpy==1.26.4
+opencv-python-headless==4.10.0.84
+protobuf==3.19.6
+scipy==1.10.1
+tensorflow==2.10.1
+tensorflow-hub==0.12.0
diff --git a/sls/applause_output.sls b/sls/applause_output.sls
new file mode 100644
index 0000000..42045ca
--- /dev/null
+++ b/sls/applause_output.sls
@@ -0,0 +1,13 @@
+[
+  {
+    "index": 1,
+    "label": "applause",
+    "text": "[applause]",
+    "start": 0.48,
+    "end": 24.96,
+    "audio_confidence": 0.9995602965354919,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8422581630945207,
+    "reason": "audio+visual"
+  }
+]
\ No newline at end of file
diff --git a/sls/glass_output.sls b/sls/glass_output.sls
new file mode 100644
index 0000000..c18cbaa
--- /dev/null
+++ b/sls/glass_output.sls
@@ -0,0 +1,156 @@
+[
+  {
+    "index": 1,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 7.199999999999999,
+    "end": 10.56,
+    "audio_confidence": 0.9861820936203003,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9549001514911651,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 7.68,
+    "end": 10.08,
+    "audio_confidence": 0.9511262774467468,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8156194525957108,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 8.64,
+    "end": 10.08,
+    "audio_confidence": 0.9510408043861389,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8155724424123765,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 12.0,
+    "end": 17.28,
+    "audio_confidence": 0.9526554942131042,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9364605218172074,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 12.0,
+    "end": 12.96,
+    "audio_confidence": 0.8753526210784912,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7739439415931703,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 12.0,
+    "end": 12.96,
+    "audio_confidence": 0.6574118137359619,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.6540764975547791,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 13.92,
+    "end": 16.32,
+    "audio_confidence": 0.9251334071159363,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8013233739137651,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 8,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 13.92,
+    "end": 14.4,
+    "audio_confidence": 0.6074072122573853,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.626573966741562,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 9,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 15.36,
+    "end": 16.32,
+    "audio_confidence": 0.8594706654548645,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7652088660001756,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 10,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8928834795951843,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9035859137773515,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 11,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8732807040214539,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7728043872117997,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 12,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 18.72,
+    "end": 19.68,
+    "audio_confidence": 0.8596591949462891,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7653125572204591,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 13,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 21.599999999999998,
+    "end": 24.96,
+    "audio_confidence": 0.9833531975746155,
+    "reaction_confidence": 0.46931650208883047,
+    "decision_score": 0.8720366846060122,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 14,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 21.599999999999998,
+    "end": 24.48,
+    "audio_confidence": 0.9815574884414673,
+    "reaction_confidence": 0.4739797139495354,
+    "decision_score": 0.7531474899200981,
+    "reason": "audio+visual"
+  }
+]
\ No newline at end of file
diff --git a/sls/gun_output.sls b/sls/gun_output.sls
new file mode 100644
index 0000000..f805f92
--- /dev/null
+++ b/sls/gun_output.sls
@@ -0,0 +1,79 @@
+[
+  {
+    "index": 1,
+    "label": "explosion",
+    "text": "[explosion]",
+    "start": 5.76,
+    "end": 9.120000000000001,
+    "audio_confidence": 0.7828652858734131,
+    "reaction_confidence": 0.24199659019974723,
+    "decision_score": 0.6594743728202636,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 5.76,
+    "end": 6.720000000000001,
+    "audio_confidence": 0.5234716534614563,
+    "reaction_confidence": 0.38832228271901104,
+    "decision_score": 0.5826544366273559,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "explosion",
+    "text": "[explosion]",
+    "start": 10.08,
+    "end": 23.52,
+    "audio_confidence": 0.9754327535629272,
+    "reaction_confidence": 0.45062498595623396,
+    "decision_score": 0.8592692581399153,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 12.0,
+    "end": 18.72,
+    "audio_confidence": 0.9698405265808105,
+    "reaction_confidence": 0.4639640204182693,
+    "decision_score": 0.862196098807667,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "fireworks",
+    "text": "[fireworks]",
+    "start": 18.24,
+    "end": 21.599999999999998,
+    "audio_confidence": 0.8429054021835327,
+    "reaction_confidence": 0.19450016282831448,
+    "decision_score": 0.5511230444736845,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 21.119999999999997,
+    "end": 23.04,
+    "audio_confidence": 0.8048539757728577,
+    "reaction_confidence": 0.6408123896081255,
+    "decision_score": 0.8510352619987281,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "artillery fire",
+    "text": "[artillery fire]",
+    "start": 22.56,
+    "end": 23.04,
+    "audio_confidence": 0.6131650805473328,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.6297407943010331,
+    "reason": "audio+visual"
+  }
+]
\ No newline at end of file
diff --git a/sls/honking_output.sls b/sls/honking_output.sls
new file mode 100644
index 0000000..83f969f
--- /dev/null
+++ b/sls/honking_output.sls
@@ -0,0 +1,79 @@
+[
+  {
+    "index": 1,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 2.4,
+    "end": 12.48,
+    "audio_confidence": 0.9964298009872437,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9605363905429841,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 10.08,
+    "end": 10.56,
+    "audio_confidence": 0.3690028488636017,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.6154515668749809,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 15.36,
+    "end": 18.24,
+    "audio_confidence": 0.531771719455719,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7049744457006455,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 24.0,
+    "end": 25.919999999999998,
+    "audio_confidence": 0.9008418917655945,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9079630404710771,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 28.799999999999997,
+    "end": 29.279999999999998,
+    "audio_confidence": 0.7434176802635193,
+    "reaction_confidence": 0.6412603682918208,
+    "decision_score": 0.8174468898762549,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 31.2,
+    "end": 41.279999999999994,
+    "audio_confidence": 0.9966819882392883,
+    "reaction_confidence": 0.5461075806537908,
+    "decision_score": 0.9139235048258145,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 40.32,
+    "end": 40.8,
+    "audio_confidence": 0.46439415216445923,
+    "reaction_confidence": 0.0,
+    "decision_score": 0.3754167836904526,
+    "reason": "high-impact-audio"
+  }
+]
\ No newline at end of file
diff --git a/sls/siren_output.sls b/sls/siren_output.sls
new file mode 100644
index 0000000..3ddb772
--- /dev/null
+++ b/sls/siren_output.sls
@@ -0,0 +1,134 @@
+[
+  {
+    "index": 1,
+    "label": "siren",
+    "text": "[siren]",
+    "start": 0.96,
+    "end": 1.44,
+    "audio_confidence": 0.8404152989387512,
+    "reaction_confidence": 0.011924344025995747,
+    "decision_score": 0.5875943692280112,
+    "reason": "high-impact-audio"
+  },
+  {
+    "index": 2,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 0.96,
+    "end": 1.44,
+    "audio_confidence": 0.45202356576919556,
+    "reaction_confidence": 0.011924344025995747,
+    "decision_score": 0.3739789159847557,
+    "reason": "high-impact-audio"
+  },
+  {
+    "index": 3,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 2.88,
+    "end": 7.68,
+    "audio_confidence": 0.9621760845184326,
+    "reaction_confidence": 0.5794830769953337,
+    "decision_score": 0.9099642311330381,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "siren",
+    "text": "[siren]",
+    "start": 2.88,
+    "end": 7.68,
+    "audio_confidence": 0.9531972408294678,
+    "reaction_confidence": 0.5794830769953337,
+    "decision_score": 0.9050258671041075,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "siren",
+    "text": "[siren]",
+    "start": 8.64,
+    "end": 10.56,
+    "audio_confidence": 0.9823512434959412,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9527931839227678,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 8.64,
+    "end": 10.56,
+    "audio_confidence": 0.9796193838119507,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9512906610965729,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "music",
+    "text": "[music]",
+    "start": 10.559999999999999,
+    "end": 12.0,
+    "audio_confidence": 0.9774209856987,
+    "reaction_confidence": 0.021797618726838597,
+    "decision_score": 0.5473904705613624,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 8,
+    "label": "siren",
+    "text": "[siren]",
+    "start": 12.0,
+    "end": 16.32,
+    "audio_confidence": 0.9961757063865662,
+    "reaction_confidence": 0.02027477735210032,
+    "decision_score": 0.6770202883210565,
+    "reason": "high-impact-audio"
+  },
+  {
+    "index": 9,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 12.0,
+    "end": 16.32,
+    "audio_confidence": 0.9948476552963257,
+    "reaction_confidence": 0.02027477735210032,
+    "decision_score": 0.6762898602214243,
+    "reason": "high-impact-audio"
+  },
+  {
+    "index": 10,
+    "label": "music",
+    "text": "[music]",
+    "start": 16.32,
+    "end": 17.28,
+    "audio_confidence": 0.9585652947425842,
+    "reaction_confidence": 0.018021383470899995,
+    "decision_score": 0.5353205346703264,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 11,
+    "label": "alarm",
+    "text": "[alarm]",
+    "start": 17.28,
+    "end": 28.8,
+    "audio_confidence": 0.9847885370254517,
+    "reaction_confidence": 0.086813772000233,
+    "decision_score": 0.7006998927641033,
+    "reason": "high-impact-audio"
+  },
+  {
+    "index": 12,
+    "label": "siren",
+    "text": "[siren]",
+    "start": 17.28,
+    "end": 28.8,
+    "audio_confidence": 0.9837005138397217,
+    "reaction_confidence": 0.086813772000233,
+    "decision_score": 0.7001014800119518,
+    "reason": "high-impact-audio"
+  }
+]
\ No newline at end of file
diff --git a/sls/smoke_output.sls b/sls/smoke_output.sls
new file mode 100644
index 0000000..4917ba5
--- /dev/null
+++ b/sls/smoke_output.sls
@@ -0,0 +1,35 @@
+[
+  {
+    "index": 1,
+    "label": "music",
+    "text": "[music]",
+    "start": 0.0,
+    "end": 0.48,
+    "audio_confidence": 0.9944783449172974,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8394630897045137,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "music",
+    "text": "[music]",
+    "start": 1.44,
+    "end": 35.04,
+    "audio_confidence": 0.9990421533584595,
+    "reaction_confidence": 0.9930364583333335,
+    "decision_score": 0.9963395905971528,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "music",
+    "text": "[music]",
+    "start": 49.44,
+    "end": 50.879999999999995,
+    "audio_confidence": 0.7439404129981995,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7016672271490098,
+    "reason": "audio+visual"
+  }
+]
\ No newline at end of file
diff --git a/sls/test_sound_output.sls b/sls/test_sound_output.sls
new file mode 100644
index 0000000..ff83c15
--- /dev/null
+++ b/sls/test_sound_output.sls
@@ -0,0 +1,156 @@
+[
+  {
+    "index": 1,
+    "label": "applause",
+    "text": "[applause]",
+    "start": 0.0,
+    "end": 10.56,
+    "audio_confidence": 0.999273419380188,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8421003806591034,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 2,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 12.959999999999999,
+    "end": 21.12,
+    "audio_confidence": 0.9998518824577332,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9624185353517533,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 3,
+    "label": "honking",
+    "text": "[honking]",
+    "start": 22.08,
+    "end": 23.52,
+    "audio_confidence": 0.9345638751983643,
+    "reaction_confidence": 0.4904459294825792,
+    "decision_score": 0.854710799626261,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 4,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 24.96,
+    "end": 29.279999999999998,
+    "audio_confidence": 0.9425867199897766,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.9309226959943772,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 5,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 24.96,
+    "end": 25.44,
+    "audio_confidence": 0.7915558218955994,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7278557020425798,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 6,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 24.96,
+    "end": 25.44,
+    "audio_confidence": 0.4363442063331604,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.5324893134832382,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 7,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 26.88,
+    "end": 29.279999999999998,
+    "audio_confidence": 0.9276607036590576,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8027133870124818,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 8,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 26.88,
+    "end": 27.36,
+    "audio_confidence": 0.4864293038845062,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.5600361171364785,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 9,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 28.32,
+    "end": 29.279999999999998,
+    "audio_confidence": 0.8208039402961731,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7439421671628953,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 10,
+    "label": "glass breaking",
+    "text": "[glass breaking]",
+    "start": 31.2,
+    "end": 32.63999999999999,
+    "audio_confidence": 0.8777748942375183,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.8952761918306352,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 11,
+    "label": "breaking",
+    "text": "[breaking]",
+    "start": 31.68,
+    "end": 32.63999999999999,
+    "audio_confidence": 0.8007316589355469,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7329024124145509,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 12,
+    "label": "shatter",
+    "text": "[shatter]",
+    "start": 31.68,
+    "end": 32.63999999999999,
+    "audio_confidence": 0.7463002800941467,
+    "reaction_confidence": 0.65,
+    "decision_score": 0.7029651540517807,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 13,
+    "label": "explosion",
+    "text": "[explosion]",
+    "start": 33.6,
+    "end": 43.199999999999996,
+    "audio_confidence": 0.9554585814476013,
+    "reaction_confidence": 0.4912832531764888,
+    "decision_score": 0.8665796837256007,
+    "reason": "audio+visual"
+  },
+  {
+    "index": 14,
+    "label": "gunshot",
+    "text": "[gunshot]",
+    "start": 33.6,
+    "end": 38.4,
+    "audio_confidence": 0.9355015754699707,
+    "reaction_confidence": 0.41342752319574355,
+    "decision_score": 0.8205682519465686,
+    "reason": "audio+visual"
+  }
+]
\ No newline at end of file
diff --git a/src/intelligent_cc.egg-info/PKG-INFO b/src/intelligent_cc.egg-info/PKG-INFO
new file mode 100644
index 0000000..d20229e
--- /dev/null
+++ b/src/intelligent_cc.egg-info/PKG-INFO
@@ -0,0 +1,83 @@
+Metadata-Version: 2.4
+Name: intelligent-cc
+Version: 0.1.0
+Summary: AI-assisted closed-caption suggestion pipeline for meaningful non-speech events in video.
+Requires-Python: <3.11,>=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: click==8.1.7
+Requires-Dist: ffmpeg-python==0.2.0
+Requires-Dist: imageio-ffmpeg==0.4.9
+Requires-Dist: librosa==0.10.1
+Requires-Dist: numpy==1.26.4
+Requires-Dist: opencv-python-headless==4.10.0.84
+Requires-Dist: protobuf==3.19.6
+Requires-Dist: scipy==1.10.1
+Requires-Dist: tensorflow==2.10.1
+Requires-Dist: tensorflow-hub==0.12.0
+Provides-Extra: dev
+Requires-Dist: pytest==8.2.2; extra == "dev"
+
+# Intelligent Closed Caption Suggestion Tool
+
+AI-assisted backend pipeline for finding meaningful non-speech moments in a video and exporting closed-caption suggestions as SRT or SLS. The pipeline combines:
+
+- YAMNet sound event detection for non-speech audio events.
+- OpenCV frame sampling and optical-flow motion analysis.
+- MediaPipe face detection for visible head/face-position shifts.
+- A decision engine that avoids captioning low-impact ambient sounds unless the audio event and scene reaction justify it.
+
+## Python And Dependencies
+
+Use Python `3.10.x`. The project pins `>=3.10,<3.11` because this machine has Python 3.10 installed and TensorFlow `2.10.x` provides compatible native Windows wheels for that runtime.
+
+The app uses `imageio-ffmpeg` to provide an FFmpeg executable, so you do not need a separate system FFmpeg install for normal CLI use.
+
+```powershell
+py -3.10 -m venv .venv
+.\.venv\Scripts\Activate.ps1
+python -m pip install --upgrade pip
+pip install -e .
+```
+
+YAMNet is loaded from TensorFlow Hub on first use, so the first run needs internet access to download the model cache.
+
+## Usage
+
+```powershell
+intelligent-cc video.mp4 -o output.srt
+```
+
+For structured JSON-style output:
+
+```powershell
+intelligent-cc video.mp4 --format sls -o output.sls
+```
+
+Useful tuning flags:
+
+```powershell
+intelligent-cc video.mp4 --audio-threshold 0.30 --decision-threshold 0.55 --max-events 20
+```
+
+## Pipeline
+
+1. Extract mono 16 kHz audio from the input video with FFmpeg.
+2. Run YAMNet and keep captionable non-speech classes such as honking, glass breaking, alarms, applause, explosions, sirens, laughter, and music.
+3. Merge adjacent detections into timestamped audio events with confidence scores.
+4. Sample video frames around each event timestamp.
+5. Score visible reaction using optical flow and MediaPipe face-center movement.
+6. Combine audio confidence, visual reaction confidence, and high-impact label rules.
+7. Export accepted suggestions as SRT captions like `[honking]`.
+
+## Development
+
+```powershell
+pip install -e ".[dev]"
+pytest
+```
+
+## Notes
+
+- The included `video.mp4` can be used for a smoke test after dependencies are installed.
+- The reaction detector is intentionally conservative: routine background sounds are rejected unless paired with visible motion/reaction or a high-impact audio label.
+- For production review workflows, keep rejected events as diagnostic metadata by using the Python API and inspecting `PipelineResult.audio_events`.
diff --git a/src/intelligent_cc.egg-info/SOURCES.txt b/src/intelligent_cc.egg-info/SOURCES.txt
new file mode 100644
index 0000000..02fdf89
--- /dev/null
+++ b/src/intelligent_cc.egg-info/SOURCES.txt
@@ -0,0 +1,18 @@
+README.md
+pyproject.toml
+src/intelligent_cc/__init__.py
+src/intelligent_cc/audio.py
+src/intelligent_cc/cli.py
+src/intelligent_cc/decision.py
+src/intelligent_cc/models.py
+src/intelligent_cc/output.py
+src/intelligent_cc/pipeline.py
+src/intelligent_cc/vision.py
+src/intelligent_cc.egg-info/PKG-INFO
+src/intelligent_cc.egg-info/SOURCES.txt
+src/intelligent_cc.egg-info/dependency_links.txt
+src/intelligent_cc.egg-info/entry_points.txt
+src/intelligent_cc.egg-info/requires.txt
+src/intelligent_cc.egg-info/top_level.txt
+tests/test_decision.py
+tests/test_output.py
\ No newline at end of file
diff --git a/src/intelligent_cc.egg-info/dependency_links.txt b/src/intelligent_cc.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/intelligent_cc.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/intelligent_cc.egg-info/entry_points.txt b/src/intelligent_cc.egg-info/entry_points.txt
new file mode 100644
index 0000000..359c3d7
--- /dev/null
+++ b/src/intelligent_cc.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+intelligent-cc = intelligent_cc.cli:main
diff --git a/src/intelligent_cc.egg-info/requires.txt b/src/intelligent_cc.egg-info/requires.txt
new file mode 100644
index 0000000..ff0f514
--- /dev/null
+++ b/src/intelligent_cc.egg-info/requires.txt
@@ -0,0 +1,13 @@
+click==8.1.7
+ffmpeg-python==0.2.0
+imageio-ffmpeg==0.4.9
+librosa==0.10.1
+numpy==1.26.4
+opencv-python-headless==4.10.0.84
+protobuf==3.19.6
+scipy==1.10.1
+tensorflow==2.10.1
+tensorflow-hub==0.12.0
+
+[dev]
+pytest==8.2.2
diff --git a/src/intelligent_cc.egg-info/top_level.txt b/src/intelligent_cc.egg-info/top_level.txt
new file mode 100644
index 0000000..27ffce2
--- /dev/null
+++ b/src/intelligent_cc.egg-info/top_level.txt
@@ -0,0 +1 @@
+intelligent_cc
diff --git a/src/intelligent_cc/__init__.py b/src/intelligent_cc/__init__.py
new file mode 100644
index 0000000..597b904
--- /dev/null
+++ b/src/intelligent_cc/__init__.py
@@ -0,0 +1,5 @@
+"""Intelligent Closed Caption suggestion pipeline."""
+
+__all__ = ["__version__"]
+
+__version__ = "0.1.0"
diff --git a/src/intelligent_cc/audio.py b/src/intelligent_cc/audio.py
new file mode 100644
index 0000000..c5e72f3
--- /dev/null
+++ b/src/intelligent_cc/audio.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+
+import csv
+import tempfile
+from pathlib import Path
+
+import ffmpeg
+import imageio_ffmpeg
+import librosa
+import numpy as np
+
+from .models import AudioEvent
+
+
+YAMNET_URL = "https://tfhub.dev/google/yamnet/1"
+SAMPLE_RATE = 16000
+YAMNET_PATCH_SECONDS = 0.48
+YAMNET_HOP_SECONDS = 0.48
+
+NOISE_LABEL_HINTS = {
+    "alarm",
+    "applause",
+    "bang",
+    "bell",
+    "breaking",
+    "cheering",
+    "clap",
+    "crash",
+    "cry",
+    "door",
+    "explosion",
+    "fire",
+    "glass",
+    "gunshot",
+    "horn",
+    "laughter",
+    "music",
+    "scream",
+    "shatter",
+    "siren",
+    "thunder",
+    "vehicle",
+}
+
+LABEL_NORMALIZATION = {
+    "air horn, truck horn": "honking",
+    "car horn, truck horn": "honking",
+    "horn": "honking",
+    "glass": "glass breaking",
+    "glass shatter": "glass breaking",
+    "gunshot, gunfire": "gunshot",
+    "music": "music",
+    "applause": "applause",
+    "cheering": "crowd cheering",
+    "siren": "siren",
+    "alarm": "alarm",
+}
+
+
+class AudioEventDetector:
+    """YAMNet-based non-speech event detector."""
+
+    def __init__(
+        self,
+        confidence_threshold: float = 0.25,
+        min_duration: float = 0.35,
+        merge_gap: float = 0.6,
+        max_events: int | None = None,
+    ) -> None:
+        self.confidence_threshold = confidence_threshold
+        self.min_duration = min_duration
+        self.merge_gap = merge_gap
+        self.max_events = max_events
+        self._yamnet = None
+        self._class_names: list[str] | None = None
+
+    def detect(self, video_path: Path) -> list[AudioEvent]:
+        with tempfile.TemporaryDirectory(prefix="intelligent_cc_") as tmpdir:
+            wav_path = Path(tmpdir) / "audio.wav"
+            self._extract_audio(video_path, wav_path)
+            waveform, _ = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
+
+        if waveform.size == 0:
+            return []
+
+        scores = self._score_waveform(waveform)
+        raw_events = self._scores_to_events(scores)
+        events = self._merge_events(raw_events)
+        events.sort(key=lambda event: (event.start, -event.confidence))
+        if self.max_events is not None:
+            events = events[: self.max_events]
+        return events
+
+    def _extract_audio(self, video_path: Path, wav_path: Path) -> None:
+        try:
+            (
+                ffmpeg.input(str(video_path))
+                .output(str(wav_path), ac=1, ar=SAMPLE_RATE, format="wav", loglevel="error")
+                .overwrite_output()
+                .run(
+                    cmd=imageio_ffmpeg.get_ffmpeg_exe(),
+                    capture_stdout=True,
+                    capture_stderr=True,
+                )
+            )
+        except ffmpeg.Error as exc:
+            detail = exc.stderr.decode("utf-8", errors="ignore") if exc.stderr else str(exc)
+            raise RuntimeError(f"Unable to extract audio with ffmpeg: {detail}") from exc
+
+    def _load_yamnet(self):
+        if self._yamnet is None:
+            import tensorflow_hub as hub
+
+            self._yamnet = hub.load(YAMNET_URL)
+            class_map_path = self._yamnet.class_map_path().numpy().decode("utf-8")
+            with open(class_map_path, newline="", encoding="utf-8") as handle:
+                reader = csv.DictReader(handle)
+                self._class_names = [row["display_name"] for row in reader]
+        return self._yamnet, self._class_names or []
+
+    def _score_waveform(self, waveform: np.ndarray) -> np.ndarray:
+        yamnet, _ = self._load_yamnet()
+        scores, _, _ = yamnet(waveform.astype(np.float32))
+        return scores.numpy()
+
+    def _scores_to_events(self, scores: np.ndarray) -> list[AudioEvent]:
+        _, class_names = self._load_yamnet()
+        events: list[AudioEvent] = []
+        for frame_index, frame_scores in enumerate(scores):
+            top_indices = np.argsort(frame_scores)[-8:][::-1]
+            start = frame_index * YAMNET_HOP_SECONDS
+            end = start + YAMNET_PATCH_SECONDS
+            for class_index in top_indices:
+                label = class_names[int(class_index)]
+                confidence = float(frame_scores[class_index])
+                if confidence < self.confidence_threshold:
+                    continue
+                if not self._is_captionable_audio(label):
+                    continue
+                events.append(
+                    AudioEvent(
+                        label=self._normalize_label(label),
+                        confidence=confidence,
+                        start=start,
+                        end=end,
+                    )
+                )
+        return events
+
+    def _merge_events(self, events: list[AudioEvent]) -> list[AudioEvent]:
+        grouped: dict[str, list[AudioEvent]] = {}
+        for event in events:
+            grouped.setdefault(event.label, []).append(event)
+
+        merged: list[AudioEvent] = []
+        for label, label_events in grouped.items():
+            label_events.sort(key=lambda event: event.start)
+            current = label_events[0] if label_events else None
+            for event in label_events[1:]:
+                if current is None:
+                    current = event
+                    continue
+                if event.start <= current.end + self.merge_gap:
+                    duration_a = max(current.end - current.start, 0.01)
+                    duration_b = max(event.end - event.start, 0.01)
+                    weighted_conf = (
+                        current.confidence * duration_a + event.confidence * duration_b
+                    ) / (duration_a + duration_b)
+                    current = AudioEvent(
+                        label=label,
+                        confidence=float(max(current.confidence, weighted_conf, event.confidence)),
+                        start=current.start,
+                        end=max(current.end, event.end),
+                    )
+                else:
+                    if current.end - current.start >= self.min_duration:
+                        merged.append(current)
+                    current = event
+            if current is not None and current.end - current.start >= self.min_duration:
+                merged.append(current)
+        return merged
+
+    def _is_captionable_audio(self, label: str) -> bool:
+        lowered = label.lower()
+        if "speech" in lowered or "conversation" in lowered or "narration" in lowered:
+            return False
+        return any(hint in lowered for hint in NOISE_LABEL_HINTS)
+
+    def _normalize_label(self, label: str) -> str:
+        lowered = label.lower()
+        for needle, replacement in LABEL_NORMALIZATION.items():
+            if needle in lowered:
+                return replacement
+        cleaned = lowered.replace("_", " ").replace("/", " ")
+        return " ".join(cleaned.split())
diff --git a/src/intelligent_cc/cli.py b/src/intelligent_cc/cli.py
new file mode 100644
index 0000000..9e8e91b
--- /dev/null
+++ b/src/intelligent_cc/cli.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import click
+
+from .audio import AudioEventDetector
+from .decision import CaptionDecisionEngine
+from .pipeline import IntelligentCCPipeline
+from .vision import VisualReactionDetector
+
+
+@click.command(context_settings={"help_option_names": ["-h", "--help"]})
+@click.argument("video", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Output caption path. Defaults to <video-name>.srt.",
+)
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["srt", "sls"], case_sensitive=False),
+    default="srt",
+    show_default=True,
+)
+@click.option("--audio-threshold", default=0.25, show_default=True, help="YAMNet confidence cutoff.")
+@click.option("--decision-threshold", default=0.5, show_default=True, help="CC acceptance cutoff.")
+@click.option("--max-events", type=int, default=None, help="Optional cap for quick test runs.")
+def main(
+    video: Path,
+    output: Path | None,
+    output_format: str,
+    audio_threshold: float,
+    decision_threshold: float,
+    max_events: int | None,
+) -> None:
+    """Generate context-aware non-speech CC suggestions for VIDEO."""
+
+    if output is None:
+        output = video.with_suffix(f".{output_format}")
+
+    pipeline = IntelligentCCPipeline(
+        audio_detector=AudioEventDetector(
+            confidence_threshold=audio_threshold,
+            max_events=max_events,
+        ),
+        reaction_detector=VisualReactionDetector(),
+        decision_engine=CaptionDecisionEngine(decision_threshold=decision_threshold),
+    )
+    result = pipeline.run(video, output, output_format=output_format)
+
+    click.echo(f"Detected audio events: {len(result.audio_events)}")
+    click.echo(f"Accepted CC suggestions: {len(result.suggestions)}")
+    click.echo(f"Wrote: {result.output_path}")
diff --git a/src/intelligent_cc/decision.py b/src/intelligent_cc/decision.py
new file mode 100644
index 0000000..f5a7e6c
--- /dev/null
+++ b/src/intelligent_cc/decision.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from .models import AudioEvent, CaptionSuggestion, ReactionSignal
+
+
+HIGH_IMPACT_LABELS = {
+    "alarm",
+    "explosion",
+    "glass breaking",
+    "gunshot",
+    "honking",
+    "scream",
+    "siren",
+}
+
+
+class CaptionDecisionEngine:
+    def __init__(
+        self,
+        audio_weight: float = 0.55,
+        reaction_weight: float = 0.45,
+        decision_threshold: float = 0.5,
+        high_impact_audio_threshold: float = 0.45,
+    ) -> None:
+        self.audio_weight = audio_weight
+        self.reaction_weight = reaction_weight
+        self.decision_threshold = decision_threshold
+        self.high_impact_audio_threshold = high_impact_audio_threshold
+
+    def decide(
+        self,
+        event: AudioEvent,
+        reaction: ReactionSignal,
+        index: int,
+    ) -> CaptionSuggestion | None:
+        label_boost = 0.12 if event.label in HIGH_IMPACT_LABELS else 0.0
+        score = min(
+            1.0,
+            self.audio_weight * event.confidence
+            + self.reaction_weight * reaction.confidence
+            + label_boost,
+        )
+        high_impact_audio = (
+            event.label in HIGH_IMPACT_LABELS
+            and event.confidence >= self.high_impact_audio_threshold
+        )
+        if score < self.decision_threshold and not high_impact_audio:
+            return None
+
+        reason = "audio+visual"
+        if high_impact_audio and reaction.confidence < 0.2:
+            reason = "high-impact-audio"
+
+        return CaptionSuggestion(
+            index=index,
+            label=event.label,
+            text=f"[{event.label}]",
+            start=event.start,
+            end=event.end,
+            audio_confidence=event.confidence,
+            reaction_confidence=reaction.confidence,
+            decision_score=float(score),
+            reason=reason,
+        )
diff --git a/src/intelligent_cc/models.py b/src/intelligent_cc/models.py
new file mode 100644
index 0000000..03b5735
--- /dev/null
+++ b/src/intelligent_cc/models.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class AudioEvent:
+    label: str
+    confidence: float
+    start: float
+    end: float
+
+
+@dataclass(frozen=True)
+class ReactionSignal:
+    confidence: float
+    motion_score: float
+    face_shift_score: float
+    frame_count: int
+
+
+@dataclass(frozen=True)
+class CaptionSuggestion:
+    index: int
+    label: str
+    text: str
+    start: float
+    end: float
+    audio_confidence: float
+    reaction_confidence: float
+    decision_score: float
+    reason: str
+
+
+@dataclass(frozen=True)
+class PipelineResult:
+    video_path: Path
+    audio_events: list[AudioEvent]
+    suggestions: list[CaptionSuggestion]
+    output_path: Path
diff --git a/src/intelligent_cc/output.py b/src/intelligent_cc/output.py
new file mode 100644
index 0000000..a816147
--- /dev/null
+++ b/src/intelligent_cc/output.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+import json
+from dataclasses import asdict
+from pathlib import Path
+
+from .models import CaptionSuggestion
+
+
+def write_srt(suggestions: list[CaptionSuggestion], output_path: Path) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    lines: list[str] = []
+    for suggestion in suggestions:
+        lines.extend(
+            [
+                str(suggestion.index),
+                f"{format_timestamp(suggestion.start)} --> {format_timestamp(suggestion.end)}",
+                suggestion.text,
+                "",
+            ]
+        )
+    output_path.write_text("\n".join(lines), encoding="utf-8")
+
+
+def write_sls(suggestions: list[CaptionSuggestion], output_path: Path) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    payload = [asdict(suggestion) for suggestion in suggestions]
+    output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def format_timestamp(seconds: float) -> str:
+    milliseconds = int(round(seconds * 1000))
+    hours, remainder = divmod(milliseconds, 3_600_000)
+    minutes, remainder = divmod(remainder, 60_000)
+    secs, millis = divmod(remainder, 1000)
+    return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}"
diff --git a/src/intelligent_cc/pipeline.py b/src/intelligent_cc/pipeline.py
new file mode 100644
index 0000000..c668633
--- /dev/null
+++ b/src/intelligent_cc/pipeline.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from .audio import AudioEventDetector
+from .decision import CaptionDecisionEngine
+from .models import PipelineResult
+from .output import write_sls, write_srt
+from .vision import VisualReactionDetector
+
+
+class IntelligentCCPipeline:
+    def __init__(
+        self,
+        audio_detector: AudioEventDetector | None = None,
+        reaction_detector: VisualReactionDetector | None = None,
+        decision_engine: CaptionDecisionEngine | None = None,
+    ) -> None:
+        self.audio_detector = audio_detector or AudioEventDetector()
+        self.reaction_detector = reaction_detector or VisualReactionDetector()
+        self.decision_engine = decision_engine or CaptionDecisionEngine()
+
+    def run(self, video_path: Path, output_path: Path, output_format: str = "srt") -> PipelineResult:
+        video_path = video_path.resolve()
+        output_path = output_path.resolve()
+        if not video_path.exists():
+            raise FileNotFoundError(video_path)
+
+        audio_events = self.audio_detector.detect(video_path)
+        suggestions = []
+        next_index = 1
+        for event in audio_events:
+            reaction = self.reaction_detector.score_event(video_path, event)
+            suggestion = self.decision_engine.decide(event, reaction, next_index)
+            if suggestion is not None:
+                suggestions.append(suggestion)
+                next_index += 1
+
+        if output_format == "srt":
+            write_srt(suggestions, output_path)
+        elif output_format == "sls":
+            write_sls(suggestions, output_path)
+        else:
+            raise ValueError(f"Unsupported output format: {output_format}")
+
+        return PipelineResult(
+            video_path=video_path,
+            audio_events=audio_events,
+            suggestions=suggestions,
+            output_path=output_path,
+        )
diff --git a/src/intelligent_cc/vision.py b/src/intelligent_cc/vision.py
new file mode 100644
index 0000000..7b6e919
--- /dev/null
+++ b/src/intelligent_cc/vision.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from .models import AudioEvent, ReactionSignal
+
+
+class VisualReactionDetector:
+    """Detects visible motion and face/head-position changes around an audio event."""
+
+    def __init__(
+        self,
+        window_before: float = 0.75,
+        window_after: float = 1.25,
+        sample_fps: float = 6.0,
+    ) -> None:
+        self.window_before = window_before
+        self.window_after = window_after
+        self.sample_fps = sample_fps
+        self._mediapipe_detector = None
+        self._haar_detector = None
+
+    def score_event(self, video_path: Path, event: AudioEvent) -> ReactionSignal:
+        frames = self._sample_frames(
+            video_path,
+            max(event.start - self.window_before, 0.0),
+            event.end + self.window_after,
+        )
+        if len(frames) < 2:
+            return ReactionSignal(0.0, 0.0, 0.0, len(frames))
+
+        motion_score = self._motion_score(frames)
+        face_shift_score = self._face_shift_score(frames)
+        confidence = min(1.0, 0.65 * motion_score + 0.35 * face_shift_score)
+        return ReactionSignal(
+            confidence=float(confidence),
+            motion_score=float(motion_score),
+            face_shift_score=float(face_shift_score),
+            frame_count=len(frames),
+        )
+
+    def _sample_frames(self, video_path: Path, start: float, end: float) -> list[np.ndarray]:
+        capture = cv2.VideoCapture(str(video_path))
+        if not capture.isOpened():
+            raise RuntimeError(f"Unable to open video: {video_path}")
+
+        source_fps = capture.get(cv2.CAP_PROP_FPS) or 25.0
+        step = max(int(round(source_fps / self.sample_fps)), 1)
+        start_frame = int(start * source_fps)
+        end_frame = int(end * source_fps)
+        capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+
+        frames: list[np.ndarray] = []
+        frame_number = start_frame
+        while frame_number <= end_frame:
+            ok, frame = capture.read()
+            if not ok:
+                break
+            if (frame_number - start_frame) % step == 0:
+                frames.append(cv2.resize(frame, (320, 180), interpolation=cv2.INTER_AREA))
+            frame_number += 1
+        capture.release()
+        return frames
+
+    def _motion_score(self, frames: list[np.ndarray]) -> float:
+        scores: list[float] = []
+        previous = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
+        for frame in frames[1:]:
+            current = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            flow = cv2.calcOpticalFlowFarneback(
+                previous,
+                current,
+                None,
+                pyr_scale=0.5,
+                levels=2,
+                winsize=15,
+                iterations=2,
+                poly_n=5,
+                poly_sigma=1.1,
+                flags=0,
+            )
+            magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
+            scores.append(float(np.percentile(magnitude, 90)))
+            previous = current
+        if not scores:
+            return 0.0
+        return min(1.0, float(np.mean(scores)) / 3.5)
+
+    def _face_shift_score(self, frames: list[np.ndarray]) -> float:
+        centers: list[tuple[float, float]] = []
+        for frame in frames:
+            center = self._detect_face_center(frame)
+            if center is not None:
+                centers.append(center)
+        if len(centers) < 2:
+            return 0.0
+
+        deltas = [
+            abs(centers[index][0] - centers[index - 1][0])
+            + abs(centers[index][1] - centers[index - 1][1])
+            for index in range(1, len(centers))
+        ]
+        return min(1.0, float(np.percentile(deltas, 80)) * 3.0)
+
+    def _detect_face_center(self, frame: np.ndarray) -> tuple[float, float] | None:
+        mediapipe_center = self._detect_mediapipe_face_center(frame)
+        if mediapipe_center is not None:
+            return mediapipe_center
+        return self._detect_haar_face_center(frame)
+
+    def _detect_mediapipe_face_center(self, frame: np.ndarray) -> tuple[float, float] | None:
+        try:
+            import mediapipe as mp
+        except ImportError:
+            return None
+
+        if self._mediapipe_detector is None:
+            self._mediapipe_detector = mp.solutions.face_detection.FaceDetection(
+                model_selection=0,
+                min_detection_confidence=0.45,
+            )
+
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        result = self._mediapipe_detector.process(rgb)
+        if not result.detections:
+            return None
+
+        box = result.detections[0].location_data.relative_bounding_box
+        return (float(box.xmin + box.width / 2.0), float(box.ymin + box.height / 2.0))
+
+    def _detect_haar_face_center(self, frame: np.ndarray) -> tuple[float, float] | None:
+        if self._haar_detector is None:
+            cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+            self._haar_detector = cv2.CascadeClassifier(cascade_path)
+
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        faces = self._haar_detector.detectMultiScale(
+            gray,
+            scaleFactor=1.1,
+            minNeighbors=5,
+            minSize=(24, 24),
+        )
+        if len(faces) == 0:
+            return None
+
+        x, y, width, height = max(faces, key=lambda face: face[2] * face[3])
+        frame_height, frame_width = gray.shape[:2]
+        return (
+            float((x + width / 2.0) / frame_width),
+            float((y + height / 2.0) / frame_height),
+        )
diff --git a/srt/applause_output.srt b/srt/applause_output.srt
new file mode 100644
index 0000000..f67a8b7
--- /dev/null
+++ b/srt/applause_output.srt
@@ -0,0 +1,3 @@
+1
+00:00:00,480 --> 00:00:24,960
+[applause]
diff --git a/srt/glass_output.srt b/srt/glass_output.srt
new file mode 100644
index 0000000..a20d2d9
--- /dev/null
+++ b/srt/glass_output.srt
@@ -0,0 +1,55 @@
+1
+00:00:07,200 --> 00:00:10,560
+[glass breaking]
+
+2
+00:00:07,680 --> 00:00:10,080
+[breaking]
+
+3
+00:00:08,640 --> 00:00:10,080
+[shatter]
+
+4
+00:00:12,000 --> 00:00:17,280
+[glass breaking]
+
+5
+00:00:12,000 --> 00:00:12,960
+[breaking]
+
+6
+00:00:12,000 --> 00:00:12,960
+[shatter]
+
+7
+00:00:13,920 --> 00:00:16,320
+[breaking]
+
+8
+00:00:13,920 --> 00:00:14,400
+[shatter]
+
+9
+00:00:15,360 --> 00:00:16,320
+[shatter]
+
+10
+00:00:18,720 --> 00:00:19,680
+[glass breaking]
+
+11
+00:00:18,720 --> 00:00:19,680
+[breaking]
+
+12
+00:00:18,720 --> 00:00:19,680
+[shatter]
+
+13
+00:00:21,600 --> 00:00:24,960
+[glass breaking]
+
+14
+00:00:21,600 --> 00:00:24,480
+[shatter]
diff --git a/srt/gun_output.srt b/srt/gun_output.srt
new file mode 100644
index 0000000..99e1a9a
--- /dev/null
+++ b/srt/gun_output.srt
@@ -0,0 +1,27 @@
+1
+00:00:05,760 --> 00:00:09,120
+[explosion]
+
+2
+00:00:05,760 --> 00:00:06,720
+[gunshot]
+
+3
+00:00:10,080 --> 00:00:23,520
+[explosion]
+
+4
+00:00:12,000 --> 00:00:18,720
+[gunshot]
+
+5
+00:00:18,240 --> 00:00:21,600
+[fireworks]
+
+6
+00:00:21,120 --> 00:00:23,040
+[gunshot]
+
+7
+00:00:22,560 --> 00:00:23,040
+[artillery fire]
diff --git a/srt/siren_output.srt b/srt/siren_output.srt
new file mode 100644
index 0000000..15b7143
--- /dev/null
+++ b/srt/siren_output.srt
@@ -0,0 +1,47 @@
+1
+00:00:00,960 --> 00:00:01,440
+[siren]
+
+2
+00:00:00,960 --> 00:00:01,440
+[alarm]
+
+3
+00:00:02,880 --> 00:00:07,680
+[alarm]
+
+4
+00:00:02,880 --> 00:00:07,680
+[siren]
+
+5
+00:00:08,640 --> 00:00:10,560
+[siren]
+
+6
+00:00:08,640 --> 00:00:10,560
+[alarm]
+
+7
+00:00:10,560 --> 00:00:12,000
+[music]
+
+8
+00:00:12,000 --> 00:00:16,320
+[siren]
+
+9
+00:00:12,000 --> 00:00:16,320
+[alarm]
+
+10
+00:00:16,320 --> 00:00:17,280
+[music]
+
+11
+00:00:17,280 --> 00:00:28,800
+[alarm]
+
+12
+00:00:17,280 --> 00:00:28,800
+[siren]
diff --git a/srt/smoke_output.srt b/srt/smoke_output.srt
new file mode 100644
index 0000000..643af20
--- /dev/null
+++ b/srt/smoke_output.srt
@@ -0,0 +1,11 @@
+1
+00:00:00,000 --> 00:00:00,480
+[music]
+
+2
+00:00:01,440 --> 00:00:35,040
+[music]
+
+3
+00:00:49,440 --> 00:00:50,880
+[music]
diff --git a/srt/test_sound_output.srt b/srt/test_sound_output.srt
new file mode 100644
index 0000000..518c855
--- /dev/null
+++ b/srt/test_sound_output.srt
@@ -0,0 +1,55 @@
+1
+00:00:00,000 --> 00:00:10,560
+[applause]
+
+2
+00:00:12,960 --> 00:00:21,120
+[honking]
+
+3
+00:00:22,080 --> 00:00:23,520
+[honking]
+
+4
+00:00:24,960 --> 00:00:29,280
+[glass breaking]
+
+5
+00:00:24,960 --> 00:00:25,440
+[breaking]
+
+6
+00:00:24,960 --> 00:00:25,440
+[shatter]
+
+7
+00:00:26,880 --> 00:00:29,280
+[breaking]
+
+8
+00:00:26,880 --> 00:00:27,360
+[shatter]
+
+9
+00:00:28,320 --> 00:00:29,280
+[shatter]
+
+10
+00:00:31,200 --> 00:00:32,640
+[glass breaking]
+
+11
+00:00:31,680 --> 00:00:32,640
+[breaking]
+
+12
+00:00:31,680 --> 00:00:32,640
+[shatter]
+
+13
+00:00:33,600 --> 00:00:43,200
+[explosion]
+
+14
+00:00:33,600 --> 00:00:38,400
+[gunshot]
diff --git a/tests/test_decision.py b/tests/test_decision.py
new file mode 100644
index 0000000..0dc6cb1
--- /dev/null
+++ b/tests/test_decision.py
@@ -0,0 +1,23 @@
+from intelligent_cc.decision import CaptionDecisionEngine
+from intelligent_cc.models import AudioEvent, ReactionSignal
+
+
+def test_accepts_event_when_audio_and_visual_scores_are_meaningful() -> None:
+    suggestion = CaptionDecisionEngine().decide(
+        AudioEvent("glass breaking", 0.62, 3.0, 4.0),
+        ReactionSignal(0.55, 0.5, 0.6, 10),
+        1,
+    )
+
+    assert suggestion is not None
+    assert suggestion.text == "[glass breaking]"
+
+
+def test_rejects_low_impact_event_without_reaction() -> None:
+    suggestion = CaptionDecisionEngine().decide(
+        AudioEvent("music", 0.28, 3.0, 4.0),
+        ReactionSignal(0.05, 0.04, 0.0, 10),
+        1,
+    )
+
+    assert suggestion is None
diff --git a/tests/test_output.py b/tests/test_output.py
new file mode 100644
index 0000000..e6ca694
--- /dev/null
+++ b/tests/test_output.py
@@ -0,0 +1,32 @@
+from intelligent_cc.models import CaptionSuggestion
+from intelligent_cc.output import format_timestamp, write_srt
+
+
+def test_format_timestamp_uses_srt_millisecond_format() -> None:
+    assert format_timestamp(3723.456) == "01:02:03,456"
+
+
+def test_write_srt(tmp_path) -> None:
+    output = tmp_path / "captions.srt"
+    write_srt(
+        [
+            CaptionSuggestion(
+                index=1,
+                label="honking",
+                text="[honking]",
+                start=1.0,
+                end=2.25,
+                audio_confidence=0.9,
+                reaction_confidence=0.7,
+                decision_score=0.8,
+                reason="audio+visual",
+            )
+        ],
+        output,
+    )
+
+    assert output.read_text(encoding="utf-8") == (
+        "1\n"
+        "00:00:01,000 --> 00:00:02,250\n"
+        "[honking]\n"
+    )
diff --git a/videos/applause.mp4 b/videos/applause.mp4
new file mode 100644
index 0000000..06e887d
Binary files /dev/null and b/videos/applause.mp4 differ
diff --git a/videos/glass.mp4 b/videos/glass.mp4
new file mode 100644
index 0000000..301be9c
Binary files /dev/null and b/videos/glass.mp4 differ
diff --git a/videos/gun.mp4 b/videos/gun.mp4
new file mode 100644
index 0000000..16dfc25
Binary files /dev/null and b/videos/gun.mp4 differ
diff --git a/videos/honking.mp4 b/videos/honking.mp4
new file mode 100644
index 0000000..cae377a
Binary files /dev/null and b/videos/honking.mp4 differ
diff --git a/videos/siren.mp4 b/videos/siren.mp4
new file mode 100644
index 0000000..36a1bb1
Binary files /dev/null and b/videos/siren.mp4 differ
diff --git a/videos/smoke.mp4 b/videos/smoke.mp4
new file mode 100644
index 0000000..b2e4bad
Binary files /dev/null and b/videos/smoke.mp4 differ