diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b11fd1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +.venv/ +__pycache__/ +*.py[cod] +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +*.srt +*.sls +*.wav +*.log + +models/ +.cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..6ab4371 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# Intelligent Closed Caption Suggestion Tool + +AI-assisted backend pipeline for finding meaningful non-speech moments in a video and exporting closed-caption suggestions as SRT or SLS. The pipeline combines: + +- YAMNet sound event detection for non-speech audio events. +- OpenCV frame sampling and optical-flow motion analysis. +- Face-position shift detection using MediaPipe when installed, with an OpenCV Haar-cascade fallback in the default setup. +- A decision engine that avoids captioning low-impact ambient sounds unless the audio event and scene reaction justify it. + +## Python And Dependencies + +Use Python `3.10.x`. The project pins `>=3.10,<3.11` because this machine has Python 3.10 installed and TensorFlow `2.10.x` provides compatible native Windows wheels for that runtime. + +The app uses `imageio-ffmpeg` to provide an FFmpeg executable, so you do not need a separate system FFmpeg install for normal CLI use. + +```powershell +py -3.10 -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip +pip install -e . +``` + +YAMNet is loaded from TensorFlow Hub on first use, so the first run needs internet access to download the model cache. + +## Usage + +```powershell +intelligent-cc video.mp4 -o output.srt +``` + +For structured JSON-style output: + +```powershell +intelligent-cc video.mp4 --format sls -o output.sls +``` + +Useful tuning flags: + +```powershell +intelligent-cc video.mp4 --audio-threshold 0.30 --decision-threshold 0.55 --max-events 20 +``` + +## Pipeline + +1. Extract mono 16 kHz audio from the input video with FFmpeg. +2. Run YAMNet and keep captionable non-speech classes such as honking, glass breaking, alarms, applause, explosions, sirens, laughter, and music. +3. Merge adjacent detections into timestamped audio events with confidence scores. +4. Sample video frames around each event timestamp. +5. Score visible reaction using optical flow and MediaPipe face-center movement. +6. Combine audio confidence, visual reaction confidence, and high-impact label rules. +7. Export accepted suggestions as SRT captions like `[honking]`. + +## Development + +```powershell +pip install -e ".[dev]" +pytest +``` + +## Notes + +- The included `video.mp4` can be used for a smoke test after dependencies are installed. +- The reaction detector is intentionally conservative: routine background sounds are rejected unless paired with visible motion/reaction or a high-impact audio label. +- For production review workflows, keep rejected events as diagnostic metadata by using the Python API and inspecting `PipelineResult.audio_events`. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..74fb913 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "intelligent-cc" +version = "0.1.0" +description = "AI-assisted closed-caption suggestion pipeline for meaningful non-speech events in video." +readme = "README.md" +requires-python = ">=3.10,<3.11" +dependencies = [ + "click==8.1.7", + "ffmpeg-python==0.2.0", + "imageio-ffmpeg==0.4.9", + "librosa==0.10.1", + "numpy==1.26.4", + "opencv-python-headless==4.10.0.84", + "protobuf==3.19.6", + "scipy==1.10.1", + "tensorflow==2.10.1", + "tensorflow-hub==0.12.0", +] + +[project.optional-dependencies] +dev = [ + "pytest==8.2.2", +] + +[project.scripts] +intelligent-cc = "intelligent_cc.cli:main" + +[tool.setuptools.package-dir] +"" = "src" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..917e846 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +click==8.1.7 +ffmpeg-python==0.2.0 +imageio-ffmpeg==0.4.9 +librosa==0.10.1 +numpy==1.26.4 +opencv-python-headless==4.10.0.84 +protobuf==3.19.6 +scipy==1.10.1 +tensorflow==2.10.1 +tensorflow-hub==0.12.0 diff --git a/sls/applause_output.sls b/sls/applause_output.sls new file mode 100644 index 0000000..42045ca --- /dev/null +++ b/sls/applause_output.sls @@ -0,0 +1,13 @@ +[ + { + "index": 1, + "label": "applause", + "text": "[applause]", + "start": 0.48, + "end": 24.96, + "audio_confidence": 0.9995602965354919, + "reaction_confidence": 0.65, + "decision_score": 0.8422581630945207, + "reason": "audio+visual" + } +] \ No newline at end of file diff --git a/sls/glass_output.sls b/sls/glass_output.sls new file mode 100644 index 0000000..c18cbaa --- /dev/null +++ b/sls/glass_output.sls @@ -0,0 +1,156 @@ +[ + { + "index": 1, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 7.199999999999999, + "end": 10.56, + "audio_confidence": 0.9861820936203003, + "reaction_confidence": 0.65, + "decision_score": 0.9549001514911651, + "reason": "audio+visual" + }, + { + "index": 2, + "label": "breaking", + "text": "[breaking]", + "start": 7.68, + "end": 10.08, + "audio_confidence": 0.9511262774467468, + "reaction_confidence": 0.65, + "decision_score": 0.8156194525957108, + "reason": "audio+visual" + }, + { + "index": 3, + "label": "shatter", + "text": "[shatter]", + "start": 8.64, + "end": 10.08, + "audio_confidence": 0.9510408043861389, + "reaction_confidence": 0.65, + "decision_score": 0.8155724424123765, + "reason": "audio+visual" + }, + { + "index": 4, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 12.0, + "end": 17.28, + "audio_confidence": 0.9526554942131042, + "reaction_confidence": 0.65, + "decision_score": 0.9364605218172074, + "reason": "audio+visual" + }, + { + "index": 5, + "label": "breaking", + "text": "[breaking]", + "start": 12.0, + "end": 12.96, + "audio_confidence": 0.8753526210784912, + "reaction_confidence": 0.65, + "decision_score": 0.7739439415931703, + "reason": "audio+visual" + }, + { + "index": 6, + "label": "shatter", + "text": "[shatter]", + "start": 12.0, + "end": 12.96, + "audio_confidence": 0.6574118137359619, + "reaction_confidence": 0.65, + "decision_score": 0.6540764975547791, + "reason": "audio+visual" + }, + { + "index": 7, + "label": "breaking", + "text": "[breaking]", + "start": 13.92, + "end": 16.32, + "audio_confidence": 0.9251334071159363, + "reaction_confidence": 0.65, + "decision_score": 0.8013233739137651, + "reason": "audio+visual" + }, + { + "index": 8, + "label": "shatter", + "text": "[shatter]", + "start": 13.92, + "end": 14.4, + "audio_confidence": 0.6074072122573853, + "reaction_confidence": 0.65, + "decision_score": 0.626573966741562, + "reason": "audio+visual" + }, + { + "index": 9, + "label": "shatter", + "text": "[shatter]", + "start": 15.36, + "end": 16.32, + "audio_confidence": 0.8594706654548645, + "reaction_confidence": 0.65, + "decision_score": 0.7652088660001756, + "reason": "audio+visual" + }, + { + "index": 10, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 18.72, + "end": 19.68, + "audio_confidence": 0.8928834795951843, + "reaction_confidence": 0.65, + "decision_score": 0.9035859137773515, + "reason": "audio+visual" + }, + { + "index": 11, + "label": "breaking", + "text": "[breaking]", + "start": 18.72, + "end": 19.68, + "audio_confidence": 0.8732807040214539, + "reaction_confidence": 0.65, + "decision_score": 0.7728043872117997, + "reason": "audio+visual" + }, + { + "index": 12, + "label": "shatter", + "text": "[shatter]", + "start": 18.72, + "end": 19.68, + "audio_confidence": 0.8596591949462891, + "reaction_confidence": 0.65, + "decision_score": 0.7653125572204591, + "reason": "audio+visual" + }, + { + "index": 13, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 21.599999999999998, + "end": 24.96, + "audio_confidence": 0.9833531975746155, + "reaction_confidence": 0.46931650208883047, + "decision_score": 0.8720366846060122, + "reason": "audio+visual" + }, + { + "index": 14, + "label": "shatter", + "text": "[shatter]", + "start": 21.599999999999998, + "end": 24.48, + "audio_confidence": 0.9815574884414673, + "reaction_confidence": 0.4739797139495354, + "decision_score": 0.7531474899200981, + "reason": "audio+visual" + } +] \ No newline at end of file diff --git a/sls/gun_output.sls b/sls/gun_output.sls new file mode 100644 index 0000000..f805f92 --- /dev/null +++ b/sls/gun_output.sls @@ -0,0 +1,79 @@ +[ + { + "index": 1, + "label": "explosion", + "text": "[explosion]", + "start": 5.76, + "end": 9.120000000000001, + "audio_confidence": 0.7828652858734131, + "reaction_confidence": 0.24199659019974723, + "decision_score": 0.6594743728202636, + "reason": "audio+visual" + }, + { + "index": 2, + "label": "gunshot", + "text": "[gunshot]", + "start": 5.76, + "end": 6.720000000000001, + "audio_confidence": 0.5234716534614563, + "reaction_confidence": 0.38832228271901104, + "decision_score": 0.5826544366273559, + "reason": "audio+visual" + }, + { + "index": 3, + "label": "explosion", + "text": "[explosion]", + "start": 10.08, + "end": 23.52, + "audio_confidence": 0.9754327535629272, + "reaction_confidence": 0.45062498595623396, + "decision_score": 0.8592692581399153, + "reason": "audio+visual" + }, + { + "index": 4, + "label": "gunshot", + "text": "[gunshot]", + "start": 12.0, + "end": 18.72, + "audio_confidence": 0.9698405265808105, + "reaction_confidence": 0.4639640204182693, + "decision_score": 0.862196098807667, + "reason": "audio+visual" + }, + { + "index": 5, + "label": "fireworks", + "text": "[fireworks]", + "start": 18.24, + "end": 21.599999999999998, + "audio_confidence": 0.8429054021835327, + "reaction_confidence": 0.19450016282831448, + "decision_score": 0.5511230444736845, + "reason": "audio+visual" + }, + { + "index": 6, + "label": "gunshot", + "text": "[gunshot]", + "start": 21.119999999999997, + "end": 23.04, + "audio_confidence": 0.8048539757728577, + "reaction_confidence": 0.6408123896081255, + "decision_score": 0.8510352619987281, + "reason": "audio+visual" + }, + { + "index": 7, + "label": "artillery fire", + "text": "[artillery fire]", + "start": 22.56, + "end": 23.04, + "audio_confidence": 0.6131650805473328, + "reaction_confidence": 0.65, + "decision_score": 0.6297407943010331, + "reason": "audio+visual" + } +] \ No newline at end of file diff --git a/sls/honking_output.sls b/sls/honking_output.sls new file mode 100644 index 0000000..83f969f --- /dev/null +++ b/sls/honking_output.sls @@ -0,0 +1,79 @@ +[ + { + "index": 1, + "label": "honking", + "text": "[honking]", + "start": 2.4, + "end": 12.48, + "audio_confidence": 0.9964298009872437, + "reaction_confidence": 0.65, + "decision_score": 0.9605363905429841, + "reason": "audio+visual" + }, + { + "index": 2, + "label": "alarm", + "text": "[alarm]", + "start": 10.08, + "end": 10.56, + "audio_confidence": 0.3690028488636017, + "reaction_confidence": 0.65, + "decision_score": 0.6154515668749809, + "reason": "audio+visual" + }, + { + "index": 3, + "label": "honking", + "text": "[honking]", + "start": 15.36, + "end": 18.24, + "audio_confidence": 0.531771719455719, + "reaction_confidence": 0.65, + "decision_score": 0.7049744457006455, + "reason": "audio+visual" + }, + { + "index": 4, + "label": "honking", + "text": "[honking]", + "start": 24.0, + "end": 25.919999999999998, + "audio_confidence": 0.9008418917655945, + "reaction_confidence": 0.65, + "decision_score": 0.9079630404710771, + "reason": "audio+visual" + }, + { + "index": 5, + "label": "honking", + "text": "[honking]", + "start": 28.799999999999997, + "end": 29.279999999999998, + "audio_confidence": 0.7434176802635193, + "reaction_confidence": 0.6412603682918208, + "decision_score": 0.8174468898762549, + "reason": "audio+visual" + }, + { + "index": 6, + "label": "honking", + "text": "[honking]", + "start": 31.2, + "end": 41.279999999999994, + "audio_confidence": 0.9966819882392883, + "reaction_confidence": 0.5461075806537908, + "decision_score": 0.9139235048258145, + "reason": "audio+visual" + }, + { + "index": 7, + "label": "alarm", + "text": "[alarm]", + "start": 40.32, + "end": 40.8, + "audio_confidence": 0.46439415216445923, + "reaction_confidence": 0.0, + "decision_score": 0.3754167836904526, + "reason": "high-impact-audio" + } +] \ No newline at end of file diff --git a/sls/siren_output.sls b/sls/siren_output.sls new file mode 100644 index 0000000..3ddb772 --- /dev/null +++ b/sls/siren_output.sls @@ -0,0 +1,134 @@ +[ + { + "index": 1, + "label": "siren", + "text": "[siren]", + "start": 0.96, + "end": 1.44, + "audio_confidence": 0.8404152989387512, + "reaction_confidence": 0.011924344025995747, + "decision_score": 0.5875943692280112, + "reason": "high-impact-audio" + }, + { + "index": 2, + "label": "alarm", + "text": "[alarm]", + "start": 0.96, + "end": 1.44, + "audio_confidence": 0.45202356576919556, + "reaction_confidence": 0.011924344025995747, + "decision_score": 0.3739789159847557, + "reason": "high-impact-audio" + }, + { + "index": 3, + "label": "alarm", + "text": "[alarm]", + "start": 2.88, + "end": 7.68, + "audio_confidence": 0.9621760845184326, + "reaction_confidence": 0.5794830769953337, + "decision_score": 0.9099642311330381, + "reason": "audio+visual" + }, + { + "index": 4, + "label": "siren", + "text": "[siren]", + "start": 2.88, + "end": 7.68, + "audio_confidence": 0.9531972408294678, + "reaction_confidence": 0.5794830769953337, + "decision_score": 0.9050258671041075, + "reason": "audio+visual" + }, + { + "index": 5, + "label": "siren", + "text": "[siren]", + "start": 8.64, + "end": 10.56, + "audio_confidence": 0.9823512434959412, + "reaction_confidence": 0.65, + "decision_score": 0.9527931839227678, + "reason": "audio+visual" + }, + { + "index": 6, + "label": "alarm", + "text": "[alarm]", + "start": 8.64, + "end": 10.56, + "audio_confidence": 0.9796193838119507, + "reaction_confidence": 0.65, + "decision_score": 0.9512906610965729, + "reason": "audio+visual" + }, + { + "index": 7, + "label": "music", + "text": "[music]", + "start": 10.559999999999999, + "end": 12.0, + "audio_confidence": 0.9774209856987, + "reaction_confidence": 0.021797618726838597, + "decision_score": 0.5473904705613624, + "reason": "audio+visual" + }, + { + "index": 8, + "label": "siren", + "text": "[siren]", + "start": 12.0, + "end": 16.32, + "audio_confidence": 0.9961757063865662, + "reaction_confidence": 0.02027477735210032, + "decision_score": 0.6770202883210565, + "reason": "high-impact-audio" + }, + { + "index": 9, + "label": "alarm", + "text": "[alarm]", + "start": 12.0, + "end": 16.32, + "audio_confidence": 0.9948476552963257, + "reaction_confidence": 0.02027477735210032, + "decision_score": 0.6762898602214243, + "reason": "high-impact-audio" + }, + { + "index": 10, + "label": "music", + "text": "[music]", + "start": 16.32, + "end": 17.28, + "audio_confidence": 0.9585652947425842, + "reaction_confidence": 0.018021383470899995, + "decision_score": 0.5353205346703264, + "reason": "audio+visual" + }, + { + "index": 11, + "label": "alarm", + "text": "[alarm]", + "start": 17.28, + "end": 28.8, + "audio_confidence": 0.9847885370254517, + "reaction_confidence": 0.086813772000233, + "decision_score": 0.7006998927641033, + "reason": "high-impact-audio" + }, + { + "index": 12, + "label": "siren", + "text": "[siren]", + "start": 17.28, + "end": 28.8, + "audio_confidence": 0.9837005138397217, + "reaction_confidence": 0.086813772000233, + "decision_score": 0.7001014800119518, + "reason": "high-impact-audio" + } +] \ No newline at end of file diff --git a/sls/smoke_output.sls b/sls/smoke_output.sls new file mode 100644 index 0000000..4917ba5 --- /dev/null +++ b/sls/smoke_output.sls @@ -0,0 +1,35 @@ +[ + { + "index": 1, + "label": "music", + "text": "[music]", + "start": 0.0, + "end": 0.48, + "audio_confidence": 0.9944783449172974, + "reaction_confidence": 0.65, + "decision_score": 0.8394630897045137, + "reason": "audio+visual" + }, + { + "index": 2, + "label": "music", + "text": "[music]", + "start": 1.44, + "end": 35.04, + "audio_confidence": 0.9990421533584595, + "reaction_confidence": 0.9930364583333335, + "decision_score": 0.9963395905971528, + "reason": "audio+visual" + }, + { + "index": 3, + "label": "music", + "text": "[music]", + "start": 49.44, + "end": 50.879999999999995, + "audio_confidence": 0.7439404129981995, + "reaction_confidence": 0.65, + "decision_score": 0.7016672271490098, + "reason": "audio+visual" + } +] \ No newline at end of file diff --git a/sls/test_sound_output.sls b/sls/test_sound_output.sls new file mode 100644 index 0000000..ff83c15 --- /dev/null +++ b/sls/test_sound_output.sls @@ -0,0 +1,156 @@ +[ + { + "index": 1, + "label": "applause", + "text": "[applause]", + "start": 0.0, + "end": 10.56, + "audio_confidence": 0.999273419380188, + "reaction_confidence": 0.65, + "decision_score": 0.8421003806591034, + "reason": "audio+visual" + }, + { + "index": 2, + "label": "honking", + "text": "[honking]", + "start": 12.959999999999999, + "end": 21.12, + "audio_confidence": 0.9998518824577332, + "reaction_confidence": 0.65, + "decision_score": 0.9624185353517533, + "reason": "audio+visual" + }, + { + "index": 3, + "label": "honking", + "text": "[honking]", + "start": 22.08, + "end": 23.52, + "audio_confidence": 0.9345638751983643, + "reaction_confidence": 0.4904459294825792, + "decision_score": 0.854710799626261, + "reason": "audio+visual" + }, + { + "index": 4, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 24.96, + "end": 29.279999999999998, + "audio_confidence": 0.9425867199897766, + "reaction_confidence": 0.65, + "decision_score": 0.9309226959943772, + "reason": "audio+visual" + }, + { + "index": 5, + "label": "breaking", + "text": "[breaking]", + "start": 24.96, + "end": 25.44, + "audio_confidence": 0.7915558218955994, + "reaction_confidence": 0.65, + "decision_score": 0.7278557020425798, + "reason": "audio+visual" + }, + { + "index": 6, + "label": "shatter", + "text": "[shatter]", + "start": 24.96, + "end": 25.44, + "audio_confidence": 0.4363442063331604, + "reaction_confidence": 0.65, + "decision_score": 0.5324893134832382, + "reason": "audio+visual" + }, + { + "index": 7, + "label": "breaking", + "text": "[breaking]", + "start": 26.88, + "end": 29.279999999999998, + "audio_confidence": 0.9276607036590576, + "reaction_confidence": 0.65, + "decision_score": 0.8027133870124818, + "reason": "audio+visual" + }, + { + "index": 8, + "label": "shatter", + "text": "[shatter]", + "start": 26.88, + "end": 27.36, + "audio_confidence": 0.4864293038845062, + "reaction_confidence": 0.65, + "decision_score": 0.5600361171364785, + "reason": "audio+visual" + }, + { + "index": 9, + "label": "shatter", + "text": "[shatter]", + "start": 28.32, + "end": 29.279999999999998, + "audio_confidence": 0.8208039402961731, + "reaction_confidence": 0.65, + "decision_score": 0.7439421671628953, + "reason": "audio+visual" + }, + { + "index": 10, + "label": "glass breaking", + "text": "[glass breaking]", + "start": 31.2, + "end": 32.63999999999999, + "audio_confidence": 0.8777748942375183, + "reaction_confidence": 0.65, + "decision_score": 0.8952761918306352, + "reason": "audio+visual" + }, + { + "index": 11, + "label": "breaking", + "text": "[breaking]", + "start": 31.68, + "end": 32.63999999999999, + "audio_confidence": 0.8007316589355469, + "reaction_confidence": 0.65, + "decision_score": 0.7329024124145509, + "reason": "audio+visual" + }, + { + "index": 12, + "label": "shatter", + "text": "[shatter]", + "start": 31.68, + "end": 32.63999999999999, + "audio_confidence": 0.7463002800941467, + "reaction_confidence": 0.65, + "decision_score": 0.7029651540517807, + "reason": "audio+visual" + }, + { + "index": 13, + "label": "explosion", + "text": "[explosion]", + "start": 33.6, + "end": 43.199999999999996, + "audio_confidence": 0.9554585814476013, + "reaction_confidence": 0.4912832531764888, + "decision_score": 0.8665796837256007, + "reason": "audio+visual" + }, + { + "index": 14, + "label": "gunshot", + "text": "[gunshot]", + "start": 33.6, + "end": 38.4, + "audio_confidence": 0.9355015754699707, + "reaction_confidence": 0.41342752319574355, + "decision_score": 0.8205682519465686, + "reason": "audio+visual" + } +] \ No newline at end of file diff --git a/src/intelligent_cc.egg-info/PKG-INFO b/src/intelligent_cc.egg-info/PKG-INFO new file mode 100644 index 0000000..d20229e --- /dev/null +++ b/src/intelligent_cc.egg-info/PKG-INFO @@ -0,0 +1,83 @@ +Metadata-Version: 2.4 +Name: intelligent-cc +Version: 0.1.0 +Summary: AI-assisted closed-caption suggestion pipeline for meaningful non-speech events in video. +Requires-Python: <3.11,>=3.10 +Description-Content-Type: text/markdown +Requires-Dist: click==8.1.7 +Requires-Dist: ffmpeg-python==0.2.0 +Requires-Dist: imageio-ffmpeg==0.4.9 +Requires-Dist: librosa==0.10.1 +Requires-Dist: numpy==1.26.4 +Requires-Dist: opencv-python-headless==4.10.0.84 +Requires-Dist: protobuf==3.19.6 +Requires-Dist: scipy==1.10.1 +Requires-Dist: tensorflow==2.10.1 +Requires-Dist: tensorflow-hub==0.12.0 +Provides-Extra: dev +Requires-Dist: pytest==8.2.2; extra == "dev" + +# Intelligent Closed Caption Suggestion Tool + +AI-assisted backend pipeline for finding meaningful non-speech moments in a video and exporting closed-caption suggestions as SRT or SLS. The pipeline combines: + +- YAMNet sound event detection for non-speech audio events. +- OpenCV frame sampling and optical-flow motion analysis. +- MediaPipe face detection for visible head/face-position shifts. +- A decision engine that avoids captioning low-impact ambient sounds unless the audio event and scene reaction justify it. + +## Python And Dependencies + +Use Python `3.10.x`. The project pins `>=3.10,<3.11` because this machine has Python 3.10 installed and TensorFlow `2.10.x` provides compatible native Windows wheels for that runtime. + +The app uses `imageio-ffmpeg` to provide an FFmpeg executable, so you do not need a separate system FFmpeg install for normal CLI use. + +```powershell +py -3.10 -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip +pip install -e . +``` + +YAMNet is loaded from TensorFlow Hub on first use, so the first run needs internet access to download the model cache. + +## Usage + +```powershell +intelligent-cc video.mp4 -o output.srt +``` + +For structured JSON-style output: + +```powershell +intelligent-cc video.mp4 --format sls -o output.sls +``` + +Useful tuning flags: + +```powershell +intelligent-cc video.mp4 --audio-threshold 0.30 --decision-threshold 0.55 --max-events 20 +``` + +## Pipeline + +1. Extract mono 16 kHz audio from the input video with FFmpeg. +2. Run YAMNet and keep captionable non-speech classes such as honking, glass breaking, alarms, applause, explosions, sirens, laughter, and music. +3. Merge adjacent detections into timestamped audio events with confidence scores. +4. Sample video frames around each event timestamp. +5. Score visible reaction using optical flow and MediaPipe face-center movement. +6. Combine audio confidence, visual reaction confidence, and high-impact label rules. +7. Export accepted suggestions as SRT captions like `[honking]`. + +## Development + +```powershell +pip install -e ".[dev]" +pytest +``` + +## Notes + +- The included `video.mp4` can be used for a smoke test after dependencies are installed. +- The reaction detector is intentionally conservative: routine background sounds are rejected unless paired with visible motion/reaction or a high-impact audio label. +- For production review workflows, keep rejected events as diagnostic metadata by using the Python API and inspecting `PipelineResult.audio_events`. diff --git a/src/intelligent_cc.egg-info/SOURCES.txt b/src/intelligent_cc.egg-info/SOURCES.txt new file mode 100644 index 0000000..02fdf89 --- /dev/null +++ b/src/intelligent_cc.egg-info/SOURCES.txt @@ -0,0 +1,18 @@ +README.md +pyproject.toml +src/intelligent_cc/__init__.py +src/intelligent_cc/audio.py +src/intelligent_cc/cli.py +src/intelligent_cc/decision.py +src/intelligent_cc/models.py +src/intelligent_cc/output.py +src/intelligent_cc/pipeline.py +src/intelligent_cc/vision.py +src/intelligent_cc.egg-info/PKG-INFO +src/intelligent_cc.egg-info/SOURCES.txt +src/intelligent_cc.egg-info/dependency_links.txt +src/intelligent_cc.egg-info/entry_points.txt +src/intelligent_cc.egg-info/requires.txt +src/intelligent_cc.egg-info/top_level.txt +tests/test_decision.py +tests/test_output.py \ No newline at end of file diff --git a/src/intelligent_cc.egg-info/dependency_links.txt b/src/intelligent_cc.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/intelligent_cc.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/intelligent_cc.egg-info/entry_points.txt b/src/intelligent_cc.egg-info/entry_points.txt new file mode 100644 index 0000000..359c3d7 --- /dev/null +++ b/src/intelligent_cc.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +intelligent-cc = intelligent_cc.cli:main diff --git a/src/intelligent_cc.egg-info/requires.txt b/src/intelligent_cc.egg-info/requires.txt new file mode 100644 index 0000000..ff0f514 --- /dev/null +++ b/src/intelligent_cc.egg-info/requires.txt @@ -0,0 +1,13 @@ +click==8.1.7 +ffmpeg-python==0.2.0 +imageio-ffmpeg==0.4.9 +librosa==0.10.1 +numpy==1.26.4 +opencv-python-headless==4.10.0.84 +protobuf==3.19.6 +scipy==1.10.1 +tensorflow==2.10.1 +tensorflow-hub==0.12.0 + +[dev] +pytest==8.2.2 diff --git a/src/intelligent_cc.egg-info/top_level.txt b/src/intelligent_cc.egg-info/top_level.txt new file mode 100644 index 0000000..27ffce2 --- /dev/null +++ b/src/intelligent_cc.egg-info/top_level.txt @@ -0,0 +1 @@ +intelligent_cc diff --git a/src/intelligent_cc/__init__.py b/src/intelligent_cc/__init__.py new file mode 100644 index 0000000..597b904 --- /dev/null +++ b/src/intelligent_cc/__init__.py @@ -0,0 +1,5 @@ +"""Intelligent Closed Caption suggestion pipeline.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/src/intelligent_cc/audio.py b/src/intelligent_cc/audio.py new file mode 100644 index 0000000..c5e72f3 --- /dev/null +++ b/src/intelligent_cc/audio.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import csv +import tempfile +from pathlib import Path + +import ffmpeg +import imageio_ffmpeg +import librosa +import numpy as np + +from .models import AudioEvent + + +YAMNET_URL = "https://tfhub.dev/google/yamnet/1" +SAMPLE_RATE = 16000 +YAMNET_PATCH_SECONDS = 0.48 +YAMNET_HOP_SECONDS = 0.48 + +NOISE_LABEL_HINTS = { + "alarm", + "applause", + "bang", + "bell", + "breaking", + "cheering", + "clap", + "crash", + "cry", + "door", + "explosion", + "fire", + "glass", + "gunshot", + "horn", + "laughter", + "music", + "scream", + "shatter", + "siren", + "thunder", + "vehicle", +} + +LABEL_NORMALIZATION = { + "air horn, truck horn": "honking", + "car horn, truck horn": "honking", + "horn": "honking", + "glass": "glass breaking", + "glass shatter": "glass breaking", + "gunshot, gunfire": "gunshot", + "music": "music", + "applause": "applause", + "cheering": "crowd cheering", + "siren": "siren", + "alarm": "alarm", +} + + +class AudioEventDetector: + """YAMNet-based non-speech event detector.""" + + def __init__( + self, + confidence_threshold: float = 0.25, + min_duration: float = 0.35, + merge_gap: float = 0.6, + max_events: int | None = None, + ) -> None: + self.confidence_threshold = confidence_threshold + self.min_duration = min_duration + self.merge_gap = merge_gap + self.max_events = max_events + self._yamnet = None + self._class_names: list[str] | None = None + + def detect(self, video_path: Path) -> list[AudioEvent]: + with tempfile.TemporaryDirectory(prefix="intelligent_cc_") as tmpdir: + wav_path = Path(tmpdir) / "audio.wav" + self._extract_audio(video_path, wav_path) + waveform, _ = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True) + + if waveform.size == 0: + return [] + + scores = self._score_waveform(waveform) + raw_events = self._scores_to_events(scores) + events = self._merge_events(raw_events) + events.sort(key=lambda event: (event.start, -event.confidence)) + if self.max_events is not None: + events = events[: self.max_events] + return events + + def _extract_audio(self, video_path: Path, wav_path: Path) -> None: + try: + ( + ffmpeg.input(str(video_path)) + .output(str(wav_path), ac=1, ar=SAMPLE_RATE, format="wav", loglevel="error") + .overwrite_output() + .run( + cmd=imageio_ffmpeg.get_ffmpeg_exe(), + capture_stdout=True, + capture_stderr=True, + ) + ) + except ffmpeg.Error as exc: + detail = exc.stderr.decode("utf-8", errors="ignore") if exc.stderr else str(exc) + raise RuntimeError(f"Unable to extract audio with ffmpeg: {detail}") from exc + + def _load_yamnet(self): + if self._yamnet is None: + import tensorflow_hub as hub + + self._yamnet = hub.load(YAMNET_URL) + class_map_path = self._yamnet.class_map_path().numpy().decode("utf-8") + with open(class_map_path, newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + self._class_names = [row["display_name"] for row in reader] + return self._yamnet, self._class_names or [] + + def _score_waveform(self, waveform: np.ndarray) -> np.ndarray: + yamnet, _ = self._load_yamnet() + scores, _, _ = yamnet(waveform.astype(np.float32)) + return scores.numpy() + + def _scores_to_events(self, scores: np.ndarray) -> list[AudioEvent]: + _, class_names = self._load_yamnet() + events: list[AudioEvent] = [] + for frame_index, frame_scores in enumerate(scores): + top_indices = np.argsort(frame_scores)[-8:][::-1] + start = frame_index * YAMNET_HOP_SECONDS + end = start + YAMNET_PATCH_SECONDS + for class_index in top_indices: + label = class_names[int(class_index)] + confidence = float(frame_scores[class_index]) + if confidence < self.confidence_threshold: + continue + if not self._is_captionable_audio(label): + continue + events.append( + AudioEvent( + label=self._normalize_label(label), + confidence=confidence, + start=start, + end=end, + ) + ) + return events + + def _merge_events(self, events: list[AudioEvent]) -> list[AudioEvent]: + grouped: dict[str, list[AudioEvent]] = {} + for event in events: + grouped.setdefault(event.label, []).append(event) + + merged: list[AudioEvent] = [] + for label, label_events in grouped.items(): + label_events.sort(key=lambda event: event.start) + current = label_events[0] if label_events else None + for event in label_events[1:]: + if current is None: + current = event + continue + if event.start <= current.end + self.merge_gap: + duration_a = max(current.end - current.start, 0.01) + duration_b = max(event.end - event.start, 0.01) + weighted_conf = ( + current.confidence * duration_a + event.confidence * duration_b + ) / (duration_a + duration_b) + current = AudioEvent( + label=label, + confidence=float(max(current.confidence, weighted_conf, event.confidence)), + start=current.start, + end=max(current.end, event.end), + ) + else: + if current.end - current.start >= self.min_duration: + merged.append(current) + current = event + if current is not None and current.end - current.start >= self.min_duration: + merged.append(current) + return merged + + def _is_captionable_audio(self, label: str) -> bool: + lowered = label.lower() + if "speech" in lowered or "conversation" in lowered or "narration" in lowered: + return False + return any(hint in lowered for hint in NOISE_LABEL_HINTS) + + def _normalize_label(self, label: str) -> str: + lowered = label.lower() + for needle, replacement in LABEL_NORMALIZATION.items(): + if needle in lowered: + return replacement + cleaned = lowered.replace("_", " ").replace("/", " ") + return " ".join(cleaned.split()) diff --git a/src/intelligent_cc/cli.py b/src/intelligent_cc/cli.py new file mode 100644 index 0000000..9e8e91b --- /dev/null +++ b/src/intelligent_cc/cli.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from pathlib import Path + +import click + +from .audio import AudioEventDetector +from .decision import CaptionDecisionEngine +from .pipeline import IntelligentCCPipeline +from .vision import VisualReactionDetector + + +@click.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.argument("video", type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option( + "-o", + "--output", + type=click.Path(dir_okay=False, path_type=Path), + default=None, + help="Output caption path. Defaults to .srt.", +) +@click.option( + "--format", + "output_format", + type=click.Choice(["srt", "sls"], case_sensitive=False), + default="srt", + show_default=True, +) +@click.option("--audio-threshold", default=0.25, show_default=True, help="YAMNet confidence cutoff.") +@click.option("--decision-threshold", default=0.5, show_default=True, help="CC acceptance cutoff.") +@click.option("--max-events", type=int, default=None, help="Optional cap for quick test runs.") +def main( + video: Path, + output: Path | None, + output_format: str, + audio_threshold: float, + decision_threshold: float, + max_events: int | None, +) -> None: + """Generate context-aware non-speech CC suggestions for VIDEO.""" + + if output is None: + output = video.with_suffix(f".{output_format}") + + pipeline = IntelligentCCPipeline( + audio_detector=AudioEventDetector( + confidence_threshold=audio_threshold, + max_events=max_events, + ), + reaction_detector=VisualReactionDetector(), + decision_engine=CaptionDecisionEngine(decision_threshold=decision_threshold), + ) + result = pipeline.run(video, output, output_format=output_format) + + click.echo(f"Detected audio events: {len(result.audio_events)}") + click.echo(f"Accepted CC suggestions: {len(result.suggestions)}") + click.echo(f"Wrote: {result.output_path}") diff --git a/src/intelligent_cc/decision.py b/src/intelligent_cc/decision.py new file mode 100644 index 0000000..f5a7e6c --- /dev/null +++ b/src/intelligent_cc/decision.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from .models import AudioEvent, CaptionSuggestion, ReactionSignal + + +HIGH_IMPACT_LABELS = { + "alarm", + "explosion", + "glass breaking", + "gunshot", + "honking", + "scream", + "siren", +} + + +class CaptionDecisionEngine: + def __init__( + self, + audio_weight: float = 0.55, + reaction_weight: float = 0.45, + decision_threshold: float = 0.5, + high_impact_audio_threshold: float = 0.45, + ) -> None: + self.audio_weight = audio_weight + self.reaction_weight = reaction_weight + self.decision_threshold = decision_threshold + self.high_impact_audio_threshold = high_impact_audio_threshold + + def decide( + self, + event: AudioEvent, + reaction: ReactionSignal, + index: int, + ) -> CaptionSuggestion | None: + label_boost = 0.12 if event.label in HIGH_IMPACT_LABELS else 0.0 + score = min( + 1.0, + self.audio_weight * event.confidence + + self.reaction_weight * reaction.confidence + + label_boost, + ) + high_impact_audio = ( + event.label in HIGH_IMPACT_LABELS + and event.confidence >= self.high_impact_audio_threshold + ) + if score < self.decision_threshold and not high_impact_audio: + return None + + reason = "audio+visual" + if high_impact_audio and reaction.confidence < 0.2: + reason = "high-impact-audio" + + return CaptionSuggestion( + index=index, + label=event.label, + text=f"[{event.label}]", + start=event.start, + end=event.end, + audio_confidence=event.confidence, + reaction_confidence=reaction.confidence, + decision_score=float(score), + reason=reason, + ) diff --git a/src/intelligent_cc/models.py b/src/intelligent_cc/models.py new file mode 100644 index 0000000..03b5735 --- /dev/null +++ b/src/intelligent_cc/models.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class AudioEvent: + label: str + confidence: float + start: float + end: float + + +@dataclass(frozen=True) +class ReactionSignal: + confidence: float + motion_score: float + face_shift_score: float + frame_count: int + + +@dataclass(frozen=True) +class CaptionSuggestion: + index: int + label: str + text: str + start: float + end: float + audio_confidence: float + reaction_confidence: float + decision_score: float + reason: str + + +@dataclass(frozen=True) +class PipelineResult: + video_path: Path + audio_events: list[AudioEvent] + suggestions: list[CaptionSuggestion] + output_path: Path diff --git a/src/intelligent_cc/output.py b/src/intelligent_cc/output.py new file mode 100644 index 0000000..a816147 --- /dev/null +++ b/src/intelligent_cc/output.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import json +from dataclasses import asdict +from pathlib import Path + +from .models import CaptionSuggestion + + +def write_srt(suggestions: list[CaptionSuggestion], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + lines: list[str] = [] + for suggestion in suggestions: + lines.extend( + [ + str(suggestion.index), + f"{format_timestamp(suggestion.start)} --> {format_timestamp(suggestion.end)}", + suggestion.text, + "", + ] + ) + output_path.write_text("\n".join(lines), encoding="utf-8") + + +def write_sls(suggestions: list[CaptionSuggestion], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + payload = [asdict(suggestion) for suggestion in suggestions] + output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def format_timestamp(seconds: float) -> str: + milliseconds = int(round(seconds * 1000)) + hours, remainder = divmod(milliseconds, 3_600_000) + minutes, remainder = divmod(remainder, 60_000) + secs, millis = divmod(remainder, 1000) + return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}" diff --git a/src/intelligent_cc/pipeline.py b/src/intelligent_cc/pipeline.py new file mode 100644 index 0000000..c668633 --- /dev/null +++ b/src/intelligent_cc/pipeline.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from pathlib import Path + +from .audio import AudioEventDetector +from .decision import CaptionDecisionEngine +from .models import PipelineResult +from .output import write_sls, write_srt +from .vision import VisualReactionDetector + + +class IntelligentCCPipeline: + def __init__( + self, + audio_detector: AudioEventDetector | None = None, + reaction_detector: VisualReactionDetector | None = None, + decision_engine: CaptionDecisionEngine | None = None, + ) -> None: + self.audio_detector = audio_detector or AudioEventDetector() + self.reaction_detector = reaction_detector or VisualReactionDetector() + self.decision_engine = decision_engine or CaptionDecisionEngine() + + def run(self, video_path: Path, output_path: Path, output_format: str = "srt") -> PipelineResult: + video_path = video_path.resolve() + output_path = output_path.resolve() + if not video_path.exists(): + raise FileNotFoundError(video_path) + + audio_events = self.audio_detector.detect(video_path) + suggestions = [] + next_index = 1 + for event in audio_events: + reaction = self.reaction_detector.score_event(video_path, event) + suggestion = self.decision_engine.decide(event, reaction, next_index) + if suggestion is not None: + suggestions.append(suggestion) + next_index += 1 + + if output_format == "srt": + write_srt(suggestions, output_path) + elif output_format == "sls": + write_sls(suggestions, output_path) + else: + raise ValueError(f"Unsupported output format: {output_format}") + + return PipelineResult( + video_path=video_path, + audio_events=audio_events, + suggestions=suggestions, + output_path=output_path, + ) diff --git a/src/intelligent_cc/vision.py b/src/intelligent_cc/vision.py new file mode 100644 index 0000000..7b6e919 --- /dev/null +++ b/src/intelligent_cc/vision.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from pathlib import Path + +import cv2 +import numpy as np + +from .models import AudioEvent, ReactionSignal + + +class VisualReactionDetector: + """Detects visible motion and face/head-position changes around an audio event.""" + + def __init__( + self, + window_before: float = 0.75, + window_after: float = 1.25, + sample_fps: float = 6.0, + ) -> None: + self.window_before = window_before + self.window_after = window_after + self.sample_fps = sample_fps + self._mediapipe_detector = None + self._haar_detector = None + + def score_event(self, video_path: Path, event: AudioEvent) -> ReactionSignal: + frames = self._sample_frames( + video_path, + max(event.start - self.window_before, 0.0), + event.end + self.window_after, + ) + if len(frames) < 2: + return ReactionSignal(0.0, 0.0, 0.0, len(frames)) + + motion_score = self._motion_score(frames) + face_shift_score = self._face_shift_score(frames) + confidence = min(1.0, 0.65 * motion_score + 0.35 * face_shift_score) + return ReactionSignal( + confidence=float(confidence), + motion_score=float(motion_score), + face_shift_score=float(face_shift_score), + frame_count=len(frames), + ) + + def _sample_frames(self, video_path: Path, start: float, end: float) -> list[np.ndarray]: + capture = cv2.VideoCapture(str(video_path)) + if not capture.isOpened(): + raise RuntimeError(f"Unable to open video: {video_path}") + + source_fps = capture.get(cv2.CAP_PROP_FPS) or 25.0 + step = max(int(round(source_fps / self.sample_fps)), 1) + start_frame = int(start * source_fps) + end_frame = int(end * source_fps) + capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame) + + frames: list[np.ndarray] = [] + frame_number = start_frame + while frame_number <= end_frame: + ok, frame = capture.read() + if not ok: + break + if (frame_number - start_frame) % step == 0: + frames.append(cv2.resize(frame, (320, 180), interpolation=cv2.INTER_AREA)) + frame_number += 1 + capture.release() + return frames + + def _motion_score(self, frames: list[np.ndarray]) -> float: + scores: list[float] = [] + previous = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY) + for frame in frames[1:]: + current = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + flow = cv2.calcOpticalFlowFarneback( + previous, + current, + None, + pyr_scale=0.5, + levels=2, + winsize=15, + iterations=2, + poly_n=5, + poly_sigma=1.1, + flags=0, + ) + magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1]) + scores.append(float(np.percentile(magnitude, 90))) + previous = current + if not scores: + return 0.0 + return min(1.0, float(np.mean(scores)) / 3.5) + + def _face_shift_score(self, frames: list[np.ndarray]) -> float: + centers: list[tuple[float, float]] = [] + for frame in frames: + center = self._detect_face_center(frame) + if center is not None: + centers.append(center) + if len(centers) < 2: + return 0.0 + + deltas = [ + abs(centers[index][0] - centers[index - 1][0]) + + abs(centers[index][1] - centers[index - 1][1]) + for index in range(1, len(centers)) + ] + return min(1.0, float(np.percentile(deltas, 80)) * 3.0) + + def _detect_face_center(self, frame: np.ndarray) -> tuple[float, float] | None: + mediapipe_center = self._detect_mediapipe_face_center(frame) + if mediapipe_center is not None: + return mediapipe_center + return self._detect_haar_face_center(frame) + + def _detect_mediapipe_face_center(self, frame: np.ndarray) -> tuple[float, float] | None: + try: + import mediapipe as mp + except ImportError: + return None + + if self._mediapipe_detector is None: + self._mediapipe_detector = mp.solutions.face_detection.FaceDetection( + model_selection=0, + min_detection_confidence=0.45, + ) + + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + result = self._mediapipe_detector.process(rgb) + if not result.detections: + return None + + box = result.detections[0].location_data.relative_bounding_box + return (float(box.xmin + box.width / 2.0), float(box.ymin + box.height / 2.0)) + + def _detect_haar_face_center(self, frame: np.ndarray) -> tuple[float, float] | None: + if self._haar_detector is None: + cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml" + self._haar_detector = cv2.CascadeClassifier(cascade_path) + + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + faces = self._haar_detector.detectMultiScale( + gray, + scaleFactor=1.1, + minNeighbors=5, + minSize=(24, 24), + ) + if len(faces) == 0: + return None + + x, y, width, height = max(faces, key=lambda face: face[2] * face[3]) + frame_height, frame_width = gray.shape[:2] + return ( + float((x + width / 2.0) / frame_width), + float((y + height / 2.0) / frame_height), + ) diff --git a/srt/applause_output.srt b/srt/applause_output.srt new file mode 100644 index 0000000..f67a8b7 --- /dev/null +++ b/srt/applause_output.srt @@ -0,0 +1,3 @@ +1 +00:00:00,480 --> 00:00:24,960 +[applause] diff --git a/srt/glass_output.srt b/srt/glass_output.srt new file mode 100644 index 0000000..a20d2d9 --- /dev/null +++ b/srt/glass_output.srt @@ -0,0 +1,55 @@ +1 +00:00:07,200 --> 00:00:10,560 +[glass breaking] + +2 +00:00:07,680 --> 00:00:10,080 +[breaking] + +3 +00:00:08,640 --> 00:00:10,080 +[shatter] + +4 +00:00:12,000 --> 00:00:17,280 +[glass breaking] + +5 +00:00:12,000 --> 00:00:12,960 +[breaking] + +6 +00:00:12,000 --> 00:00:12,960 +[shatter] + +7 +00:00:13,920 --> 00:00:16,320 +[breaking] + +8 +00:00:13,920 --> 00:00:14,400 +[shatter] + +9 +00:00:15,360 --> 00:00:16,320 +[shatter] + +10 +00:00:18,720 --> 00:00:19,680 +[glass breaking] + +11 +00:00:18,720 --> 00:00:19,680 +[breaking] + +12 +00:00:18,720 --> 00:00:19,680 +[shatter] + +13 +00:00:21,600 --> 00:00:24,960 +[glass breaking] + +14 +00:00:21,600 --> 00:00:24,480 +[shatter] diff --git a/srt/gun_output.srt b/srt/gun_output.srt new file mode 100644 index 0000000..99e1a9a --- /dev/null +++ b/srt/gun_output.srt @@ -0,0 +1,27 @@ +1 +00:00:05,760 --> 00:00:09,120 +[explosion] + +2 +00:00:05,760 --> 00:00:06,720 +[gunshot] + +3 +00:00:10,080 --> 00:00:23,520 +[explosion] + +4 +00:00:12,000 --> 00:00:18,720 +[gunshot] + +5 +00:00:18,240 --> 00:00:21,600 +[fireworks] + +6 +00:00:21,120 --> 00:00:23,040 +[gunshot] + +7 +00:00:22,560 --> 00:00:23,040 +[artillery fire] diff --git a/srt/siren_output.srt b/srt/siren_output.srt new file mode 100644 index 0000000..15b7143 --- /dev/null +++ b/srt/siren_output.srt @@ -0,0 +1,47 @@ +1 +00:00:00,960 --> 00:00:01,440 +[siren] + +2 +00:00:00,960 --> 00:00:01,440 +[alarm] + +3 +00:00:02,880 --> 00:00:07,680 +[alarm] + +4 +00:00:02,880 --> 00:00:07,680 +[siren] + +5 +00:00:08,640 --> 00:00:10,560 +[siren] + +6 +00:00:08,640 --> 00:00:10,560 +[alarm] + +7 +00:00:10,560 --> 00:00:12,000 +[music] + +8 +00:00:12,000 --> 00:00:16,320 +[siren] + +9 +00:00:12,000 --> 00:00:16,320 +[alarm] + +10 +00:00:16,320 --> 00:00:17,280 +[music] + +11 +00:00:17,280 --> 00:00:28,800 +[alarm] + +12 +00:00:17,280 --> 00:00:28,800 +[siren] diff --git a/srt/smoke_output.srt b/srt/smoke_output.srt new file mode 100644 index 0000000..643af20 --- /dev/null +++ b/srt/smoke_output.srt @@ -0,0 +1,11 @@ +1 +00:00:00,000 --> 00:00:00,480 +[music] + +2 +00:00:01,440 --> 00:00:35,040 +[music] + +3 +00:00:49,440 --> 00:00:50,880 +[music] diff --git a/srt/test_sound_output.srt b/srt/test_sound_output.srt new file mode 100644 index 0000000..518c855 --- /dev/null +++ b/srt/test_sound_output.srt @@ -0,0 +1,55 @@ +1 +00:00:00,000 --> 00:00:10,560 +[applause] + +2 +00:00:12,960 --> 00:00:21,120 +[honking] + +3 +00:00:22,080 --> 00:00:23,520 +[honking] + +4 +00:00:24,960 --> 00:00:29,280 +[glass breaking] + +5 +00:00:24,960 --> 00:00:25,440 +[breaking] + +6 +00:00:24,960 --> 00:00:25,440 +[shatter] + +7 +00:00:26,880 --> 00:00:29,280 +[breaking] + +8 +00:00:26,880 --> 00:00:27,360 +[shatter] + +9 +00:00:28,320 --> 00:00:29,280 +[shatter] + +10 +00:00:31,200 --> 00:00:32,640 +[glass breaking] + +11 +00:00:31,680 --> 00:00:32,640 +[breaking] + +12 +00:00:31,680 --> 00:00:32,640 +[shatter] + +13 +00:00:33,600 --> 00:00:43,200 +[explosion] + +14 +00:00:33,600 --> 00:00:38,400 +[gunshot] diff --git a/tests/test_decision.py b/tests/test_decision.py new file mode 100644 index 0000000..0dc6cb1 --- /dev/null +++ b/tests/test_decision.py @@ -0,0 +1,23 @@ +from intelligent_cc.decision import CaptionDecisionEngine +from intelligent_cc.models import AudioEvent, ReactionSignal + + +def test_accepts_event_when_audio_and_visual_scores_are_meaningful() -> None: + suggestion = CaptionDecisionEngine().decide( + AudioEvent("glass breaking", 0.62, 3.0, 4.0), + ReactionSignal(0.55, 0.5, 0.6, 10), + 1, + ) + + assert suggestion is not None + assert suggestion.text == "[glass breaking]" + + +def test_rejects_low_impact_event_without_reaction() -> None: + suggestion = CaptionDecisionEngine().decide( + AudioEvent("music", 0.28, 3.0, 4.0), + ReactionSignal(0.05, 0.04, 0.0, 10), + 1, + ) + + assert suggestion is None diff --git a/tests/test_output.py b/tests/test_output.py new file mode 100644 index 0000000..e6ca694 --- /dev/null +++ b/tests/test_output.py @@ -0,0 +1,32 @@ +from intelligent_cc.models import CaptionSuggestion +from intelligent_cc.output import format_timestamp, write_srt + + +def test_format_timestamp_uses_srt_millisecond_format() -> None: + assert format_timestamp(3723.456) == "01:02:03,456" + + +def test_write_srt(tmp_path) -> None: + output = tmp_path / "captions.srt" + write_srt( + [ + CaptionSuggestion( + index=1, + label="honking", + text="[honking]", + start=1.0, + end=2.25, + audio_confidence=0.9, + reaction_confidence=0.7, + decision_score=0.8, + reason="audio+visual", + ) + ], + output, + ) + + assert output.read_text(encoding="utf-8") == ( + "1\n" + "00:00:01,000 --> 00:00:02,250\n" + "[honking]\n" + ) diff --git a/videos/applause.mp4 b/videos/applause.mp4 new file mode 100644 index 0000000..06e887d Binary files /dev/null and b/videos/applause.mp4 differ diff --git a/videos/glass.mp4 b/videos/glass.mp4 new file mode 100644 index 0000000..301be9c Binary files /dev/null and b/videos/glass.mp4 differ diff --git a/videos/gun.mp4 b/videos/gun.mp4 new file mode 100644 index 0000000..16dfc25 Binary files /dev/null and b/videos/gun.mp4 differ diff --git a/videos/honking.mp4 b/videos/honking.mp4 new file mode 100644 index 0000000..cae377a Binary files /dev/null and b/videos/honking.mp4 differ diff --git a/videos/siren.mp4 b/videos/siren.mp4 new file mode 100644 index 0000000..36a1bb1 Binary files /dev/null and b/videos/siren.mp4 differ diff --git a/videos/smoke.mp4 b/videos/smoke.mp4 new file mode 100644 index 0000000..b2e4bad Binary files /dev/null and b/videos/smoke.mp4 differ