⚡ Fall Down
+ ⚡ Punching
+ ⚡ Punching
+ ⚡ Fall Down
+ ⚡ Punching
+ ⚡ Fall Down
+ ⚡ Punching
+ 👁 helicopter (mixing bowl)
+ ⚡ Fall Down
+ 👁 dog (American black bear, black bear, Ursus americanus, Euarctos americanus); 👁 cat (American black bear, black bear, Ursus americanus, Euarctos americanus)
+ 👁 helicopter (modem)
+ 👁 helicopter (modem)
+ ⚡ Punching
+ ⚡ Fall Down
+ ⚡ Punching
+ 👁 dog (Appenzeller)
+ ⚡ Fall Down
+ ⚡ Fall Down
+ ⚡ Punching
+ ⚡ Punching
+ ⚡ Fall Down
+ ⚡ Punching
+ [Engine] (33%); [Idling] (20%); [Medium engine (mid frequency)] (20%)
+ [Vehicle horn, car horn, honking] (41%)
+ [Vehicle horn, car horn, honking] (26%)
+ [Vehicle horn, car horn, honking] (33%)
+ [Vehicle horn, car horn, honking] (33%)
+ [Vehicle horn, car horn, honking] (41%)
+ [Vehicle horn, car horn, honking] (26%)
+ [Vehicle horn, car horn, honking] (33%)
+
+
{_xml_escape(e.caption)}' + ) + lines.append(_SLS_FOOTER) + text = "\n".join(lines) + if filepath: + with open(filepath, "w", encoding="utf-8") as f: + f.write(text) + return text + + +def _xml_escape(s: str) -> str: + return ( + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) diff --git a/create_demo_samples.py b/create_demo_samples.py new file mode 100644 index 0000000..a5f6835 --- /dev/null +++ b/create_demo_samples.py @@ -0,0 +1,27 @@ +""" +Download sample audio clips with real sounds (ESC-50 dataset, CC-BY-NC). +""" + +import os, urllib.request + +DEST = os.path.join(os.path.dirname(__file__), "sample_audio") +os.makedirs(DEST, exist_ok=True) + +BASE = "https://github.com/karoldvl/ESC-50/raw/master/audio/" + +SAMPLES = [ + ("car_horn.wav", "1-17124-A-43.wav", "Car horn"), + ("siren.wav", "1-31482-A-42.wav", "Police siren"), + ("dog_bark.wav", "1-100032-A-0.wav", "Dog barking"), + ("glass_break.wav", "1-20133-A-39.wav", "Glass breaking"), + ("gunshot.wav", "1-115545-A-48.wav", "Fireworks (detected as gunshot/explosion)"), + ("engine.wav", "1-18527-A-44.wav", "Engine running"), +] + +print("Downloading sample audio clips...") +for local, remote, desc in SAMPLES: + path = os.path.join(DEST, local) + if not os.path.exists(path): + print(f" {local} ({desc})") + urllib.request.urlretrieve(BASE + remote, path) +print(f"\nDone. {len(SAMPLES)} files in '{DEST}'") diff --git a/demo_module1.py b/demo_module1.py new file mode 100644 index 0000000..2a6e612 --- /dev/null +++ b/demo_module1.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Sound Event Detection Demo (Module 1). + +Usage: + python demo_module1.py sample_audio/car_horn.wav + python demo_module1.py --all + python demo_module1.py my_video.mp4 +""" + +import sys, os, json + +sys.path.insert(0, os.path.dirname(__file__)) +from sound_event_detection import SoundEventDetector + +SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample_audio") +THRESHOLD = 0.15 + +ALL_SAMPLES = [ + ("car_horn.wav", ["Vehicle horn, car horn, honking"]), + ("siren.wav", ["Siren", "Alarm"]), + ("dog_bark.wav", ["Dog", "Bark"]), + ("glass_break.wav", ["Breaking"]), + ("gunshot.wav", ["Gunshot, gunfire", "Explosion", "Fireworks"]), + ("engine.wav", ["Vehicle", "Engine"]), +] + + +def detect(path, threshold=THRESHOLD): + detector = SoundEventDetector(confidence_threshold=threshold) + if path.lower().endswith(".wav"): + result = detector.detect_from_file(path) + else: + result = detector.detect_from_video(path) + detector.close() + return result + + +def run_all(): + print("=" * 55) + print(" Sound Event Detection Demo (Module 1)") + print("=" * 55) + + if not os.path.isdir(SAMPLE_DIR): + print("Run 'python create_demo_samples.py' first.") + return + + for fname, expected in ALL_SAMPLES: + path = os.path.join(SAMPLE_DIR, fname) + if not os.path.exists(path): + continue + + result = detect(path) + detected = {e.sound_class for e in result.events} + matched = detected & set(expected) + + print(f"\n {fname}") + print(f" Duration: {result.duration_seconds:.1f}s") + if result.events: + for e in result.events: + icon = " <<<" if e.sound_class in expected else "" + print( + f" [{e.start_time:.1f}s] {e.sound_class:35s} " + f"({e.confidence:.0%}){icon}" + ) + else: + print(" (no events above threshold)") + if matched: + print(f" => Detected: {', '.join(sorted(matched))}") + else: + print(f" => Expected: {', '.join(expected)}") + + print() + + +def main(): + if len(sys.argv) < 2: + print(__doc__.strip()) + return + + if sys.argv[1] == "--all": + run_all() + return + + path = sys.argv[1] + if not os.path.exists(path): + print(f"File not found: {path}") + sys.exit(1) + + result = detect(path) + print(f"\n File: {os.path.basename(path)}") + print(f" Duration: {result.duration_seconds:.1f}s") + print(f" Events:") + for e in result.events: + print( + f" [{e.start_time:.1f}s - {e.end_time:.1f}s] " + f"{e.sound_class:35s} ({e.confidence:.0%})" + ) + + out = os.path.splitext(path)[0] + "_events.json" + result.to_json(out) + print(f"\n Results saved to: {out}") + + +if __name__ == "__main__": + main() diff --git a/demo_module2.py b/demo_module2.py new file mode 100644 index 0000000..45769de --- /dev/null +++ b/demo_module2.py @@ -0,0 +1,187 @@ +import sys, os, json, urllib.request + +import cv2 +import numpy as np + +import mediapipe as mp +from mediapipe.tasks import python +from mediapipe.tasks.python import vision + +CLASSIFIER_URL = ( + "https://storage.googleapis.com/mediapipe-models/" + "image_classifier/efficientnet_lite0/float32/1/" + "efficientnet_lite0.tflite" +) +LABELS_URL = "https://raw.githubusercontent.com/google-coral/test_data/master/imagenet_labels.txt" +MODEL_PATH = "efficientnet_lite0.tflite" +LABELS_PATH = "imagenet_labels.txt" + +TARGETS = { + "helicopter": list(range(650, 670)), + "airplane": [404, 895], + "vehicle": [654, 468, 511, 627, 661, 581, 609, 864, 817, 656], + "person": [708], + "dog": list(range(151, 300)), + "cat": list(range(281, 300)), + "motorcycle": [661, 670], + "bicycle": [444, 671], +} + + +def _download(url, path): + if not os.path.exists(path): + print(f" Downloading {os.path.basename(path)}...") + urllib.request.urlretrieve(url, path) + + +def _detect_actions(landmarks, h, w): + actions = [] + if not landmarks: + return actions + + def y(lm): + return (1 - lm.y) * h + + nose_y = y(landmarks[0]) + hip_y = (y(landmarks[23]) + y(landmarks[24])) / 2 + ankle_y = (y(landmarks[27]) + y(landmarks[28])) / 2 + height = abs(nose_y - ankle_y) + width = abs(nose_y - hip_y) + if height > 30 and width < height * 0.4: + actions.append("fall_down") + mx = (landmarks[11].x + landmarks[12].x) / 2 * w + wrist_dist = abs(landmarks[15].x * w - mx) + abs(landmarks[16].x * w - mx) + if wrist_dist > w * 0.7: + actions.append("punching") + return actions + + +def analyze(video_path, interval=0.5): + _download(CLASSIFIER_URL, MODEL_PATH) + _download(LABELS_URL, LABELS_PATH) + + with open(LABELS_PATH) as f: + all_labels = [l.strip() for l in f.readlines()] + + # MediaPipe Image Classifier + base_opts = python.BaseOptions(model_asset_path=MODEL_PATH) + img_opts = vision.ImageClassifierOptions(base_options=base_opts, max_results=5) + classifier = vision.ImageClassifier.create_from_options(img_opts) + + # MediaPipe Pose + pose_path = "pose_landmarker.task" + if not os.path.exists(pose_path): + print(" Downloading pose model...") + urllib.request.urlretrieve( + "https://storage.googleapis.com/mediapipe-models/" + "pose_landmarker/pose_landmarker_heavy/float16/1/" + "pose_landmarker_heavy.task", + pose_path, + ) + pose_opts = vision.PoseLandmarkerOptions( + base_options=python.BaseOptions(model_asset_path=pose_path), + min_pose_detection_confidence=0.4, + ) + pose_det = vision.PoseLandmarker.create_from_options(pose_opts) + + cap = cv2.VideoCapture(video_path) + fps = cap.get(cv2.CAP_PROP_FPS) or 30 + skip = int(fps * interval) + events, prev_hist, frame_idx = [], None, 0 + + while True: + ret, frame = cap.read() + if not ret: + break + if frame_idx % skip != 0: + frame_idx += 1 + continue + + t = round(frame_idx / fps, 1) + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + h, w = frame.shape[:2] + mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb) + + # Object classification + cls_result = classifier.classify(mp_img) + if cls_result.classifications: + for cat in cls_result.classifications[0].categories: + if cat.score > 0.3 and cat.index < len(all_labels): + label = all_labels[cat.index] + for name, ids in TARGETS.items(): + if cat.index in ids: + events.append( + { + "time": t, + "type": "object", + "object": name, + "label": label, + "confidence": round(cat.score, 3), + } + ) + + # Scene change + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + hist = cv2.calcHist([gray], [0], None, [64], [0, 256]) + cv2.normalize(hist, hist) + hist = hist.flatten() + if prev_hist is not None: + diff = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_CHISQR) + if diff > 40: + events.append( + {"time": t, "type": "scene_change", "score": round(diff, 1)} + ) + prev_hist = hist + + # Pose actions + pose_res = pose_det.detect(mp_img) + for lm in ( + pose_res.pose_landmarks if hasattr(pose_res, "pose_landmarks") else [] + ): + for action in _detect_actions(lm, h, w): + events.append({"time": t, "type": "action", "action": action}) + + frame_idx += 1 + + cap.release() + classifier.close() + pose_det.close() + + return { + "video": os.path.basename(video_path), + "duration": round(frame_idx / fps, 1), + "total_events": len(events), + "events": events, + } + + +def main(): + if len(sys.argv) < 2: + print("Usage: python demo_module2.py video.mp4") + return + path = sys.argv[1] + if not os.path.exists(path): + print(f"File not found: {path}") + return + print(f"Analyzing: {path}") + result = analyze(path) + print(f"\nDuration: {result['duration']}s | Events: {result['total_events']}") + for e in result["events"][:20]: + if e["type"] == "object": + print( + f" [{e['time']}s] {e['object']} ({e['label']}) {e['confidence']:.0%}" + ) + elif e["type"] == "scene_change": + print(f" [{e['time']}s] scene change") + elif e["type"] == "action": + print(f" [{e['time']}s] action: {e['action']}") + if len(result["events"]) > 20: + print(f" ... and {len(result['events']) - 20} more") + out = os.path.splitext(path)[0] + "_visual.json" + with open(out, "w") as f: + json.dump(result, f, indent=2) + print(f"Saved: {out}") + + +if __name__ == "__main__": + main() diff --git a/demo_pipeline.py b/demo_pipeline.py new file mode 100644 index 0000000..4d214e9 --- /dev/null +++ b/demo_pipeline.py @@ -0,0 +1,151 @@ +import sys +import os +import json +import argparse +from typing import List + +from cc_decision_engine import ( + fuse, + fuse_from_files, + parse_audio_events, + parse_visual_events, + parse_speech_srt, + UnifiedEvent, + fused_to_dicts, +) +from cc_output import to_srt, to_sls + + +def run_pipeline( + video_path: str, + speech_srt_path: str = None, + reuse: bool = False, + separate: bool = False, +): + base = os.path.splitext(video_path)[0] + + # -- Step 1: Sound event detection (Module 1) -- + audio_json_path = base + "_events.json" + if reuse and os.path.exists(audio_json_path): + print(f"[1/3] Reusing audio events from {audio_json_path}") + audio_events = None # loaded via fuse_from_files + else: + print("[1/3] Running Sound Event Detection (Module 1)...") + from sound_event_detection import SoundEventDetector + + detector = SoundEventDetector(confidence_threshold=0.15, use_separation=separate) + result = detector.detect_from_video(video_path) + result.to_json(audio_json_path) + audio_events = parse_audio_events(result.to_dict()) + detector.close() + print(f" Found {len(audio_events)} audio events") + + # -- Step 2: Visual detection (Module 2) -- + visual_json_path = base + "_visual.json" + if reuse and os.path.exists(visual_json_path): + print(f"[2/3] Reusing visual events from {visual_json_path}") + visual_events = None + else: + print("[2/3] Running Visual Detection (Module 2)...") + from demo_module2 import analyze + + visual_result = analyze(video_path, interval=0.5) + visual_events = parse_visual_events(visual_result) + print(f" Found {len(visual_events)} visual events") + + # Load speech + speech_events = None + if speech_srt_path: + if os.path.exists(speech_srt_path): + print(f"[2b/3] Loading speech transcript from {speech_srt_path}") + speech_events = parse_speech_srt( + open(speech_srt_path, encoding="utf-8").read() + ) + print(f" Found {len(speech_events)} speech segments") + else: + print(f" [!] Speech file not found: {speech_srt_path}") + + # -- Step 3: Fusion -- + print("[3/3] Fusing signals into unified timeline...") + if reuse: + fused = fuse_from_files( + audio_json=audio_json_path if audio_events is None else None, + visual_json=visual_json_path if visual_events is None else None, + speech_srt=speech_srt_path, + ) + else: + if audio_events is None: + from cc_decision_engine import parse_audio_file + + audio_events = parse_audio_file(audio_json_path) + if visual_events is None: + from cc_decision_engine import parse_visual_file + + visual_events = parse_visual_file(visual_json_path) + fused = fuse(audio_events, visual_events, speech_events) + + print(f" Unified timeline: {len(fused)} caption events") + + # -- Step 4: Output -- + srt_path = base + "_cc.srt" + sls_path = base + "_cc.sls" + json_path = base + "_cc_fused.json" + + to_srt(fused, srt_path) + print(f" SRT -> {srt_path}") + + to_sls(fused, sls_path) + print(f" SLS -> {sls_path}") + + with open(json_path, "w", encoding="utf-8") as f: + json.dump(fused_to_dicts(fused), f, indent=2) + print(f" JSON -> {json_path}") + + # -- Summary -- + _print_summary(fused, video_path) + return fused + + +def _print_summary(events: List[UnifiedEvent], video_path: str): + counts: dict = {} + for e in events: + counts[e.event_type] = counts.get(e.event_type, 0) + 1 + + print(f"\n{'=' * 50}") + print(f"Pipeline complete: {os.path.basename(video_path)}") + print(f"{'=' * 50}") + for etype, cnt in sorted(counts.items()): + print(f" {etype:20s}: {cnt}") + print(f"{'=' * 50}") + print("\nSample captions (first 10):") + for e in events[:10]: + try: + sys.stdout.write( + f" [{e.start_time:.1f}s-{e.end_time:.1f}s] [{e.event_type}] {e.caption}\n" + ) + except UnicodeEncodeError: + safe = e.caption.encode("ascii", errors="replace").decode("ascii") + print(f" [{e.start_time:.1f}s-{e.end_time:.1f}s] [{e.event_type}] {safe}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Intelligent CC Generation Pipeline") + parser.add_argument("video", help="Path to video file") + parser.add_argument("--speech", help="Path to SRT speech transcript", default=None) + parser.add_argument( + "--reuse", + action="store_true", + help="Reuse existing _events.json and _visual.json instead of re-running detection", + ) + parser.add_argument( + "--separate", + action="store_true", + help="Separate vocals/dialogue from background audio before running classification", + ) + args = parser.parse_args() + + if not os.path.exists(args.video): + print(f"File not found: {args.video}") + sys.exit(1) + + run_pipeline(args.video, speech_srt_path=args.speech, reuse=args.reuse, separate=args.separate) diff --git a/efficientnet_lite0.tflite b/efficientnet_lite0.tflite new file mode 100644 index 0000000..9ff2e5b Binary files /dev/null and b/efficientnet_lite0.tflite differ diff --git a/face_detector.tflite b/face_detector.tflite new file mode 100644 index 0000000..1b2b522 Binary files /dev/null and b/face_detector.tflite differ diff --git a/imagenet_labels.txt b/imagenet_labels.txt new file mode 100644 index 0000000..0f975ff --- /dev/null +++ b/imagenet_labels.txt @@ -0,0 +1,1001 @@ +background +tench, Tinca tinca +goldfish, Carassius auratus +great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +tiger shark, Galeocerdo cuvieri +hammerhead, hammerhead shark +electric ray, crampfish, numbfish, torpedo +stingray +cock +hen +ostrich, Struthio camelus +brambling, Fringilla montifringilla +goldfinch, Carduelis carduelis +house finch, linnet, Carpodacus mexicanus +junco, snowbird +indigo bunting, indigo finch, indigo bird, Passerina cyanea +robin, American robin, Turdus migratorius +bulbul +jay +magpie +chickadee +water ouzel, dipper +kite +bald eagle, American eagle, Haliaeetus leucocephalus +vulture +great grey owl, great gray owl, Strix nebulosa +European fire salamander, Salamandra salamandra +common newt, Triturus vulgaris +eft +spotted salamander, Ambystoma maculatum +axolotl, mud puppy, Ambystoma mexicanum +bullfrog, Rana catesbeiana +tree frog, tree-frog +tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +loggerhead, loggerhead turtle, Caretta caretta +leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +mud turtle +terrapin +box turtle, box tortoise +banded gecko +common iguana, iguana, Iguana iguana +American chameleon, anole, Anolis carolinensis +whiptail, whiptail lizard +agama +frilled lizard, Chlamydosaurus kingi +alligator lizard +Gila monster, Heloderma suspectum +green lizard, Lacerta viridis +African chameleon, Chamaeleo chamaeleon +Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +African crocodile, Nile crocodile, Crocodylus niloticus +American alligator, Alligator mississipiensis +triceratops +thunder snake, worm snake, Carphophis amoenus +ringneck snake, ring-necked snake, ring snake +hognose snake, puff adder, sand viper +green snake, grass snake +king snake, kingsnake +garter snake, grass snake +water snake +vine snake +night snake, Hypsiglena torquata +boa constrictor, Constrictor constrictor +rock python, rock snake, Python sebae +Indian cobra, Naja naja +green mamba +sea snake +horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +diamondback, diamondback rattlesnake, Crotalus adamanteus +sidewinder, horned rattlesnake, Crotalus cerastes +trilobite +harvestman, daddy longlegs, Phalangium opilio +scorpion +black and gold garden spider, Argiope aurantia +barn spider, Araneus cavaticus +garden spider, Aranea diademata +black widow, Latrodectus mactans +tarantula +wolf spider, hunting spider +tick +centipede +black grouse +ptarmigan +ruffed grouse, partridge, Bonasa umbellus +prairie chicken, prairie grouse, prairie fowl +peacock +quail +partridge +African grey, African gray, Psittacus erithacus +macaw +sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +lorikeet +coucal +bee eater +hornbill +hummingbird +jacamar +toucan +drake +red-breasted merganser, Mergus serrator +goose +black swan, Cygnus atratus +tusker +echidna, spiny anteater, anteater +platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +wallaby, brush kangaroo +koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +wombat +jellyfish +sea anemone, anemone +brain coral +flatworm, platyhelminth +nematode, nematode worm, roundworm +conch +snail +slug +sea slug, nudibranch +chiton, coat-of-mail shell, sea cradle, polyplacophore +chambered nautilus, pearly nautilus, nautilus +Dungeness crab, Cancer magister +rock crab, Cancer irroratus +fiddler crab +king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +American lobster, Northern lobster, Maine lobster, Homarus americanus +spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +crayfish, crawfish, crawdad, crawdaddy +hermit crab +isopod +white stork, Ciconia ciconia +black stork, Ciconia nigra +spoonbill +flamingo +little blue heron, Egretta caerulea +American egret, great white heron, Egretta albus +bittern +crane +limpkin, Aramus pictus +European gallinule, Porphyrio porphyrio +American coot, marsh hen, mud hen, water hen, Fulica americana +bustard +ruddy turnstone, Arenaria interpres +red-backed sandpiper, dunlin, Erolia alpina +redshank, Tringa totanus +dowitcher +oystercatcher, oyster catcher +pelican +king penguin, Aptenodytes patagonica +albatross, mollymawk +grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +killer whale, killer, orca, grampus, sea wolf, Orcinus orca +dugong, Dugong dugon +sea lion +Chihuahua +Japanese spaniel +Maltese dog, Maltese terrier, Maltese +Pekinese, Pekingese, Peke +Shih-Tzu +Blenheim spaniel +papillon +toy terrier +Rhodesian ridgeback +Afghan hound, Afghan +basset, basset hound +beagle +bloodhound, sleuthhound +bluetick +black-and-tan coonhound +Walker hound, Walker foxhound +English foxhound +redbone +borzoi, Russian wolfhound +Irish wolfhound +Italian greyhound +whippet +Ibizan hound, Ibizan Podenco +Norwegian elkhound, elkhound +otterhound, otter hound +Saluki, gazelle hound +Scottish deerhound, deerhound +Weimaraner +Staffordshire bullterrier, Staffordshire bull terrier +American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +Bedlington terrier +Border terrier +Kerry blue terrier +Irish terrier +Norfolk terrier +Norwich terrier +Yorkshire terrier +wire-haired fox terrier +Lakeland terrier +Sealyham terrier, Sealyham +Airedale, Airedale terrier +cairn, cairn terrier +Australian terrier +Dandie Dinmont, Dandie Dinmont terrier +Boston bull, Boston terrier +miniature schnauzer +giant schnauzer +standard schnauzer +Scotch terrier, Scottish terrier, Scottie +Tibetan terrier, chrysanthemum dog +silky terrier, Sydney silky +soft-coated wheaten terrier +West Highland white terrier +Lhasa, Lhasa apso +flat-coated retriever +curly-coated retriever +golden retriever +Labrador retriever +Chesapeake Bay retriever +German short-haired pointer +vizsla, Hungarian pointer +English setter +Irish setter, red setter +Gordon setter +Brittany spaniel +clumber, clumber spaniel +English springer, English springer spaniel +Welsh springer spaniel +cocker spaniel, English cocker spaniel, cocker +Sussex spaniel +Irish water spaniel +kuvasz +schipperke +groenendael +malinois +briard +kelpie +komondor +Old English sheepdog, bobtail +Shetland sheepdog, Shetland sheep dog, Shetland +collie +Border collie +Bouvier des Flandres, Bouviers des Flandres +Rottweiler +German shepherd, German shepherd dog, German police dog, alsatian +Doberman, Doberman pinscher +miniature pinscher +Greater Swiss Mountain dog +Bernese mountain dog +Appenzeller +EntleBucher +boxer +bull mastiff +Tibetan mastiff +French bulldog +Great Dane +Saint Bernard, St Bernard +Eskimo dog, husky +malamute, malemute, Alaskan malamute +Siberian husky +dalmatian, coach dog, carriage dog +affenpinscher, monkey pinscher, monkey dog +basenji +pug, pug-dog +Leonberg +Newfoundland, Newfoundland dog +Great Pyrenees +Samoyed, Samoyede +Pomeranian +chow, chow chow +keeshond +Brabancon griffon +Pembroke, Pembroke Welsh corgi +Cardigan, Cardigan Welsh corgi +toy poodle +miniature poodle +standard poodle +Mexican hairless +timber wolf, grey wolf, gray wolf, Canis lupus +white wolf, Arctic wolf, Canis lupus tundrarum +red wolf, maned wolf, Canis rufus, Canis niger +coyote, prairie wolf, brush wolf, Canis latrans +dingo, warrigal, warragal, Canis dingo +dhole, Cuon alpinus +African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +hyena, hyaena +red fox, Vulpes vulpes +kit fox, Vulpes macrotis +Arctic fox, white fox, Alopex lagopus +grey fox, gray fox, Urocyon cinereoargenteus +tabby, tabby cat +tiger cat +Persian cat +Siamese cat, Siamese +Egyptian cat +cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +lynx, catamount +leopard, Panthera pardus +snow leopard, ounce, Panthera uncia +jaguar, panther, Panthera onca, Felis onca +lion, king of beasts, Panthera leo +tiger, Panthera tigris +cheetah, chetah, Acinonyx jubatus +brown bear, bruin, Ursus arctos +American black bear, black bear, Ursus americanus, Euarctos americanus +ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +sloth bear, Melursus ursinus, Ursus ursinus +mongoose +meerkat, mierkat +tiger beetle +ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +ground beetle, carabid beetle +long-horned beetle, longicorn, longicorn beetle +leaf beetle, chrysomelid +dung beetle +rhinoceros beetle +weevil +fly +bee +ant, emmet, pismire +grasshopper, hopper +cricket +walking stick, walkingstick, stick insect +cockroach, roach +mantis, mantid +cicada, cicala +leafhopper +lacewing, lacewing fly +dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +damselfly +admiral +ringlet, ringlet butterfly +monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +cabbage butterfly +sulphur butterfly, sulfur butterfly +lycaenid, lycaenid butterfly +starfish, sea star +sea urchin +sea cucumber, holothurian +wood rabbit, cottontail, cottontail rabbit +hare +Angora, Angora rabbit +hamster +porcupine, hedgehog +fox squirrel, eastern fox squirrel, Sciurus niger +marmot +beaver +guinea pig, Cavia cobaya +sorrel +zebra +hog, pig, grunter, squealer, Sus scrofa +wild boar, boar, Sus scrofa +warthog +hippopotamus, hippo, river horse, Hippopotamus amphibius +ox +water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +bison +ram, tup +bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +ibex, Capra ibex +hartebeest +impala, Aepyceros melampus +gazelle +Arabian camel, dromedary, Camelus dromedarius +llama +weasel +mink +polecat, fitch, foulmart, foumart, Mustela putorius +black-footed ferret, ferret, Mustela nigripes +otter +skunk, polecat, wood pussy +badger +armadillo +three-toed sloth, ai, Bradypus tridactylus +orangutan, orang, orangutang, Pongo pygmaeus +gorilla, Gorilla gorilla +chimpanzee, chimp, Pan troglodytes +gibbon, Hylobates lar +siamang, Hylobates syndactylus, Symphalangus syndactylus +guenon, guenon monkey +patas, hussar monkey, Erythrocebus patas +baboon +macaque +langur +colobus, colobus monkey +proboscis monkey, Nasalis larvatus +marmoset +capuchin, ringtail, Cebus capucinus +howler monkey, howler +titi, titi monkey +spider monkey, Ateles geoffroyi +squirrel monkey, Saimiri sciureus +Madagascar cat, ring-tailed lemur, Lemur catta +indri, indris, Indri indri, Indri brevicaudatus +Indian elephant, Elephas maximus +African elephant, Loxodonta africana +lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +barracouta, snoek +eel +coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +rock beauty, Holocanthus tricolor +anemone fish +sturgeon +gar, garfish, garpike, billfish, Lepisosteus osseus +lionfish +puffer, pufferfish, blowfish, globefish +abacus +abaya +academic gown, academic robe, judge's robe +accordion, piano accordion, squeeze box +acoustic guitar +aircraft carrier, carrier, flattop, attack aircraft carrier +airliner +airship, dirigible +altar +ambulance +amphibian, amphibious vehicle +analog clock +apiary, bee house +apron +ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +assault rifle, assault gun +backpack, back pack, knapsack, packsack, rucksack, haversack +bakery, bakeshop, bakehouse +balance beam, beam +balloon +ballpoint, ballpoint pen, ballpen, Biro +Band Aid +banjo +bannister, banister, balustrade, balusters, handrail +barbell +barber chair +barbershop +barn +barometer +barrel, cask +barrow, garden cart, lawn cart, wheelbarrow +baseball +basketball +bassinet +bassoon +bathing cap, swimming cap +bath towel +bathtub, bathing tub, bath, tub +beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +beacon, lighthouse, beacon light, pharos +beaker +bearskin, busby, shako +beer bottle +beer glass +bell cote, bell cot +bib +bicycle-built-for-two, tandem bicycle, tandem +bikini, two-piece +binder, ring-binder +binoculars, field glasses, opera glasses +birdhouse +boathouse +bobsled, bobsleigh, bob +bolo tie, bolo, bola tie, bola +bonnet, poke bonnet +bookcase +bookshop, bookstore, bookstall +bottlecap +bow +bow tie, bow-tie, bowtie +brass, memorial tablet, plaque +brassiere, bra, bandeau +breakwater, groin, groyne, mole, bulwark, seawall, jetty +breastplate, aegis, egis +broom +bucket, pail +buckle +bulletproof vest +bullet train, bullet +butcher shop, meat market +cab, hack, taxi, taxicab +caldron, cauldron +candle, taper, wax light +cannon +canoe +can opener, tin opener +cardigan +car mirror +carousel, carrousel, merry-go-round, roundabout, whirligig +carpenter's kit, tool kit +carton +car wheel +cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +cassette +cassette player +castle +catamaran +CD player +cello, violoncello +cellular telephone, cellular phone, cellphone, cell, mobile phone +chain +chainlink fence +chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +chain saw, chainsaw +chest +chiffonier, commode +chime, bell, gong +china cabinet, china closet +Christmas stocking +church, church building +cinema, movie theater, movie theatre, movie house, picture palace +cleaver, meat cleaver, chopper +cliff dwelling +cloak +clog, geta, patten, sabot +cocktail shaker +coffee mug +coffeepot +coil, spiral, volute, whorl, helix +combination lock +computer keyboard, keypad +confectionery, confectionary, candy store +container ship, containership, container vessel +convertible +corkscrew, bottle screw +cornet, horn, trumpet, trump +cowboy boot +cowboy hat, ten-gallon hat +cradle +crane +crash helmet +crate +crib, cot +Crock Pot +croquet ball +crutch +cuirass +dam, dike, dyke +desk +desktop computer +dial telephone, dial phone +diaper, nappy, napkin +digital clock +digital watch +dining table, board +dishrag, dishcloth +dishwasher, dish washer, dishwashing machine +disk brake, disc brake +dock, dockage, docking facility +dogsled, dog sled, dog sleigh +dome +doormat, welcome mat +drilling platform, offshore rig +drum, membranophone, tympan +drumstick +dumbbell +Dutch oven +electric fan, blower +electric guitar +electric locomotive +entertainment center +envelope +espresso maker +face powder +feather boa, boa +file, file cabinet, filing cabinet +fireboat +fire engine, fire truck +fire screen, fireguard +flagpole, flagstaff +flute, transverse flute +folding chair +football helmet +forklift +fountain +fountain pen +four-poster +freight car +French horn, horn +frying pan, frypan, skillet +fur coat +garbage truck, dustcart +gasmask, respirator, gas helmet +gas pump, gasoline pump, petrol pump, island dispenser +goblet +go-kart +golf ball +golfcart, golf cart +gondola +gong, tam-tam +gown +grand piano, grand +greenhouse, nursery, glasshouse +grille, radiator grille +grocery store, grocery, food market, market +guillotine +hair slide +hair spray +half track +hammer +hamper +hand blower, blow dryer, blow drier, hair dryer, hair drier +hand-held computer, hand-held microcomputer +handkerchief, hankie, hanky, hankey +hard disc, hard disk, fixed disk +harmonica, mouth organ, harp, mouth harp +harp +harvester, reaper +hatchet +holster +home theater, home theatre +honeycomb +hook, claw +hoopskirt, crinoline +horizontal bar, high bar +horse cart, horse-cart +hourglass +iPod +iron, smoothing iron +jack-o'-lantern +jean, blue jean, denim +jeep, landrover +jersey, T-shirt, tee shirt +jigsaw puzzle +jinrikisha, ricksha, rickshaw +joystick +kimono +knee pad +knot +lab coat, laboratory coat +ladle +lampshade, lamp shade +laptop, laptop computer +lawn mower, mower +lens cap, lens cover +letter opener, paper knife, paperknife +library +lifeboat +lighter, light, igniter, ignitor +limousine, limo +liner, ocean liner +lipstick, lip rouge +Loafer +lotion +loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +loupe, jeweler's loupe +lumbermill, sawmill +magnetic compass +mailbag, postbag +mailbox, letter box +maillot +maillot, tank suit +manhole cover +maraca +marimba, xylophone +mask +matchstick +maypole +maze, labyrinth +measuring cup +medicine chest, medicine cabinet +megalith, megalithic structure +microphone, mike +microwave, microwave oven +military uniform +milk can +minibus +miniskirt, mini +minivan +missile +mitten +mixing bowl +mobile home, manufactured home +Model T +modem +monastery +monitor +moped +mortar +mortarboard +mosque +mosquito net +motor scooter, scooter +mountain bike, all-terrain bike, off-roader +mountain tent +mouse, computer mouse +mousetrap +moving van +muzzle +nail +neck brace +necklace +nipple +notebook, notebook computer +obelisk +oboe, hautboy, hautbois +ocarina, sweet potato +odometer, hodometer, mileometer, milometer +oil filter +organ, pipe organ +oscilloscope, scope, cathode-ray oscilloscope, CRO +overskirt +oxcart +oxygen mask +packet +paddle, boat paddle +paddlewheel, paddle wheel +padlock +paintbrush +pajama, pyjama, pj's, jammies +palace +panpipe, pandean pipe, syrinx +paper towel +parachute, chute +parallel bars, bars +park bench +parking meter +passenger car, coach, carriage +patio, terrace +pay-phone, pay-station +pedestal, plinth, footstall +pencil box, pencil case +pencil sharpener +perfume, essence +Petri dish +photocopier +pick, plectrum, plectron +pickelhaube +picket fence, paling +pickup, pickup truck +pier +piggy bank, penny bank +pill bottle +pillow +ping-pong ball +pinwheel +pirate, pirate ship +pitcher, ewer +plane, carpenter's plane, woodworking plane +planetarium +plastic bag +plate rack +plow, plough +plunger, plumber's helper +Polaroid camera, Polaroid Land camera +pole +police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +poncho +pool table, billiard table, snooker table +pop bottle, soda bottle +pot, flowerpot +potter's wheel +power drill +prayer rug, prayer mat +printer +prison, prison house +projectile, missile +projector +puck, hockey puck +punching bag, punch bag, punching ball, punchball +purse +quill, quill pen +quilt, comforter, comfort, puff +racer, race car, racing car +racket, racquet +radiator +radio, wireless +radio telescope, radio reflector +rain barrel +recreational vehicle, RV, R.V. +reel +reflex camera +refrigerator, icebox +remote control, remote +restaurant, eating house, eating place, eatery +revolver, six-gun, six-shooter +rifle +rocking chair, rocker +rotisserie +rubber eraser, rubber, pencil eraser +rugby ball +rule, ruler +running shoe +safe +safety pin +saltshaker, salt shaker +sandal +sarong +sax, saxophone +scabbard +scale, weighing machine +school bus +schooner +scoreboard +screen, CRT screen +screw +screwdriver +seat belt, seatbelt +sewing machine +shield, buckler +shoe shop, shoe-shop, shoe store +shoji +shopping basket +shopping cart +shovel +shower cap +shower curtain +ski +ski mask +sleeping bag +slide rule, slipstick +sliding door +slot, one-armed bandit +snorkel +snowmobile +snowplow, snowplough +soap dispenser +soccer ball +sock +solar dish, solar collector, solar furnace +sombrero +soup bowl +space bar +space heater +space shuttle +spatula +speedboat +spider web, spider's web +spindle +sports car, sport car +spotlight, spot +stage +steam locomotive +steel arch bridge +steel drum +stethoscope +stole +stone wall +stopwatch, stop watch +stove +strainer +streetcar, tram, tramcar, trolley, trolley car +stretcher +studio couch, day bed +stupa, tope +submarine, pigboat, sub, U-boat +suit, suit of clothes +sundial +sunglass +sunglasses, dark glasses, shades +sunscreen, sunblock, sun blocker +suspension bridge +swab, swob, mop +sweatshirt +swimming trunks, bathing trunks +swing +switch, electric switch, electrical switch +syringe +table lamp +tank, army tank, armored combat vehicle, armoured combat vehicle +tape player +teapot +teddy, teddy bear +television, television system +tennis ball +thatch, thatched roof +theater curtain, theatre curtain +thimble +thresher, thrasher, threshing machine +throne +tile roof +toaster +tobacco shop, tobacconist shop, tobacconist +toilet seat +torch +totem pole +tow truck, tow car, wrecker +toyshop +tractor +trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +tray +trench coat +tricycle, trike, velocipede +trimaran +tripod +triumphal arch +trolleybus, trolley coach, trackless trolley +trombone +tub, vat +turnstile +typewriter keyboard +umbrella +unicycle, monocycle +upright, upright piano +vacuum, vacuum cleaner +vase +vault +velvet +vending machine +vestment +viaduct +violin, fiddle +volleyball +waffle iron +wall clock +wallet, billfold, notecase, pocketbook +wardrobe, closet, press +warplane, military plane +washbasin, handbasin, washbowl, lavabo, wash-hand basin +washer, automatic washer, washing machine +water bottle +water jug +water tower +whiskey jug +whistle +wig +window screen +window shade +Windsor tie +wine bottle +wing +wok +wooden spoon +wool, woolen, woollen +worm fence, snake fence, snake-rail fence, Virginia fence +wreck +yawl +yurt +web site, website, internet site, site +comic book +crossword puzzle, crossword +street sign +traffic light, traffic signal, stoplight +book jacket, dust cover, dust jacket, dust wrapper +menu +plate +guacamole +consomme +hot pot, hotpot +trifle +ice cream, icecream +ice lolly, lolly, lollipop, popsicle +French loaf +bagel, beigel +pretzel +cheeseburger +hotdog, hot dog, red hot +mashed potato +head cabbage +broccoli +cauliflower +zucchini, courgette +spaghetti squash +acorn squash +butternut squash +cucumber, cuke +artichoke, globe artichoke +bell pepper +cardoon +mushroom +Granny Smith +strawberry +orange +lemon +fig +pineapple, ananas +banana +jackfruit, jak, jack +custard apple +pomegranate +hay +carbonara +chocolate sauce, chocolate syrup +dough +meat loaf, meatloaf +pizza, pizza pie +potpie +burrito +red wine +espresso +cup +eggnog +alp +bubble +cliff, drop, drop-off +coral reef +geyser +lakeside, lakeshore +promontory, headland, head, foreland +sandbar, sand bar +seashore, coast, seacoast, sea-coast +valley, vale +volcano +ballplayer, baseball player +groom, bridegroom +scuba diver +rapeseed +daisy +yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +corn +acorn +hip, rose hip, rosehip +buckeye, horse chestnut, conker +coral fungus +agaric +gyromitra +stinkhorn, carrion fungus +earthstar +hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +bolete +ear, spike, capitulum +toilet tissue, toilet paper, bathroom tissue diff --git a/pose_landmarker.task b/pose_landmarker.task new file mode 100644 index 0000000..45449d9 Binary files /dev/null and b/pose_landmarker.task differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cd18d16 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +mediapipe>=0.10.0 +moviepy>=1.0.0 +scipy>=1.10.0 +numpy>=1.24.0 +noisereduce>=3.0.0 diff --git a/sample_audio/car_horn.wav b/sample_audio/car_horn.wav new file mode 100644 index 0000000..32064fa Binary files /dev/null and b/sample_audio/car_horn.wav differ diff --git a/sample_audio/dog_bark.wav b/sample_audio/dog_bark.wav new file mode 100644 index 0000000..827f934 Binary files /dev/null and b/sample_audio/dog_bark.wav differ diff --git a/sample_audio/engine.wav b/sample_audio/engine.wav new file mode 100644 index 0000000..69e9d20 Binary files /dev/null and b/sample_audio/engine.wav differ diff --git a/sample_audio/glass_break.wav b/sample_audio/glass_break.wav new file mode 100644 index 0000000..79a0d34 Binary files /dev/null and b/sample_audio/glass_break.wav differ diff --git a/sample_audio/gunshot.wav b/sample_audio/gunshot.wav new file mode 100644 index 0000000..3367505 Binary files /dev/null and b/sample_audio/gunshot.wav differ diff --git a/sample_audio/siren.wav b/sample_audio/siren.wav new file mode 100644 index 0000000..624bffc Binary files /dev/null and b/sample_audio/siren.wav differ diff --git a/sound_event_detection.py b/sound_event_detection.py new file mode 100644 index 0000000..7baac7c --- /dev/null +++ b/sound_event_detection.py @@ -0,0 +1,429 @@ +""" +Sound Event Detection (SED) Module - Module 1 of the Intelligent CC pipeline. + +Detects environmental sounds (car horns, gunshots, dog barks, sirens, etc.) +from audio using the YAMNet model via MediaPipe Audio Classifier. + +Outputs detected events with timestamps and confidence scores. +""" + +import os +import json +import urllib.request +import tempfile +from dataclasses import dataclass, field +from typing import List, Optional + +import numpy as np +from scipy.io import wavfile + +import mediapipe as mp +from mediapipe.tasks import python +from mediapipe.tasks.python.components import containers +from mediapipe.tasks.python import audio + +YAMNET_MODEL_URL = ( + "https://storage.googleapis.com/mediapipe-models/" + "audio_classifier/yamnet/float32/1/yamnet.tflite" +) +YAMNET_WINDOW_SEC = 0.975 + +DEFAULT_TARGET_SOUNDS = [ + "Vehicle horn, car horn, honking", + "Gunshot, gunfire", + "Dog", + "Bark", + "Siren", + "Alarm", + "Explosion", + "Fireworks", + "Glass", + "Breaking", + "Vehicle", + "Engine", +] + + +@dataclass +class SoundEvent: + """A single detected sound event with time bounds and confidence.""" + + start_time: float + end_time: float + sound_class: str + confidence: float + class_index: int + + +@dataclass +class DetectionResult: + """Collection of detected events for one audio file.""" + + events: List[SoundEvent] = field(default_factory=list) + duration_seconds: float = 0.0 + source_file: str = "" + + def to_dict(self) -> dict: + return { + "source_file": self.source_file, + "duration_seconds": self.duration_seconds, + "events": [ + { + "start_time": round(e.start_time, 3), + "end_time": round(e.end_time, 3), + "sound_class": e.sound_class, + "confidence": round(e.confidence, 3), + "class_index": e.class_index, + } + for e in self.events + ], + } + + def to_json(self, filepath: Optional[str] = None, indent: int = 2) -> str: + text = json.dumps(self.to_dict(), indent=indent) + if filepath: + with open(filepath, "w") as f: + f.write(text) + return text + + def to_srt( + self, filepath: Optional[str] = None, min_confidence: float = 0.0 + ) -> str: + lines = [] + counter = 1 + for e in self.events: + if e.confidence < min_confidence: + continue + lines.append(str(counter)) + lines.append(f"{_fmt_srt(e.start_time)} --> {_fmt_srt(e.end_time)}") + lines.append(f"[{e.sound_class}] ({e.confidence:.0%})") + lines.append("") + counter += 1 + text = "\n".join(lines) + if filepath: + with open(filepath, "w") as f: + f.write(text) + return text + + +class SoundEventDetector: + """Sound Event Detection using MediaPipe's YAMNet classifier. + + Args: + model_path: Path to YAMNet TFLite model. + confidence_threshold: Minimum confidence (0-1) to report an event. + target_classes: List of sound class names to filter for. + If None, all classes above threshold are reported. + """ + + def __init__( + self, + model_path: str = "yamnet.tflite", + confidence_threshold: float = 0.20, + target_classes: Optional[List[str]] = None, + use_separation: bool = False, + use_noise_reduction: bool = False, + ): + self.model_path = model_path + self.confidence_threshold = confidence_threshold + self.target_classes = target_classes + self.use_separation = use_separation + self.use_noise_reduction = use_noise_reduction + self._classifier: Optional[audio.AudioClassifier] = None + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def detect_from_file(self, audio_path: str) -> DetectionResult: + """Run detection on a WAV audio file.""" + if not audio_path.lower().endswith(".wav"): + raise ValueError( + "Input must be a .wav file. Use detect_from_video() for video files." + ) + + import shutil + temp_items: list[str] = [] + try: + current_path = audio_path + + if self.use_separation: + inst_path, temp_dir = self._separate_audio(current_path) + temp_items.append(temp_dir) + current_path = inst_path + + if self.use_noise_reduction: + denoised_path = self._reduce_noise(current_path) + temp_items.append(denoised_path) + current_path = denoised_path + + result = self._detect(current_path) + result.source_file = os.path.basename(audio_path) + return result + finally: + for item in reversed(temp_items): + if os.path.isdir(item): + shutil.rmtree(item, ignore_errors=True) + elif os.path.isfile(item): + try: + os.remove(item) + except OSError: + pass + + def detect_from_video(self, video_path: str) -> DetectionResult: + """Extract audio from video, run detection, clean up temp file.""" + import shutil + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_wav = tmp.name + + temp_items: list[str] = [tmp_wav] + try: + self._extract_audio(video_path, tmp_wav) + current_path = tmp_wav + + if self.use_separation: + inst_path, temp_dir = self._separate_audio(current_path) + temp_items.append(temp_dir) + current_path = inst_path + + if self.use_noise_reduction: + denoised_path = self._reduce_noise(current_path) + temp_items.append(denoised_path) + current_path = denoised_path + + result = self._detect(current_path) + result.source_file = os.path.basename(video_path) + return result + finally: + for item in reversed(temp_items): + if os.path.isdir(item): + shutil.rmtree(item, ignore_errors=True) + elif os.path.isfile(item): + try: + os.remove(item) + except OSError: + pass + + def close(self) -> None: + if self._classifier is not None: + self._classifier.close() + self._classifier = None + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + @staticmethod + def _ensure_ffmpeg() -> None: + """Dynamically find and setup ffmpeg from imageio_ffmpeg if not in PATH.""" + import shutil + if shutil.which("ffmpeg") is not None: + return + + try: + import imageio_ffmpeg + ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() + except ImportError: + return + + if not ffmpeg_exe or not os.path.exists(ffmpeg_exe): + return + + import tempfile + temp_dir = os.path.join(tempfile.gettempdir(), "intelligent_cc_ffmpeg") + os.makedirs(temp_dir, exist_ok=True) + + dest_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" + ffmpeg_dest = os.path.join(temp_dir, dest_name) + + if not os.path.exists(ffmpeg_dest): + try: + shutil.copy(ffmpeg_exe, ffmpeg_dest) + except Exception: + pass + + if os.path.exists(ffmpeg_dest): + os.environ["PATH"] = temp_dir + os.pathsep + os.environ["PATH"] + + def _separate_audio(self, wav_path: str) -> tuple[str, str]: + """Separate vocals and instrumental/background stems. + Returns a tuple of (instrumental_path, temp_dir_to_clean_up) + """ + import tempfile + from audio_separator.separator import Separator + + self._ensure_ffmpeg() + + # Create a temp directory for outputs + temp_dir = tempfile.mkdtemp(prefix="cc_separation_") + + # Initialize separator + separator = Separator(output_dir=temp_dir) + separator.load_model(model_filename='UVR-MDX-NET-Inst_HQ_3.onnx') + + print(f" Separating vocals/speech from background audio...") + outputs = separator.separate(wav_path) + + # outputs contains filenames. Find the instrumental stem + inst_file = None + for filename in outputs: + if "Instrumental" in filename: + inst_file = filename + break + + if not inst_file: + inst_file = outputs[0] if outputs else None + + if not inst_file: + raise RuntimeError("Audio stem separation failed, no output files generated.") + + inst_path = os.path.join(temp_dir, inst_file) + return inst_path, temp_dir + + def _reduce_noise(self, wav_path: str) -> str: + """Apply spectral-gate noise reduction. Returns path to denoised WAV.""" + import tempfile + import noisereduce as nr + + sr, data = wavfile.read(wav_path) + + if data.dtype == np.int16: + float_data = data.astype(np.float32) / np.iinfo(np.int16).max + elif data.dtype == np.int32: + float_data = data.astype(np.float32) / np.iinfo(np.int32).max + elif data.dtype == np.uint8: + float_data = data.astype(np.float32) / 255.0 * 2.0 - 1.0 + else: + float_data = data.astype(np.float32) + + if float_data.ndim > 1: + reduced = np.stack([ + nr.reduce_noise(y=float_data[:, c], sr=sr, prop_decrease=0.8) + for c in range(float_data.shape[1]) + ], axis=1) + else: + reduced = nr.reduce_noise(y=float_data, sr=sr, prop_decrease=0.8) + + reduced = np.clip(reduced, -1.0, 1.0) + reduced_int16 = (reduced * np.iinfo(np.int16).max).astype(np.int16) + + fd, out_path = tempfile.mkstemp(suffix="_denoised.wav") + os.close(fd) + wavfile.write(out_path, sr, reduced_int16) + return out_path + + # ------------------------------------------------------------------ + # Internal methods + # ------------------------------------------------------------------ + + def _get_classifier(self) -> audio.AudioClassifier: + if self._classifier is not None: + return self._classifier + if not os.path.exists(self.model_path): + print(f"Downloading YAMNet model to {self.model_path} ...") + urllib.request.urlretrieve(YAMNET_MODEL_URL, self.model_path) + base_opts = python.BaseOptions(model_asset_path=self.model_path) + opts = audio.AudioClassifierOptions( + base_options=base_opts, + running_mode=audio.RunningMode.AUDIO_CLIPS, + max_results=5, + ) + self._classifier = audio.AudioClassifier.create_from_options(opts) + return self._classifier + + @staticmethod + def _extract_audio(video_path: str, output_wav: str) -> None: + from moviepy import VideoFileClip + + with VideoFileClip(video_path) as clip: + if clip.audio is None: + raise ValueError(f"No audio track in {video_path}") + clip.audio.write_audiofile( + output_wav, + fps=16000, + nbytes=2, + codec="pcm_s16le", + ffmpeg_params=["-ac", "1"], + logger=None, + ) + + @staticmethod + def _load_wav(path: str): + sr, data = wavfile.read(path) + if data.ndim > 1: + data = data.mean(axis=1) + if data.dtype == np.int16: + data = data.astype(np.float32) / np.iinfo(np.int16).max + elif data.dtype == np.int32: + data = data.astype(np.float32) / np.iinfo(np.int32).max + elif data.dtype == np.uint8: + data = data.astype(np.float32) / 255.0 * 2.0 - 1.0 + return data, sr + + def _detect(self, wav_path: str) -> DetectionResult: + classifier = self._get_classifier() + waveform, sr = self._load_wav(wav_path) + duration = len(waveform) / sr + + audio_clip = containers.AudioData.create_from_array(waveform, sr) + raw_results = classifier.classify(audio_clip) + + raw_events: List[SoundEvent] = [] + for idx, frame_result in enumerate(raw_results): + ts_ms = getattr( + frame_result, + "timestamp_ms", + idx * YAMNET_WINDOW_SEC * 1000, + ) + start = ts_ms / 1000.0 + + for classification in frame_result.classifications: + for cat in classification.categories: + if cat.score < self.confidence_threshold: + continue + if ( + self.target_classes + and cat.category_name not in self.target_classes + ): + continue + raw_events.append( + SoundEvent( + start_time=start, + end_time=start + YAMNET_WINDOW_SEC, + sound_class=cat.category_name, + confidence=cat.score, + class_index=getattr(cat, "index", -1), + ) + ) + + merged = self._merge_events(raw_events) + return DetectionResult(events=merged, duration_seconds=duration) + + @staticmethod + def _merge_events( + events: List[SoundEvent], max_gap: float = 0.5 + ) -> List[SoundEvent]: + if not events: + return [] + events = sorted(events, key=lambda e: (e.start_time, e.sound_class)) + merged: List[SoundEvent] = [events[0]] + for e in events[1:]: + prev = merged[-1] + gap = e.start_time - prev.end_time + if e.sound_class == prev.sound_class and gap <= max_gap: + prev.end_time = max(prev.end_time, e.end_time) + prev.confidence = max(prev.confidence, e.confidence) + else: + merged.append(e) + return merged + + +def _fmt_srt(seconds: float) -> str: + """Format seconds to HH:MM:SS,mmm for SRT.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + ms = int(round((seconds - int(seconds)) * 1000)) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" diff --git a/test.mp4 b/test.mp4 new file mode 100644 index 0000000..c8faf62 Binary files /dev/null and b/test.mp4 differ diff --git a/test_visual.json b/test_visual.json new file mode 100644 index 0000000..c703708 --- /dev/null +++ b/test_visual.json @@ -0,0 +1,7 @@ +{ + "video": "test.mp4", + "duration": 3.0, + "max_faces_detected": 0, + "scene_changes": 0, + "events": [] +} \ No newline at end of file diff --git a/yamnet.tflite b/yamnet.tflite new file mode 100644 index 0000000..4d46551 Binary files /dev/null and b/yamnet.tflite differ