diff --git a/.github/ISSUE_TEMPLATE/DMP_2026.yml b/.github/ISSUE_TEMPLATE/DMP_2026.yml deleted file mode 100644 index 9d68f7f..0000000 --- a/.github/ISSUE_TEMPLATE/DMP_2026.yml +++ /dev/null @@ -1,345 +0,0 @@ -name: DMP 2026 Project Template -description: List a new project for Dedicated Mentoring Program (DMP) 2026 -title: "[DMP 2026]: " -labels: ["DMP 2026"] -body: - - type: textarea - id: ticket-description - validations: - required: true - attributes: - label: Ticket Contents - value: | - ## Description - [Provide a brief description of the feature, including why it is needed and what it will accomplish.] - - - type: textarea - id: ticket-goals - validations: - required: true - attributes: - label: Goals & Mid-Point Milestone - description: List the goals of the feature. Please add the goals that must be achieved by Mid-point check-in i.e 1.5 months into the coding period. - value: | - ## Goals - - [ ] [Goal 1] - - [ ] [Goal 2] - - [ ] [Goal 3] - - [ ] [Goal 4] - - [ ] [Goals Achieved By Mid-point Milestone] - - - type: textarea - id: ticket-setup - attributes: - label: Setup/Installation - description: Please list or link setup or installation guide (if any) - - - type: textarea - id: ticket-expected-outcome - attributes: - label: Expected Outcome - description: Describe in detail what the final product or result should look like and how it should behave. - - - type: textarea - id: ticket-acceptance-criteria - attributes: - label: Acceptance Criteria - description: List the acceptance criteria for this feature. - - - type: textarea - id: ticket-implementation-details - validations: - required: true - attributes: - label: Implementation Details - description: List any technical details about the proposed implementation, including any specific technologies that will be used. - - - type: textarea - id: ticket-mockups - attributes: - label: Mockups/Wireframes - description: Include links to any visual aids, mockups, wireframes, or diagrams that help illustrate what the final product should look like. This is not always necessary, but can be very helpful in many cases. - - - type: input - id: ticket-product - attributes: - label: Product Name - placeholder: Enter Product Name - validations: - required: true - - - type: dropdown - id: ticket-organisation - attributes: - label: Organisation Name - description: Enter Organisation Name - multiple: false - options: - - Agami - - Argusoft - - ARMMAN - - Avanti Fellows - - Bandhu - - Beckn - - Belongg - - Blockster Global (CREDBEL) - - Blockster Labs / AyanWorks - - CBoard - - CHAOSS - - CHAOSS Africa + GWU - - Civis - - ConveGenius - - Consul Democracy - - COSS - - CranberryFit - - Development Gateway - - DHIS2 - - Dhiway - - Dhwani - - Digital Green - - Digital India - - Dimagi - - Drupal - - Education Initiative - - eGov - - EkShop Marketplace - - FIDE - - FinternetLabs - - Flywheel - - GovDirectory - - Haqdarshak - - Healthsites.io - - IDinsight - - If Me - - IIIT Delhi - - IIT Bombay - - IIT Delhi - - Impactyaan - - Indus Action - - Intel Health - - Key Education Foundation - - Khushi Baby - - Learning Economy - - Linux Foundation - - Mecha Systems - - Medic Mobile - - Medtronic Labs - - MetaBrainz - - Mifos - - Mojaloop - - MOSIP - - NASSCOM Foundation - - NHA - - NIUA - - Norwegian Meteorological Institute - - NSUT x SEETA x AIC - - ONDC - - ONEST - - Open Healthcare Network - - OpenCRVS - - OpenFn - - OpenIMIS - - OpenMRS - - OpenSPP - - Piramal Swasthya - - Planet Read - - Policy Engine - - Pratham Books - - Project Second Chance - - Project Tech4Dev - - Protean - - RCTS-IIITH - - Reap Benefit - - Resolve to Save Lives - - Rocket Learning - - Rumsan - - Sahamati - - SamagraX - - Samanvay Foundation - - Sampatti Card - - Sanketika - - ShikshaLokam - - SimPPL - - Sugar Labs - - Swasth Alliance - - Swecha - - Tarento - - Tattle - - Tech4Dev - - Tekdi - - The Apprentice Project - - The Mifos Initiative - - Thoughtworks - - Tibil - - TinkerHub - - Trustin - - Tuner Labs - - TYCIA - - UNICEF - - United Nations - - Ushahidi - - Win Over Cancer - - WRI - - Zendalona - - Zenysis - - Arghyam - validations: - required: true - - - type: dropdown - id: ticket-governance-domain - attributes: - label: Domain - options: - - ⁠Healthcare - - ⁠Education - - Financial Inclusion - - ⁠Livelihoods - - ⁠Skilling - - ⁠Learning & Development - - ⁠Agriculture - - ⁠Service Delivery - - Open Source Library - - Water - validations: - required: true - - - - type: dropdown - id: ticket-technical-skills-required - attributes: - label: Tech Skills Needed - description: Select the technologies needed for this ticket (use Ctrl or Command to select multiple) - multiple: true - options: - - .NET - - Angular - - Artificial Intelligence - - ASP.NET - - AWS - - Babel - - Bootstrap - - C# - - Chart.js - - CI/CD - - Computer Vision - - CORS - - cURL - - Cypress - - D3.js - - Database - - Debugging - - Design - - DevOps - - Django - - Docker - - Electron - - ESLint - - Express.js - - Feature - - Flask - - Go - - GraphQL - - HTML - - Ionic - - Jest - - Java - - JavaScript - - Jenkins - - JWT - - Kubernetes - - Laravel - - Machine Learning - - Maintenance - - Markdown - - Material-UI - - Microservices - - MongoDB - - Mobile - - Mockups - - Mocha - - Natural Language Processing - - NestJS - - Node.js - - NUnit - - OAuth - - Performance Improvement - - Prettier - - Python - - Question - - React - - React Native - - Redux - - RESTful APIs - - Ruby - - Ruby on Rails - - Rust - - Scala - - Security - - Selenium - - SEO - - Serverless - - Solidity - - Spring Boot - - SQL - - Swagger - - Tailwind CSS - - Test - - Testing Library - - Three.js - - TypeScript - - UI/UX/Design - - Virtual Reality - - Vue.js - - WebSockets - - Webpack - - Other - validations: - required: true - - - type: textarea - id: ticket-mentors - attributes: - label: Mentor(s) - description: Please tag relevant mentors for the ticket - validations: - required: true - - - type: dropdown - id: ticket-category - attributes: - label: Category - description: Choose the categories that best describe your ticket - multiple: true - options: - - API - - Analytics - - Accessibility - - Backend - - Breaking Change - - Beginner Friendly - - Configuration - - CI/CD - - Database - - Data Science - - Deprecation - - Documentation - - Delpoyment - - Frontend - - Internationalization - - Localization - - Machine Learning - - Maintenance - - Mobile - - Performance Improvement - - Question - - Refactoring - - Research - - Needs Reproduction - - SEO - - Security - - Testing - - AI - - Other - validations: - required: true diff --git a/Avengers vs Ultron.mp4 b/Avengers vs Ultron.mp4 new file mode 100644 index 0000000..3cb2c2b Binary files /dev/null and b/Avengers vs Ultron.mp4 differ diff --git a/Avengers vs Ultron_cc.sls b/Avengers vs Ultron_cc.sls new file mode 100644 index 0000000..676c882 --- /dev/null +++ b/Avengers vs Ultron_cc.sls @@ -0,0 +1,77 @@ + + + +Intelligent Closed Captions + + + + + +

⚡ Fall Down + +

⚡ Punching + +

⚡ Punching + +

⚡ Fall Down + +

⚡ Punching + +

⚡ Fall Down + +

⚡ Punching + +

👁 helicopter (mixing bowl) + +

⚡ Fall Down + +

👁 dog (American black bear, black bear, Ursus americanus, Euarctos americanus); 👁 cat (American black bear, black bear, Ursus americanus, Euarctos americanus) + +

👁 helicopter (modem) + +

👁 helicopter (modem) + +

⚡ Punching + +

⚡ Fall Down + +

⚡ Punching + +

👁 dog (Appenzeller) + +

⚡ Fall Down + +

⚡ Fall Down + +

⚡ Punching + +

⚡ Punching + +

⚡ Fall Down + +

⚡ Punching + +

[Engine] (33%); [Idling] (20%); [Medium engine (mid frequency)] (20%) + +

[Vehicle horn, car horn, honking] (41%) + +

[Vehicle horn, car horn, honking] (26%) + +

[Vehicle horn, car horn, honking] (33%) + +

[Vehicle horn, car horn, honking] (33%) + +

[Vehicle horn, car horn, honking] (41%) + +

[Vehicle horn, car horn, honking] (26%) + +

[Vehicle horn, car horn, honking] (33%) + + diff --git a/Avengers vs Ultron_cc.srt b/Avengers vs Ultron_cc.srt new file mode 100644 index 0000000..61a9d66 --- /dev/null +++ b/Avengers vs Ultron_cc.srt @@ -0,0 +1,119 @@ +1 +00:00:00,500 --> 00:00:04,300 +⚡ Fall Down + +2 +00:00:02,900 --> 00:00:03,900 +⚡ Punching + +3 +00:00:04,300 --> 00:00:05,300 +⚡ Punching + +4 +00:00:08,400 --> 00:00:10,900 +⚡ Fall Down + +5 +00:00:18,200 --> 00:00:19,200 +⚡ Punching + +6 +00:00:19,200 --> 00:00:21,600 +⚡ Fall Down + +7 +00:00:21,600 --> 00:00:23,000 +⚡ Punching + +8 +00:00:24,300 --> 00:00:25,800 +👁 helicopter (mixing bowl) + +9 +00:00:29,000 --> 00:00:31,000 +⚡ Fall Down + +10 +00:00:35,500 --> 00:00:37,000 +👁 dog (American black bear, black bear, Ursus americanus, Euarctos americanus); 👁 cat (American black bear, black bear, Ursus americanus, Euarctos americanus) + +11 +00:00:46,200 --> 00:00:49,100 +👁 helicopter (modem) + +12 +00:00:56,100 --> 00:00:57,600 +👁 helicopter (modem) + +13 +00:01:02,100 --> 00:01:05,100 +⚡ Punching + +14 +00:01:21,700 --> 00:01:24,200 +⚡ Fall Down + +15 +00:01:25,000 --> 00:01:27,000 +⚡ Punching + +16 +00:01:51,600 --> 00:01:52,600 +👁 dog (Appenzeller) + +17 +00:01:52,600 --> 00:01:54,600 +⚡ Fall Down + +18 +00:01:54,900 --> 00:01:56,900 +⚡ Fall Down + +19 +00:02:05,200 --> 00:02:07,200 +⚡ Punching + +20 +00:02:15,500 --> 00:02:17,500 +⚡ Punching + +21 +00:02:45,800 --> 00:02:47,800 +⚡ Fall Down + +22 +00:02:49,600 --> 00:02:51,600 +⚡ Punching + +23 +00:03:03,300 --> 00:03:04,300 +[Engine] (33%); [Idling] (20%); [Medium engine (mid frequency)] (20%) + +24 +00:03:04,275 --> 00:03:06,225 +[Vehicle horn, car horn, honking] (41%) + +25 +00:03:07,200 --> 00:03:08,200 +[Vehicle horn, car horn, honking] (26%) + +26 +00:03:08,175 --> 00:03:15,150 +[Vehicle horn, car horn, honking] (33%) + +27 +00:03:15,975 --> 00:03:16,975 +[Vehicle horn, car horn, honking] (33%) + +28 +00:03:16,950 --> 00:03:20,850 +[Vehicle horn, car horn, honking] (41%) + +29 +00:03:20,850 --> 00:03:21,850 +[Vehicle horn, car horn, honking] (26%) + +30 +00:03:21,825 --> 00:03:24,750 +[Vehicle horn, car horn, honking] (33%) diff --git a/Avengers vs Ultron_cc_fused.json b/Avengers vs Ultron_cc_fused.json new file mode 100644 index 0000000..bd8a934 --- /dev/null +++ b/Avengers vs Ultron_cc_fused.json @@ -0,0 +1,272 @@ +[ + { + "start_time": 0.5, + "end_time": 4.3, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 2.9, + "end_time": 3.9, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 4.3, + "end_time": 5.3, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 8.4, + "end_time": 10.9, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 18.2, + "end_time": 19.2, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 19.2, + "end_time": 21.6, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 21.6, + "end_time": 23.0, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 24.3, + "end_time": 25.8, + "caption": "\ud83d\udc41 helicopter (mixing bowl)", + "event_type": "visual_object", + "source_label": "helicopter", + "confidence": 0.316, + "priority": 3 + }, + { + "start_time": 29.0, + "end_time": 31.0, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 35.5, + "end_time": 37.0, + "caption": "\ud83d\udc41 dog (American black bear, black bear, Ursus americanus, Euarctos americanus); \ud83d\udc41 cat (American black bear, black bear, Ursus americanus, Euarctos americanus)", + "event_type": "visual_object", + "source_label": "dog", + "confidence": 0.495, + "priority": 3 + }, + { + "start_time": 46.2, + "end_time": 49.1, + "caption": "\ud83d\udc41 helicopter (modem)", + "event_type": "visual_object", + "source_label": "helicopter", + "confidence": 0.431, + "priority": 3 + }, + { + "start_time": 56.1, + "end_time": 57.6, + "caption": "\ud83d\udc41 helicopter (modem)", + "event_type": "visual_object", + "source_label": "helicopter", + "confidence": 0.31, + "priority": 3 + }, + { + "start_time": 62.1, + "end_time": 65.1, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 81.7, + "end_time": 84.2, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 85.0, + "end_time": 87.0, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 111.6, + "end_time": 112.6, + "caption": "\ud83d\udc41 dog (Appenzeller)", + "event_type": "visual_object", + "source_label": "dog", + "confidence": 0.659, + "priority": 3 + }, + { + "start_time": 112.6, + "end_time": 114.6, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 114.9, + "end_time": 116.9, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 125.2, + "end_time": 127.2, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 135.5, + "end_time": 137.5, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 165.8, + "end_time": 167.8, + "caption": "\u26a1 Fall Down", + "event_type": "visual_action", + "source_label": "fall_down", + "confidence": 1.0, + "priority": 1 + }, + { + "start_time": 169.6, + "end_time": 171.6, + "caption": "\u26a1 Punching", + "event_type": "visual_action", + "source_label": "punching", + "confidence": 1.0, + "priority": 2 + }, + { + "start_time": 183.3, + "end_time": 184.3, + "caption": "[Engine] (33%); [Idling] (20%); [Medium engine (mid frequency)] (20%)", + "event_type": "audio", + "source_label": "Engine", + "confidence": 0.332, + "priority": 3 + }, + { + "start_time": 184.275, + "end_time": 186.225, + "caption": "[Vehicle horn, car horn, honking] (41%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.414, + "priority": 2 + }, + { + "start_time": 187.2, + "end_time": 188.2, + "caption": "[Vehicle horn, car horn, honking] (26%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.262, + "priority": 2 + }, + { + "start_time": 188.175, + "end_time": 195.15, + "caption": "[Vehicle horn, car horn, honking] (33%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "priority": 2 + }, + { + "start_time": 195.975, + "end_time": 196.975, + "caption": "[Vehicle horn, car horn, honking] (33%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "priority": 2 + }, + { + "start_time": 196.95, + "end_time": 200.85, + "caption": "[Vehicle horn, car horn, honking] (41%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.414, + "priority": 2 + }, + { + "start_time": 200.85, + "end_time": 201.85, + "caption": "[Vehicle horn, car horn, honking] (26%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.262, + "priority": 2 + }, + { + "start_time": 201.825, + "end_time": 204.75, + "caption": "[Vehicle horn, car horn, honking] (33%)", + "event_type": "audio", + "source_label": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "priority": 2 + } +] \ No newline at end of file diff --git a/Avengers vs Ultron_events.json b/Avengers vs Ultron_events.json new file mode 100644 index 0000000..132c844 --- /dev/null +++ b/Avengers vs Ultron_events.json @@ -0,0 +1,146 @@ +{ + "source_file": "Avengers vs Ultron.mp4", + "duration_seconds": 204.32, + "events": [ + { + "start_time": 183.3, + "end_time": 184.275, + "sound_class": "Engine", + "confidence": 0.332, + "class_index": 337 + }, + { + "start_time": 183.3, + "end_time": 184.275, + "sound_class": "Idling", + "confidence": 0.199, + "class_index": 346 + }, + { + "start_time": 183.3, + "end_time": 184.275, + "sound_class": "Medium engine (mid frequency)", + "confidence": 0.199, + "class_index": 342 + }, + { + "start_time": 184.275, + "end_time": 186.225, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.414, + "class_index": 302 + }, + { + "start_time": 187.2, + "end_time": 188.175, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 187.2, + "end_time": 188.175, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.262, + "class_index": 302 + }, + { + "start_time": 188.175, + "end_time": 189.15, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 188.175, + "end_time": 189.15, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "class_index": 302 + }, + { + "start_time": 189.15, + "end_time": 190.125, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 189.15, + "end_time": 195.975, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "class_index": 302 + }, + { + "start_time": 195.975, + "end_time": 196.95, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 195.975, + "end_time": 196.95, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "class_index": 302 + }, + { + "start_time": 196.95, + "end_time": 197.925, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 196.95, + "end_time": 197.925, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.414, + "class_index": 302 + }, + { + "start_time": 197.925, + "end_time": 198.9, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 197.925, + "end_time": 200.85, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.414, + "class_index": 302 + }, + { + "start_time": 200.85, + "end_time": 201.825, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 200.85, + "end_time": 201.825, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.262, + "class_index": 302 + }, + { + "start_time": 201.825, + "end_time": 202.8, + "sound_class": "Vehicle", + "confidence": 0.199, + "class_index": 294 + }, + { + "start_time": 201.825, + "end_time": 204.75, + "sound_class": "Vehicle horn, car horn, honking", + "confidence": 0.332, + "class_index": 302 + } + ] +} \ No newline at end of file diff --git a/Avengers vs Ultron_visual.json b/Avengers vs Ultron_visual.json new file mode 100644 index 0000000..d9e368f --- /dev/null +++ b/Avengers vs Ultron_visual.json @@ -0,0 +1,365 @@ +{ + "video": "Avengers vs Ultron.mp4", + "duration": 204.2, + "total_events": 68, + "events": [ + { + "time": 0.5, + "type": "action", + "action": "fall_down" + }, + { + "time": 0.5, + "type": "action", + "action": "punching" + }, + { + "time": 0.9, + "type": "action", + "action": "fall_down" + }, + { + "time": 0.9, + "type": "action", + "action": "punching" + }, + { + "time": 1.4, + "type": "action", + "action": "punching" + }, + { + "time": 1.9, + "type": "action", + "action": "punching" + }, + { + "time": 2.3, + "type": "action", + "action": "fall_down" + }, + { + "time": 2.3, + "type": "action", + "action": "punching" + }, + { + "time": 2.8, + "type": "action", + "action": "punching" + }, + { + "time": 3.3, + "type": "scene_change", + "score": 1985.4 + }, + { + "time": 6.1, + "type": "scene_change", + "score": 57.6 + }, + { + "time": 6.5, + "type": "scene_change", + "score": 81.4 + }, + { + "time": 8.4, + "type": "action", + "action": "fall_down" + }, + { + "time": 8.9, + "type": "action", + "action": "fall_down" + }, + { + "time": 18.2, + "type": "action", + "action": "punching" + }, + { + "time": 18.7, + "type": "action", + "action": "punching" + }, + { + "time": 19.2, + "type": "action", + "action": "fall_down" + }, + { + "time": 19.2, + "type": "action", + "action": "punching" + }, + { + "time": 19.6, + "type": "action", + "action": "fall_down" + }, + { + "time": 19.6, + "type": "action", + "action": "punching" + }, + { + "time": 20.1, + "type": "action", + "action": "punching" + }, + { + "time": 20.6, + "type": "action", + "action": "punching" + }, + { + "time": 21.0, + "type": "action", + "action": "punching" + }, + { + "time": 24.3, + "type": "object", + "object": "helicopter", + "label": "mixing bowl", + "confidence": 0.316 + }, + { + "time": 28.5, + "type": "scene_change", + "score": 59.5 + }, + { + "time": 29.0, + "type": "action", + "action": "fall_down" + }, + { + "time": 32.2, + "type": "scene_change", + "score": 43.9 + }, + { + "time": 35.5, + "type": "object", + "object": "dog", + "label": "American black bear, black bear, Ursus americanus, Euarctos americanus", + "confidence": 0.495 + }, + { + "time": 35.5, + "type": "object", + "object": "cat", + "label": "American black bear, black bear, Ursus americanus, Euarctos americanus", + "confidence": 0.495 + }, + { + "time": 37.4, + "type": "scene_change", + "score": 575.2 + }, + { + "time": 44.8, + "type": "scene_change", + "score": 290.8 + }, + { + "time": 46.2, + "type": "object", + "object": "helicopter", + "label": "modem", + "confidence": 0.431 + }, + { + "time": 46.7, + "type": "object", + "object": "helicopter", + "label": "modem", + "confidence": 0.423 + }, + { + "time": 47.2, + "type": "object", + "object": "helicopter", + "label": "modem", + "confidence": 0.637 + }, + { + "time": 47.6, + "type": "object", + "object": "helicopter", + "label": "modem", + "confidence": 0.465 + }, + { + "time": 48.1, + "type": "scene_change", + "score": 61.0 + }, + { + "time": 56.1, + "type": "object", + "object": "helicopter", + "label": "modem", + "confidence": 0.31 + }, + { + "time": 61.7, + "type": "scene_change", + "score": 1091.5 + }, + { + "time": 62.1, + "type": "action", + "action": "punching" + }, + { + "time": 62.6, + "type": "action", + "action": "punching" + }, + { + "time": 63.1, + "type": "action", + "action": "punching" + }, + { + "time": 73.8, + "type": "scene_change", + "score": 108.2 + }, + { + "time": 81.7, + "type": "action", + "action": "fall_down" + }, + { + "time": 82.2, + "type": "action", + "action": "fall_down" + }, + { + "time": 83.1, + "type": "scene_change", + "score": 47.6 + }, + { + "time": 85.0, + "type": "action", + "action": "punching" + }, + { + "time": 87.4, + "type": "scene_change", + "score": 154.7 + }, + { + "time": 111.6, + "type": "object", + "object": "dog", + "label": "Appenzeller", + "confidence": 0.659 + }, + { + "time": 112.6, + "type": "action", + "action": "fall_down" + }, + { + "time": 114.9, + "type": "action", + "action": "fall_down" + }, + { + "time": 125.2, + "type": "action", + "action": "punching" + }, + { + "time": 128.5, + "type": "scene_change", + "score": 41.4 + }, + { + "time": 129.9, + "type": "scene_change", + "score": 120.8 + }, + { + "time": 134.1, + "type": "scene_change", + "score": 98.9 + }, + { + "time": 135.5, + "type": "action", + "action": "punching" + }, + { + "time": 137.8, + "type": "scene_change", + "score": 77.1 + }, + { + "time": 151.8, + "type": "scene_change", + "score": 43.4 + }, + { + "time": 165.8, + "type": "action", + "action": "fall_down" + }, + { + "time": 169.6, + "type": "action", + "action": "punching" + }, + { + "time": 170.5, + "type": "scene_change", + "score": 128.1 + }, + { + "time": 171.9, + "type": "scene_change", + "score": 99.4 + }, + { + "time": 173.3, + "type": "scene_change", + "score": 2631.0 + }, + { + "time": 176.1, + "type": "scene_change", + "score": 505.2 + }, + { + "time": 176.6, + "type": "scene_change", + "score": 685.1 + }, + { + "time": 179.4, + "type": "scene_change", + "score": 97.9 + }, + { + "time": 181.7, + "type": "scene_change", + "score": 58.8 + }, + { + "time": 184.1, + "type": "scene_change", + "score": 206.3 + }, + { + "time": 184.5, + "type": "scene_change", + "score": 46.5 + } + ] +} \ No newline at end of file diff --git "a/C4GT \342\200\223 DMP 2026 _ Planet Read.pdf" "b/C4GT \342\200\223 DMP 2026 _ Planet Read.pdf" new file mode 100644 index 0000000..bbf35e9 Binary files /dev/null and "b/C4GT \342\200\223 DMP 2026 _ Planet Read.pdf" differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..d8db70a --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# Intelligent Closed Caption Generation + +This project implements an intelligent, multi-modal closed captioning (CC) pipeline. It extracts non-verbal audio cues, visual actions, and objects, and fuses them alongside speech transcripts to create rich, context-aware subtitles. + +## Project File Structure + +```text +Intelligent-cc-generation/ +├── .git/ +├── .ruff_cache/ +├── __pycache__/ +├── sample_audio/ +├── README.md # Project documentation +├── requirements.txt # Python dependencies +├── create_demo_samples.py # Script to generate demo/sample data +├── demo_pipeline.py # Main execution script for the pipeline +├── demo_module1.py # CLI demo for Module 1 (Audio) +├── demo_module2.py # CLI demo for Module 2 (Visual) +├── cc_decision_engine.py # Module 3: Logic for prioritizing/merging events +├── cc_output.py # Formatter for generating SRT/SLS/JSON +├── sound_event_detection.py # Audio event classification logic +├── yamnet.tflite # TFLite model for audio event detection +├── efficientnet_lite0.tflite # TFLite model for object detection +├── face_detector.tflite # TFLite model for face detection +├── pose_landmarker.task # MediaPipe task for action detection +└── imagenet_labels.txt # Labels for visual object classification +``` + +## Workflow Modules + +### Module 1: Sound Event Detection +Detects environmental sounds (car horns, gunshots, dog barks, sirens, etc.) from audio using YAMNet and MediaPipe. Outputs sound events with precise timestamps and confidence scores. + +* **Files**: `sound_event_detection.py`, `demo_module1.py` +* **Usage**: `python demo_module1.py my_file.wav` + +### Module 2: Visual Detection +Analyzes video frames for visual context using MediaPipe and TensorFlow Lite models. It detects objects (e.g., cars, weapons), actions/poses (e.g., punching, falling), and facial expressions. + +* **Files**: `demo_module2.py` +* **Models used**: `efficientnet_lite0.tflite`, `face_detector.tflite`, `pose_landmarker.task` + +### Module 3: Fusion & Decision Engine +Fuses the outputs from Module 1 (audio events), Module 2 (visual events), and a speech transcript (SRT) into a unified timeline. The Decision Engine resolves conflicts, prioritizes overlapping events, and formats them into cohesive captions. Finally, it outputs standard subtitle files (SRT, SLS, JSON). + +* **Files**: `cc_decision_engine.py`, `cc_output.py`, `demo_pipeline.py` + +## Quick Start (End-to-End Pipeline) + +```bash +# 1. Install dependencies +pip install -r requirements.txt + +# 2. Run the end-to-end pipeline on a video +python demo_pipeline.py "Avengers vs Ultron.mp4" + +# 3. To include an existing speech transcript (SRT): +python demo_pipeline.py "Avengers vs Ultron.mp4" --speech transcript.srt + +# 4. To reuse previously generated intermediate JSON files: +python demo_pipeline.py "Avengers vs Ultron.mp4" --reuse +``` + +## Example Output (Sample Captions) + +``` +[0.0s-1.0s] [audio] [Music] (41%); [Speech] (80%) +[0.5s-4.3s] [visual_action] ⚡ Fall Down +[2.9s-3.9s] [visual_action] ⚡ Punching +[4.8s-5.8s] [audio] [Music] (26%); [Speech] (80%) +[6.8s-7.8s] [audio] [Smash, crash] (20%) +``` + +## Limitations + +- ESC-50 samples downloaded via `create_demo_samples.py` are CC-BY-NC. +- YAMNet is limited to the 521 AudioSet classes. diff --git a/__pycache__/cc_decision_engine.cpython-313.pyc b/__pycache__/cc_decision_engine.cpython-313.pyc new file mode 100644 index 0000000..af3c1d3 Binary files /dev/null and b/__pycache__/cc_decision_engine.cpython-313.pyc differ diff --git a/__pycache__/cc_output.cpython-313.pyc b/__pycache__/cc_output.cpython-313.pyc new file mode 100644 index 0000000..0179b61 Binary files /dev/null and b/__pycache__/cc_output.cpython-313.pyc differ diff --git a/__pycache__/demo_module2.cpython-313.pyc b/__pycache__/demo_module2.cpython-313.pyc new file mode 100644 index 0000000..003fb1c Binary files /dev/null and b/__pycache__/demo_module2.cpython-313.pyc differ diff --git a/__pycache__/demo_pipeline.cpython-313.pyc b/__pycache__/demo_pipeline.cpython-313.pyc new file mode 100644 index 0000000..e51f288 Binary files /dev/null and b/__pycache__/demo_pipeline.cpython-313.pyc differ diff --git a/__pycache__/sound_event_detection.cpython-313.pyc b/__pycache__/sound_event_detection.cpython-313.pyc new file mode 100644 index 0000000..be84ef8 Binary files /dev/null and b/__pycache__/sound_event_detection.cpython-313.pyc differ diff --git a/cc_decision_engine.py b/cc_decision_engine.py new file mode 100644 index 0000000..f1b772b --- /dev/null +++ b/cc_decision_engine.py @@ -0,0 +1,412 @@ +""" +CC Decision Engine - Fuses audio events, visual events, and speech transcripts +into a unified closed caption timeline with intelligent prioritization. + +Inputs (all optional): + - Audio events: sound_event_detection.DetectionResult.to_dict() or JSON file + - Visual events: demo_module2.analyze() output dict or JSON file + - Speech transcript: path to existing SRT file or SRT string + +Algorithm: + 1. Parse all inputs into UnifiedEvent objects with priority levels + 2. Remove non-caption events (scene changes used as metadata only) + 3. Sort by start_time, then priority (ascending) + 4. Resolve overlaps: higher priority preempts lower priority + 5. Merge adjacent same-priority events + 6. Apply duration constraints (min 1.0s, max 6.0s) + 7. Output the unified timeline +""" + +import json +import re +from dataclasses import dataclass +from typing import List, Optional, Dict, Any +from enum import IntEnum + + +# --------------------------------------------------------------------------- +# Priority levels (lower number = higher priority) +# --------------------------------------------------------------------------- + + +class Priority(IntEnum): + CRITICAL = 1 # gunshots, explosions, falls + HIGH = 2 # horns, punches, sirens, alarms + MEDIUM = 3 # vehicles, engines, animals + LOW = 4 # scene changes (metadata only) + SPEECH = 5 # speech transcription + + +_MAX_CAPTION_DURATION = 6.0 +_MIN_CAPTION_DURATION = 1.0 +_COOLDOWN = 0.3 +_SCENE_PRIORITY = int(Priority.LOW) +_DEFAULT_PRIORITY = int(Priority.MEDIUM) + +# Sound class keywords -> priority +_SOUND_PRIORITY: Dict[str, int] = { + "gunshot": int(Priority.CRITICAL), + "gunfire": int(Priority.CRITICAL), + "explosion": int(Priority.CRITICAL), + "glass": int(Priority.CRITICAL), + "breaking": int(Priority.CRITICAL), + "siren": int(Priority.HIGH), + "alarm": int(Priority.HIGH), + "horn": int(Priority.HIGH), + "honking": int(Priority.HIGH), + "vehicle": int(Priority.MEDIUM), + "engine": int(Priority.MEDIUM), + "dog": int(Priority.MEDIUM), + "bark": int(Priority.MEDIUM), + "cat": int(Priority.MEDIUM), + "person": int(Priority.MEDIUM), + "speech": int(Priority.SPEECH), + "music": int(Priority.SPEECH), +} + +_ACTION_PRIORITY: Dict[str, int] = { + "fall_down": int(Priority.CRITICAL), + "punching": int(Priority.HIGH), +} + + +# --------------------------------------------------------------------------- +# Data models +# --------------------------------------------------------------------------- + + +@dataclass +class UnifiedEvent: + """A single caption event in the unified timeline.""" + + start_time: float + end_time: float + caption: str + event_type: str # audio / visual_object / visual_action / speech + source_label: str + confidence: float + priority: int + + def duration(self) -> float: + return self.end_time - self.start_time + + def to_dict(self) -> dict: + return { + "start_time": round(self.start_time, 3), + "end_time": round(self.end_time, 3), + "caption": self.caption, + "event_type": self.event_type, + "source_label": self.source_label, + "confidence": round(self.confidence, 3), + "priority": self.priority, + } + + +# --------------------------------------------------------------------------- +# Parsers +# --------------------------------------------------------------------------- + + +def _priority_for_sound(sound_class: str) -> int: + cls = sound_class.lower() + for kw, pri in _SOUND_PRIORITY.items(): + if kw in cls: + return pri + return _DEFAULT_PRIORITY + + +def _priority_for_action(action: str) -> int: + return _ACTION_PRIORITY.get(action, int(Priority.HIGH)) + + +def _fmt_caption(event_type: str, label: str, detail: str, confidence: float) -> str: + if event_type == "speech": + return detail + if event_type == "visual_action": + return f"\u26a1 {detail}" + if event_type == "visual_object": + return f"\U0001f441 {detail}" + conf_str = f"({confidence:.0%})" if event_type == "audio" else "" + return f"[{label}] {conf_str}".strip() + + +def parse_audio_events(audio_data: Dict[str, Any]) -> List[UnifiedEvent]: + events = [] + for e in audio_data.get("events", []): + sc = e.get("sound_class", "") + pri = _priority_for_sound(sc) + caption = _fmt_caption("audio", sc, sc, e.get("confidence", 0)) + events.append( + UnifiedEvent( + start_time=e["start_time"], + end_time=e["end_time"], + caption=caption, + event_type="audio", + source_label=sc, + confidence=e.get("confidence", 0.0), + priority=pri, + ) + ) + return events + + +def parse_audio_file(path: str) -> List[UnifiedEvent]: + with open(path) as f: + return parse_audio_events(json.load(f)) + + +def parse_visual_events(visual_data: Dict[str, Any]) -> List[UnifiedEvent]: + events = [] + for e in visual_data.get("events", []): + t = e.get("type", "") + time = e.get("time", 0.0) + + if t == "object": + obj = e.get("object", "unknown") + label = e.get("label", "") + pri = _SOUND_PRIORITY.get(obj, int(Priority.MEDIUM)) + detail = f"{obj} ({label})" if label else obj + events.append( + UnifiedEvent( + start_time=time, + end_time=time + 1.5, + caption=_fmt_caption( + "visual_object", obj, detail, e.get("confidence", 0) + ), + event_type="visual_object", + source_label=obj, + confidence=e.get("confidence", 0.0), + priority=pri, + ) + ) + + elif t == "action": + action = e.get("action", "") + pri = _priority_for_action(action) + detail = action.replace("_", " ").title() + events.append( + UnifiedEvent( + start_time=time, + end_time=time + 2.0, + caption=_fmt_caption("visual_action", action, detail, 1.0), + event_type="visual_action", + source_label=action, + confidence=1.0, + priority=pri, + ) + ) + + elif t == "scene_change": + events.append( + UnifiedEvent( + start_time=time, + end_time=time + 0.5, + caption="", + event_type="scene_change", + source_label="scene_change", + confidence=0.0, + priority=_SCENE_PRIORITY, + ) + ) + + return events + + +def parse_visual_file(path: str) -> List[UnifiedEvent]: + with open(path) as f: + return parse_visual_events(json.load(f)) + + +_SRT_TIME_RE = re.compile(r"(\d+):(\d+):(\d+)[,.](\d+)") + + +def _parse_srt_time(s: str) -> float: + m = _SRT_TIME_RE.match(s.strip()) + if not m: + return 0.0 + h, mi, sec, ms = int(m[1]), int(m[2]), int(m[3]), int(m[4]) + return h * 3600 + mi * 60 + sec + ms / 1000.0 + + +def parse_speech_srt(srt_text: str) -> List[UnifiedEvent]: + if not srt_text or not srt_text.strip(): + return [] + events = [] + blocks = re.split(r"\n\s*\n", srt_text.strip()) + for block in blocks: + lines = [l.strip() for l in block.split("\n") if l.strip()] + if len(lines) < 2: + continue + time_line = None + text_lines = [] + for line in lines: + if "-->" in line: + time_line = line + elif not line.isdigit(): + text_lines.append(line) + if not time_line or not text_lines: + continue + parts = time_line.split("-->") + start = _parse_srt_time(parts[0]) + end = _parse_srt_time(parts[1]) + text = " ".join(text_lines) + events.append( + UnifiedEvent( + start_time=start, + end_time=end, + caption=text, + event_type="speech", + source_label="speech", + confidence=1.0, + priority=int(Priority.SPEECH), + ) + ) + return events + + +def parse_speech_file(path: str) -> List[UnifiedEvent]: + with open(path, encoding="utf-8") as f: + return parse_speech_srt(f.read()) + + +# --------------------------------------------------------------------------- +# Fusion logic +# --------------------------------------------------------------------------- + + +def _overlaps(a: UnifiedEvent, b: UnifiedEvent) -> bool: + return a.start_time < b.end_time and b.start_time < a.end_time + + +def _merge_events(events: List[UnifiedEvent]) -> List[UnifiedEvent]: + """Merge adjacent events that have the same caption within a small gap.""" + if not events: + return [] + merged = [events[0]] + for e in events[1:]: + prev = merged[-1] + gap = e.start_time - prev.end_time + if ( + gap <= _COOLDOWN + and e.caption == prev.caption + and e.priority == prev.priority + ): + prev.end_time = max(prev.end_time, e.end_time) + else: + merged.append(e) + return merged + + +def fuse( + audio_events: Optional[List[UnifiedEvent]] = None, + visual_events: Optional[List[UnifiedEvent]] = None, + speech_events: Optional[List[UnifiedEvent]] = None, +) -> List[UnifiedEvent]: + """Fuse all event sources into a single prioritized caption timeline. + + Algorithm: + 1. Collect all events, marking scene changes as metadata + 2. Sort by start_time then priority (lower number = higher priority) + 3. Walk through events; when higher-priority event overlaps a + lower-priority one, truncate/preempt the lower-priority event + 4. Clamp durations to [MIN_CAPTION_DURATION, MAX_CAPTION_DURATION] + 5. Merge adjacent same-caption events + """ + all_events: List[UnifiedEvent] = [] + if audio_events: + all_events.extend(audio_events) + if visual_events: + all_events.extend(visual_events) + if speech_events: + all_events.extend(speech_events) + + if not all_events: + return [] + + # Sort: time ascending, then priority ascending (higher priority first) + all_events.sort(key=lambda e: (e.start_time, e.priority)) + + # --- conflict resolution pass --- + resolved: List[UnifiedEvent] = [] + for e in all_events: + if e.event_type == "scene_change": + continue # scene changes are metadata, not captions + if not e.caption: + continue + + # Check overlaps with already-resolved events + inserted = False + for i, existing in enumerate(resolved): + if not _overlaps(e, existing): + continue + # Same source label: extend end_time + if ( + e.source_label == existing.source_label + and e.caption == existing.caption + ): + existing.end_time = max(existing.end_time, e.end_time) + inserted = True + break + # Different: higher priority wins + if e.priority < existing.priority: + # New event is higher priority — truncate existing + if e.start_time <= existing.start_time: + # New event completely covers the old + resolved[i] = e + else: + # Truncate existing, insert new + existing.end_time = e.start_time + resolved.insert(i + 1, e) + inserted = True + break + elif e.priority > existing.priority: + # Existing event is higher priority — skip or truncate new + if existing.end_time >= e.end_time: + inserted = True # fully covered, discard + break + else: + e.start_time = existing.end_time + # fall through to try next or append + else: + # Same priority: merge captions if overlapping + existing.end_time = max(existing.end_time, e.end_time) + if e.caption not in existing.caption: + existing.caption = f"{existing.caption}; {e.caption}" + existing.confidence = max(existing.confidence, e.confidence) + inserted = True + break + + if not inserted: + resolved.append(e) + + # --- clamp durations --- + for e in resolved: + if e.duration() > _MAX_CAPTION_DURATION: + e.end_time = e.start_time + _MAX_CAPTION_DURATION + if e.duration() < _MIN_CAPTION_DURATION and e.event_type != "speech": + e.end_time = e.start_time + _MIN_CAPTION_DURATION + + # --- merge adjacent same-caption events --- + resolved = _merge_events(resolved) + + return resolved + + +# --------------------------------------------------------------------------- +# Convenience: load everything from files and fuse +# --------------------------------------------------------------------------- + + +def fuse_from_files( + audio_json: Optional[str] = None, + visual_json: Optional[str] = None, + speech_srt: Optional[str] = None, +) -> List[UnifiedEvent]: + audio = parse_audio_file(audio_json) if audio_json else None + visual = parse_visual_file(visual_json) if visual_json else None + speech = parse_speech_file(speech_srt) if speech_srt else None + return fuse(audio, visual, speech) + + +def fused_to_dicts(events: List[UnifiedEvent]) -> List[dict]: + return [e.to_dict() for e in events] diff --git a/cc_output.py b/cc_output.py new file mode 100644 index 0000000..e443203 --- /dev/null +++ b/cc_output.py @@ -0,0 +1,98 @@ +""" +CC Output Generators - Produces SRT and SLS caption files from a fused +UnifiedEvent timeline. + +Formats: + - SRT: Standard SubRip subtitle format with enriched captions combining + audio, visual, and speech signals. + - SLS: Styled caption format (SAMI-like) with CSS classes per event type, + enabling color-coded display of audio events, visual events, and + speech in media players that support SAMI. +""" + +from typing import List, Optional +from cc_decision_engine import UnifiedEvent + + +# --------------------------------------------------------------------------- +# SRT output +# --------------------------------------------------------------------------- + + +def _fmt_srt_time(seconds: float) -> str: + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + ms = int(round((seconds - int(seconds)) * 1000)) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +def to_srt(events: List[UnifiedEvent], filepath: Optional[str] = None) -> str: + lines: List[str] = [] + for idx, e in enumerate(events, start=1): + lines.append(str(idx)) + lines.append(f"{_fmt_srt_time(e.start_time)} --> {_fmt_srt_time(e.end_time)}") + lines.append(e.caption) + lines.append("") + text = "\n".join(lines) + if filepath: + with open(filepath, "w", encoding="utf-8") as f: + f.write(text) + return text + + +# --------------------------------------------------------------------------- +# SLS (Styled Caption) output +# --------------------------------------------------------------------------- + +_SLS_CSS_CLASSES = { + "audio": "cc-audio", + "visual_object": "cc-visual-object", + "visual_action": "cc-visual-action", + "speech": "cc-speech", +} + +_SLS_HEADER = """ + + +Intelligent Closed Captions + + + +""" + +_SLS_FOOTER = """ + +""" + + +def to_sls(events: List[UnifiedEvent], filepath: Optional[str] = None) -> str: + lines: List[str] = [_SLS_HEADER] + for e in events: + css = _SLS_CSS_CLASSES.get(e.event_type, "cc-speech") + start_ms = int(e.start_time * 1000) + lines.append( + f' \n

{_xml_escape(e.caption)}' + ) + lines.append(_SLS_FOOTER) + text = "\n".join(lines) + if filepath: + with open(filepath, "w", encoding="utf-8") as f: + f.write(text) + return text + + +def _xml_escape(s: str) -> str: + return ( + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) diff --git a/create_demo_samples.py b/create_demo_samples.py new file mode 100644 index 0000000..a5f6835 --- /dev/null +++ b/create_demo_samples.py @@ -0,0 +1,27 @@ +""" +Download sample audio clips with real sounds (ESC-50 dataset, CC-BY-NC). +""" + +import os, urllib.request + +DEST = os.path.join(os.path.dirname(__file__), "sample_audio") +os.makedirs(DEST, exist_ok=True) + +BASE = "https://github.com/karoldvl/ESC-50/raw/master/audio/" + +SAMPLES = [ + ("car_horn.wav", "1-17124-A-43.wav", "Car horn"), + ("siren.wav", "1-31482-A-42.wav", "Police siren"), + ("dog_bark.wav", "1-100032-A-0.wav", "Dog barking"), + ("glass_break.wav", "1-20133-A-39.wav", "Glass breaking"), + ("gunshot.wav", "1-115545-A-48.wav", "Fireworks (detected as gunshot/explosion)"), + ("engine.wav", "1-18527-A-44.wav", "Engine running"), +] + +print("Downloading sample audio clips...") +for local, remote, desc in SAMPLES: + path = os.path.join(DEST, local) + if not os.path.exists(path): + print(f" {local} ({desc})") + urllib.request.urlretrieve(BASE + remote, path) +print(f"\nDone. {len(SAMPLES)} files in '{DEST}'") diff --git a/demo_module1.py b/demo_module1.py new file mode 100644 index 0000000..2a6e612 --- /dev/null +++ b/demo_module1.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Sound Event Detection Demo (Module 1). + +Usage: + python demo_module1.py sample_audio/car_horn.wav + python demo_module1.py --all + python demo_module1.py my_video.mp4 +""" + +import sys, os, json + +sys.path.insert(0, os.path.dirname(__file__)) +from sound_event_detection import SoundEventDetector + +SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample_audio") +THRESHOLD = 0.15 + +ALL_SAMPLES = [ + ("car_horn.wav", ["Vehicle horn, car horn, honking"]), + ("siren.wav", ["Siren", "Alarm"]), + ("dog_bark.wav", ["Dog", "Bark"]), + ("glass_break.wav", ["Breaking"]), + ("gunshot.wav", ["Gunshot, gunfire", "Explosion", "Fireworks"]), + ("engine.wav", ["Vehicle", "Engine"]), +] + + +def detect(path, threshold=THRESHOLD): + detector = SoundEventDetector(confidence_threshold=threshold) + if path.lower().endswith(".wav"): + result = detector.detect_from_file(path) + else: + result = detector.detect_from_video(path) + detector.close() + return result + + +def run_all(): + print("=" * 55) + print(" Sound Event Detection Demo (Module 1)") + print("=" * 55) + + if not os.path.isdir(SAMPLE_DIR): + print("Run 'python create_demo_samples.py' first.") + return + + for fname, expected in ALL_SAMPLES: + path = os.path.join(SAMPLE_DIR, fname) + if not os.path.exists(path): + continue + + result = detect(path) + detected = {e.sound_class for e in result.events} + matched = detected & set(expected) + + print(f"\n {fname}") + print(f" Duration: {result.duration_seconds:.1f}s") + if result.events: + for e in result.events: + icon = " <<<" if e.sound_class in expected else "" + print( + f" [{e.start_time:.1f}s] {e.sound_class:35s} " + f"({e.confidence:.0%}){icon}" + ) + else: + print(" (no events above threshold)") + if matched: + print(f" => Detected: {', '.join(sorted(matched))}") + else: + print(f" => Expected: {', '.join(expected)}") + + print() + + +def main(): + if len(sys.argv) < 2: + print(__doc__.strip()) + return + + if sys.argv[1] == "--all": + run_all() + return + + path = sys.argv[1] + if not os.path.exists(path): + print(f"File not found: {path}") + sys.exit(1) + + result = detect(path) + print(f"\n File: {os.path.basename(path)}") + print(f" Duration: {result.duration_seconds:.1f}s") + print(f" Events:") + for e in result.events: + print( + f" [{e.start_time:.1f}s - {e.end_time:.1f}s] " + f"{e.sound_class:35s} ({e.confidence:.0%})" + ) + + out = os.path.splitext(path)[0] + "_events.json" + result.to_json(out) + print(f"\n Results saved to: {out}") + + +if __name__ == "__main__": + main() diff --git a/demo_module2.py b/demo_module2.py new file mode 100644 index 0000000..45769de --- /dev/null +++ b/demo_module2.py @@ -0,0 +1,187 @@ +import sys, os, json, urllib.request + +import cv2 +import numpy as np + +import mediapipe as mp +from mediapipe.tasks import python +from mediapipe.tasks.python import vision + +CLASSIFIER_URL = ( + "https://storage.googleapis.com/mediapipe-models/" + "image_classifier/efficientnet_lite0/float32/1/" + "efficientnet_lite0.tflite" +) +LABELS_URL = "https://raw.githubusercontent.com/google-coral/test_data/master/imagenet_labels.txt" +MODEL_PATH = "efficientnet_lite0.tflite" +LABELS_PATH = "imagenet_labels.txt" + +TARGETS = { + "helicopter": list(range(650, 670)), + "airplane": [404, 895], + "vehicle": [654, 468, 511, 627, 661, 581, 609, 864, 817, 656], + "person": [708], + "dog": list(range(151, 300)), + "cat": list(range(281, 300)), + "motorcycle": [661, 670], + "bicycle": [444, 671], +} + + +def _download(url, path): + if not os.path.exists(path): + print(f" Downloading {os.path.basename(path)}...") + urllib.request.urlretrieve(url, path) + + +def _detect_actions(landmarks, h, w): + actions = [] + if not landmarks: + return actions + + def y(lm): + return (1 - lm.y) * h + + nose_y = y(landmarks[0]) + hip_y = (y(landmarks[23]) + y(landmarks[24])) / 2 + ankle_y = (y(landmarks[27]) + y(landmarks[28])) / 2 + height = abs(nose_y - ankle_y) + width = abs(nose_y - hip_y) + if height > 30 and width < height * 0.4: + actions.append("fall_down") + mx = (landmarks[11].x + landmarks[12].x) / 2 * w + wrist_dist = abs(landmarks[15].x * w - mx) + abs(landmarks[16].x * w - mx) + if wrist_dist > w * 0.7: + actions.append("punching") + return actions + + +def analyze(video_path, interval=0.5): + _download(CLASSIFIER_URL, MODEL_PATH) + _download(LABELS_URL, LABELS_PATH) + + with open(LABELS_PATH) as f: + all_labels = [l.strip() for l in f.readlines()] + + # MediaPipe Image Classifier + base_opts = python.BaseOptions(model_asset_path=MODEL_PATH) + img_opts = vision.ImageClassifierOptions(base_options=base_opts, max_results=5) + classifier = vision.ImageClassifier.create_from_options(img_opts) + + # MediaPipe Pose + pose_path = "pose_landmarker.task" + if not os.path.exists(pose_path): + print(" Downloading pose model...") + urllib.request.urlretrieve( + "https://storage.googleapis.com/mediapipe-models/" + "pose_landmarker/pose_landmarker_heavy/float16/1/" + "pose_landmarker_heavy.task", + pose_path, + ) + pose_opts = vision.PoseLandmarkerOptions( + base_options=python.BaseOptions(model_asset_path=pose_path), + min_pose_detection_confidence=0.4, + ) + pose_det = vision.PoseLandmarker.create_from_options(pose_opts) + + cap = cv2.VideoCapture(video_path) + fps = cap.get(cv2.CAP_PROP_FPS) or 30 + skip = int(fps * interval) + events, prev_hist, frame_idx = [], None, 0 + + while True: + ret, frame = cap.read() + if not ret: + break + if frame_idx % skip != 0: + frame_idx += 1 + continue + + t = round(frame_idx / fps, 1) + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + h, w = frame.shape[:2] + mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb) + + # Object classification + cls_result = classifier.classify(mp_img) + if cls_result.classifications: + for cat in cls_result.classifications[0].categories: + if cat.score > 0.3 and cat.index < len(all_labels): + label = all_labels[cat.index] + for name, ids in TARGETS.items(): + if cat.index in ids: + events.append( + { + "time": t, + "type": "object", + "object": name, + "label": label, + "confidence": round(cat.score, 3), + } + ) + + # Scene change + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + hist = cv2.calcHist([gray], [0], None, [64], [0, 256]) + cv2.normalize(hist, hist) + hist = hist.flatten() + if prev_hist is not None: + diff = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_CHISQR) + if diff > 40: + events.append( + {"time": t, "type": "scene_change", "score": round(diff, 1)} + ) + prev_hist = hist + + # Pose actions + pose_res = pose_det.detect(mp_img) + for lm in ( + pose_res.pose_landmarks if hasattr(pose_res, "pose_landmarks") else [] + ): + for action in _detect_actions(lm, h, w): + events.append({"time": t, "type": "action", "action": action}) + + frame_idx += 1 + + cap.release() + classifier.close() + pose_det.close() + + return { + "video": os.path.basename(video_path), + "duration": round(frame_idx / fps, 1), + "total_events": len(events), + "events": events, + } + + +def main(): + if len(sys.argv) < 2: + print("Usage: python demo_module2.py video.mp4") + return + path = sys.argv[1] + if not os.path.exists(path): + print(f"File not found: {path}") + return + print(f"Analyzing: {path}") + result = analyze(path) + print(f"\nDuration: {result['duration']}s | Events: {result['total_events']}") + for e in result["events"][:20]: + if e["type"] == "object": + print( + f" [{e['time']}s] {e['object']} ({e['label']}) {e['confidence']:.0%}" + ) + elif e["type"] == "scene_change": + print(f" [{e['time']}s] scene change") + elif e["type"] == "action": + print(f" [{e['time']}s] action: {e['action']}") + if len(result["events"]) > 20: + print(f" ... and {len(result['events']) - 20} more") + out = os.path.splitext(path)[0] + "_visual.json" + with open(out, "w") as f: + json.dump(result, f, indent=2) + print(f"Saved: {out}") + + +if __name__ == "__main__": + main() diff --git a/demo_pipeline.py b/demo_pipeline.py new file mode 100644 index 0000000..4d214e9 --- /dev/null +++ b/demo_pipeline.py @@ -0,0 +1,151 @@ +import sys +import os +import json +import argparse +from typing import List + +from cc_decision_engine import ( + fuse, + fuse_from_files, + parse_audio_events, + parse_visual_events, + parse_speech_srt, + UnifiedEvent, + fused_to_dicts, +) +from cc_output import to_srt, to_sls + + +def run_pipeline( + video_path: str, + speech_srt_path: str = None, + reuse: bool = False, + separate: bool = False, +): + base = os.path.splitext(video_path)[0] + + # -- Step 1: Sound event detection (Module 1) -- + audio_json_path = base + "_events.json" + if reuse and os.path.exists(audio_json_path): + print(f"[1/3] Reusing audio events from {audio_json_path}") + audio_events = None # loaded via fuse_from_files + else: + print("[1/3] Running Sound Event Detection (Module 1)...") + from sound_event_detection import SoundEventDetector + + detector = SoundEventDetector(confidence_threshold=0.15, use_separation=separate) + result = detector.detect_from_video(video_path) + result.to_json(audio_json_path) + audio_events = parse_audio_events(result.to_dict()) + detector.close() + print(f" Found {len(audio_events)} audio events") + + # -- Step 2: Visual detection (Module 2) -- + visual_json_path = base + "_visual.json" + if reuse and os.path.exists(visual_json_path): + print(f"[2/3] Reusing visual events from {visual_json_path}") + visual_events = None + else: + print("[2/3] Running Visual Detection (Module 2)...") + from demo_module2 import analyze + + visual_result = analyze(video_path, interval=0.5) + visual_events = parse_visual_events(visual_result) + print(f" Found {len(visual_events)} visual events") + + # Load speech + speech_events = None + if speech_srt_path: + if os.path.exists(speech_srt_path): + print(f"[2b/3] Loading speech transcript from {speech_srt_path}") + speech_events = parse_speech_srt( + open(speech_srt_path, encoding="utf-8").read() + ) + print(f" Found {len(speech_events)} speech segments") + else: + print(f" [!] Speech file not found: {speech_srt_path}") + + # -- Step 3: Fusion -- + print("[3/3] Fusing signals into unified timeline...") + if reuse: + fused = fuse_from_files( + audio_json=audio_json_path if audio_events is None else None, + visual_json=visual_json_path if visual_events is None else None, + speech_srt=speech_srt_path, + ) + else: + if audio_events is None: + from cc_decision_engine import parse_audio_file + + audio_events = parse_audio_file(audio_json_path) + if visual_events is None: + from cc_decision_engine import parse_visual_file + + visual_events = parse_visual_file(visual_json_path) + fused = fuse(audio_events, visual_events, speech_events) + + print(f" Unified timeline: {len(fused)} caption events") + + # -- Step 4: Output -- + srt_path = base + "_cc.srt" + sls_path = base + "_cc.sls" + json_path = base + "_cc_fused.json" + + to_srt(fused, srt_path) + print(f" SRT -> {srt_path}") + + to_sls(fused, sls_path) + print(f" SLS -> {sls_path}") + + with open(json_path, "w", encoding="utf-8") as f: + json.dump(fused_to_dicts(fused), f, indent=2) + print(f" JSON -> {json_path}") + + # -- Summary -- + _print_summary(fused, video_path) + return fused + + +def _print_summary(events: List[UnifiedEvent], video_path: str): + counts: dict = {} + for e in events: + counts[e.event_type] = counts.get(e.event_type, 0) + 1 + + print(f"\n{'=' * 50}") + print(f"Pipeline complete: {os.path.basename(video_path)}") + print(f"{'=' * 50}") + for etype, cnt in sorted(counts.items()): + print(f" {etype:20s}: {cnt}") + print(f"{'=' * 50}") + print("\nSample captions (first 10):") + for e in events[:10]: + try: + sys.stdout.write( + f" [{e.start_time:.1f}s-{e.end_time:.1f}s] [{e.event_type}] {e.caption}\n" + ) + except UnicodeEncodeError: + safe = e.caption.encode("ascii", errors="replace").decode("ascii") + print(f" [{e.start_time:.1f}s-{e.end_time:.1f}s] [{e.event_type}] {safe}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Intelligent CC Generation Pipeline") + parser.add_argument("video", help="Path to video file") + parser.add_argument("--speech", help="Path to SRT speech transcript", default=None) + parser.add_argument( + "--reuse", + action="store_true", + help="Reuse existing _events.json and _visual.json instead of re-running detection", + ) + parser.add_argument( + "--separate", + action="store_true", + help="Separate vocals/dialogue from background audio before running classification", + ) + args = parser.parse_args() + + if not os.path.exists(args.video): + print(f"File not found: {args.video}") + sys.exit(1) + + run_pipeline(args.video, speech_srt_path=args.speech, reuse=args.reuse, separate=args.separate) diff --git a/efficientnet_lite0.tflite b/efficientnet_lite0.tflite new file mode 100644 index 0000000..9ff2e5b Binary files /dev/null and b/efficientnet_lite0.tflite differ diff --git a/face_detector.tflite b/face_detector.tflite new file mode 100644 index 0000000..1b2b522 Binary files /dev/null and b/face_detector.tflite differ diff --git a/imagenet_labels.txt b/imagenet_labels.txt new file mode 100644 index 0000000..0f975ff --- /dev/null +++ b/imagenet_labels.txt @@ -0,0 +1,1001 @@ +background +tench, Tinca tinca +goldfish, Carassius auratus +great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +tiger shark, Galeocerdo cuvieri +hammerhead, hammerhead shark +electric ray, crampfish, numbfish, torpedo +stingray +cock +hen +ostrich, Struthio camelus +brambling, Fringilla montifringilla +goldfinch, Carduelis carduelis +house finch, linnet, Carpodacus mexicanus +junco, snowbird +indigo bunting, indigo finch, indigo bird, Passerina cyanea +robin, American robin, Turdus migratorius +bulbul +jay +magpie +chickadee +water ouzel, dipper +kite +bald eagle, American eagle, Haliaeetus leucocephalus +vulture +great grey owl, great gray owl, Strix nebulosa +European fire salamander, Salamandra salamandra +common newt, Triturus vulgaris +eft +spotted salamander, Ambystoma maculatum +axolotl, mud puppy, Ambystoma mexicanum +bullfrog, Rana catesbeiana +tree frog, tree-frog +tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +loggerhead, loggerhead turtle, Caretta caretta +leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +mud turtle +terrapin +box turtle, box tortoise +banded gecko +common iguana, iguana, Iguana iguana +American chameleon, anole, Anolis carolinensis +whiptail, whiptail lizard +agama +frilled lizard, Chlamydosaurus kingi +alligator lizard +Gila monster, Heloderma suspectum +green lizard, Lacerta viridis +African chameleon, Chamaeleo chamaeleon +Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +African crocodile, Nile crocodile, Crocodylus niloticus +American alligator, Alligator mississipiensis +triceratops +thunder snake, worm snake, Carphophis amoenus +ringneck snake, ring-necked snake, ring snake +hognose snake, puff adder, sand viper +green snake, grass snake +king snake, kingsnake +garter snake, grass snake +water snake +vine snake +night snake, Hypsiglena torquata +boa constrictor, Constrictor constrictor +rock python, rock snake, Python sebae +Indian cobra, Naja naja +green mamba +sea snake +horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +diamondback, diamondback rattlesnake, Crotalus adamanteus +sidewinder, horned rattlesnake, Crotalus cerastes +trilobite +harvestman, daddy longlegs, Phalangium opilio +scorpion +black and gold garden spider, Argiope aurantia +barn spider, Araneus cavaticus +garden spider, Aranea diademata +black widow, Latrodectus mactans +tarantula +wolf spider, hunting spider +tick +centipede +black grouse +ptarmigan +ruffed grouse, partridge, Bonasa umbellus +prairie chicken, prairie grouse, prairie fowl +peacock +quail +partridge +African grey, African gray, Psittacus erithacus +macaw +sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +lorikeet +coucal +bee eater +hornbill +hummingbird +jacamar +toucan +drake +red-breasted merganser, Mergus serrator +goose +black swan, Cygnus atratus +tusker +echidna, spiny anteater, anteater +platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +wallaby, brush kangaroo +koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +wombat +jellyfish +sea anemone, anemone +brain coral +flatworm, platyhelminth +nematode, nematode worm, roundworm +conch +snail +slug +sea slug, nudibranch +chiton, coat-of-mail shell, sea cradle, polyplacophore +chambered nautilus, pearly nautilus, nautilus +Dungeness crab, Cancer magister +rock crab, Cancer irroratus +fiddler crab +king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +American lobster, Northern lobster, Maine lobster, Homarus americanus +spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +crayfish, crawfish, crawdad, crawdaddy +hermit crab +isopod +white stork, Ciconia ciconia +black stork, Ciconia nigra +spoonbill +flamingo +little blue heron, Egretta caerulea +American egret, great white heron, Egretta albus +bittern +crane +limpkin, Aramus pictus +European gallinule, Porphyrio porphyrio +American coot, marsh hen, mud hen, water hen, Fulica americana +bustard +ruddy turnstone, Arenaria interpres +red-backed sandpiper, dunlin, Erolia alpina +redshank, Tringa totanus +dowitcher +oystercatcher, oyster catcher +pelican +king penguin, Aptenodytes patagonica +albatross, mollymawk +grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +killer whale, killer, orca, grampus, sea wolf, Orcinus orca +dugong, Dugong dugon +sea lion +Chihuahua +Japanese spaniel +Maltese dog, Maltese terrier, Maltese +Pekinese, Pekingese, Peke +Shih-Tzu +Blenheim spaniel +papillon +toy terrier +Rhodesian ridgeback +Afghan hound, Afghan +basset, basset hound +beagle +bloodhound, sleuthhound +bluetick +black-and-tan coonhound +Walker hound, Walker foxhound +English foxhound +redbone +borzoi, Russian wolfhound +Irish wolfhound +Italian greyhound +whippet +Ibizan hound, Ibizan Podenco +Norwegian elkhound, elkhound +otterhound, otter hound +Saluki, gazelle hound +Scottish deerhound, deerhound +Weimaraner +Staffordshire bullterrier, Staffordshire bull terrier +American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +Bedlington terrier +Border terrier +Kerry blue terrier +Irish terrier +Norfolk terrier +Norwich terrier +Yorkshire terrier +wire-haired fox terrier +Lakeland terrier +Sealyham terrier, Sealyham +Airedale, Airedale terrier +cairn, cairn terrier +Australian terrier +Dandie Dinmont, Dandie Dinmont terrier +Boston bull, Boston terrier +miniature schnauzer +giant schnauzer +standard schnauzer +Scotch terrier, Scottish terrier, Scottie +Tibetan terrier, chrysanthemum dog +silky terrier, Sydney silky +soft-coated wheaten terrier +West Highland white terrier +Lhasa, Lhasa apso +flat-coated retriever +curly-coated retriever +golden retriever +Labrador retriever +Chesapeake Bay retriever +German short-haired pointer +vizsla, Hungarian pointer +English setter +Irish setter, red setter +Gordon setter +Brittany spaniel +clumber, clumber spaniel +English springer, English springer spaniel +Welsh springer spaniel +cocker spaniel, English cocker spaniel, cocker +Sussex spaniel +Irish water spaniel +kuvasz +schipperke +groenendael +malinois +briard +kelpie +komondor +Old English sheepdog, bobtail +Shetland sheepdog, Shetland sheep dog, Shetland +collie +Border collie +Bouvier des Flandres, Bouviers des Flandres +Rottweiler +German shepherd, German shepherd dog, German police dog, alsatian +Doberman, Doberman pinscher +miniature pinscher +Greater Swiss Mountain dog +Bernese mountain dog +Appenzeller +EntleBucher +boxer +bull mastiff +Tibetan mastiff +French bulldog +Great Dane +Saint Bernard, St Bernard +Eskimo dog, husky +malamute, malemute, Alaskan malamute +Siberian husky +dalmatian, coach dog, carriage dog +affenpinscher, monkey pinscher, monkey dog +basenji +pug, pug-dog +Leonberg +Newfoundland, Newfoundland dog +Great Pyrenees +Samoyed, Samoyede +Pomeranian +chow, chow chow +keeshond +Brabancon griffon +Pembroke, Pembroke Welsh corgi +Cardigan, Cardigan Welsh corgi +toy poodle +miniature poodle +standard poodle +Mexican hairless +timber wolf, grey wolf, gray wolf, Canis lupus +white wolf, Arctic wolf, Canis lupus tundrarum +red wolf, maned wolf, Canis rufus, Canis niger +coyote, prairie wolf, brush wolf, Canis latrans +dingo, warrigal, warragal, Canis dingo +dhole, Cuon alpinus +African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +hyena, hyaena +red fox, Vulpes vulpes +kit fox, Vulpes macrotis +Arctic fox, white fox, Alopex lagopus +grey fox, gray fox, Urocyon cinereoargenteus +tabby, tabby cat +tiger cat +Persian cat +Siamese cat, Siamese +Egyptian cat +cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +lynx, catamount +leopard, Panthera pardus +snow leopard, ounce, Panthera uncia +jaguar, panther, Panthera onca, Felis onca +lion, king of beasts, Panthera leo +tiger, Panthera tigris +cheetah, chetah, Acinonyx jubatus +brown bear, bruin, Ursus arctos +American black bear, black bear, Ursus americanus, Euarctos americanus +ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +sloth bear, Melursus ursinus, Ursus ursinus +mongoose +meerkat, mierkat +tiger beetle +ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +ground beetle, carabid beetle +long-horned beetle, longicorn, longicorn beetle +leaf beetle, chrysomelid +dung beetle +rhinoceros beetle +weevil +fly +bee +ant, emmet, pismire +grasshopper, hopper +cricket +walking stick, walkingstick, stick insect +cockroach, roach +mantis, mantid +cicada, cicala +leafhopper +lacewing, lacewing fly +dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +damselfly +admiral +ringlet, ringlet butterfly +monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +cabbage butterfly +sulphur butterfly, sulfur butterfly +lycaenid, lycaenid butterfly +starfish, sea star +sea urchin +sea cucumber, holothurian +wood rabbit, cottontail, cottontail rabbit +hare +Angora, Angora rabbit +hamster +porcupine, hedgehog +fox squirrel, eastern fox squirrel, Sciurus niger +marmot +beaver +guinea pig, Cavia cobaya +sorrel +zebra +hog, pig, grunter, squealer, Sus scrofa +wild boar, boar, Sus scrofa +warthog +hippopotamus, hippo, river horse, Hippopotamus amphibius +ox +water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +bison +ram, tup +bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +ibex, Capra ibex +hartebeest +impala, Aepyceros melampus +gazelle +Arabian camel, dromedary, Camelus dromedarius +llama +weasel +mink +polecat, fitch, foulmart, foumart, Mustela putorius +black-footed ferret, ferret, Mustela nigripes +otter +skunk, polecat, wood pussy +badger +armadillo +three-toed sloth, ai, Bradypus tridactylus +orangutan, orang, orangutang, Pongo pygmaeus +gorilla, Gorilla gorilla +chimpanzee, chimp, Pan troglodytes +gibbon, Hylobates lar +siamang, Hylobates syndactylus, Symphalangus syndactylus +guenon, guenon monkey +patas, hussar monkey, Erythrocebus patas +baboon +macaque +langur +colobus, colobus monkey +proboscis monkey, Nasalis larvatus +marmoset +capuchin, ringtail, Cebus capucinus +howler monkey, howler +titi, titi monkey +spider monkey, Ateles geoffroyi +squirrel monkey, Saimiri sciureus +Madagascar cat, ring-tailed lemur, Lemur catta +indri, indris, Indri indri, Indri brevicaudatus +Indian elephant, Elephas maximus +African elephant, Loxodonta africana +lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +barracouta, snoek +eel +coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +rock beauty, Holocanthus tricolor +anemone fish +sturgeon +gar, garfish, garpike, billfish, Lepisosteus osseus +lionfish +puffer, pufferfish, blowfish, globefish +abacus +abaya +academic gown, academic robe, judge's robe +accordion, piano accordion, squeeze box +acoustic guitar +aircraft carrier, carrier, flattop, attack aircraft carrier +airliner +airship, dirigible +altar +ambulance +amphibian, amphibious vehicle +analog clock +apiary, bee house +apron +ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +assault rifle, assault gun +backpack, back pack, knapsack, packsack, rucksack, haversack +bakery, bakeshop, bakehouse +balance beam, beam +balloon +ballpoint, ballpoint pen, ballpen, Biro +Band Aid +banjo +bannister, banister, balustrade, balusters, handrail +barbell +barber chair +barbershop +barn +barometer +barrel, cask +barrow, garden cart, lawn cart, wheelbarrow +baseball +basketball +bassinet +bassoon +bathing cap, swimming cap +bath towel +bathtub, bathing tub, bath, tub +beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +beacon, lighthouse, beacon light, pharos +beaker +bearskin, busby, shako +beer bottle +beer glass +bell cote, bell cot +bib +bicycle-built-for-two, tandem bicycle, tandem +bikini, two-piece +binder, ring-binder +binoculars, field glasses, opera glasses +birdhouse +boathouse +bobsled, bobsleigh, bob +bolo tie, bolo, bola tie, bola +bonnet, poke bonnet +bookcase +bookshop, bookstore, bookstall +bottlecap +bow +bow tie, bow-tie, bowtie +brass, memorial tablet, plaque +brassiere, bra, bandeau +breakwater, groin, groyne, mole, bulwark, seawall, jetty +breastplate, aegis, egis +broom +bucket, pail +buckle +bulletproof vest +bullet train, bullet +butcher shop, meat market +cab, hack, taxi, taxicab +caldron, cauldron +candle, taper, wax light +cannon +canoe +can opener, tin opener +cardigan +car mirror +carousel, carrousel, merry-go-round, roundabout, whirligig +carpenter's kit, tool kit +carton +car wheel +cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +cassette +cassette player +castle +catamaran +CD player +cello, violoncello +cellular telephone, cellular phone, cellphone, cell, mobile phone +chain +chainlink fence +chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +chain saw, chainsaw +chest +chiffonier, commode +chime, bell, gong +china cabinet, china closet +Christmas stocking +church, church building +cinema, movie theater, movie theatre, movie house, picture palace +cleaver, meat cleaver, chopper +cliff dwelling +cloak +clog, geta, patten, sabot +cocktail shaker +coffee mug +coffeepot +coil, spiral, volute, whorl, helix +combination lock +computer keyboard, keypad +confectionery, confectionary, candy store +container ship, containership, container vessel +convertible +corkscrew, bottle screw +cornet, horn, trumpet, trump +cowboy boot +cowboy hat, ten-gallon hat +cradle +crane +crash helmet +crate +crib, cot +Crock Pot +croquet ball +crutch +cuirass +dam, dike, dyke +desk +desktop computer +dial telephone, dial phone +diaper, nappy, napkin +digital clock +digital watch +dining table, board +dishrag, dishcloth +dishwasher, dish washer, dishwashing machine +disk brake, disc brake +dock, dockage, docking facility +dogsled, dog sled, dog sleigh +dome +doormat, welcome mat +drilling platform, offshore rig +drum, membranophone, tympan +drumstick +dumbbell +Dutch oven +electric fan, blower +electric guitar +electric locomotive +entertainment center +envelope +espresso maker +face powder +feather boa, boa +file, file cabinet, filing cabinet +fireboat +fire engine, fire truck +fire screen, fireguard +flagpole, flagstaff +flute, transverse flute +folding chair +football helmet +forklift +fountain +fountain pen +four-poster +freight car +French horn, horn +frying pan, frypan, skillet +fur coat +garbage truck, dustcart +gasmask, respirator, gas helmet +gas pump, gasoline pump, petrol pump, island dispenser +goblet +go-kart +golf ball +golfcart, golf cart +gondola +gong, tam-tam +gown +grand piano, grand +greenhouse, nursery, glasshouse +grille, radiator grille +grocery store, grocery, food market, market +guillotine +hair slide +hair spray +half track +hammer +hamper +hand blower, blow dryer, blow drier, hair dryer, hair drier +hand-held computer, hand-held microcomputer +handkerchief, hankie, hanky, hankey +hard disc, hard disk, fixed disk +harmonica, mouth organ, harp, mouth harp +harp +harvester, reaper +hatchet +holster +home theater, home theatre +honeycomb +hook, claw +hoopskirt, crinoline +horizontal bar, high bar +horse cart, horse-cart +hourglass +iPod +iron, smoothing iron +jack-o'-lantern +jean, blue jean, denim +jeep, landrover +jersey, T-shirt, tee shirt +jigsaw puzzle +jinrikisha, ricksha, rickshaw +joystick +kimono +knee pad +knot +lab coat, laboratory coat +ladle +lampshade, lamp shade +laptop, laptop computer +lawn mower, mower +lens cap, lens cover +letter opener, paper knife, paperknife +library +lifeboat +lighter, light, igniter, ignitor +limousine, limo +liner, ocean liner +lipstick, lip rouge +Loafer +lotion +loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +loupe, jeweler's loupe +lumbermill, sawmill +magnetic compass +mailbag, postbag +mailbox, letter box +maillot +maillot, tank suit +manhole cover +maraca +marimba, xylophone +mask +matchstick +maypole +maze, labyrinth +measuring cup +medicine chest, medicine cabinet +megalith, megalithic structure +microphone, mike +microwave, microwave oven +military uniform +milk can +minibus +miniskirt, mini +minivan +missile +mitten +mixing bowl +mobile home, manufactured home +Model T +modem +monastery +monitor +moped +mortar +mortarboard +mosque +mosquito net +motor scooter, scooter +mountain bike, all-terrain bike, off-roader +mountain tent +mouse, computer mouse +mousetrap +moving van +muzzle +nail +neck brace +necklace +nipple +notebook, notebook computer +obelisk +oboe, hautboy, hautbois +ocarina, sweet potato +odometer, hodometer, mileometer, milometer +oil filter +organ, pipe organ +oscilloscope, scope, cathode-ray oscilloscope, CRO +overskirt +oxcart +oxygen mask +packet +paddle, boat paddle +paddlewheel, paddle wheel +padlock +paintbrush +pajama, pyjama, pj's, jammies +palace +panpipe, pandean pipe, syrinx +paper towel +parachute, chute +parallel bars, bars +park bench +parking meter +passenger car, coach, carriage +patio, terrace +pay-phone, pay-station +pedestal, plinth, footstall +pencil box, pencil case +pencil sharpener +perfume, essence +Petri dish +photocopier +pick, plectrum, plectron +pickelhaube +picket fence, paling +pickup, pickup truck +pier +piggy bank, penny bank +pill bottle +pillow +ping-pong ball +pinwheel +pirate, pirate ship +pitcher, ewer +plane, carpenter's plane, woodworking plane +planetarium +plastic bag +plate rack +plow, plough +plunger, plumber's helper +Polaroid camera, Polaroid Land camera +pole +police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +poncho +pool table, billiard table, snooker table +pop bottle, soda bottle +pot, flowerpot +potter's wheel +power drill +prayer rug, prayer mat +printer +prison, prison house +projectile, missile +projector +puck, hockey puck +punching bag, punch bag, punching ball, punchball +purse +quill, quill pen +quilt, comforter, comfort, puff +racer, race car, racing car +racket, racquet +radiator +radio, wireless +radio telescope, radio reflector +rain barrel +recreational vehicle, RV, R.V. +reel +reflex camera +refrigerator, icebox +remote control, remote +restaurant, eating house, eating place, eatery +revolver, six-gun, six-shooter +rifle +rocking chair, rocker +rotisserie +rubber eraser, rubber, pencil eraser +rugby ball +rule, ruler +running shoe +safe +safety pin +saltshaker, salt shaker +sandal +sarong +sax, saxophone +scabbard +scale, weighing machine +school bus +schooner +scoreboard +screen, CRT screen +screw +screwdriver +seat belt, seatbelt +sewing machine +shield, buckler +shoe shop, shoe-shop, shoe store +shoji +shopping basket +shopping cart +shovel +shower cap +shower curtain +ski +ski mask +sleeping bag +slide rule, slipstick +sliding door +slot, one-armed bandit +snorkel +snowmobile +snowplow, snowplough +soap dispenser +soccer ball +sock +solar dish, solar collector, solar furnace +sombrero +soup bowl +space bar +space heater +space shuttle +spatula +speedboat +spider web, spider's web +spindle +sports car, sport car +spotlight, spot +stage +steam locomotive +steel arch bridge +steel drum +stethoscope +stole +stone wall +stopwatch, stop watch +stove +strainer +streetcar, tram, tramcar, trolley, trolley car +stretcher +studio couch, day bed +stupa, tope +submarine, pigboat, sub, U-boat +suit, suit of clothes +sundial +sunglass +sunglasses, dark glasses, shades +sunscreen, sunblock, sun blocker +suspension bridge +swab, swob, mop +sweatshirt +swimming trunks, bathing trunks +swing +switch, electric switch, electrical switch +syringe +table lamp +tank, army tank, armored combat vehicle, armoured combat vehicle +tape player +teapot +teddy, teddy bear +television, television system +tennis ball +thatch, thatched roof +theater curtain, theatre curtain +thimble +thresher, thrasher, threshing machine +throne +tile roof +toaster +tobacco shop, tobacconist shop, tobacconist +toilet seat +torch +totem pole +tow truck, tow car, wrecker +toyshop +tractor +trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +tray +trench coat +tricycle, trike, velocipede +trimaran +tripod +triumphal arch +trolleybus, trolley coach, trackless trolley +trombone +tub, vat +turnstile +typewriter keyboard +umbrella +unicycle, monocycle +upright, upright piano +vacuum, vacuum cleaner +vase +vault +velvet +vending machine +vestment +viaduct +violin, fiddle +volleyball +waffle iron +wall clock +wallet, billfold, notecase, pocketbook +wardrobe, closet, press +warplane, military plane +washbasin, handbasin, washbowl, lavabo, wash-hand basin +washer, automatic washer, washing machine +water bottle +water jug +water tower +whiskey jug +whistle +wig +window screen +window shade +Windsor tie +wine bottle +wing +wok +wooden spoon +wool, woolen, woollen +worm fence, snake fence, snake-rail fence, Virginia fence +wreck +yawl +yurt +web site, website, internet site, site +comic book +crossword puzzle, crossword +street sign +traffic light, traffic signal, stoplight +book jacket, dust cover, dust jacket, dust wrapper +menu +plate +guacamole +consomme +hot pot, hotpot +trifle +ice cream, icecream +ice lolly, lolly, lollipop, popsicle +French loaf +bagel, beigel +pretzel +cheeseburger +hotdog, hot dog, red hot +mashed potato +head cabbage +broccoli +cauliflower +zucchini, courgette +spaghetti squash +acorn squash +butternut squash +cucumber, cuke +artichoke, globe artichoke +bell pepper +cardoon +mushroom +Granny Smith +strawberry +orange +lemon +fig +pineapple, ananas +banana +jackfruit, jak, jack +custard apple +pomegranate +hay +carbonara +chocolate sauce, chocolate syrup +dough +meat loaf, meatloaf +pizza, pizza pie +potpie +burrito +red wine +espresso +cup +eggnog +alp +bubble +cliff, drop, drop-off +coral reef +geyser +lakeside, lakeshore +promontory, headland, head, foreland +sandbar, sand bar +seashore, coast, seacoast, sea-coast +valley, vale +volcano +ballplayer, baseball player +groom, bridegroom +scuba diver +rapeseed +daisy +yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +corn +acorn +hip, rose hip, rosehip +buckeye, horse chestnut, conker +coral fungus +agaric +gyromitra +stinkhorn, carrion fungus +earthstar +hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +bolete +ear, spike, capitulum +toilet tissue, toilet paper, bathroom tissue diff --git a/pose_landmarker.task b/pose_landmarker.task new file mode 100644 index 0000000..45449d9 Binary files /dev/null and b/pose_landmarker.task differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cd18d16 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +mediapipe>=0.10.0 +moviepy>=1.0.0 +scipy>=1.10.0 +numpy>=1.24.0 +noisereduce>=3.0.0 diff --git a/sample_audio/car_horn.wav b/sample_audio/car_horn.wav new file mode 100644 index 0000000..32064fa Binary files /dev/null and b/sample_audio/car_horn.wav differ diff --git a/sample_audio/dog_bark.wav b/sample_audio/dog_bark.wav new file mode 100644 index 0000000..827f934 Binary files /dev/null and b/sample_audio/dog_bark.wav differ diff --git a/sample_audio/engine.wav b/sample_audio/engine.wav new file mode 100644 index 0000000..69e9d20 Binary files /dev/null and b/sample_audio/engine.wav differ diff --git a/sample_audio/glass_break.wav b/sample_audio/glass_break.wav new file mode 100644 index 0000000..79a0d34 Binary files /dev/null and b/sample_audio/glass_break.wav differ diff --git a/sample_audio/gunshot.wav b/sample_audio/gunshot.wav new file mode 100644 index 0000000..3367505 Binary files /dev/null and b/sample_audio/gunshot.wav differ diff --git a/sample_audio/siren.wav b/sample_audio/siren.wav new file mode 100644 index 0000000..624bffc Binary files /dev/null and b/sample_audio/siren.wav differ diff --git a/sound_event_detection.py b/sound_event_detection.py new file mode 100644 index 0000000..7baac7c --- /dev/null +++ b/sound_event_detection.py @@ -0,0 +1,429 @@ +""" +Sound Event Detection (SED) Module - Module 1 of the Intelligent CC pipeline. + +Detects environmental sounds (car horns, gunshots, dog barks, sirens, etc.) +from audio using the YAMNet model via MediaPipe Audio Classifier. + +Outputs detected events with timestamps and confidence scores. +""" + +import os +import json +import urllib.request +import tempfile +from dataclasses import dataclass, field +from typing import List, Optional + +import numpy as np +from scipy.io import wavfile + +import mediapipe as mp +from mediapipe.tasks import python +from mediapipe.tasks.python.components import containers +from mediapipe.tasks.python import audio + +YAMNET_MODEL_URL = ( + "https://storage.googleapis.com/mediapipe-models/" + "audio_classifier/yamnet/float32/1/yamnet.tflite" +) +YAMNET_WINDOW_SEC = 0.975 + +DEFAULT_TARGET_SOUNDS = [ + "Vehicle horn, car horn, honking", + "Gunshot, gunfire", + "Dog", + "Bark", + "Siren", + "Alarm", + "Explosion", + "Fireworks", + "Glass", + "Breaking", + "Vehicle", + "Engine", +] + + +@dataclass +class SoundEvent: + """A single detected sound event with time bounds and confidence.""" + + start_time: float + end_time: float + sound_class: str + confidence: float + class_index: int + + +@dataclass +class DetectionResult: + """Collection of detected events for one audio file.""" + + events: List[SoundEvent] = field(default_factory=list) + duration_seconds: float = 0.0 + source_file: str = "" + + def to_dict(self) -> dict: + return { + "source_file": self.source_file, + "duration_seconds": self.duration_seconds, + "events": [ + { + "start_time": round(e.start_time, 3), + "end_time": round(e.end_time, 3), + "sound_class": e.sound_class, + "confidence": round(e.confidence, 3), + "class_index": e.class_index, + } + for e in self.events + ], + } + + def to_json(self, filepath: Optional[str] = None, indent: int = 2) -> str: + text = json.dumps(self.to_dict(), indent=indent) + if filepath: + with open(filepath, "w") as f: + f.write(text) + return text + + def to_srt( + self, filepath: Optional[str] = None, min_confidence: float = 0.0 + ) -> str: + lines = [] + counter = 1 + for e in self.events: + if e.confidence < min_confidence: + continue + lines.append(str(counter)) + lines.append(f"{_fmt_srt(e.start_time)} --> {_fmt_srt(e.end_time)}") + lines.append(f"[{e.sound_class}] ({e.confidence:.0%})") + lines.append("") + counter += 1 + text = "\n".join(lines) + if filepath: + with open(filepath, "w") as f: + f.write(text) + return text + + +class SoundEventDetector: + """Sound Event Detection using MediaPipe's YAMNet classifier. + + Args: + model_path: Path to YAMNet TFLite model. + confidence_threshold: Minimum confidence (0-1) to report an event. + target_classes: List of sound class names to filter for. + If None, all classes above threshold are reported. + """ + + def __init__( + self, + model_path: str = "yamnet.tflite", + confidence_threshold: float = 0.20, + target_classes: Optional[List[str]] = None, + use_separation: bool = False, + use_noise_reduction: bool = False, + ): + self.model_path = model_path + self.confidence_threshold = confidence_threshold + self.target_classes = target_classes + self.use_separation = use_separation + self.use_noise_reduction = use_noise_reduction + self._classifier: Optional[audio.AudioClassifier] = None + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def detect_from_file(self, audio_path: str) -> DetectionResult: + """Run detection on a WAV audio file.""" + if not audio_path.lower().endswith(".wav"): + raise ValueError( + "Input must be a .wav file. Use detect_from_video() for video files." + ) + + import shutil + temp_items: list[str] = [] + try: + current_path = audio_path + + if self.use_separation: + inst_path, temp_dir = self._separate_audio(current_path) + temp_items.append(temp_dir) + current_path = inst_path + + if self.use_noise_reduction: + denoised_path = self._reduce_noise(current_path) + temp_items.append(denoised_path) + current_path = denoised_path + + result = self._detect(current_path) + result.source_file = os.path.basename(audio_path) + return result + finally: + for item in reversed(temp_items): + if os.path.isdir(item): + shutil.rmtree(item, ignore_errors=True) + elif os.path.isfile(item): + try: + os.remove(item) + except OSError: + pass + + def detect_from_video(self, video_path: str) -> DetectionResult: + """Extract audio from video, run detection, clean up temp file.""" + import shutil + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_wav = tmp.name + + temp_items: list[str] = [tmp_wav] + try: + self._extract_audio(video_path, tmp_wav) + current_path = tmp_wav + + if self.use_separation: + inst_path, temp_dir = self._separate_audio(current_path) + temp_items.append(temp_dir) + current_path = inst_path + + if self.use_noise_reduction: + denoised_path = self._reduce_noise(current_path) + temp_items.append(denoised_path) + current_path = denoised_path + + result = self._detect(current_path) + result.source_file = os.path.basename(video_path) + return result + finally: + for item in reversed(temp_items): + if os.path.isdir(item): + shutil.rmtree(item, ignore_errors=True) + elif os.path.isfile(item): + try: + os.remove(item) + except OSError: + pass + + def close(self) -> None: + if self._classifier is not None: + self._classifier.close() + self._classifier = None + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + @staticmethod + def _ensure_ffmpeg() -> None: + """Dynamically find and setup ffmpeg from imageio_ffmpeg if not in PATH.""" + import shutil + if shutil.which("ffmpeg") is not None: + return + + try: + import imageio_ffmpeg + ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() + except ImportError: + return + + if not ffmpeg_exe or not os.path.exists(ffmpeg_exe): + return + + import tempfile + temp_dir = os.path.join(tempfile.gettempdir(), "intelligent_cc_ffmpeg") + os.makedirs(temp_dir, exist_ok=True) + + dest_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" + ffmpeg_dest = os.path.join(temp_dir, dest_name) + + if not os.path.exists(ffmpeg_dest): + try: + shutil.copy(ffmpeg_exe, ffmpeg_dest) + except Exception: + pass + + if os.path.exists(ffmpeg_dest): + os.environ["PATH"] = temp_dir + os.pathsep + os.environ["PATH"] + + def _separate_audio(self, wav_path: str) -> tuple[str, str]: + """Separate vocals and instrumental/background stems. + Returns a tuple of (instrumental_path, temp_dir_to_clean_up) + """ + import tempfile + from audio_separator.separator import Separator + + self._ensure_ffmpeg() + + # Create a temp directory for outputs + temp_dir = tempfile.mkdtemp(prefix="cc_separation_") + + # Initialize separator + separator = Separator(output_dir=temp_dir) + separator.load_model(model_filename='UVR-MDX-NET-Inst_HQ_3.onnx') + + print(f" Separating vocals/speech from background audio...") + outputs = separator.separate(wav_path) + + # outputs contains filenames. Find the instrumental stem + inst_file = None + for filename in outputs: + if "Instrumental" in filename: + inst_file = filename + break + + if not inst_file: + inst_file = outputs[0] if outputs else None + + if not inst_file: + raise RuntimeError("Audio stem separation failed, no output files generated.") + + inst_path = os.path.join(temp_dir, inst_file) + return inst_path, temp_dir + + def _reduce_noise(self, wav_path: str) -> str: + """Apply spectral-gate noise reduction. Returns path to denoised WAV.""" + import tempfile + import noisereduce as nr + + sr, data = wavfile.read(wav_path) + + if data.dtype == np.int16: + float_data = data.astype(np.float32) / np.iinfo(np.int16).max + elif data.dtype == np.int32: + float_data = data.astype(np.float32) / np.iinfo(np.int32).max + elif data.dtype == np.uint8: + float_data = data.astype(np.float32) / 255.0 * 2.0 - 1.0 + else: + float_data = data.astype(np.float32) + + if float_data.ndim > 1: + reduced = np.stack([ + nr.reduce_noise(y=float_data[:, c], sr=sr, prop_decrease=0.8) + for c in range(float_data.shape[1]) + ], axis=1) + else: + reduced = nr.reduce_noise(y=float_data, sr=sr, prop_decrease=0.8) + + reduced = np.clip(reduced, -1.0, 1.0) + reduced_int16 = (reduced * np.iinfo(np.int16).max).astype(np.int16) + + fd, out_path = tempfile.mkstemp(suffix="_denoised.wav") + os.close(fd) + wavfile.write(out_path, sr, reduced_int16) + return out_path + + # ------------------------------------------------------------------ + # Internal methods + # ------------------------------------------------------------------ + + def _get_classifier(self) -> audio.AudioClassifier: + if self._classifier is not None: + return self._classifier + if not os.path.exists(self.model_path): + print(f"Downloading YAMNet model to {self.model_path} ...") + urllib.request.urlretrieve(YAMNET_MODEL_URL, self.model_path) + base_opts = python.BaseOptions(model_asset_path=self.model_path) + opts = audio.AudioClassifierOptions( + base_options=base_opts, + running_mode=audio.RunningMode.AUDIO_CLIPS, + max_results=5, + ) + self._classifier = audio.AudioClassifier.create_from_options(opts) + return self._classifier + + @staticmethod + def _extract_audio(video_path: str, output_wav: str) -> None: + from moviepy import VideoFileClip + + with VideoFileClip(video_path) as clip: + if clip.audio is None: + raise ValueError(f"No audio track in {video_path}") + clip.audio.write_audiofile( + output_wav, + fps=16000, + nbytes=2, + codec="pcm_s16le", + ffmpeg_params=["-ac", "1"], + logger=None, + ) + + @staticmethod + def _load_wav(path: str): + sr, data = wavfile.read(path) + if data.ndim > 1: + data = data.mean(axis=1) + if data.dtype == np.int16: + data = data.astype(np.float32) / np.iinfo(np.int16).max + elif data.dtype == np.int32: + data = data.astype(np.float32) / np.iinfo(np.int32).max + elif data.dtype == np.uint8: + data = data.astype(np.float32) / 255.0 * 2.0 - 1.0 + return data, sr + + def _detect(self, wav_path: str) -> DetectionResult: + classifier = self._get_classifier() + waveform, sr = self._load_wav(wav_path) + duration = len(waveform) / sr + + audio_clip = containers.AudioData.create_from_array(waveform, sr) + raw_results = classifier.classify(audio_clip) + + raw_events: List[SoundEvent] = [] + for idx, frame_result in enumerate(raw_results): + ts_ms = getattr( + frame_result, + "timestamp_ms", + idx * YAMNET_WINDOW_SEC * 1000, + ) + start = ts_ms / 1000.0 + + for classification in frame_result.classifications: + for cat in classification.categories: + if cat.score < self.confidence_threshold: + continue + if ( + self.target_classes + and cat.category_name not in self.target_classes + ): + continue + raw_events.append( + SoundEvent( + start_time=start, + end_time=start + YAMNET_WINDOW_SEC, + sound_class=cat.category_name, + confidence=cat.score, + class_index=getattr(cat, "index", -1), + ) + ) + + merged = self._merge_events(raw_events) + return DetectionResult(events=merged, duration_seconds=duration) + + @staticmethod + def _merge_events( + events: List[SoundEvent], max_gap: float = 0.5 + ) -> List[SoundEvent]: + if not events: + return [] + events = sorted(events, key=lambda e: (e.start_time, e.sound_class)) + merged: List[SoundEvent] = [events[0]] + for e in events[1:]: + prev = merged[-1] + gap = e.start_time - prev.end_time + if e.sound_class == prev.sound_class and gap <= max_gap: + prev.end_time = max(prev.end_time, e.end_time) + prev.confidence = max(prev.confidence, e.confidence) + else: + merged.append(e) + return merged + + +def _fmt_srt(seconds: float) -> str: + """Format seconds to HH:MM:SS,mmm for SRT.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + ms = int(round((seconds - int(seconds)) * 1000)) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" diff --git a/test.mp4 b/test.mp4 new file mode 100644 index 0000000..c8faf62 Binary files /dev/null and b/test.mp4 differ diff --git a/test_visual.json b/test_visual.json new file mode 100644 index 0000000..c703708 --- /dev/null +++ b/test_visual.json @@ -0,0 +1,7 @@ +{ + "video": "test.mp4", + "duration": 3.0, + "max_faces_detected": 0, + "scene_changes": 0, + "events": [] +} \ No newline at end of file diff --git a/yamnet.tflite b/yamnet.tflite new file mode 100644 index 0000000..4d46551 Binary files /dev/null and b/yamnet.tflite differ