NVIDIA · nune-tadevosyan · Jun 16, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 14, 2025
diff --git a/dataset_configs/portuguese/unlabeled/config.yaml b/dataset_configs/portuguese/unlabeled/config.yaml
@@ -49,6 +49,7 @@ processors:
     output_manifest_file: ${manifest_dir}/manifest2.json
     input_audio_key: audio_filepath
     output_lang_key: audio_lang
+    should_run: False
     device: cuda
     pretrained_model: "langid_ambernet"
     segment_duration: 20
@@ -57,6 +58,7 @@ processors:
   - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${manifest_dir}/manifest3.json
     input_value_key: audio_lang
+    should_run: False
     target_value: ${language_short}
 
   - _target_: sdp.processors.PreserveByValue
@@ -66,14 +68,14 @@ processors:
     target_value: 20000.0
 
   - _target_: sdp.processors.Subprocess
-    cmd: "rm -rf ${manifest_dir}/vad/*"
-      
+    cmd: 'rm -rf ${manifest_dir}/vad/*'
+
   - _target_: sdp.processors.Subprocess
     input_manifest_file: ${manifest_dir}/manifest4.json
     output_manifest_file: ${manifest_dir}/vad
     input_manifest_arg: "manifest_filepath"
     output_manifest_arg: "output_dir"
-    cmd: "python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet  vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
+    cmd: 'python sdp/processors/nemo/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet  vad_config=sdp/processors/nemo/frame_vad_infer_postprocess.yaml'
 
   - _target_: sdp.processors.RenameFields
     input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
@@ -103,4 +105,4 @@ processors:
 
   - _target_: sdp.processors.KeepOnlySpecifiedFields
     output_manifest_file: ${final_manifest}
-    fields_to_keep: ["audio_filepath", "duration"]
+    fields_to_keep: ["audio_filepath", "duration"]
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
@@ -91,8 +91,8 @@ def process(self):
                 process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
             if self.output_manifest_arg:
                 process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+        subprocess.run(" ".join(process_args), shell=True)
 
-        subprocess.run(process_args)
 
 
 class CombineSources(BaseParallelProcessor):

diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -75,7 +75,7 @@ def __init__(
                 'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)'
             )
 
-    def process_dataset_entry(self, data_entry):
+    def process_dataset_entry(self, data_entry): 
         input_value = data_entry[self.input_value_key]
         target = self.target_value
         if self.operator(input_value, target):

diff --git a/sdp/processors/nemo/frame_vad_infer_postprocess.yaml b/sdp/processors/nemo/frame_vad_infer_postprocess.yaml
@@ -0,0 +1,39 @@
+name: &name "vad_inference_postprocessing"
+
+input_manifest: null # Path of json file of evaluation data. Audio files should have unique names
+output_dir: null  # Path to output directory where results will be stored
+num_workers: 12
+sample_rate: 16000
+evaluate: false  # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled
+
+prepare_manifest:
+  auto_split: true # whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.
+  split_duration: 400 # max length in seconds, try smaller number if you still have CUDA memory issue
+
+vad:
+  model_path: "vad_multilingual_frame_marblenet" #.nemo local model path or pretrained model name or none
+  use_rttm: True # set True to output as RTTM format
+  parameters: # Parameters not tuned on large datasets, please use default parameters with caution
+    normalize_audio_db: null  # set to non null value to normalize RMS DB of audio before preprocessing
+    window_length_in_sec: 0.0  # window length in sec for VAD context input, must be 0 for frame-VAD
+    shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD, must be 0.02 for the pretrained NeMo VAD model 
+    smoothing: False # Deprecated for Frame-VAD. false or type of smoothing method (eg: median, mean)
+    overlap: 0.875 # Deprecated for Frame-VAD. overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value.
+    postprocessing:
+      onset: 0.3 # onset threshold for detecting the beginning and end of a speech
+      offset: 0.3 # offset threshold for detecting the end of a speech.
+      pad_onset: 0.2 # adding durations before each speech segment
+      pad_offset: 0.2 # adding durations after each speech segment
+      min_duration_on: 0.2 # threshold for short speech deletion
+      min_duration_off: 0.2 # threshold for short non-speech segment deletion
+      filter_speech_first: True
+
+prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json"
+frame_out_dir: "vad_frame_outputs"
+smoothing_out_dir: null # if not specify, it will automatically generated be frame_out_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
+rttm_out_dir: null # if not specify, it will automatically be frame_out_dir + "/seg_output_" + key and value in postprocessing params
+out_manifest_filepath: null # if not specify it will automatically be "manifest_vad_out.json"
+
+
+# json manifest line example
+# {"audio_filepath": "/path/to/audio_file.wav", "offset": 0, "duration": 1.23, "label": "infer", "text": "-"}