Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions dataset_configs/portuguese/unlabeled/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ processors:
output_manifest_file: ${manifest_dir}/manifest2.json
input_audio_key: audio_filepath
output_lang_key: audio_lang
should_run: False
device: cuda
pretrained_model: "langid_ambernet"
segment_duration: 20
Expand All @@ -57,6 +58,7 @@ processors:
- _target_: sdp.processors.PreserveByValue
output_manifest_file: ${manifest_dir}/manifest3.json
input_value_key: audio_lang
should_run: False
target_value: ${language_short}

- _target_: sdp.processors.PreserveByValue
Expand All @@ -66,14 +68,14 @@ processors:
target_value: 20000.0

- _target_: sdp.processors.Subprocess
cmd: "rm -rf ${manifest_dir}/vad/*"
cmd: 'rm -rf ${manifest_dir}/vad/*'

- _target_: sdp.processors.Subprocess
input_manifest_file: ${manifest_dir}/manifest4.json
output_manifest_file: ${manifest_dir}/vad
input_manifest_arg: "manifest_filepath"
output_manifest_arg: "output_dir"
cmd: "python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
cmd: 'python sdp/processors/nemo/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=sdp/processors/nemo/frame_vad_infer_postprocess.yaml'

- _target_: sdp.processors.RenameFields
input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
Expand Down Expand Up @@ -103,4 +105,4 @@ processors:

- _target_: sdp.processors.KeepOnlySpecifiedFields
output_manifest_file: ${final_manifest}
fields_to_keep: ["audio_filepath", "duration"]
fields_to_keep: ["audio_filepath", "duration"]
2 changes: 1 addition & 1 deletion sdp/processors/modify_manifest/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def process(self):
process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
if self.output_manifest_arg:
process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
subprocess.run(" ".join(process_args), shell=True)

subprocess.run(process_args)


class CombineSources(BaseParallelProcessor):
Expand Down
2 changes: 1 addition & 1 deletion sdp/processors/modify_manifest/data_to_dropbool.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(
'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)'
)

def process_dataset_entry(self, data_entry):
def process_dataset_entry(self, data_entry):
input_value = data_entry[self.input_value_key]
target = self.target_value
if self.operator(input_value, target):
Expand Down
39 changes: 39 additions & 0 deletions sdp/processors/nemo/frame_vad_infer_postprocess.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: &name "vad_inference_postprocessing"

input_manifest: null # Path of json file of evaluation data. Audio files should have unique names
output_dir: null # Path to output directory where results will be stored
num_workers: 12
sample_rate: 16000
evaluate: false # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled

prepare_manifest:
auto_split: true # whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.
split_duration: 400 # max length in seconds, try smaller number if you still have CUDA memory issue

vad:
model_path: "vad_multilingual_frame_marblenet" #.nemo local model path or pretrained model name or none
use_rttm: True # set True to output as RTTM format
parameters: # Parameters not tuned on large datasets, please use default parameters with caution
normalize_audio_db: null # set to non null value to normalize RMS DB of audio before preprocessing
window_length_in_sec: 0.0 # window length in sec for VAD context input, must be 0 for frame-VAD
shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD, must be 0.02 for the pretrained NeMo VAD model
smoothing: False # Deprecated for Frame-VAD. false or type of smoothing method (eg: median, mean)
overlap: 0.875 # Deprecated for Frame-VAD. overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value.
postprocessing:
onset: 0.3 # onset threshold for detecting the beginning and end of a speech
offset: 0.3 # offset threshold for detecting the end of a speech.
pad_onset: 0.2 # adding durations before each speech segment
pad_offset: 0.2 # adding durations after each speech segment
min_duration_on: 0.2 # threshold for short speech deletion
min_duration_off: 0.2 # threshold for short non-speech segment deletion
filter_speech_first: True

prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json"
frame_out_dir: "vad_frame_outputs"
smoothing_out_dir: null # if not specify, it will automatically generated be frame_out_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
rttm_out_dir: null # if not specify, it will automatically be frame_out_dir + "/seg_output_" + key and value in postprocessing params
out_manifest_filepath: null # if not specify it will automatically be "manifest_vad_out.json"


# json manifest line example
# {"audio_filepath": "/path/to/audio_file.wav", "offset": 0, "duration": 1.23, "label": "infer", "text": "-"}
Loading
Loading