From 738e39d7f11e436ea835ce3f17731088cb2dd23a Mon Sep 17 00:00:00 2001 From: Nune Date: Fri, 13 Jun 2025 11:54:02 +0400 Subject: [PATCH 01/13] Test check Signed-off-by: Nune --- .../portuguese/unlabeled/config.yaml | 10 ++++--- sdp/run_processors.py | 8 ++++-- tests/test_cfg_end_to_end_tests.py | 26 ++++++++++++++++--- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/dataset_configs/portuguese/unlabeled/config.yaml b/dataset_configs/portuguese/unlabeled/config.yaml index 87ca0f27..608d9ced 100644 --- a/dataset_configs/portuguese/unlabeled/config.yaml +++ b/dataset_configs/portuguese/unlabeled/config.yaml @@ -49,6 +49,7 @@ processors: output_manifest_file: ${manifest_dir}/manifest2.json input_audio_key: audio_filepath output_lang_key: audio_lang + should_run: False device: cuda pretrained_model: "langid_ambernet" segment_duration: 20 @@ -57,6 +58,7 @@ processors: - _target_: sdp.processors.PreserveByValue output_manifest_file: ${manifest_dir}/manifest3.json input_value_key: audio_lang + should_run: False target_value: ${language_short} - _target_: sdp.processors.PreserveByValue @@ -66,14 +68,14 @@ processors: target_value: 20000.0 - _target_: sdp.processors.Subprocess - cmd: "rm -rf ${manifest_dir}/vad/*" - + cmd: 'rm -rf ${manifest_dir}/vad/*' + - _target_: sdp.processors.Subprocess input_manifest_file: ${manifest_dir}/manifest4.json output_manifest_file: ${manifest_dir}/vad input_manifest_arg: "manifest_filepath" output_manifest_arg: "output_dir" - cmd: "python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml" + cmd: "bash -c 'NEMO_PATH=$(python -c \"import nemo, pathlib; print(pathlib.Path(nemo.__file__).parent.parent)\") && python \"$NEMO_PATH/examples/asr/asr_vad/speech_to_text_with_vad.py\" audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=\"$NEMO_PATH/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml\"'" - _target_: sdp.processors.RenameFields input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json @@ -103,4 +105,4 @@ processors: - _target_: sdp.processors.KeepOnlySpecifiedFields output_manifest_file: ${final_manifest} - fields_to_keep: ["audio_filepath", "duration"] \ No newline at end of file + fields_to_keep: ["audio_filepath", "duration"] diff --git a/sdp/run_processors.py b/sdp/run_processors.py index 8c498cf2..43c4b06d 100644 --- a/sdp/run_processors.py +++ b/sdp/run_processors.py @@ -160,6 +160,8 @@ def run_processors(cfg): # filtering out any processors that have should_run=False processors_cfgs = [] for processor_cfg in selected_cfgs: + print("********************************************************************************") + print(f"processor_cfg {processor_cfg}") with open_dict(processor_cfg): should_run = processor_cfg.pop("should_run", True) if should_run: @@ -214,7 +216,6 @@ def run_processors(cfg): use_dask_flag = global_use_dask else: use_dask_flag = flag - processor = hydra.utils.instantiate(processor_cfg) processor.use_dask = use_dask_flag # running runtime tests to fail right-away if something is not @@ -222,10 +223,10 @@ def run_processors(cfg): processor.test() processors.append(processor) - # Start Dask client if any processor requires it dask_client = None if any(p.use_dask for p in processors): + try: num_cpus = psutil.cpu_count(logical=False) or 4 logger.info(f"Starting Dask client with {num_cpus} workers") @@ -243,6 +244,9 @@ def run_processors(cfg): logger.info('=> Running processor "%s" with Dask', proc) else: logger.info('=> Running processor "%s" with Multiprocessing', proc) + + print(f"p {proc}") + print(f"") proc.process() finally: if dask_client is not None: diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index db0425e7..3e88eabe 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -88,6 +88,22 @@ def data_check_fn_uzbekvoice(raw_data_dir: str) -> None: else: raise ValueError(f"No such file {str(expected_file)} at {str(raw_data_dir)}") +def data_check_fn_unlabeled(raw_data_dir: str) -> None: + """Checks for data and sets it up for unlabeled processing. + + Args: + raw_data_dir: Directory where data should be + language: Language code (e.g. 'portuguese') + """ + # Get the MLS directory path (one level up from unlabeled) + if (Path(raw_data_dir) / "unlabeled").exists(): + return + expected_file = Path(raw_data_dir) / "unlabeled.tar.gz" + if not expected_file.exists(): + raise ValueError(f"No such file {str(expected_file)}") + + + def data_check_fn_armenian_toloka_pipeline_start(raw_data_dir: str) -> None: """Checks for the Armenian Toloka test data. @@ -244,7 +260,12 @@ def get_test_cases() -> List[Tuple[str, Callable]]: reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", fields_to_ignore=['audio_filepath', 'duration'], processors_to_run="1:6" - ) + ), + + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/unlabeled/config.yaml", + data_check_fn=partial(data_check_fn_unlabeled) + ), ] def get_test_names(): @@ -356,7 +377,6 @@ def test_configs(setup_data, tmp_path): cfg.processors[2].workspace_dir = (data_dir / "pipeline_get_final_res").as_posix() # Set input_manifest_file for ASRFileCheck to use the existing manifest.json cfg.processors[1].input_manifest_file = (data_dir / "pipeline_get_final_res" / "manifest.json").as_posix() - run_processors(cfg) # additionally, let's test that final generated manifest matches the # reference file (ignoring the file paths and additional fields explicitly specified to ignore) @@ -411,4 +431,4 @@ def test_get_e2e_test_data_path(tmp_path): assert mock_bucket.download_file.call_count == 2 if __name__ == "__main__": - pytest.main([__file__, "-v", "--durations=0"]) \ No newline at end of file + pytest.main([__file__, "-v", "--durations=0"]) From ba5e085ac42fc3d438c410f036a0eb4ec85c0409 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 16:35:58 +0400 Subject: [PATCH 02/13] Update Signed-off-by: Nune --- dataset_configs/portuguese/unlabeled/config.yaml | 2 +- sdp/processors/modify_manifest/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataset_configs/portuguese/unlabeled/config.yaml b/dataset_configs/portuguese/unlabeled/config.yaml index 608d9ced..46feb0ab 100644 --- a/dataset_configs/portuguese/unlabeled/config.yaml +++ b/dataset_configs/portuguese/unlabeled/config.yaml @@ -75,7 +75,7 @@ processors: output_manifest_file: ${manifest_dir}/vad input_manifest_arg: "manifest_filepath" output_manifest_arg: "output_dir" - cmd: "bash -c 'NEMO_PATH=$(python -c \"import nemo, pathlib; print(pathlib.Path(nemo.__file__).parent.parent)\") && python \"$NEMO_PATH/examples/asr/asr_vad/speech_to_text_with_vad.py\" audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=\"$NEMO_PATH/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml\"'" + cmd: 'export NEMO_PATH=$(python -c "import nemo, pathlib; print(pathlib.Path(nemo.__file__).parent.parent)") && python $NEMO_PATH/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=$NEMO_PATH/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml' - _target_: sdp.processors.RenameFields input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 30ba5fec..69a29083 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -91,8 +91,8 @@ def process(self): process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) if self.output_manifest_arg: process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) + subprocess.run(" ".join(process_args), shell=True) - subprocess.run(process_args) class CombineSources(BaseParallelProcessor): From 2fd425da73c6410ceb03be9c3f1d17fc185d6518 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 16:37:22 +0400 Subject: [PATCH 03/13] Small changes Signed-off-by: Nune --- sdp/processors/modify_manifest/data_to_dropbool.py | 4 +++- tests/test_cfg_end_to_end_tests.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index 49132c09..08118228 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -75,7 +75,9 @@ def __init__( 'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)' ) - def process_dataset_entry(self, data_entry): + def process_dataset_entry(self, data_entry): + print(f"self.input_value_key {self.input_value_key}") + print(f"data_entry {data_entry}") input_value = data_entry[self.input_value_key] target = self.target_value if self.operator(input_value, target): diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 3e88eabe..63ca6a02 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -101,6 +101,8 @@ def data_check_fn_unlabeled(raw_data_dir: str) -> None: expected_file = Path(raw_data_dir) / "unlabeled.tar.gz" if not expected_file.exists(): raise ValueError(f"No such file {str(expected_file)}") + with tarfile.open(expected_file, 'r:gz') as tar: + tar.extractall(path=raw_data_dir) From 86dd23dc5cd9f14b151bd03fffb804b0b2554d6e Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 17:18:08 +0400 Subject: [PATCH 04/13] Small changes Signed-off-by: Nune --- dataset_configs/portuguese/mtedx/config.yaml | 94 ++++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/dataset_configs/portuguese/mtedx/config.yaml b/dataset_configs/portuguese/mtedx/config.yaml index cdd3c324..daa4ae16 100644 --- a/dataset_configs/portuguese/mtedx/config.yaml +++ b/dataset_configs/portuguese/mtedx/config.yaml @@ -28,14 +28,14 @@ documentation: | processors_to_run: all -workspace_dir: ??? -data_split: ??? -final_manifest: ??? +workspace_dir: /home/ntadevosyan/Documents/sdp_main/NeMo-speech-data-processor/test_data/portuguese/mtedx/ +data_split: train +final_manifest: none.json processors: - _target_: sdp.processors.CreateInitialManifestMTEDX - raw_data_dir: ${workspace_dir}/raw_data + raw_data_dir: ${workspace_dir}/ data_split: ${data_split} language_id: pt already_extracted: False @@ -45,10 +45,10 @@ processors: converted_audio_dir: ${workspace_dir}/resampled input_file_key: audio_filepath output_file_key: audio_filepath + output_manifest_file: ${workspace_dir}/${data_split}_manifest1.json - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence output_manifest_file: ${workspace_dir}/manifest_vtt.json - input_manifest_file: ${workspace_dir}/${data_split}_manifest0.json splited_audio_dir: ${workspace_dir}/splited source_audio_field: audio_filepath target_audio_field: audio_filepath @@ -56,46 +56,46 @@ processors: text_field: text vtt_field: vtt_filepath additional_fields: [] - duration_threshold: 20.0 + duration_threshold: 24 - - _target_: sdp.processors.SubRegex - regex_params_list: - - {"pattern": "(Aplausos)", "repl": " "} - - {"pattern": "(Risos)", "repl": " "} - - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} - - {"pattern": "'", "repl": " "} - - {"pattern": '[\$\&\¡\(\)]', "repl": " "} - - {"pattern": '[\«\°\´\·\»]', "repl": " "} - - {"pattern": '[\«\°\´\·\»]', "repl": " "} - - {"pattern": '[\‘\’\“\”\„]', "repl": " "} - - {"pattern": '[\:\;\`\ʻ]', "repl": " "} - - {"pattern": "!", "repl": "."} - - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end - - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end - - # remove remaining repeated periods since most of the time they are unnecessary in this data - - {"pattern": "\\.{2,20}", "repl": " "} - - - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} - - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} - - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} - - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } - - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} - - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} - - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} - - - {"pattern": " um km ", "repl" : " um quilômetro "} - - {"pattern": " km ", "repl" : " quilômetros "} - - - _target_: sdp.processors.DropHighLowDuration - high_duration_threshold: 20 - low_duration_threshold: 1 - - - _target_: sdp.processors.DropHighLowCharrate - high_charrate_threshold: 21 - low_charrate_threshold: 1 - - - _target_: sdp.processors.DropNonAlphabet - output_manifest_file: ${final_manifest} - alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" - +# - _target_: sdp.processors.SubRegex +# regex_params_list: +# - {"pattern": "(Aplausos)", "repl": " "} +# - {"pattern": "(Risos)", "repl": " "} +# - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} +# - {"pattern": "'", "repl": " "} +# - {"pattern": '[\$\&\¡\(\)]', "repl": " "} +# - {"pattern": '[\«\°\´\·\»]', "repl": " "} +# - {"pattern": '[\«\°\´\·\»]', "repl": " "} +# - {"pattern": '[\‘\’\“\”\„]', "repl": " "} +# - {"pattern": '[\:\;\`\ʻ]', "repl": " "} +# - {"pattern": "!", "repl": "."} +# - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end +# - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end +# +# # remove remaining repeated periods since most of the time they are unnecessary in this data +# - {"pattern": "\\.{2,20}", "repl": " "} +# +# - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} +# - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} +# - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} +# - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } +# - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} +# - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} +# - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} +# +# - {"pattern": " um km ", "repl" : " um quilômetro "} +# - {"pattern": " km ", "repl" : " quilômetros "} +# +# - _target_: sdp.processors.DropHighLowDuration +# high_duration_threshold: 20 +# low_duration_threshold: 1 +# +# - _target_: sdp.processors.DropHighLowCharrate +# high_charrate_threshold: 21 +# low_charrate_threshold: 1 +# +# - _target_: sdp.processors.DropNonAlphabet +# output_manifest_file: ${final_manifest} +# alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" +# From dbe8b2eedba4f2c8cbff4fdf702eca8512639e59 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 17:20:17 +0400 Subject: [PATCH 05/13] Small changes Signed-off-by: Nune --- dataset_configs/portuguese/mtedx/config.yaml | 94 ++++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/dataset_configs/portuguese/mtedx/config.yaml b/dataset_configs/portuguese/mtedx/config.yaml index daa4ae16..cdd3c324 100644 --- a/dataset_configs/portuguese/mtedx/config.yaml +++ b/dataset_configs/portuguese/mtedx/config.yaml @@ -28,14 +28,14 @@ documentation: | processors_to_run: all -workspace_dir: /home/ntadevosyan/Documents/sdp_main/NeMo-speech-data-processor/test_data/portuguese/mtedx/ -data_split: train -final_manifest: none.json +workspace_dir: ??? +data_split: ??? +final_manifest: ??? processors: - _target_: sdp.processors.CreateInitialManifestMTEDX - raw_data_dir: ${workspace_dir}/ + raw_data_dir: ${workspace_dir}/raw_data data_split: ${data_split} language_id: pt already_extracted: False @@ -45,10 +45,10 @@ processors: converted_audio_dir: ${workspace_dir}/resampled input_file_key: audio_filepath output_file_key: audio_filepath - output_manifest_file: ${workspace_dir}/${data_split}_manifest1.json - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence output_manifest_file: ${workspace_dir}/manifest_vtt.json + input_manifest_file: ${workspace_dir}/${data_split}_manifest0.json splited_audio_dir: ${workspace_dir}/splited source_audio_field: audio_filepath target_audio_field: audio_filepath @@ -56,46 +56,46 @@ processors: text_field: text vtt_field: vtt_filepath additional_fields: [] - duration_threshold: 24 + duration_threshold: 20.0 -# - _target_: sdp.processors.SubRegex -# regex_params_list: -# - {"pattern": "(Aplausos)", "repl": " "} -# - {"pattern": "(Risos)", "repl": " "} -# - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} -# - {"pattern": "'", "repl": " "} -# - {"pattern": '[\$\&\¡\(\)]', "repl": " "} -# - {"pattern": '[\«\°\´\·\»]', "repl": " "} -# - {"pattern": '[\«\°\´\·\»]', "repl": " "} -# - {"pattern": '[\‘\’\“\”\„]', "repl": " "} -# - {"pattern": '[\:\;\`\ʻ]', "repl": " "} -# - {"pattern": "!", "repl": "."} -# - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end -# - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end -# -# # remove remaining repeated periods since most of the time they are unnecessary in this data -# - {"pattern": "\\.{2,20}", "repl": " "} -# -# - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} -# - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} -# - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} -# - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } -# - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} -# - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} -# - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} -# -# - {"pattern": " um km ", "repl" : " um quilômetro "} -# - {"pattern": " km ", "repl" : " quilômetros "} -# -# - _target_: sdp.processors.DropHighLowDuration -# high_duration_threshold: 20 -# low_duration_threshold: 1 -# -# - _target_: sdp.processors.DropHighLowCharrate -# high_charrate_threshold: 21 -# low_charrate_threshold: 1 -# -# - _target_: sdp.processors.DropNonAlphabet -# output_manifest_file: ${final_manifest} -# alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" -# + - _target_: sdp.processors.SubRegex + regex_params_list: + - {"pattern": "(Aplausos)", "repl": " "} + - {"pattern": "(Risos)", "repl": " "} + - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} + - {"pattern": "'", "repl": " "} + - {"pattern": '[\$\&\¡\(\)]', "repl": " "} + - {"pattern": '[\«\°\´\·\»]', "repl": " "} + - {"pattern": '[\«\°\´\·\»]', "repl": " "} + - {"pattern": '[\‘\’\“\”\„]', "repl": " "} + - {"pattern": '[\:\;\`\ʻ]', "repl": " "} + - {"pattern": "!", "repl": "."} + - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end + - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end + + # remove remaining repeated periods since most of the time they are unnecessary in this data + - {"pattern": "\\.{2,20}", "repl": " "} + + - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} + - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} + - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} + - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } + - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} + - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} + - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} + + - {"pattern": " um km ", "repl" : " um quilômetro "} + - {"pattern": " km ", "repl" : " quilômetros "} + + - _target_: sdp.processors.DropHighLowDuration + high_duration_threshold: 20 + low_duration_threshold: 1 + + - _target_: sdp.processors.DropHighLowCharrate + high_charrate_threshold: 21 + low_charrate_threshold: 1 + + - _target_: sdp.processors.DropNonAlphabet + output_manifest_file: ${final_manifest} + alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" + From 7d8db0b1b6aac3cc7290bf6b928a3497e9644fe4 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 17:22:09 +0400 Subject: [PATCH 06/13] Small changes Signed-off-by: Nune --- tests/test_cfg_end_to_end_tests.py | 248 +++++++++++++------------- tests/test_data_to_dropbool.py | 269 ----------------------------- 2 files changed, 124 insertions(+), 393 deletions(-) delete mode 100644 tests/test_data_to_dropbool.py diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 77964b60..f75bf26c 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -139,130 +139,130 @@ def data_check_fn_armenian_toloka_pipeline_get_final_res(raw_data_dir: str) -> N def get_test_cases() -> List[Tuple[str, Callable]]: return [ - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", - data_check_fn=partial(data_check_fn_mls, language="spanish"), - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", - data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", - data_check_fn=data_check_fn_voxpopuli - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", - data_check_fn=partial(data_check_fn_mls, language="italian") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", - data_check_fn=partial(data_check_fn_mls, language="portuguese") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", - data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", - data_check_fn=partial(data_check_fn_mtedx, language_id="pt") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", - data_check_fn=data_check_fn_coraa - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", - data_check_fn=lambda raw_data_dir: True - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", - data_check_fn=lambda raw_data_dir: True - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", - data_check_fn=data_check_fn_librispeech - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", - data_check_fn=data_check_fn_fleurs - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", - data_check_fn=lambda raw_data_dir: True - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", - data_check_fn=lambda raw_data_dir: True, - fields_to_ignore=['text'], - ), - TestCase( - f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", - partial(data_check_fn_mcv, archive_file_stem="mcv_kk") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", - data_check_fn=data_check_fn_slr140 - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="slr102_kk.tar.gz") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", - data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", - data_check_fn=data_check_fn_uzbekvoice - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", - data_check_fn=data_check_fn_fleurs - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config_filter_noisy_train.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz"), - reference_manifest_filename="test_data_reference_filter.json" - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mcv/config.yaml", - data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv.ar") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/fleurs/config.yaml", - data_check_fn=data_check_fn_fleurs - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mediaspeech/config.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="AR.tar.gz") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/arabic/everyayah/config.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", - data_check_fn=data_check_fn_armenian_toloka_pipeline_start, - fields_to_ignore=['source_filepath'], - processors_to_run="2:14", - reference_manifest_filename="pipeline_start/test_data_reference.json" - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", - data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, - reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", - fields_to_ignore=['audio_filepath', 'duration'], - processors_to_run="1:6" - ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", + # data_check_fn=partial(data_check_fn_mls, language="spanish"), + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", + # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", + # data_check_fn=data_check_fn_voxpopuli + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", + # data_check_fn=partial(data_check_fn_mls, language="italian") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", + # data_check_fn=partial(data_check_fn_mls, language="portuguese") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", + # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", + # data_check_fn=partial(data_check_fn_mtedx, language_id="pt") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", + # data_check_fn=data_check_fn_coraa + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", + # data_check_fn=lambda raw_data_dir: True + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", + # data_check_fn=lambda raw_data_dir: True + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", + # data_check_fn=data_check_fn_librispeech + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", + # data_check_fn=data_check_fn_fleurs + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", + # data_check_fn=lambda raw_data_dir: True + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", + # data_check_fn=lambda raw_data_dir: True, + # fields_to_ignore=['text'], + # ), + #TestCase( + # f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", + # partial(data_check_fn_mcv, archive_file_stem="mcv_kk") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", + # data_check_fn=data_check_fn_slr140 + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="slr102_kk.tar.gz") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", + # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", + # data_check_fn=data_check_fn_uzbekvoice + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", + # data_check_fn=data_check_fn_fleurs + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config_filter_noisy_train.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz"), + # reference_manifest_filename="test_data_reference_filter.json" + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mcv/config.yaml", + # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv.ar") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/fleurs/config.yaml", + # data_check_fn=data_check_fn_fleurs + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mediaspeech/config.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="AR.tar.gz") + # ), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/everyayah/config.yaml", + # data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") + #), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", + # data_check_fn=data_check_fn_armenian_toloka_pipeline_start, + # fields_to_ignore=['source_filepath'], + # processors_to_run="2:14", + # reference_manifest_filename="pipeline_start/test_data_reference.json" + #), + #TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", + # data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, + # reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", + # fields_to_ignore=['audio_filepath', 'duration'], + # processors_to_run="1:6" + #), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/unlabeled/config.yaml", diff --git a/tests/test_data_to_dropbool.py b/tests/test_data_to_dropbool.py deleted file mode 100644 index 743c7620..00000000 --- a/tests/test_data_to_dropbool.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from sdp.processors.modify_manifest.data_to_dropbool import ( - DropASRErrorBeginningEnd, - DropHighCER, - DropHighLowCharrate, - DropHighLowDuration, - DropHighLowWordrate, - DropHighWER, - DropIfNoneOfRegexMatch, - DropIfRegexMatch, - DropIfSubstringInInsertion, - DropLowWordMatchRate, - DropNonAlphabet, -) - -test_params_list = [] - -test_params_list.extend( - [ - ( - DropHighLowCharrate, - {"high_charrate_threshold": 9.9, "low_charrate_threshold": 0}, - {"text": "0123456789", "duration": 1}, - True, - ), - ( - DropHighLowCharrate, - {"high_charrate_threshold": 99, "low_charrate_threshold": 10.1}, - {"text": "0123456789", "duration": 1}, - True, - ), - ( - DropHighLowCharrate, - {"high_charrate_threshold": 10.1, "low_charrate_threshold": 9.9}, - {"text": "0123456789", "duration": 1}, - False, - ), - ] -) - -test_params_list.extend( - [ - ( - DropHighLowWordrate, - {"high_wordrate_threshold": 3.9, "low_wordrate_threshold": 0}, - {"text": "11 22 33 44", "duration": 1}, - True, - ), - ( - DropHighLowWordrate, - {"high_wordrate_threshold": 99, "low_wordrate_threshold": 4.1}, - {"text": "11 22 33 44", "duration": 1}, - True, - ), - ( - DropHighLowWordrate, - {"high_wordrate_threshold": 4.1, "low_wordrate_threshold": 3.9}, - {"text": "11 22 33 44", "duration": 1}, - False, - ), - ] -) - -test_params_list.extend( - [ - ( - DropHighLowDuration, - {"high_duration_threshold": 3.9, "low_duration_threshold": 0}, - {"duration": 4}, - True, - ), - ( - DropHighLowDuration, - {"high_duration_threshold": 99, "low_duration_threshold": 4.1}, - {"duration": 4}, - True, - ), - ( - DropHighLowDuration, - {"high_duration_threshold": 4.1, "low_duration_threshold": 3.9}, - {"duration": 4}, - False, - ), - ] -) - - -test_params_list.extend( - [ - ( - DropNonAlphabet, - {"alphabet": " abc"}, - {"text": "ab ba cab dac"}, - True, - ), - ( - DropNonAlphabet, - {"alphabet": " abcd"}, - {"text": "ab ba cab dac"}, - False, - ), - ] -) - - -test_params_list.extend( - [ - ( - DropASRErrorBeginningEnd, - {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, - {"text": "2", "pred_text": "1 2 3"}, - True, - ), - ( - DropASRErrorBeginningEnd, - {"beginning_error_char_threshold": 2, "end_error_char_threshold": 0}, - {"text": "2", "pred_text": "1 2 3"}, - True, - ), - ( - DropASRErrorBeginningEnd, - {"beginning_error_char_threshold": 2, "end_error_char_threshold": 2}, - {"text": "2", "pred_text": "1 2 3"}, - False, - ), - ( - DropASRErrorBeginningEnd, - {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, - {"text": "sentence with some text here", "pred_text": "sentence with some text her"}, - False, - ), - ( - DropASRErrorBeginningEnd, - {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, - { - "text": "sentence with some text here but actually more text was spoken", - "pred_text": "sentence with some text her", - }, - True, - ), - ] -) - -test_params_list.extend( - [ - ( - DropHighCER, - {"cer_threshold": 9.9}, - {"text": "0123456789", "pred_text": "012345678"}, - True, - ), - ( - DropHighCER, - {"cer_threshold": 10.1}, - {"text": "0123456789", "pred_text": "012345678"}, - False, - ), - ] -) - -test_params_list.extend( - [ - ( - DropHighWER, - {"wer_threshold": 0}, - {"text": "11 22", "pred_text": "11 22"}, - False, - ), - ( - DropHighWER, - {"wer_threshold": 50.1}, - {"text": "11 22", "pred_text": "11 22 33"}, - False, - ), - ( - DropHighWER, - {"wer_threshold": 49.9}, - {"text": "11 22", "pred_text": "11 22 33"}, - True, - ), - ] -) - -test_params_list.extend( - [ - (DropIfNoneOfRegexMatch, {"regex_patterns": ["keep this", "also this"]}, {"text": "I don't want this"}, True), - ( - DropIfNoneOfRegexMatch, - {"regex_patterns": ["keep this", "also this"]}, - {"text": "I want to keep this"}, - False, - ), - ] -) - -test_params_list.extend( - [ - (DropIfRegexMatch, {"regex_patterns": ["incorrect_text"]}, {"text": "incorrect_text"}, True), - ( - DropIfRegexMatch, - {"regex_patterns": ["001/002"], "text_key": "audio_filepath"}, - {"audio_filepath": "001/002/003.wav"}, - True, - ), - ] -) - -test_params_list.extend( - [ - ( - DropLowWordMatchRate, - {"wmr_threshold": 50.1}, - {"text": "hello world i'm nemo", "pred_text": "hello world"}, - True, - ), - ( - DropLowWordMatchRate, - {"wmr_threshold": 49.9}, - {"text": "hello world i'm nemo", "pred_text": "hello world"}, - False, - ), - ] -) - -test_params_list.extend( - [ - ( - DropIfSubstringInInsertion, - {"substrings_in_insertion": ["might "]}, - {"text": "we miss certain words", "pred_text": "we might miss certain words"}, - True, - ), - ( - DropIfSubstringInInsertion, - {"substrings_in_insertion": ["might "]}, - {"text": "we may certain words", "pred_text": "we might miss certain words"}, - False, - ), - ] -) - - -@pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str) -def test_data_to_data(test_class, class_kwargs, test_input, expected_output): - processor = test_class(**class_kwargs, output_manifest_file=None) - - output = processor.process_dataset_entry(test_input) - if output: - output = output[0].data - - if expected_output: - assert output is None - else: - assert output == test_input From 2d93b8a634d4ba9209f06d1a2fe3c13e922ef695 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 17:43:31 +0400 Subject: [PATCH 07/13] Small changes Signed-off-by: Nune --- .../portuguese/unlabeled/config.yaml | 2 +- .../nemo/frame_vad_infer_postprocess.yaml | 39 ++ .../nemo/speech_to_text_with_vad.py | 649 ++++++++++++++++++ 3 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 sdp/processors/nemo/frame_vad_infer_postprocess.yaml create mode 100644 sdp/processors/nemo/speech_to_text_with_vad.py diff --git a/dataset_configs/portuguese/unlabeled/config.yaml b/dataset_configs/portuguese/unlabeled/config.yaml index 46feb0ab..fc1c0c80 100644 --- a/dataset_configs/portuguese/unlabeled/config.yaml +++ b/dataset_configs/portuguese/unlabeled/config.yaml @@ -75,7 +75,7 @@ processors: output_manifest_file: ${manifest_dir}/vad input_manifest_arg: "manifest_filepath" output_manifest_arg: "output_dir" - cmd: 'export NEMO_PATH=$(python -c "import nemo, pathlib; print(pathlib.Path(nemo.__file__).parent.parent)") && python $NEMO_PATH/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=$NEMO_PATH/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml' + cmd: 'python sdp/processors/nemo/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=sdp/processors/nemo/frame_vad_infer_postprocess.yaml' - _target_: sdp.processors.RenameFields input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json diff --git a/sdp/processors/nemo/frame_vad_infer_postprocess.yaml b/sdp/processors/nemo/frame_vad_infer_postprocess.yaml new file mode 100644 index 00000000..1d00eca6 --- /dev/null +++ b/sdp/processors/nemo/frame_vad_infer_postprocess.yaml @@ -0,0 +1,39 @@ +name: &name "vad_inference_postprocessing" + +input_manifest: null # Path of json file of evaluation data. Audio files should have unique names +output_dir: null # Path to output directory where results will be stored +num_workers: 12 +sample_rate: 16000 +evaluate: false # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled + +prepare_manifest: + auto_split: true # whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue. + split_duration: 400 # max length in seconds, try smaller number if you still have CUDA memory issue + +vad: + model_path: "vad_multilingual_frame_marblenet" #.nemo local model path or pretrained model name or none + use_rttm: True # set True to output as RTTM format + parameters: # Parameters not tuned on large datasets, please use default parameters with caution + normalize_audio_db: null # set to non null value to normalize RMS DB of audio before preprocessing + window_length_in_sec: 0.0 # window length in sec for VAD context input, must be 0 for frame-VAD + shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD, must be 0.02 for the pretrained NeMo VAD model + smoothing: False # Deprecated for Frame-VAD. false or type of smoothing method (eg: median, mean) + overlap: 0.875 # Deprecated for Frame-VAD. overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value. + postprocessing: + onset: 0.3 # onset threshold for detecting the beginning and end of a speech + offset: 0.3 # offset threshold for detecting the end of a speech. + pad_onset: 0.2 # adding durations before each speech segment + pad_offset: 0.2 # adding durations after each speech segment + min_duration_on: 0.2 # threshold for short speech deletion + min_duration_off: 0.2 # threshold for short non-speech segment deletion + filter_speech_first: True + +prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json" +frame_out_dir: "vad_frame_outputs" +smoothing_out_dir: null # if not specify, it will automatically generated be frame_out_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) +rttm_out_dir: null # if not specify, it will automatically be frame_out_dir + "/seg_output_" + key and value in postprocessing params +out_manifest_filepath: null # if not specify it will automatically be "manifest_vad_out.json" + + +# json manifest line example +# {"audio_filepath": "/path/to/audio_file.wav", "offset": 0, "duration": 1.23, "label": "infer", "text": "-"} diff --git a/sdp/processors/nemo/speech_to_text_with_vad.py b/sdp/processors/nemo/speech_to_text_with_vad.py new file mode 100644 index 00000000..6fdd183d --- /dev/null +++ b/sdp/processors/nemo/speech_to_text_with_vad.py @@ -0,0 +1,649 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file provides the ASR+VAD inference pipeline, with the option to perform only ASR or VAD alone. + +There are two types of input, the first one is a manifest passed to `manifest_filepath`, +and the other one is to pass a directory containing audios to `audio_dir` and specify `audio_type`. + +The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "text"] are required. An example of a manifest file is: +``` +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "text": "a b c d e"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "text": "f g h i j"} +``` + +To run the code with ASR+VAD default settings: + +```bash +python speech_to_text_with_vad.py \ + manifest_filepath=/PATH/TO/MANIFEST.json \ + vad_model=vad_multilingual_frame_marblenet\ + asr_model=stt_en_conformer_ctc_large \ + vad_config=../conf/vad/frame_vad_inference_postprocess.yaml +``` + +To use only ASR and disable VAD, set `vad_model=None` and `use_rttm=False`. + +To use only VAD, set `asr_model=None` and specify both `vad_model` and `vad_config`. + +To enable profiling, set `profiling=True`, but this will significantly slow down the program. + +To use or disable feature masking/droping based on RTTM files, set `use_rttm` to `True` or `False`. +There are two ways to use RTTM files, either by masking the features (`rttm_mode=mask`) or by dropping the features (`rttm_mode=drop`). +For audios that have long non-speech audios between speech segments, dropping frames is recommended. + +To normalize feature before masking, set `normalize=pre_norm`, +and set `normalize=post_norm` for masking before normalization. + +To use a specific value for feature masking, set `feat_mask_val` to the desired value. +Default is `feat_mask_val=None`, where -16.635 will be used for `post_norm` and 0 will be used for `pre_norm`. + +See more options in the `InferenceConfig` class. +""" + + +import contextlib +import json +import os + +import time +from dataclasses import dataclass, is_dataclass, field +from pathlib import Path +from typing import Callable, Optional + +import torch +import torch.amp +import yaml +from omegaconf import DictConfig, OmegaConf +from torch.profiler import ProfilerActivity, profile, record_function +from tqdm import tqdm + +from nemo.collections.asr.data import feature_to_text_dataset +from nemo.collections.asr.metrics.wer import word_error_rate +from nemo.collections.asr.models import ASRModel, EncDecClassificationModel +from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig +from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.collections.asr.parts.utils.vad_utils import ( + generate_overlap_vad_seq, + generate_vad_segment_table, + get_vad_stream_status, + init_frame_vad_model, + init_vad_model, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@dataclass +class InferenceConfig: + # Required configs + asr_model: Optional[str] = None # Path to a .nemo file or a pretrained NeMo model on NGC + vad_model: Optional[str] = None # Path to a .nemo file or a pretrained NeMo model on NGC + vad_config: Optional[str] = None # Path to a yaml file containing VAD post-processing configs + manifest_filepath: Optional[str] = None # Path to dataset's JSON manifest + audio_dir: Optional[str] = None # Path to a directory containing audio files, use this if no manifest is provided + + use_rttm: bool = True # whether to use RTTM + rttm_mode: str = "mask" # how to use RTTM files, choices=[`mask`, `drop`] + feat_mask_val: Optional[float] = None # value used to mask features based on RTTM, set None to use defaults + normalize: Optional[str] = ( + "post_norm" # whether and where to normalize audio feature, choices=[None, `pre_norm`, `post_norm`] + ) + normalize_type: str = "per_feature" # how to determine mean and std used for normalization + normalize_audio_db: Optional[float] = None # set to normalize RMS DB of audio before extracting audio features + + profiling: bool = False # whether to enable pytorch profiling + + # General configs + batch_size: int = 1 # batch size for ASR. Feature extraction and VAD only support single sample per batch. + num_workers: int = 8 + sample_rate: int = 16000 + frame_unit_time_secs: float = ( + 0.01 # unit time per frame in seconds, equal to `window_stride` in ASR configs, typically 10ms. + ) + audio_type: str = "wav" + + # Output settings, no need to change + output_dir: Optional[str] = None # will be automatically set by the program + output_filename: Optional[str] = None # will be automatically set by the program + pred_name_postfix: Optional[str] = None # If you need to use another model name, other than the standard one. + + # Set to True to output language ID information + compute_langs: bool = False + + # Decoding strategy for CTC models + ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig) + + # Decoding strategy for RNNT models + rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1)) + + # VAD model type + vad_type: str = "frame" # which type of VAD to use, choices=[`frame`, `segment`] + + +@hydra_runner(config_name="InferenceConfig", schema=InferenceConfig) +def main(cfg): + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + if cfg.output_dir is None: + cfg.output_dir = "./outputs" + output_dir = Path(cfg.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + # setup profiling, note that profiling will significantly increast the total runtime + if cfg.profiling: + logging.info("Profiling enabled") + profile_fn = profile + record_fn = record_function + else: + logging.info("Profiling disabled") + + @contextlib.contextmanager + def profile_fn(*args, **kwargs): + yield + + @contextlib.contextmanager + def record_fn(*args, **kwargs): + yield + + input_manifest_file = prepare_inference_manifest(cfg) + + if cfg.manifest_filepath is None: + cfg.manifest_filepath = str(input_manifest_file) + + with profile_fn( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True + ) as prof: + + input_manifest_file = extract_audio_features(input_manifest_file, cfg, record_fn) + + if cfg.vad_model is not None: + logging.info(f"Running VAD with model: {cfg.vad_model}") + input_manifest_file = run_vad_inference(input_manifest_file, cfg, record_fn) + + if cfg.asr_model is not None: + logging.info(f"Running ASR with model: {cfg.asr_model}") + run_asr_inference(input_manifest_file, cfg, record_fn) + + if cfg.profiling: + print("--------------------------------------------------------------------\n") + print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15)) + print("--------------------------------------------------------------------\n") + logging.info("Done.") + + +def prepare_inference_manifest(cfg: DictConfig) -> str: + + if cfg.audio_dir is not None and cfg.manifest_filepath is None: + manifest_data = [] + for audio_file in Path(cfg.audio_dir).glob(f"**/*.{cfg.audio_type}"): + item = {"audio_filepath": str(audio_file.absolute()), "duration": 1000000, "offset": 0} + manifest_data.append(item) + parent_dir = Path(cfg.audio_dir) + else: + manifest_data = read_manifest(cfg.manifest_filepath) + parent_dir = Path(cfg.manifest_filepath).parent + + new_manifest_data = [] + + for item in manifest_data: + audio_file = Path(item["audio_filepath"]) + if len(str(audio_file)) < 255 and not audio_file.is_file() and not audio_file.is_absolute(): + new_audio_file = parent_dir / audio_file + if new_audio_file.is_file(): + item["audio_filepath"] = str(new_audio_file.absolute()) + else: + item["audio_filepath"] = os.path.expanduser(str(audio_file)) + else: + item["audio_filepath"] = os.path.expanduser(str(audio_file)) + item["label"] = "infer" + item["text"] = "-" + new_manifest_data.append(item) + + new_manifest_filepath = str(Path(cfg.output_dir) / Path("temp_manifest_input.json")) + write_manifest(new_manifest_filepath, new_manifest_data) + return new_manifest_filepath + + +def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: Callable) -> str: + file_list = [] + manifest_data = [] + out_dir = Path(cfg.output_dir) / Path("features") + new_manifest_filepath = str(Path(cfg.output_dir) / Path("temp_manifest_input_feature.json")) + + if Path(new_manifest_filepath).is_file(): + logging.info("Features already exist in output_dir, skipping feature extraction.") + return new_manifest_filepath + + has_feat = False + with open(manifest_filepath, 'r', encoding='utf-8') as fin: + for line in fin.readlines(): + item = json.loads(line.strip()) + manifest_data.append(item) + file_list.append(Path(item['audio_filepath']).stem) + if "feature_file" in item: + has_feat = True + if has_feat: + logging.info("Features already exist in manifest, skipping feature extraction.") + return manifest_filepath + + out_dir.mkdir(parents=True, exist_ok=True) + torch.set_grad_enabled(False) + if cfg.vad_model: + vad_model = init_frame_vad_model(cfg.vad_model) + else: + vad_model = EncDecClassificationModel.from_pretrained("vad_multilingual_marblenet") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + vad_model = vad_model.to(device) + vad_model.eval() + vad_model.setup_test_data( + test_data_config={ + 'batch_size': 1, + 'vad_stream': False, + 'sample_rate': cfg.sample_rate, + 'manifest_filepath': manifest_filepath, + 'labels': [ + 'infer', + ], + 'num_workers': cfg.num_workers, + 'shuffle': False, + 'normalize_audio_db': cfg.normalize_audio_db, + } + ) + + logging.info(f"Extracting features on {len(file_list)} audio files...") + with record_fn("feat_extract_loop"): + for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))): + test_batch = [x.to(vad_model.device) for x in test_batch] + with torch.amp.autocast(vad_model.device.type): + with record_fn("feat_extract_infer"): + processed_signal, processed_signal_length = vad_model.preprocessor( + input_signal=test_batch[0], + length=test_batch[1], + ) + with record_fn("feat_extract_other"): + processed_signal = processed_signal.squeeze(0)[:, :processed_signal_length] + processed_signal = processed_signal.cpu() + outpath = os.path.join(out_dir, file_list[i] + ".pt") + outpath = str(Path(outpath).absolute()) + torch.save(processed_signal, outpath) + manifest_data[i]["feature_file"] = outpath + del test_batch + + logging.info(f"Features saved at: {out_dir}") + write_manifest(new_manifest_filepath, manifest_data) + return new_manifest_filepath + + +def run_vad_inference(manifest_filepath: str, cfg: DictConfig, record_fn: Callable) -> str: + logging.info("Start VAD inference pipeline...") + if cfg.vad_type == "segment": + vad_model = init_vad_model(cfg.vad_model) + elif cfg.vad_type == "frame": + vad_model = init_frame_vad_model(cfg.vad_model) + else: + raise ValueError(f"Unknown VAD type: {cfg.vad_type}, supported types: ['segment', 'frame']") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + vad_model = vad_model.to(device) + vad_model.eval() + + vad_yaml = Path(cfg.vad_config) + if not vad_yaml.is_file(): + raise ValueError(f"VAD config file not found: {cfg.vad_config}") + + with vad_yaml.open("r") as fp: + vad_cfg = yaml.safe_load(fp) + vad_cfg = DictConfig(vad_cfg) + + test_data_config = { + 'vad_stream': True, + 'manifest_filepath': manifest_filepath, + 'labels': [ + 'infer', + ], + 'num_workers': cfg.num_workers, + 'shuffle': False, + 'window_length_in_sec': vad_cfg.vad.parameters.window_length_in_sec, + 'shift_length_in_sec': vad_cfg.vad.parameters.shift_length_in_sec, + } + vad_model.setup_test_data(test_data_config=test_data_config, use_feat=True) + + pred_dir = Path(cfg.output_dir) / Path("vad_frame_pred") + if pred_dir.is_dir(): + logging.info(f"VAD frame-level prediction already exists: {pred_dir}, skipped") + else: + logging.info("Generating VAD frame-level prediction") + pred_dir.mkdir(parents=True) + t0 = time.time() + pred_dir = generate_vad_frame_pred( + vad_model=vad_model, + window_length_in_sec=vad_cfg.vad.parameters.window_length_in_sec, + shift_length_in_sec=vad_cfg.vad.parameters.shift_length_in_sec, + manifest_vad_input=manifest_filepath, + out_dir=str(pred_dir), + use_feat=True, + record_fn=record_fn, + ) + t1 = time.time() + logging.info(f"Time elapsed: {t1 - t0: .2f} seconds") + logging.info( + f"Finished generating VAD frame level prediction with window_length_in_sec={vad_cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={vad_cfg.vad.parameters.shift_length_in_sec}" + ) + + frame_length_in_sec = vad_cfg.vad.parameters.shift_length_in_sec + # overlap smoothing filter + if vad_cfg.vad.parameters.smoothing: + # Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. + # smoothing_method would be either in majority vote (median) or average (mean) + logging.info("Generating predictions with overlapping input segments") + t0 = time.time() + smoothing_pred_dir = generate_overlap_vad_seq( + frame_pred_dir=pred_dir, + smoothing_method=vad_cfg.vad.parameters.smoothing, + overlap=vad_cfg.vad.parameters.overlap, + window_length_in_sec=vad_cfg.vad.parameters.window_length_in_sec, + shift_length_in_sec=vad_cfg.vad.parameters.shift_length_in_sec, + num_workers=cfg.num_workers, + out_dir=vad_cfg.smoothing_out_dir, + ) + logging.info( + f"Finish generating predictions with overlapping input segments with smoothing_method={vad_cfg.vad.parameters.smoothing} and overlap={vad_cfg.vad.parameters.overlap}" + ) + t1 = time.time() + logging.info(f"Time elapsed: {t1 - t0: .2f} seconds") + pred_dir = smoothing_pred_dir + frame_length_in_sec = 0.01 + + # Turn frame-wise prediction into speech intervals + logging.info(f"Generating segment tables with postprocessing params: {vad_cfg.vad.parameters.postprocessing}") + segment_dir_name = "vad_rttm" + for key, val in vad_cfg.vad.parameters.postprocessing.items(): + segment_dir_name = segment_dir_name + "-" + str(key) + str(val) + + segment_dir = Path(cfg.output_dir) / Path(segment_dir_name) + if segment_dir.is_dir(): + logging.info(f"VAD speech segments already exists: {segment_dir}, skipped") + else: + segment_dir.mkdir(parents=True) + t0 = time.time() + segment_dir = generate_vad_segment_table( + vad_pred_dir=pred_dir, + postprocessing_params=vad_cfg.vad.parameters.postprocessing, + frame_length_in_sec=frame_length_in_sec, + num_workers=cfg.num_workers, + out_dir=segment_dir, + use_rttm=True, + ) + t1 = time.time() + logging.info(f"Time elapsed: {t1 - t0: .2f} seconds") + logging.info("Finished generating RTTM files from VAD predictions.") + + rttm_map = {} + for filepath in Path(segment_dir).glob("*.rttm"): + rttm_map[filepath.stem] = str(filepath.absolute()) + + manifest_data = read_manifest(manifest_filepath) + for i in range(len(manifest_data)): + key = Path(manifest_data[i]["audio_filepath"]).stem + manifest_data[i]["rttm_file"] = rttm_map[key] + + new_manifest_filepath = str(Path(cfg.output_dir) / Path(f"temp_manifest_{segment_dir_name}.json")) + write_manifest(new_manifest_filepath, manifest_data) + return new_manifest_filepath + + +def generate_vad_frame_pred( + vad_model: EncDecClassificationModel, + window_length_in_sec: float, + shift_length_in_sec: float, + manifest_vad_input: str, + out_dir: str, + use_feat: bool = False, + record_fn: Callable = None, +) -> str: + """ + Generate VAD frame level prediction and write to out_dir + """ + time_unit = int(window_length_in_sec / shift_length_in_sec) + trunc = int(time_unit / 2) + trunc_l = time_unit - trunc + all_len = 0 + + data = [] + with open(manifest_vad_input, 'r', encoding='utf-8') as fin: + for line in fin.readlines(): + file = json.loads(line)['audio_filepath'].split("/")[-1] + data.append(file.split(".wav")[0]) + logging.info(f"Inference on {len(data)} audio files/json lines!") + + status = get_vad_stream_status(data) + + with record_fn("vad_infer_loop"): + for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))): + test_batch = [x.to(vad_model.device) for x in test_batch] + with torch.amp.autocast(vad_model.device.type): + with record_fn("vad_infer_model"): + if use_feat: + log_probs = vad_model(processed_signal=test_batch[0], processed_signal_length=test_batch[1]) + else: + log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) + + with record_fn("vad_infer_other"): + probs = torch.softmax(log_probs, dim=-1) + if len(probs.shape) == 3: + # squeeze the batch dimension, since batch size is 1 + probs = probs.squeeze(0) # [1,T,C] -> [T,C] + pred = probs[:, 1] + + if window_length_in_sec == 0: + to_save = pred + elif status[i] == 'start': + to_save = pred[:-trunc] + elif status[i] == 'next': + to_save = pred[trunc:-trunc_l] + elif status[i] == 'end': + to_save = pred[trunc_l:] + else: + to_save = pred + + to_save = to_save.cpu().tolist() + all_len += len(to_save) + + outpath = os.path.join(out_dir, data[i] + ".frame") + with open(outpath, "a", encoding='utf-8') as fout: + for p in to_save: + fout.write(f'{p:0.4f}\n') + + del test_batch + if status[i] == 'end' or status[i] == 'single': + all_len = 0 + return out_dir + + +def init_asr_model(model_path: str) -> ASRModel: + if model_path.endswith('.nemo'): + logging.info(f"Using local ASR model from {model_path}") + asr_model = ASRModel.restore_from(restore_path=model_path) + elif model_path.endswith('.ckpt'): + asr_model = ASRModel.load_from_checkpoint(checkpoint_path=model_path) + else: + logging.info(f"Using NGC ASR model {model_path}") + asr_model = ASRModel.from_pretrained(model_name=model_path) + return asr_model + + +def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: + logging.info("Start ASR inference pipeline...") + asr_model = init_asr_model(cfg.asr_model) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + asr_model = asr_model.to(device) + asr_model.eval() + + # Setup decoding strategy + decode_function = None + decoder_type = cfg.get("decoder_type", None) + if not hasattr(asr_model, 'change_decoding_strategy'): + raise ValueError(f"ASR model {cfg.asr_model} does not support decoding strategy.") + if decoder_type is not None: # Hybrid model + if decoder_type == 'rnnt': + cfg.rnnt_decoding.fused_batch_size = -1 + cfg.rnnt_decoding.compute_langs = cfg.compute_langs + asr_model.change_decoding_strategy(cfg.rnnt_decoding, decoder_type=decoder_type) + decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor + elif decoder_type == 'ctc': + asr_model.change_decoding_strategy(cfg.ctc_decoding, decoder_type=decoder_type) + decode_function = asr_model.decoding.ctc_decoder_predictions_tensor + else: + raise ValueError( + f"Unknown decoder type for hybrid model: {decoder_type}, supported types: ['rnnt', 'ctc']" + ) + elif hasattr(asr_model, 'joint'): # RNNT model + cfg.rnnt_decoding.fused_batch_size = -1 + cfg.rnnt_decoding.compute_langs = cfg.compute_langs + asr_model.change_decoding_strategy(cfg.rnnt_decoding) + decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor + else: + asr_model.change_decoding_strategy(cfg.ctc_decoding) + decode_function = asr_model.decoding.ctc_decoder_predictions_tensor + + # Compute output filename + if cfg.output_filename is None: + # create default output filename + if cfg.pred_name_postfix is not None: + cfg.output_filename = cfg.manifest_filepath.replace('.json', f'_{cfg.pred_name_postfix}.json') + else: + tag = f"{cfg.normalize}_{cfg.normalize_type}" + if cfg.use_rttm: + vad_tag = Path(manifest_filepath).stem + vad_tag = vad_tag[len("temp_manifest_vad_rttm_") :] + if cfg.rttm_mode == "mask": + tag += f"-mask{cfg.feat_mask_val}-{vad_tag}" + else: + tag += f"-dropframe-{vad_tag}" + cfg.output_filename = cfg.manifest_filepath.replace('.json', f'-{Path(cfg.asr_model).stem}-{tag}.json') + cfg.output_filename = Path(cfg.output_dir) / Path(cfg.output_filename).name + + logging.info("Setting up dataloader for ASR...") + data_config = { + "manifest_filepath": manifest_filepath, + "normalize": cfg.normalize, + "normalize_type": cfg.normalize_type, + "use_rttm": cfg.use_rttm, + "rttm_mode": cfg.rttm_mode, + "feat_mask_val": cfg.feat_mask_val, + "frame_unit_time_secs": cfg.frame_unit_time_secs, + } + logging.info(f"use_rttm = {cfg.use_rttm}, rttm_mode = {cfg.rttm_mode}, feat_mask_val = {cfg.feat_mask_val}") + + if hasattr(asr_model, "tokenizer"): + dataset = feature_to_text_dataset.get_bpe_dataset(config=data_config, tokenizer=asr_model.tokenizer) + else: + data_config["labels"] = asr_model.decoder.vocabulary + dataset = feature_to_text_dataset.get_char_dataset(config=data_config) + + dataloader = torch.utils.data.DataLoader( + dataset=dataset, + batch_size=cfg.batch_size, + collate_fn=dataset._collate_fn, + drop_last=False, + shuffle=False, + num_workers=cfg.get('num_workers', 0), + pin_memory=cfg.get('pin_memory', False), + ) + + logging.info("Start transcribing...") + hypotheses = [] + all_hypotheses = [] + t0 = time.time() + with torch.amp.autocast(asr_model.device.type): + with torch.no_grad(): + with record_fn("asr_infer_loop"): + for test_batch in tqdm(dataloader, desc="Transcribing"): + with record_fn("asr_infer_model"): + outputs = asr_model.forward( + processed_signal=test_batch[0].to(device), + processed_signal_length=test_batch[1].to(device), + ) + + with record_fn("asr_infer_other"): + logits, logits_len = outputs[0], outputs[1] + + current_hypotheses, all_hyp = decode_function( + logits, + logits_len, + return_hypotheses=False, + ) + if isinstance(current_hypotheses, tuple) and len(current_hypotheses) == 2: + current_hypotheses = current_hypotheses[0] # handle RNNT output + + hypotheses += current_hypotheses + if all_hyp is not None: + all_hypotheses += all_hyp + else: + all_hypotheses += current_hypotheses + + del logits + del test_batch + t1 = time.time() + logging.info(f"Time elapsed: {t1 - t0: .2f} seconds") + + logging.info("Finished transcribing.") + # Save output to manifest + input_manifest_data = read_manifest(manifest_filepath) + manifest_data = read_manifest(cfg.manifest_filepath) + + if "text" not in manifest_data[0]: + has_groundtruth = False + else: + has_groundtruth = True + + groundtruth = [] + for i in range(len(manifest_data)): + if has_groundtruth: + groundtruth.append(manifest_data[i]["text"]) + manifest_data[i]["pred_text"] = hypotheses[i] + manifest_data[i]["feature_file"] = input_manifest_data[i]["feature_file"] + if "rttm_file" in input_manifest_data[i]: + manifest_data[i]["feature_file"] = input_manifest_data[i]["feature_file"] + + write_manifest(cfg.output_filename, manifest_data) + + if not has_groundtruth: + hypotheses = " ".join(hypotheses) + words = hypotheses.split() + chars = "".join(words) + logging.info("-----------------------------------------") + logging.info(f"Number of generated characters={len(chars)}") + logging.info(f"Number of generated words={len(words)}") + logging.info("-----------------------------------------") + else: + wer_score = word_error_rate(hypotheses=hypotheses, references=groundtruth) + cer_score = word_error_rate(hypotheses=hypotheses, references=groundtruth, use_cer=True) + logging.info("-----------------------------------------") + logging.info(f"WER={wer_score:.4f}, CER={cer_score:.4f}") + logging.info("-----------------------------------------") + + logging.info(f"ASR output saved at {cfg.output_filename}") + return cfg.output_filename + + +if __name__ == "__main__": + main() From 3d34b1c55e305499a4ce24059cf018ebb28a36d0 Mon Sep 17 00:00:00 2001 From: Nune Date: Sat, 14 Jun 2025 18:36:29 +0400 Subject: [PATCH 08/13] Small changes Signed-off-by: Nune --- tests/test_bootstrap_estimate.py | 101 ------------------ tests/test_data_to_data.py | 100 ------------------ tests/test_modify_manifest.py | 169 ------------------------------- 3 files changed, 370 deletions(-) delete mode 100644 tests/test_bootstrap_estimate.py delete mode 100644 tests/test_data_to_data.py delete mode 100644 tests/test_modify_manifest.py diff --git a/tests/test_bootstrap_estimate.py b/tests/test_bootstrap_estimate.py deleted file mode 100644 index 18ac7bf7..00000000 --- a/tests/test_bootstrap_estimate.py +++ /dev/null @@ -1,101 +0,0 @@ -import json -import tempfile -from pathlib import Path -from sdp.utils import BootstrapProcessor - -def _write_manifest(manifest_path: Path, entries): - with manifest_path.open("w") as f: - for entry in entries: - f.write(json.dumps(entry) + "\n") - -def test_bootstrap_processor(): - manifest1_data = [ - {"audio_filepath": "path1.wav", "duration": 3.744, "text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", - "pred_text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", "wer": 0.142857, "tokens": 7, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.142857}, - {"audio_filepath": "path2.wav", "duration": 5.76, "text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմինդրների արտադրությունը։", - "pred_text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կաղնիտների արտադրությունը։", "wer": 0.285714, "tokens": 7, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.285714}, - {"audio_filepath": "path3.wav", "duration": 6.984, "text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվազի մոտ երեք անգամ։", - "pred_text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվագի մոտ երեք անգամ։", "wer": 0.1, "tokens": 10, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.1}, - {"audio_filepath": "path4.wav", "duration": 4.104, "text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", - "pred_text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", "wer": 0.0, "tokens": 6, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.0} - ] - - manifest2_data = [ - {"audio_filepath": "path1.wav", "duration": 3.744, "text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", - "pred_text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", "wer": 0.142857, "tokens": 7, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.142857}, - {"audio_filepath": "path2.wav", "duration": 5.76, "text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմինդրների արտադրությունը։", - "pred_text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմիտների արտադրությունը։", "wer": 0.285714, "tokens": 7, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.285714}, - {"audio_filepath": "path3.wav", "duration": 6.984, "text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվազի մոտ երեք անգամ։", - "pred_text": "Եթե մոտակայքում չկան մեղվափետներ, ապա բերքատվությունը կնվացի մոտ երեք անգամ։", "wer": 0.2, "tokens": 10, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.2}, - {"audio_filepath": "path4.wav", "duration": 4.104, "text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", - "pred_text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", "wer": 0.0, "tokens": 6, - "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.0} - ] - - # Expected output for comparison - expected_output = { - "individual_results": { - "manifest1.json": { - "mean_wer": 5.358, - "ci_lower": 0.5625, - "ci_upper": 10.992625 - }, - "manifest2.json": { - "mean_wer": 9.0725, - "ci_lower": 5.0, - "ci_upper": 15.234875 - } - }, - "pairwise_comparisons": [ - { - "file_1": "manifest1.json", - "file_2": "manifest2.json", - "delta_wer_mean": -1.75, - "ci_lower": -5.0, - "ci_upper": 0.0, - "poi": 0.0 - } - ] - } - - with tempfile.TemporaryDirectory() as tmpdir: - # Create temporary paths - manifest1_path = Path(tmpdir) / "manifest1.json" - manifest2_path = Path(tmpdir) / "manifest2.json" - output_path = Path(tmpdir) / "output_manifest.json" - - # Write manifest data to temporary files - _write_manifest(manifest1_path, manifest1_data) - _write_manifest(manifest2_path, manifest2_data) - - # Run BootstrapProcessor with test parameters - processor = BootstrapProcessor( - bootstrap_manifest_files=[str(manifest1_path), str(manifest2_path)], - raw_data_dir=str(tmpdir), - output_file=str(output_path), - num_bootstraps=10, - bootstrap_sample_ratio=1.0, - calculate_pairwise=True, - metric_type='wer', - text_key='text', - pred_text_key='pred_text', - ci_lower=2.5, - ci_upper=97.5, - random_state=42, - output_manifest_file=None # A placeholder to skip BaseProcessor failing - ) - - processor.process() - - # Load and compare the processor output - with open(output_path, "r") as f: - output = json.load(f) - - assert output == expected_output, f"Expected {expected_output}, but got {output}" diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py deleted file mode 100644 index 5bd75f47..00000000 --- a/tests/test_data_to_data.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from sdp.processors.modify_manifest.data_to_data import ( - InsIfASRInsertion, - SubIfASRSubstitution, - SubMakeLowercase, - SubRegex, -) - -test_params_list = [] - -test_params_list.extend( - [ - ( - InsIfASRInsertion, - {"insert_words": [" nemo", "nemo ", " nemo "]}, - {"text": "i love the toolkit", "pred_text": "i love the nemo toolkit"}, - {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, - ), - ( - InsIfASRInsertion, - {"insert_words": [" nemo", "nemo ", " nemo "]}, - {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}, - {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}, - ), - ] -) - -test_params_list.extend( - [ - ( - SubIfASRSubstitution, - {"sub_words": {"nmo ": "nemo "}}, - {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"}, - {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, - ), - ] -) - -test_params_list.extend( - [ - ( - SubIfASRSubstitution, - {"sub_words": {"nmo ": "nemo "}}, - {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"}, - {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, - ), - ] -) - -test_params_list.extend( - [ - ( - SubMakeLowercase, - {}, - {"text": "Hello Привет 123"}, - {"text": "hello привет 123"}, - ), - ( - SubMakeLowercase, - {"text_key": "text_new"}, - {"text_new": "Hello Привет 123"}, - {"text_new": "hello привет 123"}, - ), - ] -) - -test_params_list.extend( - [ - ( - SubRegex, - {"regex_params_list": [{"pattern": "\s<.*>\s", "repl": " "}]}, - {"text": "hello world"}, - {"text": "hello world"}, - ), - ] -) - - -@pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str) -def test_data_to_data(test_class, class_kwargs, test_input, expected_output): - processor = test_class(**class_kwargs, output_manifest_file=None) - - output = processor.process_dataset_entry(test_input)[0].data - - assert output == expected_output diff --git a/tests/test_modify_manifest.py b/tests/test_modify_manifest.py deleted file mode 100644 index 99583c26..00000000 --- a/tests/test_modify_manifest.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import tempfile -from pathlib import Path -from typing import Dict, List, Union - -import pytest - -from sdp.processors import ApplyInnerJoin, DropNonAlphabet - - -def _write_manifest(manifest: Path, entries: List[Dict[str, Union[str, float]]]): - with open(manifest, "w") as f: - for line in entries: - f.write(json.dumps(line) + "\n") - - -def test_empty_test_cases(): - """Testing that empty test cases don't raise an error.""" - processor = DropNonAlphabet("123", output_manifest_file="tmp") - processor.test() - - -inner_join_entries = [] -inner_join_entries.extend( - [ - ( - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc3", "duration": 11, "audio_filepath": "path_2"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - ], - None, - ), - ( - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - ], - None, - ), - ( - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc_2", "duration": 11, "audio_filepath": "path2"}, - ], - [ - { - "id": 1, - "text_x": "abc1", - "duration_x": 10, - "audio_filepath_x": "path1", - "text_y": "abc1", - "duration_y": 10, - "audio_filepath_y": "path1", - }, - { - "id": 2, - "text_x": "abc2", - "duration_x": 11, - "audio_filepath_x": "path2", - "text_y": "abc_2", - "duration_y": 11, - "audio_filepath_y": "path2", - }, - ], - "id", - ), - ( - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, - {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, - ], - [ - {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, - {"id": 2, "text": "abc_2", "duration": 11, "audio_filepath": "path2"}, - ], - [ - { - "id": 1, - "text_x": "abc1", - "duration": 10, - "audio_filepath": "path1", - "text_y": "abc1", - }, - { - "id": 2, - "text_x": "abc2", - "duration": 11, - "audio_filepath": "path2", - "text_y": "abc_2", - }, - ], - ["id", "duration", "audio_filepath"], - ), - ( - [{"id": 1, "text": "text1"}], - [{"id": 1, "text": "text2"}], - [{"id": 1, "text_x": "text1", "text_y": "text2"}], - "id", - ), - ] -) - - -@pytest.mark.parametrize("input1,input2,output,coloumn_id", inner_join_entries) -def test_apply_inner_join( - input1: List[Dict[str, Union[str, float]]], - input2: List[Dict[str, Union[str, float]]], - output: List[Dict[str, Union[str, float]]], - coloumn_id: Union[str, List[str], None], -): - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir_path = Path(tmpdir) - manifest1 = tmpdir_path / "manifest1.json" - manifest2 = tmpdir_path / "manifest2.json" - manifest_out = tmpdir_path / "output_manifest.json" - - _write_manifest(manifest1, input1) - _write_manifest(manifest2, input2) - - processor = ApplyInnerJoin( - left_manifest_file=manifest1, - right_manifest_file=manifest2, - column_id=coloumn_id, - output_manifest_file=manifest_out, - ) - - processor.process() - - with open(manifest_out, "r") as f: - output_lines = [json.loads(line) for line in f] - - assert output_lines == output From 9999008d0ef0696b165e1cd64464b09c7144778b Mon Sep 17 00:00:00 2001 From: Nune Date: Mon, 16 Jun 2025 12:50:23 +0400 Subject: [PATCH 09/13] Ignore duration as it differs based on setup Signed-off-by: Nune --- tests/test_cfg_end_to_end_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index f75bf26c..ca1e10f8 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -266,7 +266,8 @@ def get_test_cases() -> List[Tuple[str, Callable]]: TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/unlabeled/config.yaml", - data_check_fn=partial(data_check_fn_unlabeled) + data_check_fn=partial(data_check_fn_unlabeled), + fields_to_ignore=['duration'], ), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml", From cfe3aaf9f26535ec0938480bc25414177a7a5a23 Mon Sep 17 00:00:00 2001 From: Nune Date: Mon, 16 Jun 2025 14:43:30 +0400 Subject: [PATCH 10/13] Bringing back tests Signed-off-by: Nune --- tests/test_bootstrap_estimate.py | 101 ++++++++++++ tests/test_data_to_data.py | 100 ++++++++++++ tests/test_data_to_dropbool.py | 269 +++++++++++++++++++++++++++++++ tests/test_modify_manifest.py | 169 +++++++++++++++++++ 4 files changed, 639 insertions(+) create mode 100644 tests/test_bootstrap_estimate.py create mode 100644 tests/test_data_to_data.py create mode 100644 tests/test_data_to_dropbool.py create mode 100644 tests/test_modify_manifest.py diff --git a/tests/test_bootstrap_estimate.py b/tests/test_bootstrap_estimate.py new file mode 100644 index 00000000..18ac7bf7 --- /dev/null +++ b/tests/test_bootstrap_estimate.py @@ -0,0 +1,101 @@ +import json +import tempfile +from pathlib import Path +from sdp.utils import BootstrapProcessor + +def _write_manifest(manifest_path: Path, entries): + with manifest_path.open("w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + +def test_bootstrap_processor(): + manifest1_data = [ + {"audio_filepath": "path1.wav", "duration": 3.744, "text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", + "pred_text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", "wer": 0.142857, "tokens": 7, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.142857}, + {"audio_filepath": "path2.wav", "duration": 5.76, "text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմինդրների արտադրությունը։", + "pred_text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կաղնիտների արտադրությունը։", "wer": 0.285714, "tokens": 7, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.285714}, + {"audio_filepath": "path3.wav", "duration": 6.984, "text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվազի մոտ երեք անգամ։", + "pred_text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվագի մոտ երեք անգամ։", "wer": 0.1, "tokens": 10, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.1}, + {"audio_filepath": "path4.wav", "duration": 4.104, "text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", + "pred_text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", "wer": 0.0, "tokens": 6, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.0} + ] + + manifest2_data = [ + {"audio_filepath": "path1.wav", "duration": 3.744, "text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", + "pred_text": "Նա նաև լավ էր գրում մանկական ոտանավորներ։", "wer": 0.142857, "tokens": 7, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.142857}, + {"audio_filepath": "path2.wav", "duration": 5.76, "text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմինդրների արտադրությունը։", + "pred_text": "Ամենամեծ ջանքերը պահանջեց աղյուսների և կղմիտների արտադրությունը։", "wer": 0.285714, "tokens": 7, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.285714}, + {"audio_filepath": "path3.wav", "duration": 6.984, "text": "Եթե մոտակայքում չկան մեղվափեթակներ, ապա բերքատվությունը կնվազի մոտ երեք անգամ։", + "pred_text": "Եթե մոտակայքում չկան մեղվափետներ, ապա բերքատվությունը կնվացի մոտ երեք անգամ։", "wer": 0.2, "tokens": 10, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.2}, + {"audio_filepath": "path4.wav", "duration": 4.104, "text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", + "pred_text": "Դպրոցը հիմնականում պահվել է եկեղեցու եկամուտներով։", "wer": 0.0, "tokens": 6, + "ins_rate": 0.0, "del_rate": 0.0, "sub_rate": 0.0} + ] + + # Expected output for comparison + expected_output = { + "individual_results": { + "manifest1.json": { + "mean_wer": 5.358, + "ci_lower": 0.5625, + "ci_upper": 10.992625 + }, + "manifest2.json": { + "mean_wer": 9.0725, + "ci_lower": 5.0, + "ci_upper": 15.234875 + } + }, + "pairwise_comparisons": [ + { + "file_1": "manifest1.json", + "file_2": "manifest2.json", + "delta_wer_mean": -1.75, + "ci_lower": -5.0, + "ci_upper": 0.0, + "poi": 0.0 + } + ] + } + + with tempfile.TemporaryDirectory() as tmpdir: + # Create temporary paths + manifest1_path = Path(tmpdir) / "manifest1.json" + manifest2_path = Path(tmpdir) / "manifest2.json" + output_path = Path(tmpdir) / "output_manifest.json" + + # Write manifest data to temporary files + _write_manifest(manifest1_path, manifest1_data) + _write_manifest(manifest2_path, manifest2_data) + + # Run BootstrapProcessor with test parameters + processor = BootstrapProcessor( + bootstrap_manifest_files=[str(manifest1_path), str(manifest2_path)], + raw_data_dir=str(tmpdir), + output_file=str(output_path), + num_bootstraps=10, + bootstrap_sample_ratio=1.0, + calculate_pairwise=True, + metric_type='wer', + text_key='text', + pred_text_key='pred_text', + ci_lower=2.5, + ci_upper=97.5, + random_state=42, + output_manifest_file=None # A placeholder to skip BaseProcessor failing + ) + + processor.process() + + # Load and compare the processor output + with open(output_path, "r") as f: + output = json.load(f) + + assert output == expected_output, f"Expected {expected_output}, but got {output}" diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py new file mode 100644 index 00000000..5bd75f47 --- /dev/null +++ b/tests/test_data_to_data.py @@ -0,0 +1,100 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from sdp.processors.modify_manifest.data_to_data import ( + InsIfASRInsertion, + SubIfASRSubstitution, + SubMakeLowercase, + SubRegex, +) + +test_params_list = [] + +test_params_list.extend( + [ + ( + InsIfASRInsertion, + {"insert_words": [" nemo", "nemo ", " nemo "]}, + {"text": "i love the toolkit", "pred_text": "i love the nemo toolkit"}, + {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, + ), + ( + InsIfASRInsertion, + {"insert_words": [" nemo", "nemo ", " nemo "]}, + {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}, + {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}, + ), + ] +) + +test_params_list.extend( + [ + ( + SubIfASRSubstitution, + {"sub_words": {"nmo ": "nemo "}}, + {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"}, + {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, + ), + ] +) + +test_params_list.extend( + [ + ( + SubIfASRSubstitution, + {"sub_words": {"nmo ": "nemo "}}, + {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"}, + {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}, + ), + ] +) + +test_params_list.extend( + [ + ( + SubMakeLowercase, + {}, + {"text": "Hello Привет 123"}, + {"text": "hello привет 123"}, + ), + ( + SubMakeLowercase, + {"text_key": "text_new"}, + {"text_new": "Hello Привет 123"}, + {"text_new": "hello привет 123"}, + ), + ] +) + +test_params_list.extend( + [ + ( + SubRegex, + {"regex_params_list": [{"pattern": "\s<.*>\s", "repl": " "}]}, + {"text": "hello world"}, + {"text": "hello world"}, + ), + ] +) + + +@pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str) +def test_data_to_data(test_class, class_kwargs, test_input, expected_output): + processor = test_class(**class_kwargs, output_manifest_file=None) + + output = processor.process_dataset_entry(test_input)[0].data + + assert output == expected_output diff --git a/tests/test_data_to_dropbool.py b/tests/test_data_to_dropbool.py new file mode 100644 index 00000000..743c7620 --- /dev/null +++ b/tests/test_data_to_dropbool.py @@ -0,0 +1,269 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from sdp.processors.modify_manifest.data_to_dropbool import ( + DropASRErrorBeginningEnd, + DropHighCER, + DropHighLowCharrate, + DropHighLowDuration, + DropHighLowWordrate, + DropHighWER, + DropIfNoneOfRegexMatch, + DropIfRegexMatch, + DropIfSubstringInInsertion, + DropLowWordMatchRate, + DropNonAlphabet, +) + +test_params_list = [] + +test_params_list.extend( + [ + ( + DropHighLowCharrate, + {"high_charrate_threshold": 9.9, "low_charrate_threshold": 0}, + {"text": "0123456789", "duration": 1}, + True, + ), + ( + DropHighLowCharrate, + {"high_charrate_threshold": 99, "low_charrate_threshold": 10.1}, + {"text": "0123456789", "duration": 1}, + True, + ), + ( + DropHighLowCharrate, + {"high_charrate_threshold": 10.1, "low_charrate_threshold": 9.9}, + {"text": "0123456789", "duration": 1}, + False, + ), + ] +) + +test_params_list.extend( + [ + ( + DropHighLowWordrate, + {"high_wordrate_threshold": 3.9, "low_wordrate_threshold": 0}, + {"text": "11 22 33 44", "duration": 1}, + True, + ), + ( + DropHighLowWordrate, + {"high_wordrate_threshold": 99, "low_wordrate_threshold": 4.1}, + {"text": "11 22 33 44", "duration": 1}, + True, + ), + ( + DropHighLowWordrate, + {"high_wordrate_threshold": 4.1, "low_wordrate_threshold": 3.9}, + {"text": "11 22 33 44", "duration": 1}, + False, + ), + ] +) + +test_params_list.extend( + [ + ( + DropHighLowDuration, + {"high_duration_threshold": 3.9, "low_duration_threshold": 0}, + {"duration": 4}, + True, + ), + ( + DropHighLowDuration, + {"high_duration_threshold": 99, "low_duration_threshold": 4.1}, + {"duration": 4}, + True, + ), + ( + DropHighLowDuration, + {"high_duration_threshold": 4.1, "low_duration_threshold": 3.9}, + {"duration": 4}, + False, + ), + ] +) + + +test_params_list.extend( + [ + ( + DropNonAlphabet, + {"alphabet": " abc"}, + {"text": "ab ba cab dac"}, + True, + ), + ( + DropNonAlphabet, + {"alphabet": " abcd"}, + {"text": "ab ba cab dac"}, + False, + ), + ] +) + + +test_params_list.extend( + [ + ( + DropASRErrorBeginningEnd, + {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, + {"text": "2", "pred_text": "1 2 3"}, + True, + ), + ( + DropASRErrorBeginningEnd, + {"beginning_error_char_threshold": 2, "end_error_char_threshold": 0}, + {"text": "2", "pred_text": "1 2 3"}, + True, + ), + ( + DropASRErrorBeginningEnd, + {"beginning_error_char_threshold": 2, "end_error_char_threshold": 2}, + {"text": "2", "pred_text": "1 2 3"}, + False, + ), + ( + DropASRErrorBeginningEnd, + {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, + {"text": "sentence with some text here", "pred_text": "sentence with some text her"}, + False, + ), + ( + DropASRErrorBeginningEnd, + {"beginning_error_char_threshold": 0, "end_error_char_threshold": 2}, + { + "text": "sentence with some text here but actually more text was spoken", + "pred_text": "sentence with some text her", + }, + True, + ), + ] +) + +test_params_list.extend( + [ + ( + DropHighCER, + {"cer_threshold": 9.9}, + {"text": "0123456789", "pred_text": "012345678"}, + True, + ), + ( + DropHighCER, + {"cer_threshold": 10.1}, + {"text": "0123456789", "pred_text": "012345678"}, + False, + ), + ] +) + +test_params_list.extend( + [ + ( + DropHighWER, + {"wer_threshold": 0}, + {"text": "11 22", "pred_text": "11 22"}, + False, + ), + ( + DropHighWER, + {"wer_threshold": 50.1}, + {"text": "11 22", "pred_text": "11 22 33"}, + False, + ), + ( + DropHighWER, + {"wer_threshold": 49.9}, + {"text": "11 22", "pred_text": "11 22 33"}, + True, + ), + ] +) + +test_params_list.extend( + [ + (DropIfNoneOfRegexMatch, {"regex_patterns": ["keep this", "also this"]}, {"text": "I don't want this"}, True), + ( + DropIfNoneOfRegexMatch, + {"regex_patterns": ["keep this", "also this"]}, + {"text": "I want to keep this"}, + False, + ), + ] +) + +test_params_list.extend( + [ + (DropIfRegexMatch, {"regex_patterns": ["incorrect_text"]}, {"text": "incorrect_text"}, True), + ( + DropIfRegexMatch, + {"regex_patterns": ["001/002"], "text_key": "audio_filepath"}, + {"audio_filepath": "001/002/003.wav"}, + True, + ), + ] +) + +test_params_list.extend( + [ + ( + DropLowWordMatchRate, + {"wmr_threshold": 50.1}, + {"text": "hello world i'm nemo", "pred_text": "hello world"}, + True, + ), + ( + DropLowWordMatchRate, + {"wmr_threshold": 49.9}, + {"text": "hello world i'm nemo", "pred_text": "hello world"}, + False, + ), + ] +) + +test_params_list.extend( + [ + ( + DropIfSubstringInInsertion, + {"substrings_in_insertion": ["might "]}, + {"text": "we miss certain words", "pred_text": "we might miss certain words"}, + True, + ), + ( + DropIfSubstringInInsertion, + {"substrings_in_insertion": ["might "]}, + {"text": "we may certain words", "pred_text": "we might miss certain words"}, + False, + ), + ] +) + + +@pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str) +def test_data_to_data(test_class, class_kwargs, test_input, expected_output): + processor = test_class(**class_kwargs, output_manifest_file=None) + + output = processor.process_dataset_entry(test_input) + if output: + output = output[0].data + + if expected_output: + assert output is None + else: + assert output == test_input diff --git a/tests/test_modify_manifest.py b/tests/test_modify_manifest.py new file mode 100644 index 00000000..99583c26 --- /dev/null +++ b/tests/test_modify_manifest.py @@ -0,0 +1,169 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import tempfile +from pathlib import Path +from typing import Dict, List, Union + +import pytest + +from sdp.processors import ApplyInnerJoin, DropNonAlphabet + + +def _write_manifest(manifest: Path, entries: List[Dict[str, Union[str, float]]]): + with open(manifest, "w") as f: + for line in entries: + f.write(json.dumps(line) + "\n") + + +def test_empty_test_cases(): + """Testing that empty test cases don't raise an error.""" + processor = DropNonAlphabet("123", output_manifest_file="tmp") + processor.test() + + +inner_join_entries = [] +inner_join_entries.extend( + [ + ( + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc3", "duration": 11, "audio_filepath": "path_2"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + ], + None, + ), + ( + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + ], + None, + ), + ( + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc_2", "duration": 11, "audio_filepath": "path2"}, + ], + [ + { + "id": 1, + "text_x": "abc1", + "duration_x": 10, + "audio_filepath_x": "path1", + "text_y": "abc1", + "duration_y": 10, + "audio_filepath_y": "path1", + }, + { + "id": 2, + "text_x": "abc2", + "duration_x": 11, + "audio_filepath_x": "path2", + "text_y": "abc_2", + "duration_y": 11, + "audio_filepath_y": "path2", + }, + ], + "id", + ), + ( + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc2", "duration": 11, "audio_filepath": "path2"}, + {"id": 3, "text": "abc3", "duration": 11, "audio_filepath": "path3"}, + ], + [ + {"id": 1, "text": "abc1", "duration": 10, "audio_filepath": "path1"}, + {"id": 2, "text": "abc_2", "duration": 11, "audio_filepath": "path2"}, + ], + [ + { + "id": 1, + "text_x": "abc1", + "duration": 10, + "audio_filepath": "path1", + "text_y": "abc1", + }, + { + "id": 2, + "text_x": "abc2", + "duration": 11, + "audio_filepath": "path2", + "text_y": "abc_2", + }, + ], + ["id", "duration", "audio_filepath"], + ), + ( + [{"id": 1, "text": "text1"}], + [{"id": 1, "text": "text2"}], + [{"id": 1, "text_x": "text1", "text_y": "text2"}], + "id", + ), + ] +) + + +@pytest.mark.parametrize("input1,input2,output,coloumn_id", inner_join_entries) +def test_apply_inner_join( + input1: List[Dict[str, Union[str, float]]], + input2: List[Dict[str, Union[str, float]]], + output: List[Dict[str, Union[str, float]]], + coloumn_id: Union[str, List[str], None], +): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + manifest1 = tmpdir_path / "manifest1.json" + manifest2 = tmpdir_path / "manifest2.json" + manifest_out = tmpdir_path / "output_manifest.json" + + _write_manifest(manifest1, input1) + _write_manifest(manifest2, input2) + + processor = ApplyInnerJoin( + left_manifest_file=manifest1, + right_manifest_file=manifest2, + column_id=coloumn_id, + output_manifest_file=manifest_out, + ) + + processor.process() + + with open(manifest_out, "r") as f: + output_lines = [json.loads(line) for line in f] + + assert output_lines == output From 5fedad0e93af06228055454565fba457c287e2c4 Mon Sep 17 00:00:00 2001 From: Nune Date: Mon, 16 Jun 2025 14:48:06 +0400 Subject: [PATCH 11/13] remove prints Signed-off-by: Nune --- .../datasets/commoncrawl/commoncrawl.py | 3 +- .../modify_manifest/data_to_dropbool.py | 2 - sdp/run_processors.py | 5 - tests/test_cfg_end_to_end_tests.py | 249 +++++++++--------- 4 files changed, 125 insertions(+), 134 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 8a5cc2c6..b020d4cf 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -67,8 +67,7 @@ def process_dataset_entry(self, data_entry): pass end_c = end_sr if len(text_c) > 0 and ( - end_c - start_c > self.duration_threshold * samplerate or - text_c[-1] == "." or text_c[-1] == "?"): + end_c - start_c > self.duration_threshold * samplerate): res_list.append( self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) text_c = '' diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index fda5651e..eeeebd1e 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -76,8 +76,6 @@ def __init__( ) def process_dataset_entry(self, data_entry): - print(f"self.input_value_key {self.input_value_key}") - print(f"data_entry {data_entry}") input_value = data_entry[self.input_value_key] target = self.target_value if self.operator(input_value, target): diff --git a/sdp/run_processors.py b/sdp/run_processors.py index 43c4b06d..6ddf27f4 100644 --- a/sdp/run_processors.py +++ b/sdp/run_processors.py @@ -160,8 +160,6 @@ def run_processors(cfg): # filtering out any processors that have should_run=False processors_cfgs = [] for processor_cfg in selected_cfgs: - print("********************************************************************************") - print(f"processor_cfg {processor_cfg}") with open_dict(processor_cfg): should_run = processor_cfg.pop("should_run", True) if should_run: @@ -244,9 +242,6 @@ def run_processors(cfg): logger.info('=> Running processor "%s" with Dask', proc) else: logger.info('=> Running processor "%s" with Multiprocessing', proc) - - print(f"p {proc}") - print(f"") proc.process() finally: if dask_client is not None: diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index ca1e10f8..ab2d9a7b 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -139,131 +139,130 @@ def data_check_fn_armenian_toloka_pipeline_get_final_res(raw_data_dir: str) -> N def get_test_cases() -> List[Tuple[str, Callable]]: return [ - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", - # data_check_fn=partial(data_check_fn_mls, language="spanish"), - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", - # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", - # data_check_fn=data_check_fn_voxpopuli - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", - # data_check_fn=partial(data_check_fn_mls, language="italian") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", - # data_check_fn=partial(data_check_fn_mls, language="portuguese") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", - # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", - # data_check_fn=partial(data_check_fn_mtedx, language_id="pt") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", - # data_check_fn=data_check_fn_coraa - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", - # data_check_fn=lambda raw_data_dir: True - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", - # data_check_fn=lambda raw_data_dir: True - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", - # data_check_fn=data_check_fn_librispeech - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", - # data_check_fn=data_check_fn_fleurs - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", - # data_check_fn=lambda raw_data_dir: True - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", - # data_check_fn=lambda raw_data_dir: True, - # fields_to_ignore=['text'], - # ), - #TestCase( - # f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", - # partial(data_check_fn_mcv, archive_file_stem="mcv_kk") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", - # data_check_fn=data_check_fn_slr140 - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="slr102_kk.tar.gz") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", - # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", - # data_check_fn=data_check_fn_uzbekvoice - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", - # data_check_fn=data_check_fn_fleurs - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config_filter_noisy_train.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz"), - # reference_manifest_filename="test_data_reference_filter.json" - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mcv/config.yaml", - # data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv.ar") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/fleurs/config.yaml", - # data_check_fn=data_check_fn_fleurs - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mediaspeech/config.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="AR.tar.gz") - # ), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/arabic/everyayah/config.yaml", - # data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") - #), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", - # data_check_fn=data_check_fn_armenian_toloka_pipeline_start, - # fields_to_ignore=['source_filepath'], - # processors_to_run="2:14", - # reference_manifest_filename="pipeline_start/test_data_reference.json" - #), - #TestCase( - # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", - # data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, - # reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", - # fields_to_ignore=['audio_filepath', 'duration'], - # processors_to_run="1:6" - #), - + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", + data_check_fn=partial(data_check_fn_mls, language="spanish"), + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", + data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", + data_check_fn=data_check_fn_voxpopuli + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", + data_check_fn=partial(data_check_fn_mls, language="italian") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", + data_check_fn=partial(data_check_fn_mls, language="portuguese") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", + data_check_fn=partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", + data_check_fn=partial(data_check_fn_mtedx, language_id="pt") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", + data_check_fn=data_check_fn_coraa + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", + data_check_fn=lambda raw_data_dir: True + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", + data_check_fn=lambda raw_data_dir: True + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", + data_check_fn=data_check_fn_librispeech + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", + data_check_fn=data_check_fn_fleurs + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", + data_check_fn=lambda raw_data_dir: True + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", + data_check_fn=lambda raw_data_dir: True, + fields_to_ignore=['text'], + ), + TestCase( + f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", + partial(data_check_fn_mcv, archive_file_stem="mcv_kk") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", + data_check_fn=data_check_fn_slr140 + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="slr102_kk.tar.gz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", + data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", + data_check_fn=data_check_fn_uzbekvoice + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", + data_check_fn=data_check_fn_fleurs + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/masc/config_filter_noisy_train.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="masc.tar.gz"), + reference_manifest_filename="test_data_reference_filter.json" + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mcv/config.yaml", + data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv.ar") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/fleurs/config.yaml", + data_check_fn=data_check_fn_fleurs + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/mediaspeech/config.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="AR.tar.gz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/arabic/everyayah/config.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", + data_check_fn=data_check_fn_armenian_toloka_pipeline_start, + fields_to_ignore=['source_filepath'], + processors_to_run="2:14", + reference_manifest_filename="pipeline_start/test_data_reference.json" + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", + data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, + reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", + fields_to_ignore=['audio_filepath', 'duration'], + processors_to_run="1:6" + ), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/unlabeled/config.yaml", data_check_fn=partial(data_check_fn_unlabeled), From aac6f82eff43da88f22739fa544469de48be0fc1 Mon Sep 17 00:00:00 2001 From: Nune Date: Mon, 16 Jun 2025 14:49:01 +0400 Subject: [PATCH 12/13] remove prints Signed-off-by: Nune --- sdp/processors/datasets/commoncrawl/commoncrawl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index b020d4cf..8a5cc2c6 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -67,7 +67,8 @@ def process_dataset_entry(self, data_entry): pass end_c = end_sr if len(text_c) > 0 and ( - end_c - start_c > self.duration_threshold * samplerate): + end_c - start_c > self.duration_threshold * samplerate or + text_c[-1] == "." or text_c[-1] == "?"): res_list.append( self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) text_c = '' From deec51363765d3204bceaababb37b06d3ae20338 Mon Sep 17 00:00:00 2001 From: Nune Date: Mon, 16 Jun 2025 14:50:24 +0400 Subject: [PATCH 13/13] Remove Signed-off-by: Nune --- tests/test_cfg_end_to_end_tests.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index ab2d9a7b..05fdfcb3 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -104,8 +104,6 @@ def data_check_fn_unlabeled(raw_data_dir: str) -> None: with tarfile.open(expected_file, 'r:gz') as tar: tar.extractall(path=raw_data_dir) - - def data_check_fn_armenian_toloka_pipeline_start(raw_data_dir: str) -> None: """Checks for the Armenian Toloka test data.