diff --git a/Jenkinsfile b/Jenkinsfile index c7d16ec5560e..87dc4782f6e9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -102,7 +102,7 @@ pipeline { } } - stage('L0: Unit Tests Speech Dataset Processor') { + stage('L0: Unit Tests Speech Data Processor') { when { anyOf { branch 'main' @@ -110,8 +110,8 @@ pipeline { } } steps { - sh 'pip install -r tools/speech_dataset_processor/requirements.txt' - sh 'cd tools/speech_dataset_processor && CUDA_VISIBLE_DEVICES="" pytest tests -m "not pleasefixme"' + sh 'pip install -r tools/speech_data_processor/requirements.txt' + sh 'cd tools/speech_data_processor && CUDA_VISIBLE_DEVICES="" pytest tests -m "not pleasefixme"' } } diff --git a/tools/speech_dataset_processor/README.md b/tools/speech_data_processor/README.md similarity index 94% rename from tools/speech_dataset_processor/README.md rename to tools/speech_data_processor/README.md index 31f22f5d81bf..58547c2e8317 100644 --- a/tools/speech_dataset_processor/README.md +++ b/tools/speech_data_processor/README.md @@ -1,6 +1,6 @@ -# Speech Dataset Processor +# Speech Data Processor -Speech Dataset Processor (SDP) is a toolkit to make it easy to: +Speech Data Processor (SDP) is a toolkit to make it easy to: 1. write code to process a new dataset, minimizing the amount of boilerplate code required. 2. share the steps for processing a speech dataset. Sharing processing steps can be as easy as sharing a YAML file. @@ -8,7 +8,7 @@ SDP's philosophy is to represent processing operations as 'processor' classes. M SDP is specifically intended for the use case when you have an existing dataset with the audio & text pairs already specified in some form, and you wish to create a JSON manifest suitable for use with NeMo. SDP allows for intermediate cleaning and filtering steps which involve amending the 'ground truth' `"text"` or dropping utterances which are deemed to be too inaccurate for training on. -## Quick intro to Speech Dataset Processor +## Quick intro to Speech Data Processor * The steps to process a dataset are specified by a YAML config file. * The YAML config file contains a list of processor classes & the args to pass into the constructor. @@ -44,7 +44,7 @@ processors: ... ``` ## Existing processor classes -In addition to those mentioned in the example config file, many more classes are already included in Speech Dataset Processor, for example: +In addition to those mentioned in the example config file, many more classes are already included in Speech Data Processor, for example: * `sdp.processors.ASRInference` will run inference on the manifest using a specified `pretrained_model`. * `sdp.processors.DropHighWER` will compute WER between `text` and `pred_text` of each utterance and remove the utterance if WER is greater than the specified `wer_threshold`. * `sdp.processors.DropHighLowCharrate` will compute the character rate in the utterance using `text` and `duration`, and drop the utterance if it is outside the bounds of the specified `high_charrate_threshold` and `low_charrate_threshold`. Carefully chosen thresholds will allow us to drop utterances with incorrect ground truth `text`. diff --git a/tools/speech_dataset_processor/__init__.py b/tools/speech_data_processor/__init__.py similarity index 100% rename from tools/speech_dataset_processor/__init__.py rename to tools/speech_data_processor/__init__.py diff --git a/tools/speech_dataset_processor/dataset_configs/spanish/mls/config_mls_es.yaml b/tools/speech_data_processor/dataset_configs/spanish/mls/config_mls_es.yaml similarity index 100% rename from tools/speech_dataset_processor/dataset_configs/spanish/mls/config_mls_es.yaml rename to tools/speech_data_processor/dataset_configs/spanish/mls/config_mls_es.yaml diff --git a/tools/speech_dataset_processor/dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py b/tools/speech_data_processor/dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py similarity index 100% rename from tools/speech_dataset_processor/dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py rename to tools/speech_data_processor/dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py diff --git a/tools/speech_dataset_processor/main.py b/tools/speech_data_processor/main.py similarity index 100% rename from tools/speech_dataset_processor/main.py rename to tools/speech_data_processor/main.py diff --git a/tools/speech_dataset_processor/requirements.txt b/tools/speech_data_processor/requirements.txt similarity index 100% rename from tools/speech_dataset_processor/requirements.txt rename to tools/speech_data_processor/requirements.txt diff --git a/tools/speech_dataset_processor/sdp/__init__.py b/tools/speech_data_processor/sdp/__init__.py similarity index 100% rename from tools/speech_dataset_processor/sdp/__init__.py rename to tools/speech_data_processor/sdp/__init__.py diff --git a/tools/speech_dataset_processor/sdp/processors/__init__.py b/tools/speech_data_processor/sdp/processors/__init__.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/__init__.py rename to tools/speech_data_processor/sdp/processors/__init__.py diff --git a/tools/speech_dataset_processor/sdp/processors/asr_inference.py b/tools/speech_data_processor/sdp/processors/asr_inference.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/asr_inference.py rename to tools/speech_data_processor/sdp/processors/asr_inference.py diff --git a/tools/speech_dataset_processor/sdp/processors/base_processor.py b/tools/speech_data_processor/sdp/processors/base_processor.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/base_processor.py rename to tools/speech_data_processor/sdp/processors/base_processor.py diff --git a/tools/speech_dataset_processor/sdp/processors/create_initial_manifest/__init__.py b/tools/speech_data_processor/sdp/processors/create_initial_manifest/__init__.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/create_initial_manifest/__init__.py rename to tools/speech_data_processor/sdp/processors/create_initial_manifest/__init__.py diff --git a/tools/speech_dataset_processor/sdp/processors/create_initial_manifest/create_initial_manifest_mls.py b/tools/speech_data_processor/sdp/processors/create_initial_manifest/create_initial_manifest_mls.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/create_initial_manifest/create_initial_manifest_mls.py rename to tools/speech_data_processor/sdp/processors/create_initial_manifest/create_initial_manifest_mls.py diff --git a/tools/speech_dataset_processor/sdp/processors/modify_manifest/__init__.py b/tools/speech_data_processor/sdp/processors/modify_manifest/__init__.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/modify_manifest/__init__.py rename to tools/speech_data_processor/sdp/processors/modify_manifest/__init__.py diff --git a/tools/speech_dataset_processor/sdp/processors/modify_manifest/data_to_data.py b/tools/speech_data_processor/sdp/processors/modify_manifest/data_to_data.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/modify_manifest/data_to_data.py rename to tools/speech_data_processor/sdp/processors/modify_manifest/data_to_data.py diff --git a/tools/speech_dataset_processor/sdp/processors/modify_manifest/data_to_dropbool.py b/tools/speech_data_processor/sdp/processors/modify_manifest/data_to_dropbool.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/modify_manifest/data_to_dropbool.py rename to tools/speech_data_processor/sdp/processors/modify_manifest/data_to_dropbool.py diff --git a/tools/speech_dataset_processor/sdp/processors/modify_manifest/modify_manifest.py b/tools/speech_data_processor/sdp/processors/modify_manifest/modify_manifest.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/modify_manifest/modify_manifest.py rename to tools/speech_data_processor/sdp/processors/modify_manifest/modify_manifest.py diff --git a/tools/speech_dataset_processor/sdp/processors/write_manifest.py b/tools/speech_data_processor/sdp/processors/write_manifest.py similarity index 100% rename from tools/speech_dataset_processor/sdp/processors/write_manifest.py rename to tools/speech_data_processor/sdp/processors/write_manifest.py diff --git a/tools/speech_dataset_processor/sdp/run_processors.py b/tools/speech_data_processor/sdp/run_processors.py similarity index 100% rename from tools/speech_dataset_processor/sdp/run_processors.py rename to tools/speech_data_processor/sdp/run_processors.py diff --git a/tools/speech_dataset_processor/sdp/utils/__init__.py b/tools/speech_data_processor/sdp/utils/__init__.py similarity index 100% rename from tools/speech_dataset_processor/sdp/utils/__init__.py rename to tools/speech_data_processor/sdp/utils/__init__.py diff --git a/tools/speech_dataset_processor/sdp/utils/common.py b/tools/speech_data_processor/sdp/utils/common.py similarity index 100% rename from tools/speech_dataset_processor/sdp/utils/common.py rename to tools/speech_data_processor/sdp/utils/common.py diff --git a/tools/speech_dataset_processor/sdp/utils/edit_spaces.py b/tools/speech_data_processor/sdp/utils/edit_spaces.py similarity index 100% rename from tools/speech_dataset_processor/sdp/utils/edit_spaces.py rename to tools/speech_data_processor/sdp/utils/edit_spaces.py diff --git a/tools/speech_dataset_processor/sdp/utils/get_diff.py b/tools/speech_data_processor/sdp/utils/get_diff.py similarity index 100% rename from tools/speech_dataset_processor/sdp/utils/get_diff.py rename to tools/speech_data_processor/sdp/utils/get_diff.py diff --git a/tools/speech_dataset_processor/sdp/utils/metrics_computation.py b/tools/speech_data_processor/sdp/utils/metrics_computation.py similarity index 100% rename from tools/speech_dataset_processor/sdp/utils/metrics_computation.py rename to tools/speech_data_processor/sdp/utils/metrics_computation.py diff --git a/tools/speech_dataset_processor/tests/__init__.py b/tools/speech_data_processor/tests/__init__.py similarity index 100% rename from tools/speech_dataset_processor/tests/__init__.py rename to tools/speech_data_processor/tests/__init__.py diff --git a/tools/speech_dataset_processor/tests/prepare_test_data/prepare_mls_data.py b/tools/speech_data_processor/tests/prepare_test_data/prepare_mls_data.py similarity index 100% rename from tools/speech_dataset_processor/tests/prepare_test_data/prepare_mls_data.py rename to tools/speech_data_processor/tests/prepare_test_data/prepare_mls_data.py diff --git a/tools/speech_dataset_processor/tests/test_all_cfgs.py b/tools/speech_data_processor/tests/test_all_cfgs.py similarity index 100% rename from tools/speech_dataset_processor/tests/test_all_cfgs.py rename to tools/speech_data_processor/tests/test_all_cfgs.py diff --git a/tools/speech_dataset_processor/tests/test_data_to_data.py b/tools/speech_data_processor/tests/test_data_to_data.py similarity index 100% rename from tools/speech_dataset_processor/tests/test_data_to_data.py rename to tools/speech_data_processor/tests/test_data_to_data.py diff --git a/tools/speech_dataset_processor/tests/test_data_to_dropbool.py b/tools/speech_data_processor/tests/test_data_to_dropbool.py similarity index 100% rename from tools/speech_dataset_processor/tests/test_data_to_dropbool.py rename to tools/speech_data_processor/tests/test_data_to_dropbool.py diff --git a/tools/speech_dataset_processor/tests/test_utils.py b/tools/speech_data_processor/tests/test_utils.py similarity index 100% rename from tools/speech_dataset_processor/tests/test_utils.py rename to tools/speech_data_processor/tests/test_utils.py