NVIDIA-NeMo · erastorgueva-nv · Nov 10, 2022 · Nov 9, 2022
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -102,16 +102,16 @@ pipeline {
       }
     }
 
-    stage('L0: Unit Tests Speech Dataset Processor') {
+    stage('L0: Unit Tests Speech Data Processor') {
       when {
         anyOf {
           branch 'main'
           changeRequest target: 'main'
         }
       }
       steps {
-        sh 'pip install -r tools/speech_dataset_processor/requirements.txt'
-        sh 'cd tools/speech_dataset_processor && CUDA_VISIBLE_DEVICES="" pytest tests -m "not pleasefixme"'
+        sh 'pip install -r tools/speech_data_processor/requirements.txt'
+        sh 'cd tools/speech_data_processor && CUDA_VISIBLE_DEVICES="" pytest tests -m "not pleasefixme"'
       }
     }
 

diff --git a/tools/speech_dataset_processor/README.md → tools/speech_data_processor/README.md b/tools/speech_dataset_processor/README.md → tools/speech_data_processor/README.md
@@ -1,14 +1,14 @@
-# Speech Dataset Processor
+# Speech Data Processor
 
-Speech Dataset Processor (SDP) is a toolkit to make it easy to:
+Speech Data Processor (SDP) is a toolkit to make it easy to:
 1. write code to process a new dataset, minimizing the amount of boilerplate code required.
 2. share the steps for processing a speech dataset. Sharing processing steps can be as easy as sharing a YAML file.
 
 SDP's philosophy is to represent processing operations as 'processor' classes. Many common processing operations are provided, and it is easy to add your own. In some cases, all you will need to do to process a new dataset is simply to write a YAML file containing the parameters needed to process your dataset.
 
 SDP is specifically intended for the use case when you have an existing dataset with the audio & text pairs already specified in some form, and you wish to create a JSON manifest suitable for use with NeMo. SDP allows for intermediate cleaning and filtering steps which involve amending the 'ground truth' `"text"` or dropping utterances which are deemed to be too inaccurate for training on.
 
-## Quick intro to Speech Dataset Processor
+## Quick intro to Speech Data Processor
 
 * The steps to process a dataset are specified by a YAML config file.
 * The YAML config file contains a list of processor classes & the args to pass into the constructor.
@@ -44,7 +44,7 @@ processors:
     ...
 ```
 ## Existing processor classes
-In addition to those mentioned in the example config file, many more classes are already included in Speech Dataset Processor, for example:
+In addition to those mentioned in the example config file, many more classes are already included in Speech Data Processor, for example:
 * `sdp.processors.ASRInference` will run inference on the manifest using a specified `pretrained_model`.
 * `sdp.processors.DropHighWER` will compute WER between `text` and `pred_text` of each utterance and remove the utterance if WER is greater than the specified `wer_threshold`.
 * `sdp.processors.DropHighLowCharrate` will compute the character rate in the utterance using `text` and `duration`, and drop the utterance if it is outside the bounds of the specified `high_charrate_threshold` and `low_charrate_threshold`. Carefully chosen thresholds will allow us to drop utterances with incorrect ground truth `text`.

diff --git a/tools/speech_dataset_processor/__init__.py → tools/speech_data_processor/__init__.py b/tools/speech_dataset_processor/__init__.py → tools/speech_data_processor/__init__.py
diff --git a/...et_configs/spanish/mls/config_mls_es.yaml → ...et_configs/spanish/mls/config_mls_es.yaml b/...et_configs/spanish/mls/config_mls_es.yaml → ...et_configs/spanish/mls/config_mls_es.yaml
diff --git a/...unique_processors/clean_roman_numerals.py → ...unique_processors/clean_roman_numerals.py b/...unique_processors/clean_roman_numerals.py → ...unique_processors/clean_roman_numerals.py
diff --git a/tools/speech_dataset_processor/main.py → tools/speech_data_processor/main.py b/tools/speech_dataset_processor/main.py → tools/speech_data_processor/main.py
diff --git a/...speech_dataset_processor/requirements.txt → tools/speech_data_processor/requirements.txt b/...speech_dataset_processor/requirements.txt → tools/speech_data_processor/requirements.txt
diff --git a/.../speech_dataset_processor/sdp/__init__.py → tools/speech_data_processor/sdp/__init__.py b/.../speech_dataset_processor/sdp/__init__.py → tools/speech_data_processor/sdp/__init__.py
diff --git a/...aset_processor/sdp/processors/__init__.py → ...data_processor/sdp/processors/__init__.py b/...aset_processor/sdp/processors/__init__.py → ...data_processor/sdp/processors/__init__.py
diff --git a/...processor/sdp/processors/asr_inference.py → ...processor/sdp/processors/asr_inference.py b/...processor/sdp/processors/asr_inference.py → ...processor/sdp/processors/asr_inference.py
diff --git a/...rocessor/sdp/processors/base_processor.py → ...rocessor/sdp/processors/base_processor.py b/...rocessor/sdp/processors/base_processor.py → ...rocessor/sdp/processors/base_processor.py
diff --git a/...ssors/create_initial_manifest/__init__.py → ...ssors/create_initial_manifest/__init__.py b/...ssors/create_initial_manifest/__init__.py → ...ssors/create_initial_manifest/__init__.py
diff --git a/...l_manifest/create_initial_manifest_mls.py → ...l_manifest/create_initial_manifest_mls.py b/...l_manifest/create_initial_manifest_mls.py → ...l_manifest/create_initial_manifest_mls.py
diff --git a/...dp/processors/modify_manifest/__init__.py → ...dp/processors/modify_manifest/__init__.py b/...dp/processors/modify_manifest/__init__.py → ...dp/processors/modify_manifest/__init__.py
diff --git a/...rocessors/modify_manifest/data_to_data.py → ...rocessors/modify_manifest/data_to_data.py b/...rocessors/modify_manifest/data_to_data.py → ...rocessors/modify_manifest/data_to_data.py
diff --git a/...ssors/modify_manifest/data_to_dropbool.py → ...ssors/modify_manifest/data_to_dropbool.py b/...ssors/modify_manifest/data_to_dropbool.py → ...ssors/modify_manifest/data_to_dropbool.py
diff --git a/...essors/modify_manifest/modify_manifest.py → ...essors/modify_manifest/modify_manifest.py b/...essors/modify_manifest/modify_manifest.py → ...essors/modify_manifest/modify_manifest.py
diff --git a/...rocessor/sdp/processors/write_manifest.py → ...rocessor/sdp/processors/write_manifest.py b/...rocessor/sdp/processors/write_manifest.py → ...rocessor/sdp/processors/write_manifest.py
diff --git a/...h_dataset_processor/sdp/run_processors.py → ...eech_data_processor/sdp/run_processors.py b/...h_dataset_processor/sdp/run_processors.py → ...eech_data_processor/sdp/run_processors.py
diff --git a/...h_dataset_processor/sdp/utils/__init__.py → ...eech_data_processor/sdp/utils/__init__.py b/...h_dataset_processor/sdp/utils/__init__.py → ...eech_data_processor/sdp/utils/__init__.py
diff --git a/...ech_dataset_processor/sdp/utils/common.py → ...speech_data_processor/sdp/utils/common.py b/...ech_dataset_processor/sdp/utils/common.py → ...speech_data_processor/sdp/utils/common.py
diff --git a/...ataset_processor/sdp/utils/edit_spaces.py → ...h_data_processor/sdp/utils/edit_spaces.py b/...ataset_processor/sdp/utils/edit_spaces.py → ...h_data_processor/sdp/utils/edit_spaces.py
diff --git a/...h_dataset_processor/sdp/utils/get_diff.py → ...eech_data_processor/sdp/utils/get_diff.py b/...h_dataset_processor/sdp/utils/get_diff.py → ...eech_data_processor/sdp/utils/get_diff.py
diff --git a/...rocessor/sdp/utils/metrics_computation.py → ...rocessor/sdp/utils/metrics_computation.py b/...rocessor/sdp/utils/metrics_computation.py → ...rocessor/sdp/utils/metrics_computation.py
diff --git a/...peech_dataset_processor/tests/__init__.py → ...s/speech_data_processor/tests/__init__.py b/...peech_dataset_processor/tests/__init__.py → ...s/speech_data_processor/tests/__init__.py
diff --git a/...sts/prepare_test_data/prepare_mls_data.py → ...sts/prepare_test_data/prepare_mls_data.py b/...sts/prepare_test_data/prepare_mls_data.py → ...sts/prepare_test_data/prepare_mls_data.py
diff --git a/..._dataset_processor/tests/test_all_cfgs.py → ...ech_data_processor/tests/test_all_cfgs.py b/..._dataset_processor/tests/test_all_cfgs.py → ...ech_data_processor/tests/test_all_cfgs.py
diff --git a/...aset_processor/tests/test_data_to_data.py → ...data_processor/tests/test_data_to_data.py b/...aset_processor/tests/test_data_to_data.py → ...data_processor/tests/test_data_to_data.py
diff --git a/..._processor/tests/test_data_to_dropbool.py → ..._processor/tests/test_data_to_dropbool.py b/..._processor/tests/test_data_to_dropbool.py → ..._processor/tests/test_data_to_dropbool.py
diff --git a/...ech_dataset_processor/tests/test_utils.py → ...speech_data_processor/tests/test_utils.py b/...ech_dataset_processor/tests/test_utils.py → ...speech_data_processor/tests/test_utils.py