From 68bfac9bb968370f1c1b6258bcbc94a6e7b29d9f Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Tue, 10 May 2022 15:20:07 +0300 Subject: [PATCH 1/5] Add draft of fixing tmp files overwritting Signed-off-by: PeganovAnton --- .../data/data_split.py | 4 ++-- .../data/en/data_preprocessing.py | 4 ++-- ...nctuation_capitalization_tarred_dataset.py | 6 +++--- ...are_data_for_punctuation_capitalization.py | 2 +- ...nctuation_capitalization_tarred_dataset.py | 19 +++++++++++++++++-- .../glue_benchmark/glue_benchmark_model.py | 2 +- .../token_classification_model.py | 2 +- .../convert_hf_dataset_to_nemo.py | 6 +++--- tools/ctc_segmentation/run_segmentation.sh | 6 +++--- tools/ctc_segmentation/scripts/cut_audio.py | 4 ++-- .../ctc_segmentation/scripts/prepare_data.py | 2 +- .../scripts/run_ctc_segmentation.py | 2 +- .../pynini_export.py | 2 +- 13 files changed, 38 insertions(+), 23 deletions(-) diff --git a/examples/nlp/duplex_text_normalization/data/data_split.py b/examples/nlp/duplex_text_normalization/data/data_split.py index b05cf6d43788..69306242c4f6 100644 --- a/examples/nlp/duplex_text_normalization/data/data_split.py +++ b/examples/nlp/duplex_text_normalization/data/data_split.py @@ -22,7 +22,7 @@ 3. Run this script # python data_split.py \ --data_dir=en_with_types/ \ - --output_dir=data_split/ \ + --dir_=data_split/ \ --lang=en In this example, the split files will be stored in the `data_split` folder. @@ -88,7 +88,7 @@ def read_google_data(data_file: str, lang: str, split: str, add_test_full=False) if __name__ == '__main__': parser = ArgumentParser(description='Preprocess Google text normalization dataset') parser.add_argument('--data_dir', type=str, required=True, help='Path to folder with data') - parser.add_argument('--output_dir', type=str, default='preprocessed', help='Path to folder with preprocessed data') + parser.add_argument('--dir_', type=str, default='preprocessed', help='Path to folder with preprocessed data') parser.add_argument( '--lang', type=str, default=constants.ENGLISH, choices=constants.SUPPORTED_LANGS, help='Language' ) diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py index 9523d0974db8..0a1484da60d1 100644 --- a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py +++ b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py @@ -34,7 +34,7 @@ 4. Run this script on the different splits # python data_preprocessing.py \ --input_path=data_split/train \ - --output_dir=train_processed \ + --dir_=train_processed \ --max_integer_length=4 \ --max_denominator_length=3 @@ -383,7 +383,7 @@ def main(): if __name__ == "__main__": parser = ArgumentParser(description="Text Normalization Data Preprocessing for English") - parser.add_argument("--output_dir", required=True, type=str, help='Path to output directory.') + parser.add_argument("--dir_", required=True, type=str, help='Path to output directory.') parser.add_argument("--input_path", required=True, type=str, help='Path to input file or input directory.') parser.add_argument( "--max_integer_length", diff --git a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py index d42c4d37121c..a7c48a23b788 100644 --- a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py +++ b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py @@ -53,7 +53,7 @@ python create_punctuation_capitalization_tarred_dataset.py \ --text \ --labels \ - --output_dir \ + --dir_ \ --lines_per_dataset_fragment 10000 \ --tokens_in_batch 8000 \ --num_batches_per_tarfile 5 \ @@ -102,7 +102,7 @@ def get_args() -> argparse.Namespace: "file.", ) parser.add_argument( - "--output_dir", + "--dir_", "-o", type=Path, required=True, @@ -245,7 +245,7 @@ def get_args() -> argparse.Namespace: for name in [ "text", "labels", - "output_dir", + "dir_", "tokenizer_model", "vocab_file", "merges_file", diff --git a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py index 0ff4306a9306..9ceadd8880d9 100644 --- a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py +++ b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py @@ -87,7 +87,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Prepare data for punctuation and capitalization tasks') parser.add_argument("-s", "--source_file", required=True, type=str, help="Path to the source file") - parser.add_argument("-o", "--output_dir", required=True, type=str, help="Path to the output directory") + parser.add_argument("-o", "--dir_", required=True, type=str, help="Path to the output directory") args = parser.parse_args() if not os.path.exists(args.source_file): diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py index 363c1a0a1e9d..7430ae801cd4 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py @@ -127,6 +127,21 @@ def get_fragment_start_bytes( return num_lines, text_start_bytes, label_start_bytes +def select_unique_file_name(dir_: Union[os.PathLike, str], file_name: Union[os.PathLike, str]) -> Path: + dir_ = Path(dir_) + file_name = str(file_name) + fn = dir_ / file_name + if fn.exists(): + index = 2 + parts = file_name.split('.') + stem, suffixes = parts[0], parts[1:] + fn = dir_ / '.'.join([stem + f'_{index}'] + suffixes) + while fn.exists(): + index += 1 + fn = dir_ / '.'.join([stem + f'_{index}'] + suffixes) + return fn + + def process_fragment( text_file: Path, labels_file: Path, @@ -160,8 +175,8 @@ def process_fragment( special_tokens=special_tokens, use_fast=use_fast_tokenizer, ) - tmp_text = output_dir / f'tmp_text_{fragment_idx}.txt' - tmp_labels = output_dir / f'tmp_labels_{fragment_idx}.txt' + tmp_text = select_unique_file_name(output_dir, f'tmp_text_{fragment_idx}.txt') + tmp_labels = select_unique_file_name(output_dir, f'tmp_labels_{fragment_idx}.txt') with text_file.open() as tf, labels_file.open() as lf, tmp_text.open('w') as otf, tmp_labels.open('w') as olf: tf.seek(text_start_pos) lf.seek(label_start_pos) diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py index c86108e10390..3434b0922b57 100644 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py @@ -209,7 +209,7 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): val_name = self._validation_names[dataloader_idx].upper() logging.info(f'{val_name} evaluation: {results}') - # writing labels and predictions to a file in output_dir is specified in the config + # writing labels and predictions to a file in dir_ is specified in the config output_dir = self._cfg.output_dir if output_dir: os.makedirs(output_dir, exist_ok=True) diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 7b9531119fa7..3c9a74a4ace5 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -455,7 +455,7 @@ def evaluate_from_file( all_labels_str = f.readlines() all_labels_str = ' '.join([labels.strip() for labels in all_labels_str]) - # writing labels and predictions to a file in output_dir is specified in the config + # writing labels and predictions to a file in dir_ is specified in the config os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, 'infer_' + os.path.basename(text_file)) try: diff --git a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py index 8445a8f11754..71668ba30285 100644 --- a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py +++ b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py @@ -46,14 +46,14 @@ 1) The .arrow files for HF cache 2) The extracted dataset in HF cache -3) The preprocessed audio files preserved in the output_dir provided in the script. +3) The preprocessed audio files preserved in the dir_ provided in the script. Due to this, make sure your HDD is large enough to store the processed dataset ! ## Usage - Offline Mode python convert_hf_dataset_to_nemo.py \ - output_dir= \ + dir_= \ path=<`path` argument in HF datasets, cannot be null> \ name=<`name` argument in HF datasets, can be null> \ split=<`split` argument in HF datasets, can be null> \ @@ -70,7 +70,7 @@ NOTE: This mode is not well supported. It trades of speed for storage by only having one copy of the dataset in - output_dir, however the speed of processing is around 10x slower than offline mode. Some datasets (such as MCV) + dir_, however the speed of processing is around 10x slower than offline mode. Some datasets (such as MCV) fail to run entirely. DO NOT USE if you have sufficient disk space. diff --git a/tools/ctc_segmentation/run_segmentation.sh b/tools/ctc_segmentation/run_segmentation.sh index a87f4d1e782b..7c6247719051 100644 --- a/tools/ctc_segmentation/run_segmentation.sh +++ b/tools/ctc_segmentation/run_segmentation.sh @@ -66,7 +66,7 @@ echo "TEXT AND AUDIO PREPROCESSING..." python $SCRIPTS_DIR/prepare_data.py \ --in_text=$DATA_DIR/text \ --audio_dir=$DATA_DIR/audio \ ---output_dir=$OUTPUT_DIR/processed/ \ +--dir_=$OUTPUT_DIR/processed/ \ --language=$LANGUAGE \ --cut_prefix=$CUT_PREFIX \ --model=$MODEL_NAME_OR_PATH \ @@ -81,7 +81,7 @@ echo "SEGMENTATION STEP..." for WINDOW in 8000 12000 do python $SCRIPTS_DIR/run_ctc_segmentation.py \ - --output_dir=$OUTPUT_DIR \ + --dir_=$OUTPUT_DIR \ --data=$OUTPUT_DIR/processed \ --sample_rate=$SAMPLE_RATE \ --model=$MODEL_NAME_OR_PATH \ @@ -99,7 +99,7 @@ python $SCRIPTS_DIR/verify_segments.py \ # above the MIN_SCORE value will be saved to $OUTPUT_DIR/manifests/manifest.json echo "CUTTING AUDIO..." python $SCRIPTS_DIR/cut_audio.py \ ---output_dir=$OUTPUT_DIR \ +--dir_=$OUTPUT_DIR \ --alignment=$OUTPUT_DIR/verified_segments \ --threshold=$MIN_SCORE \ --offset=$OFFSET \ diff --git a/tools/ctc_segmentation/scripts/cut_audio.py b/tools/ctc_segmentation/scripts/cut_audio.py index 5698733c4803..e40ffc8ba17c 100644 --- a/tools/ctc_segmentation/scripts/cut_audio.py +++ b/tools/ctc_segmentation/scripts/cut_audio.py @@ -22,7 +22,7 @@ from tqdm import tqdm parser = argparse.ArgumentParser(description="Cut audio on the segments based on segments") -parser.add_argument("--output_dir", type=str, help="Path to output directory", required=True) +parser.add_argument("--dir_", type=str, help="Path to output directory", required=True) parser.add_argument( "--alignment", type=str, @@ -72,7 +72,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args): line = line[0].split() segments.append((float(line[0]) + args.offset / 1000, float(line[1]) + args.offset / 1000, float(line[2]))) - # cut the audio into segments and save the final manifests at output_dir + # cut the audio into segments and save the final manifests at dir_ sampling_rate, signal = wavfile.read(audio_file) original_duration = len(signal) / sampling_rate diff --git a/tools/ctc_segmentation/scripts/prepare_data.py b/tools/ctc_segmentation/scripts/prepare_data.py index 2231c50820c3..3e01cd4e74ec 100644 --- a/tools/ctc_segmentation/scripts/prepare_data.py +++ b/tools/ctc_segmentation/scripts/prepare_data.py @@ -39,7 +39,7 @@ parser = argparse.ArgumentParser(description="Prepares text and audio files for segmentation") parser.add_argument("--in_text", type=str, default=None, help="Path to a text file or a directory with .txt files") -parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory") +parser.add_argument("--dir_", type=str, required=True, help="Path to output directory") parser.add_argument("--audio_dir", type=str, help="Path to folder with .mp3 or .wav audio files") parser.add_argument("--sample_rate", type=int, default=16000, help="Sampling rate used during ASR model training, Hz") parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs") diff --git a/tools/ctc_segmentation/scripts/run_ctc_segmentation.py b/tools/ctc_segmentation/scripts/run_ctc_segmentation.py index 772af8c5ccce..b588d02540b6 100644 --- a/tools/ctc_segmentation/scripts/run_ctc_segmentation.py +++ b/tools/ctc_segmentation/scripts/run_ctc_segmentation.py @@ -29,7 +29,7 @@ import nemo.collections.asr as nemo_asr parser = argparse.ArgumentParser(description="CTC Segmentation") -parser.add_argument("--output_dir", default="output", type=str, help="Path to output directory") +parser.add_argument("--dir_", default="output", type=str, help="Path to output directory") parser.add_argument( "--data", type=str, diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 87d260e85f06..9efa46fd46f6 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -86,7 +86,7 @@ def export_grammars(output_dir, grammars): def parse_args(): parser = ArgumentParser() - parser.add_argument("--output_dir", help="output directory for grammars", required=True, type=str) + parser.add_argument("--dir_", help="output directory for grammars", required=True, type=str) parser.add_argument( "--language", help="language", choices=["en", "de", "es", "ru", 'fr', 'vi'], type=str, default='en' ) From 0394e24ea01371091cbbdc2672fc83a4a55d5ad3 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Tue, 10 May 2022 15:32:50 +0300 Subject: [PATCH 2/5] Remove accidental changes Signed-off-by: PeganovAnton --- examples/nlp/duplex_text_normalization/data/data_split.py | 4 ++-- .../duplex_text_normalization/data/en/data_preprocessing.py | 4 ++-- .../create_punctuation_capitalization_tarred_dataset.py | 4 ++-- .../data/prepare_data_for_punctuation_capitalization.py | 2 +- .../nlp/models/glue_benchmark/glue_benchmark_model.py | 2 +- .../token_classification/token_classification_model.py | 2 +- scripts/speech_recognition/convert_hf_dataset_to_nemo.py | 6 +++--- tools/ctc_segmentation/run_segmentation.sh | 6 +++--- tools/ctc_segmentation/scripts/cut_audio.py | 2 +- tools/ctc_segmentation/scripts/prepare_data.py | 2 +- tools/ctc_segmentation/scripts/run_ctc_segmentation.py | 2 +- tools/text_processing_deployment/pynini_export.py | 2 +- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/nlp/duplex_text_normalization/data/data_split.py b/examples/nlp/duplex_text_normalization/data/data_split.py index 69306242c4f6..b05cf6d43788 100644 --- a/examples/nlp/duplex_text_normalization/data/data_split.py +++ b/examples/nlp/duplex_text_normalization/data/data_split.py @@ -22,7 +22,7 @@ 3. Run this script # python data_split.py \ --data_dir=en_with_types/ \ - --dir_=data_split/ \ + --output_dir=data_split/ \ --lang=en In this example, the split files will be stored in the `data_split` folder. @@ -88,7 +88,7 @@ def read_google_data(data_file: str, lang: str, split: str, add_test_full=False) if __name__ == '__main__': parser = ArgumentParser(description='Preprocess Google text normalization dataset') parser.add_argument('--data_dir', type=str, required=True, help='Path to folder with data') - parser.add_argument('--dir_', type=str, default='preprocessed', help='Path to folder with preprocessed data') + parser.add_argument('--output_dir', type=str, default='preprocessed', help='Path to folder with preprocessed data') parser.add_argument( '--lang', type=str, default=constants.ENGLISH, choices=constants.SUPPORTED_LANGS, help='Language' ) diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py index 0a1484da60d1..9523d0974db8 100644 --- a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py +++ b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py @@ -34,7 +34,7 @@ 4. Run this script on the different splits # python data_preprocessing.py \ --input_path=data_split/train \ - --dir_=train_processed \ + --output_dir=train_processed \ --max_integer_length=4 \ --max_denominator_length=3 @@ -383,7 +383,7 @@ def main(): if __name__ == "__main__": parser = ArgumentParser(description="Text Normalization Data Preprocessing for English") - parser.add_argument("--dir_", required=True, type=str, help='Path to output directory.') + parser.add_argument("--output_dir", required=True, type=str, help='Path to output directory.') parser.add_argument("--input_path", required=True, type=str, help='Path to input file or input directory.') parser.add_argument( "--max_integer_length", diff --git a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py index a7c48a23b788..02e1e91e09bd 100644 --- a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py +++ b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py @@ -53,7 +53,7 @@ python create_punctuation_capitalization_tarred_dataset.py \ --text \ --labels \ - --dir_ \ + --output_dir \ --lines_per_dataset_fragment 10000 \ --tokens_in_batch 8000 \ --num_batches_per_tarfile 5 \ @@ -102,7 +102,7 @@ def get_args() -> argparse.Namespace: "file.", ) parser.add_argument( - "--dir_", + "--output_dir", "-o", type=Path, required=True, diff --git a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py index 9ceadd8880d9..0ff4306a9306 100644 --- a/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py +++ b/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py @@ -87,7 +87,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Prepare data for punctuation and capitalization tasks') parser.add_argument("-s", "--source_file", required=True, type=str, help="Path to the source file") - parser.add_argument("-o", "--dir_", required=True, type=str, help="Path to the output directory") + parser.add_argument("-o", "--output_dir", required=True, type=str, help="Path to the output directory") args = parser.parse_args() if not os.path.exists(args.source_file): diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py index 3434b0922b57..c86108e10390 100644 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py @@ -209,7 +209,7 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): val_name = self._validation_names[dataloader_idx].upper() logging.info(f'{val_name} evaluation: {results}') - # writing labels and predictions to a file in dir_ is specified in the config + # writing labels and predictions to a file in output_dir is specified in the config output_dir = self._cfg.output_dir if output_dir: os.makedirs(output_dir, exist_ok=True) diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 3c9a74a4ace5..7b9531119fa7 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -455,7 +455,7 @@ def evaluate_from_file( all_labels_str = f.readlines() all_labels_str = ' '.join([labels.strip() for labels in all_labels_str]) - # writing labels and predictions to a file in dir_ is specified in the config + # writing labels and predictions to a file in output_dir is specified in the config os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, 'infer_' + os.path.basename(text_file)) try: diff --git a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py index 71668ba30285..8445a8f11754 100644 --- a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py +++ b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py @@ -46,14 +46,14 @@ 1) The .arrow files for HF cache 2) The extracted dataset in HF cache -3) The preprocessed audio files preserved in the dir_ provided in the script. +3) The preprocessed audio files preserved in the output_dir provided in the script. Due to this, make sure your HDD is large enough to store the processed dataset ! ## Usage - Offline Mode python convert_hf_dataset_to_nemo.py \ - dir_= \ + output_dir= \ path=<`path` argument in HF datasets, cannot be null> \ name=<`name` argument in HF datasets, can be null> \ split=<`split` argument in HF datasets, can be null> \ @@ -70,7 +70,7 @@ NOTE: This mode is not well supported. It trades of speed for storage by only having one copy of the dataset in - dir_, however the speed of processing is around 10x slower than offline mode. Some datasets (such as MCV) + output_dir, however the speed of processing is around 10x slower than offline mode. Some datasets (such as MCV) fail to run entirely. DO NOT USE if you have sufficient disk space. diff --git a/tools/ctc_segmentation/run_segmentation.sh b/tools/ctc_segmentation/run_segmentation.sh index 7c6247719051..a87f4d1e782b 100644 --- a/tools/ctc_segmentation/run_segmentation.sh +++ b/tools/ctc_segmentation/run_segmentation.sh @@ -66,7 +66,7 @@ echo "TEXT AND AUDIO PREPROCESSING..." python $SCRIPTS_DIR/prepare_data.py \ --in_text=$DATA_DIR/text \ --audio_dir=$DATA_DIR/audio \ ---dir_=$OUTPUT_DIR/processed/ \ +--output_dir=$OUTPUT_DIR/processed/ \ --language=$LANGUAGE \ --cut_prefix=$CUT_PREFIX \ --model=$MODEL_NAME_OR_PATH \ @@ -81,7 +81,7 @@ echo "SEGMENTATION STEP..." for WINDOW in 8000 12000 do python $SCRIPTS_DIR/run_ctc_segmentation.py \ - --dir_=$OUTPUT_DIR \ + --output_dir=$OUTPUT_DIR \ --data=$OUTPUT_DIR/processed \ --sample_rate=$SAMPLE_RATE \ --model=$MODEL_NAME_OR_PATH \ @@ -99,7 +99,7 @@ python $SCRIPTS_DIR/verify_segments.py \ # above the MIN_SCORE value will be saved to $OUTPUT_DIR/manifests/manifest.json echo "CUTTING AUDIO..." python $SCRIPTS_DIR/cut_audio.py \ ---dir_=$OUTPUT_DIR \ +--output_dir=$OUTPUT_DIR \ --alignment=$OUTPUT_DIR/verified_segments \ --threshold=$MIN_SCORE \ --offset=$OFFSET \ diff --git a/tools/ctc_segmentation/scripts/cut_audio.py b/tools/ctc_segmentation/scripts/cut_audio.py index e40ffc8ba17c..2e571e67432c 100644 --- a/tools/ctc_segmentation/scripts/cut_audio.py +++ b/tools/ctc_segmentation/scripts/cut_audio.py @@ -22,7 +22,7 @@ from tqdm import tqdm parser = argparse.ArgumentParser(description="Cut audio on the segments based on segments") -parser.add_argument("--dir_", type=str, help="Path to output directory", required=True) +parser.add_argument("--output_dir", type=str, help="Path to output directory", required=True) parser.add_argument( "--alignment", type=str, diff --git a/tools/ctc_segmentation/scripts/prepare_data.py b/tools/ctc_segmentation/scripts/prepare_data.py index 3e01cd4e74ec..2231c50820c3 100644 --- a/tools/ctc_segmentation/scripts/prepare_data.py +++ b/tools/ctc_segmentation/scripts/prepare_data.py @@ -39,7 +39,7 @@ parser = argparse.ArgumentParser(description="Prepares text and audio files for segmentation") parser.add_argument("--in_text", type=str, default=None, help="Path to a text file or a directory with .txt files") -parser.add_argument("--dir_", type=str, required=True, help="Path to output directory") +parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory") parser.add_argument("--audio_dir", type=str, help="Path to folder with .mp3 or .wav audio files") parser.add_argument("--sample_rate", type=int, default=16000, help="Sampling rate used during ASR model training, Hz") parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs") diff --git a/tools/ctc_segmentation/scripts/run_ctc_segmentation.py b/tools/ctc_segmentation/scripts/run_ctc_segmentation.py index b588d02540b6..772af8c5ccce 100644 --- a/tools/ctc_segmentation/scripts/run_ctc_segmentation.py +++ b/tools/ctc_segmentation/scripts/run_ctc_segmentation.py @@ -29,7 +29,7 @@ import nemo.collections.asr as nemo_asr parser = argparse.ArgumentParser(description="CTC Segmentation") -parser.add_argument("--dir_", default="output", type=str, help="Path to output directory") +parser.add_argument("--output_dir", default="output", type=str, help="Path to output directory") parser.add_argument( "--data", type=str, diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 9efa46fd46f6..87d260e85f06 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -86,7 +86,7 @@ def export_grammars(output_dir, grammars): def parse_args(): parser = ArgumentParser() - parser.add_argument("--dir_", help="output directory for grammars", required=True, type=str) + parser.add_argument("--output_dir", help="output directory for grammars", required=True, type=str) parser.add_argument( "--language", help="language", choices=["en", "de", "es", "ru", 'fr', 'vi'], type=str, default='en' ) From 2f9fba7a744edf2b2a266bb2d0995e8118c72da0 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Tue, 10 May 2022 15:33:45 +0300 Subject: [PATCH 3/5] Remove accidental changes Signed-off-by: PeganovAnton --- .../data/create_punctuation_capitalization_tarred_dataset.py | 2 +- tools/ctc_segmentation/scripts/cut_audio.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py index 02e1e91e09bd..d42c4d37121c 100644 --- a/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py +++ b/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py @@ -245,7 +245,7 @@ def get_args() -> argparse.Namespace: for name in [ "text", "labels", - "dir_", + "output_dir", "tokenizer_model", "vocab_file", "merges_file", diff --git a/tools/ctc_segmentation/scripts/cut_audio.py b/tools/ctc_segmentation/scripts/cut_audio.py index 2e571e67432c..5698733c4803 100644 --- a/tools/ctc_segmentation/scripts/cut_audio.py +++ b/tools/ctc_segmentation/scripts/cut_audio.py @@ -72,7 +72,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args): line = line[0].split() segments.append((float(line[0]) + args.offset / 1000, float(line[1]) + args.offset / 1000, float(line[2]))) - # cut the audio into segments and save the final manifests at dir_ + # cut the audio into segments and save the final manifests at output_dir sampling_rate, signal = wavfile.read(audio_file) original_duration = len(signal) / sampling_rate From 912b511b639181124202e63e28a1416be11ce61f Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Tue, 10 May 2022 15:58:26 +0300 Subject: [PATCH 4/5] Use built-in tempfile library Signed-off-by: PeganovAnton --- ...nctuation_capitalization_tarred_dataset.py | 82 +++++++++---------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py index 7430ae801cd4..e17e13f9966d 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py @@ -19,6 +19,7 @@ import pickle import re import shutil +import tempfile from collections import deque from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Type, Union @@ -127,21 +128,6 @@ def get_fragment_start_bytes( return num_lines, text_start_bytes, label_start_bytes -def select_unique_file_name(dir_: Union[os.PathLike, str], file_name: Union[os.PathLike, str]) -> Path: - dir_ = Path(dir_) - file_name = str(file_name) - fn = dir_ / file_name - if fn.exists(): - index = 2 - parts = file_name.split('.') - stem, suffixes = parts[0], parts[1:] - fn = dir_ / '.'.join([stem + f'_{index}'] + suffixes) - while fn.exists(): - index += 1 - fn = dir_ / '.'.join([stem + f'_{index}'] + suffixes) - return fn - - def process_fragment( text_file: Path, labels_file: Path, @@ -175,36 +161,42 @@ def process_fragment( special_tokens=special_tokens, use_fast=use_fast_tokenizer, ) - tmp_text = select_unique_file_name(output_dir, f'tmp_text_{fragment_idx}.txt') - tmp_labels = select_unique_file_name(output_dir, f'tmp_labels_{fragment_idx}.txt') - with text_file.open() as tf, labels_file.open() as lf, tmp_text.open('w') as otf, tmp_labels.open('w') as olf: - tf.seek(text_start_pos) - lf.seek(label_start_pos) - for _ in range(lines_per_dataset_fragment): - text_line = tf.readline() - if not text_line: - break - otf.write(text_line) - olf.write(lf.readline()) - dataset = BertPunctuationCapitalizationDataset( - tmp_text, - tmp_labels, - max_seq_length, - tokenizer, - tokens_in_batch=tokens_in_batch, - pad_label=pad_label, - punct_label_ids=punct_label_ids, - capit_label_ids=capit_label_ids, - n_jobs=0, - use_cache=False, - add_masks_and_segment_ids_to_batch=False, - verbose=False, - tokenization_progress_queue=tokenization_progress_queue, - batch_mark_up_progress_queue=batch_mark_up_progress_queue, - batch_building_progress_queue=batch_building_progress_queue, - ) - tmp_text.unlink() - tmp_labels.unlink() + tmp_text: Optional[str] = None + tmp_labels: Optional[str] = None + try: + otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True) + olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True) + with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen(olfd, 'w') as olf: + tf.seek(text_start_pos) + lf.seek(label_start_pos) + for _ in range(lines_per_dataset_fragment): + text_line = tf.readline() + if not text_line: + break + otf.write(text_line) + olf.write(lf.readline()) + dataset = BertPunctuationCapitalizationDataset( + tmp_text, + tmp_labels, + max_seq_length, + tokenizer, + tokens_in_batch=tokens_in_batch, + pad_label=pad_label, + punct_label_ids=punct_label_ids, + capit_label_ids=capit_label_ids, + n_jobs=0, + use_cache=False, + add_masks_and_segment_ids_to_batch=False, + verbose=False, + tokenization_progress_queue=tokenization_progress_queue, + batch_mark_up_progress_queue=batch_mark_up_progress_queue, + batch_building_progress_queue=batch_building_progress_queue, + ) + finally: + if tmp_text is not None and os.path.exists(tmp_text): + os.remove(tmp_text) + if tmp_labels is not None and os.path.exists(tmp_labels): + os.remove(tmp_labels) dataset.features_pkl.unlink() tar_ctr = 0 current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(fragment_idx=fragment_idx, file_idx=tar_ctr) From a85a1a04c7479c576ea83409958f0c113c1f7134 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Tue, 10 May 2022 16:18:59 +0300 Subject: [PATCH 5/5] Fix code style Signed-off-by: PeganovAnton --- .../punctuation_capitalization_tarred_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py index e17e13f9966d..2bfcb7969b6e 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py @@ -166,7 +166,9 @@ def process_fragment( try: otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True) olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True) - with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen(olfd, 'w') as olf: + with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen( + olfd, 'w' + ) as olf: tf.seek(text_start_pos) lf.seek(label_start_pos) for _ in range(lines_per_dataset_fragment):