From bac2ccc85be128514bd7596b37d0d7cc64f12e3a Mon Sep 17 00:00:00 2001 From: nithinraok Date: Fri, 6 May 2022 17:16:54 -0700 Subject: [PATCH 1/4] initial commit Signed-off-by: nithinraok --- nemo/collections/asr/models/label_models.py | 1 - scripts/speaker_tasks/scp_to_manifest.py | 91 ++++++++------------- 2 files changed, 35 insertions(+), 57 deletions(-) diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index 83f0572c6afd..9c3a468bbdff 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -251,7 +251,6 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: "embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), } - @typecheck() def forward_for_export(self, processed_signal, processed_signal_len): encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len) logits, embs = self.decoder(encoder_output=encoded, length=length) diff --git a/scripts/speaker_tasks/scp_to_manifest.py b/scripts/speaker_tasks/scp_to_manifest.py index c146b13e1742..fd3941461a97 100644 --- a/scripts/speaker_tasks/scp_to_manifest.py +++ b/scripts/speaker_tasks/scp_to_manifest.py @@ -14,7 +14,6 @@ import argparse import json -import multiprocessing import os import random @@ -31,23 +30,28 @@ This scipt converts a scp file where each line contains to a manifest json file. +Optionally post processes the manifest file to create dev and train split for speaker embedding +training, also optionally chunk an audio file in to segments of random DURATIONS and create those +wav files in CWD. + +While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file. Args: --scp: scp file name --manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring scp --id: index of speaker label in filename present in scp file that is separated by '/' --out: output manifest file name ---split: True / False if you would want to split the manifest file for training purposes +--split: if you would want to split the manifest file for training purposes you may not need this for test set. output file names is _.json Defaults to False ---create_chunks: bool if you would want to chunk each manifest line to chunks of 3 sec or less +--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less you may not need this for test set, Defaults to False ---write_chunks: writes chunked files based on offset to {current working directory}/chunks/{label}/{original_file_name}_{offset}_{duration}.wav --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise """ -DURATIONS = [1.5, 2, 3] +DURATIONS = [1.5, 2, 3, 4] MIN_ENERGY = 0.01 -CWD = './' +CWD = os.getcwd() +SAMPLE_RATE = 16000 def filter_manifest_line(manifest_line): @@ -55,19 +59,35 @@ def filter_manifest_line(manifest_line): audio_path = manifest_line['audio_filepath'] start = manifest_line.get('offset', 0) dur = manifest_line['duration'] + label = manifest_line['label'] + endname = os.path.splitext(audio_path.split(label, 1)[-1])[0] + to_path = os.path.join(CWD, 'chunks', label) + to_path = os.path.join(to_path, endname[1:]) + os.makedirs(os.path.dirname(to_path), exist_ok=True) if dur >= min(DURATIONS): - signal, sr = l.load(audio_path, sr=None) + signal, sr = l.load(audio_path, sr=SAMPLE_RATE) remaining_dur = dur temp_dur = random.choice(DURATIONS) remaining_dur = remaining_dur - temp_dur while remaining_dur >= 0: segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)] if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY: + final_string = '_' + str(start) + '_' + str(temp_dur) + final_string = final_string.replace('.', '-') + to_file = to_path + final_string + '.wav' + + start = int(float(start * sr)) + end = start + int(float(temp_dur * sr)) + chunk = signal[start:end] + sf.write(to_file, chunk, sr) + meta = manifest_line.copy() - meta['offset'] = start + meta['audio_filepath'] = to_file + meta['offset'] = 0 meta['duration'] = temp_dur split_manifest.append(meta) + start = start + temp_dur temp_dur = random.choice(DURATIONS) remaining_dur = remaining_dur - temp_dur @@ -146,41 +166,21 @@ def get_labels(lines): return labels -def write_audio_file(line): - filename = line['audio_filepath'] - label = line['label'] - offset = line['offset'] - duration = line['duration'] - basename = os.path.basename(filename).replace('.wav', '') - to_path = os.path.join(CWD, 'chunks', label) - os.makedirs(to_path, exist_ok=True) - to_path = os.path.join(to_path, basename) - final_string = '_' + str(offset) + '_' + str(duration) - final_string = final_string.replace('.', '-') - samples, sr = sf.read(filename) - start = int(float(offset * sr)) - end = start + int(float(duration * sr)) - chunk = samples[start:end] - to_file = to_path + final_string + '.wav' - sf.write(to_file, chunk, sr) - - line['offset'] = 0 - line['audio_filepath'] = to_file - return line - - -def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=False, min_count=10, workers=4): +def main(scp, manifest, id, out, split=False, create_chunks=False, min_count=10): if os.path.exists(out): os.remove(out) if scp: lines = read_file(scp_file=scp, id=id) + lines = process_map(get_duration, lines, chunksize=100) + out_file = os.path.splitext(scp)[0] + '_manifest.json' + write_file(out_file, lines, range(len(lines))) else: lines = read_manifest(manifest) lines = process_map(get_duration, lines, chunksize=100) if create_chunks: - print("creating chunk") + print("creating and writing chunks to {}".format(CWD)) lines = process_map(filter_manifest_line, lines, chunksize=100) temp = [] for line in lines: @@ -188,10 +188,6 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= del lines lines = temp - if create_chunks and write_chunks: - print("writing chunks created before as new wav files") - lines = process_map(write_audio_file, lines, chunksize=100) - speakers = [x['label'] for x in lines] if min_count: @@ -231,13 +227,7 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= ) parser.add_argument( "--create_chunks", - help="bool if you would want to chunk each manifest line to chunks of 3 sec or less", - required=False, - action='store_true', - ) - parser.add_argument( - "--write_chunks", - help="bool if you would want to write the chunks created with --create_chunk to CWD ", + help="bool if you would want to chunk each manifest line to chunks of 4 sec or less", required=False, action='store_true', ) @@ -247,20 +237,9 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= type=int, help="min number of samples per speaker to consider and ignore otherwise", ) - parser.add_argument( - "--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset." - ) args = parser.parse_args() main( - args.scp, - args.manifest, - args.id, - args.out, - args.split, - args.create_chunks, - args.write_chunks, - args.min_spkrs_count, - args.num_workers, + args.scp, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count, ) From 37601bb2a3d2b5bd85e8dd5dec4007c38d1a3f3a Mon Sep 17 00:00:00 2001 From: nithinraok Date: Fri, 6 May 2022 17:49:59 -0700 Subject: [PATCH 2/4] start fix Signed-off-by: nithinraok --- scripts/speaker_tasks/scp_to_manifest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/speaker_tasks/scp_to_manifest.py b/scripts/speaker_tasks/scp_to_manifest.py index fd3941461a97..faef0604fd30 100644 --- a/scripts/speaker_tasks/scp_to_manifest.py +++ b/scripts/speaker_tasks/scp_to_manifest.py @@ -48,7 +48,7 @@ --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise """ -DURATIONS = [1.5, 2, 3, 4] +DURATIONS = [1, 2, 3, 4] MIN_ENERGY = 0.01 CWD = os.getcwd() SAMPLE_RATE = 16000 @@ -77,9 +77,9 @@ def filter_manifest_line(manifest_line): final_string = final_string.replace('.', '-') to_file = to_path + final_string + '.wav' - start = int(float(start * sr)) - end = start + int(float(temp_dur * sr)) - chunk = signal[start:end] + c_start = int(float(start * sr)) + c_end = c_start + int(float(temp_dur * sr)) + chunk = signal[c_start:c_end] sf.write(to_file, chunk, sr) meta = manifest_line.copy() From 544f18c42a890e0af6b284cdc691554893ddb87b Mon Sep 17 00:00:00 2001 From: nithinraok Date: Fri, 6 May 2022 19:31:27 -0700 Subject: [PATCH 3/4] improve multiprocessing speed while creating speaker dataset Signed-off-by: nithinraok --- scripts/speaker_tasks/scp_to_manifest.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/scripts/speaker_tasks/scp_to_manifest.py b/scripts/speaker_tasks/scp_to_manifest.py index faef0604fd30..bd0b8589f119 100644 --- a/scripts/speaker_tasks/scp_to_manifest.py +++ b/scripts/speaker_tasks/scp_to_manifest.py @@ -48,7 +48,7 @@ --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise """ -DURATIONS = [1, 2, 3, 4] +DURATIONS = sorted([1, 2, 3, 4], reverse=True) MIN_ENERGY = 0.01 CWD = os.getcwd() SAMPLE_RATE = 16000 @@ -67,10 +67,19 @@ def filter_manifest_line(manifest_line): if dur >= min(DURATIONS): signal, sr = l.load(audio_path, sr=SAMPLE_RATE) - remaining_dur = dur - temp_dur = random.choice(DURATIONS) - remaining_dur = remaining_dur - temp_dur - while remaining_dur >= 0: + remaining_dur = dur - start + + segments = DURATIONS.copy() + mode = int(remaining_dur // sum(DURATIONS)) + rem = remaining_dur % sum(DURATIONS) + segments = mode * segments + + for val in DURATIONS: + if rem >= val: + segments.append(val) + rem = rem - val + + for temp_dur in segments: segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)] if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY: final_string = '_' + str(start) + '_' + str(temp_dur) @@ -89,8 +98,6 @@ def filter_manifest_line(manifest_line): split_manifest.append(meta) start = start + temp_dur - temp_dur = random.choice(DURATIONS) - remaining_dur = remaining_dur - temp_dur return split_manifest From 2006a80366ebbef0046c26791ccf02b002f0c118 Mon Sep 17 00:00:00 2001 From: nithinraok Date: Tue, 10 May 2022 15:51:18 -0700 Subject: [PATCH 4/4] updated scp to filelist Signed-off-by: nithinraok --- ...to_manifest.py => filelist_to_manifest.py} | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) rename scripts/speaker_tasks/{scp_to_manifest.py => filelist_to_manifest.py} (89%) diff --git a/scripts/speaker_tasks/scp_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py similarity index 89% rename from scripts/speaker_tasks/scp_to_manifest.py rename to scripts/speaker_tasks/filelist_to_manifest.py index bd0b8589f119..18ad6579a551 100644 --- a/scripts/speaker_tasks/scp_to_manifest.py +++ b/scripts/speaker_tasks/filelist_to_manifest.py @@ -27,18 +27,17 @@ random.seed(42) """ -This scipt converts a scp file where each line contains - -to a manifest json file. +This scipt converts a filelist file where each line contains + to a manifest json file. Optionally post processes the manifest file to create dev and train split for speaker embedding training, also optionally chunk an audio file in to segments of random DURATIONS and create those wav files in CWD. While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file. Args: ---scp: scp file name ---manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring scp ---id: index of speaker label in filename present in scp file that is separated by '/' +--filelist: path to file containing list of audio files +--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist +--id: index of speaker label in filename present in filelist file that is separated by '/' --out: output manifest file name --split: if you would want to split the manifest file for training purposes you may not need this for test set. output file names is _.json @@ -133,9 +132,9 @@ def write_file(name, lines, idx): print("wrote", name) -def read_file(scp_file, id=-1): +def read_file(filelist, id=-1): json_lines = [] - with open(scp_file, 'r') as fo: + with open(filelist, 'r') as fo: lines = fo.readlines() lines = sorted(lines) for line in lines: @@ -173,13 +172,13 @@ def get_labels(lines): return labels -def main(scp, manifest, id, out, split=False, create_chunks=False, min_count=10): +def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10): if os.path.exists(out): os.remove(out) - if scp: - lines = read_file(scp_file=scp, id=id) + if filelist: + lines = read_file(filelist=filelist, id=id) lines = process_map(get_duration, lines, chunksize=100) - out_file = os.path.splitext(scp)[0] + '_manifest.json' + out_file = os.path.splitext(filelist)[0] + '_manifest.json' write_file(out_file, lines, range(len(lines))) else: lines = read_manifest(manifest) @@ -187,7 +186,7 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, min_count=10) lines = process_map(get_duration, lines, chunksize=100) if create_chunks: - print("creating and writing chunks to {}".format(CWD)) + print(f"creating and writing chunks to {CWD}") lines = process_map(filter_manifest_line, lines, chunksize=100) temp = [] for line in lines: @@ -216,11 +215,11 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, min_count=10) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--scp", help="scp file name", type=str, required=False, default=None) + parser.add_argument("--filelist", help="path to filelist file", type=str, required=False, default=None) parser.add_argument("--manifest", help="manifest file name", type=str, required=False, default=None) parser.add_argument( "--id", - help="field num seperated by '/' to be considered as speaker label from scp file, can be ignored if manifest file is already provided with labels", + help="field num seperated by '/' to be considered as speaker label from filelist file, can be ignored if manifest file is already provided with labels", type=int, required=False, default=None, @@ -248,5 +247,5 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, min_count=10) args = parser.parse_args() main( - args.scp, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count, + args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count, )