diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index 83f0572c6afd..9c3a468bbdff 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -251,7 +251,6 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: "embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), } - @typecheck() def forward_for_export(self, processed_signal, processed_signal_len): encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len) logits, embs = self.decoder(encoder_output=encoded, length=length) diff --git a/scripts/speaker_tasks/scp_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py similarity index 67% rename from scripts/speaker_tasks/scp_to_manifest.py rename to scripts/speaker_tasks/filelist_to_manifest.py index c146b13e1742..18ad6579a551 100644 --- a/scripts/speaker_tasks/scp_to_manifest.py +++ b/scripts/speaker_tasks/filelist_to_manifest.py @@ -14,7 +14,6 @@ import argparse import json -import multiprocessing import os import random @@ -28,26 +27,30 @@ random.seed(42) """ -This scipt converts a scp file where each line contains - -to a manifest json file. +This scipt converts a filelist file where each line contains + to a manifest json file. +Optionally post processes the manifest file to create dev and train split for speaker embedding +training, also optionally chunk an audio file in to segments of random DURATIONS and create those +wav files in CWD. + +While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file. Args: ---scp: scp file name ---manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring scp ---id: index of speaker label in filename present in scp file that is separated by '/' +--filelist: path to file containing list of audio files +--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist +--id: index of speaker label in filename present in filelist file that is separated by '/' --out: output manifest file name ---split: True / False if you would want to split the manifest file for training purposes +--split: if you would want to split the manifest file for training purposes you may not need this for test set. output file names is _.json Defaults to False ---create_chunks: bool if you would want to chunk each manifest line to chunks of 3 sec or less +--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less you may not need this for test set, Defaults to False ---write_chunks: writes chunked files based on offset to {current working directory}/chunks/{label}/{original_file_name}_{offset}_{duration}.wav --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise """ -DURATIONS = [1.5, 2, 3] +DURATIONS = sorted([1, 2, 3, 4], reverse=True) MIN_ENERGY = 0.01 -CWD = './' +CWD = os.getcwd() +SAMPLE_RATE = 16000 def filter_manifest_line(manifest_line): @@ -55,22 +58,45 @@ def filter_manifest_line(manifest_line): audio_path = manifest_line['audio_filepath'] start = manifest_line.get('offset', 0) dur = manifest_line['duration'] + label = manifest_line['label'] + endname = os.path.splitext(audio_path.split(label, 1)[-1])[0] + to_path = os.path.join(CWD, 'chunks', label) + to_path = os.path.join(to_path, endname[1:]) + os.makedirs(os.path.dirname(to_path), exist_ok=True) if dur >= min(DURATIONS): - signal, sr = l.load(audio_path, sr=None) - remaining_dur = dur - temp_dur = random.choice(DURATIONS) - remaining_dur = remaining_dur - temp_dur - while remaining_dur >= 0: + signal, sr = l.load(audio_path, sr=SAMPLE_RATE) + remaining_dur = dur - start + + segments = DURATIONS.copy() + mode = int(remaining_dur // sum(DURATIONS)) + rem = remaining_dur % sum(DURATIONS) + segments = mode * segments + + for val in DURATIONS: + if rem >= val: + segments.append(val) + rem = rem - val + + for temp_dur in segments: segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)] if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY: + final_string = '_' + str(start) + '_' + str(temp_dur) + final_string = final_string.replace('.', '-') + to_file = to_path + final_string + '.wav' + + c_start = int(float(start * sr)) + c_end = c_start + int(float(temp_dur * sr)) + chunk = signal[c_start:c_end] + sf.write(to_file, chunk, sr) + meta = manifest_line.copy() - meta['offset'] = start + meta['audio_filepath'] = to_file + meta['offset'] = 0 meta['duration'] = temp_dur split_manifest.append(meta) + start = start + temp_dur - temp_dur = random.choice(DURATIONS) - remaining_dur = remaining_dur - temp_dur return split_manifest @@ -106,9 +132,9 @@ def write_file(name, lines, idx): print("wrote", name) -def read_file(scp_file, id=-1): +def read_file(filelist, id=-1): json_lines = [] - with open(scp_file, 'r') as fo: + with open(filelist, 'r') as fo: lines = fo.readlines() lines = sorted(lines) for line in lines: @@ -146,41 +172,21 @@ def get_labels(lines): return labels -def write_audio_file(line): - filename = line['audio_filepath'] - label = line['label'] - offset = line['offset'] - duration = line['duration'] - basename = os.path.basename(filename).replace('.wav', '') - to_path = os.path.join(CWD, 'chunks', label) - os.makedirs(to_path, exist_ok=True) - to_path = os.path.join(to_path, basename) - final_string = '_' + str(offset) + '_' + str(duration) - final_string = final_string.replace('.', '-') - samples, sr = sf.read(filename) - start = int(float(offset * sr)) - end = start + int(float(duration * sr)) - chunk = samples[start:end] - to_file = to_path + final_string + '.wav' - sf.write(to_file, chunk, sr) - - line['offset'] = 0 - line['audio_filepath'] = to_file - return line - - -def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=False, min_count=10, workers=4): +def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10): if os.path.exists(out): os.remove(out) - if scp: - lines = read_file(scp_file=scp, id=id) + if filelist: + lines = read_file(filelist=filelist, id=id) + lines = process_map(get_duration, lines, chunksize=100) + out_file = os.path.splitext(filelist)[0] + '_manifest.json' + write_file(out_file, lines, range(len(lines))) else: lines = read_manifest(manifest) lines = process_map(get_duration, lines, chunksize=100) if create_chunks: - print("creating chunk") + print(f"creating and writing chunks to {CWD}") lines = process_map(filter_manifest_line, lines, chunksize=100) temp = [] for line in lines: @@ -188,10 +194,6 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= del lines lines = temp - if create_chunks and write_chunks: - print("writing chunks created before as new wav files") - lines = process_map(write_audio_file, lines, chunksize=100) - speakers = [x['label'] for x in lines] if min_count: @@ -213,11 +215,11 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--scp", help="scp file name", type=str, required=False, default=None) + parser.add_argument("--filelist", help="path to filelist file", type=str, required=False, default=None) parser.add_argument("--manifest", help="manifest file name", type=str, required=False, default=None) parser.add_argument( "--id", - help="field num seperated by '/' to be considered as speaker label from scp file, can be ignored if manifest file is already provided with labels", + help="field num seperated by '/' to be considered as speaker label from filelist file, can be ignored if manifest file is already provided with labels", type=int, required=False, default=None, @@ -231,13 +233,7 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= ) parser.add_argument( "--create_chunks", - help="bool if you would want to chunk each manifest line to chunks of 3 sec or less", - required=False, - action='store_true', - ) - parser.add_argument( - "--write_chunks", - help="bool if you would want to write the chunks created with --create_chunk to CWD ", + help="bool if you would want to chunk each manifest line to chunks of 4 sec or less", required=False, action='store_true', ) @@ -247,20 +243,9 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks= type=int, help="min number of samples per speaker to consider and ignore otherwise", ) - parser.add_argument( - "--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset." - ) args = parser.parse_args() main( - args.scp, - args.manifest, - args.id, - args.out, - args.split, - args.create_chunks, - args.write_chunks, - args.min_spkrs_count, - args.num_workers, + args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count, )