Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion nemo/collections/asr/models/label_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
"embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()),
}

@typecheck()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just replace your forwarded last two lines with calling forward_for_export() there. Do that in a later pr

def forward_for_export(self, processed_signal, processed_signal_len):
encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len)
logits, embs = self.decoder(encoder_output=encoded, length=length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import argparse
import json
import multiprocessing
import os
import random

Expand All @@ -28,49 +27,76 @@
random.seed(42)

"""
This scipt converts a scp file where each line contains
<absolute path of wav file>
to a manifest json file.
This scipt converts a filelist file where each line contains
<absolute path of wav file> to a manifest json file.
Optionally post processes the manifest file to create dev and train split for speaker embedding
training, also optionally chunk an audio file in to segments of random DURATIONS and create those
wav files in CWD.

While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file.
Args:
--scp: scp file name
--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring scp
--id: index of speaker label in filename present in scp file that is separated by '/'
--filelist: path to file containing list of audio files
--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist
--id: index of speaker label in filename present in filelist file that is separated by '/'
--out: output manifest file name
--split: True / False if you would want to split the manifest file for training purposes
--split: if you would want to split the manifest file for training purposes
you may not need this for test set. output file names is <out>_<train/dev>.json
Defaults to False
--create_chunks: bool if you would want to chunk each manifest line to chunks of 3 sec or less
--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less
you may not need this for test set, Defaults to False
--write_chunks: writes chunked files based on offset to {current working directory}/chunks/{label}/{original_file_name}_{offset}_{duration}.wav
--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise
"""

DURATIONS = [1.5, 2, 3]
DURATIONS = sorted([1, 2, 3, 4], reverse=True)
MIN_ENERGY = 0.01
CWD = './'
CWD = os.getcwd()
SAMPLE_RATE = 16000


def filter_manifest_line(manifest_line):
split_manifest = []
audio_path = manifest_line['audio_filepath']
start = manifest_line.get('offset', 0)
dur = manifest_line['duration']
label = manifest_line['label']
endname = os.path.splitext(audio_path.split(label, 1)[-1])[0]
to_path = os.path.join(CWD, 'chunks', label)
to_path = os.path.join(to_path, endname[1:])
os.makedirs(os.path.dirname(to_path), exist_ok=True)

if dur >= min(DURATIONS):
signal, sr = l.load(audio_path, sr=None)
remaining_dur = dur
temp_dur = random.choice(DURATIONS)
remaining_dur = remaining_dur - temp_dur
while remaining_dur >= 0:
signal, sr = l.load(audio_path, sr=SAMPLE_RATE)
remaining_dur = dur - start

segments = DURATIONS.copy()
mode = int(remaining_dur // sum(DURATIONS))
rem = remaining_dur % sum(DURATIONS)
segments = mode * segments

for val in DURATIONS:
if rem >= val:
segments.append(val)
rem = rem - val

for temp_dur in segments:
segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)]
if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY:
final_string = '_' + str(start) + '_' + str(temp_dur)
final_string = final_string.replace('.', '-')
to_file = to_path + final_string + '.wav'

c_start = int(float(start * sr))
c_end = c_start + int(float(temp_dur * sr))
chunk = signal[c_start:c_end]
sf.write(to_file, chunk, sr)

meta = manifest_line.copy()
meta['offset'] = start
meta['audio_filepath'] = to_file
meta['offset'] = 0
meta['duration'] = temp_dur
split_manifest.append(meta)

start = start + temp_dur
temp_dur = random.choice(DURATIONS)
remaining_dur = remaining_dur - temp_dur

return split_manifest

Expand Down Expand Up @@ -106,9 +132,9 @@ def write_file(name, lines, idx):
print("wrote", name)


def read_file(scp_file, id=-1):
def read_file(filelist, id=-1):
json_lines = []
with open(scp_file, 'r') as fo:
with open(filelist, 'r') as fo:
lines = fo.readlines()
lines = sorted(lines)
for line in lines:
Expand Down Expand Up @@ -146,52 +172,28 @@ def get_labels(lines):
return labels


def write_audio_file(line):
filename = line['audio_filepath']
label = line['label']
offset = line['offset']
duration = line['duration']
basename = os.path.basename(filename).replace('.wav', '')
to_path = os.path.join(CWD, 'chunks', label)
os.makedirs(to_path, exist_ok=True)
to_path = os.path.join(to_path, basename)
final_string = '_' + str(offset) + '_' + str(duration)
final_string = final_string.replace('.', '-')
samples, sr = sf.read(filename)
start = int(float(offset * sr))
end = start + int(float(duration * sr))
chunk = samples[start:end]
to_file = to_path + final_string + '.wav'
sf.write(to_file, chunk, sr)

line['offset'] = 0
line['audio_filepath'] = to_file
return line


def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=False, min_count=10, workers=4):
def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10):
if os.path.exists(out):
os.remove(out)
if scp:
lines = read_file(scp_file=scp, id=id)
if filelist:
lines = read_file(filelist=filelist, id=id)
lines = process_map(get_duration, lines, chunksize=100)
out_file = os.path.splitext(filelist)[0] + '_manifest.json'
write_file(out_file, lines, range(len(lines)))
else:
lines = read_manifest(manifest)

lines = process_map(get_duration, lines, chunksize=100)

if create_chunks:
print("creating chunk")
print(f"creating and writing chunks to {CWD}")
lines = process_map(filter_manifest_line, lines, chunksize=100)
temp = []
for line in lines:
temp.extend(line)
del lines
lines = temp

if create_chunks and write_chunks:
print("writing chunks created before as new wav files")
lines = process_map(write_audio_file, lines, chunksize=100)

speakers = [x['label'] for x in lines]

if min_count:
Expand All @@ -213,11 +215,11 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--scp", help="scp file name", type=str, required=False, default=None)
parser.add_argument("--filelist", help="path to filelist file", type=str, required=False, default=None)
parser.add_argument("--manifest", help="manifest file name", type=str, required=False, default=None)
parser.add_argument(
"--id",
help="field num seperated by '/' to be considered as speaker label from scp file, can be ignored if manifest file is already provided with labels",
help="field num seperated by '/' to be considered as speaker label from filelist file, can be ignored if manifest file is already provided with labels",
type=int,
required=False,
default=None,
Expand All @@ -231,13 +233,7 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=
)
parser.add_argument(
"--create_chunks",
help="bool if you would want to chunk each manifest line to chunks of 3 sec or less",
required=False,
action='store_true',
)
parser.add_argument(
"--write_chunks",
help="bool if you would want to write the chunks created with --create_chunk to CWD ",
help="bool if you would want to chunk each manifest line to chunks of 4 sec or less",
required=False,
action='store_true',
)
Expand All @@ -247,20 +243,9 @@ def main(scp, manifest, id, out, split=False, create_chunks=False, write_chunks=
type=int,
help="min number of samples per speaker to consider and ignore otherwise",
)
parser.add_argument(
"--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset."
)

args = parser.parse_args()

main(
args.scp,
args.manifest,
args.id,
args.out,
args.split,
args.create_chunks,
args.write_chunks,
args.min_spkrs_count,
args.num_workers,
args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count,
)