Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 20 additions & 11 deletions docs/src/sdp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,22 @@ UzbekVoice
Earnings21/22
'''''''''''''

.. autodata:: sdp.processors.datasets.earnings21.CreateInitialAudioAndManifest
.. autodata:: sdp.processors.datasets.earnings.CreateInitialAudioAndManifest
:annotation:

.. autodata:: sdp.processors.datasets.earnings21.CreateFullAudioManifestEarnings21
.. autodata:: sdp.processors.datasets.earnings.CreateFullAudioManifestEarnings21
:annotation:

.. autodata:: sdp.processors.datasets.earnings21.SpeakerSegmentedManifest
.. autodata:: sdp.processors.datasets.earnings.SpeakerSegmentedManifest
:annotation:

.. autodata:: sdp.processors.datasets.earnings21.CreateSentenceSegmentedManifest
.. autodata:: sdp.processors.datasets.earnings.CreateSentenceSegmentedManifest
:annotation:

.. autodata:: sdp.processors.datasets.earnings21.NeMoForcedAligner
.. autodata:: sdp.processors.datasets.earnings.NeMoForcedAligner
:annotation:

.. autodata:: sdp.processors.datasets.earnings21.ApplyEarnings21Normalizations
.. autodata:: sdp.processors.datasets.earnings.ApplyEarnings21Normalizations
:annotation:


Expand Down Expand Up @@ -278,13 +278,25 @@ ASR-based processors
Data modifications
''''''''''''''''''

.. autodata:: sdp.processors.InsIfASRInsertion
:annotation:

.. autodata:: sdp.processors.SubIfASRSubstitution
:annotation:

Files management
''''''''''''''''

.. autodata:: sdp.processors.SoxConvert
:annotation:

.. autodata:: sdp.processors.InsIfASRInsertion
.. autodata:: sdp.processors.FfmpegConvert
:annotation:

.. autodata:: sdp.processors.SubIfASRSubstitution
.. autodata:: sdp.processors.ExtractTar
:annotation:

.. autodata:: sdp.processors.RemoveFiles
:annotation:

Data filtering
Expand Down Expand Up @@ -379,9 +391,6 @@ Miscellaneous
.. autodata:: sdp.processors.GetAudioDuration
:annotation:

.. autodata:: sdp.processors.FfmpegConvert
:annotation:

.. autodata:: sdp.processors.CreateInitialManifestByExt
:annotation:

Expand Down
16 changes: 15 additions & 1 deletion docs/src/sdp/existing_configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -424,4 +424,18 @@ NemoRunIPL
:hidden:

config-docs/ipl/config
config-docs/ipl/nemo_run_config
config-docs/ipl/nemo_run_config

Earnings21/22
~~~~~~~~~~~~~

**Supported configs**.

* **English**:
`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/earnings/config.yaml>`__ |
:doc:`documentation <config-docs/english/earnings/config>`

.. toctree::
:hidden:

config-docs/english/earnings/config
12 changes: 10 additions & 2 deletions sdp/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@
CopyManifestData,
CountNumWords,
ExtractFromBrackets,
FfmpegConvert,
GetAudioDuration,
GetWER,
InsIfASRInsertion,
Expand All @@ -108,7 +107,6 @@
MakeSentence,
ReadDocxLines,
ReadTxtLines,
SoxConvert,
SplitLineBySentence,
SubIfASRSubstitution,
SubMakeLowercase,
Expand Down Expand Up @@ -136,6 +134,16 @@
from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
MakeLettersUppercaseAfterPeriod,
)
from sdp.processors.manage_files.convert_audio import (
FfmpegConvert,
SoxConvert,
)
from sdp.processors.manage_files.extract import (
ExtractTar,
)
from sdp.processors.manage_files.remove import (
RemoveFiles,
)
from sdp.processors.nemo.asr_inference import ASRInference
from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
from sdp.processors.nemo.pc_inference import PCInference
Expand Down
1 change: 0 additions & 1 deletion sdp/processors/base_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from itertools import chain
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm
Expand Down
13 changes: 13 additions & 0 deletions sdp/processors/manage_files/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
170 changes: 170 additions & 0 deletions sdp/processors/manage_files/convert_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Optional
from sox import Transformer

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry

from sdp.utils.common import ffmpeg_convert


class FfmpegConvert(BaseParallelProcessor):
"""
Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.

.. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be

``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.

Args:
converted_audio_dir (str): The directory to store the resampled audio files.
input_file_key (str): The field in the dataset representing the path to the input video or audio files.
output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
**kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.

"""

def __init__(
self,
converted_audio_dir: str,
input_file_key: str,
output_file_key: str,
id_key: str = None,
output_format: str = "wav",
base_dir: str = None,
target_samplerate: int = 16000,
target_nchannels: int = 1,
**kwargs,
):
super().__init__(**kwargs)
self.converted_audio_dir = converted_audio_dir
self.input_file_key = input_file_key
self.output_file_key = output_file_key
self.output_format = output_format
self.id_key = id_key
self.base_dir = base_dir
self.target_samplerate = target_samplerate
self.target_nchannels = target_nchannels

def prepare(self):
assert self.output_format == "wav", "Currently only wav format is supported"
os.makedirs(self.converted_audio_dir, exist_ok=True)

def process_dataset_entry(self, data_entry):
input_file = data_entry[self.input_file_key]
if self.id_key:
key = data_entry[self.id_key]
os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
else:
key = os.path.splitext(input_file)[0].split("/")[-1]

if self.base_dir:
new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)

key = os.path.join(new_dir, key)

audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format

if not os.path.isfile(audio_file):
ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)

data_entry[self.output_file_key] = audio_file
return [DataEntry(data=data_entry)]


class SoxConvert(BaseParallelProcessor):
"""Processor for Sox to convert audio files to specified format.

Args:
output_manifest_file (str): Path to the output manifest file.
input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
output_format (str): Format of the output audio file.
rate (int): Sample rate of the output audio file.
channels (int): Number of channels of the output audio file.
workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
"""

def __init__(
self,
converted_audio_dir: str,
input_audio_file_key: str = "audio_filepath",
output_audio_file_key: str = "audio_filepath",
output_format: str = "wav",
rate: int = 16000,
channels: int = 1,
workspace_dir: Optional[str] = None,
**kwargs,
):
# Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
if "workspace_dir" in kwargs:
workspace_dir = kwargs.pop("workspace_dir")

super().__init__(**kwargs)
self.input_audio_file_key = input_audio_file_key
self.output_audio_file_key = output_audio_file_key
self.converted_audio_dir = converted_audio_dir
self.output_format = output_format
self.workspace_dir = workspace_dir

# Store the new parameters for later use:
self.rate = rate
self.channels = channels

def prepare(self):
# Debug print for workspace_dir
logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
os.makedirs(self.converted_audio_dir, exist_ok=True)

def process_dataset_entry(self, data_entry):
audio_path = data_entry[self.input_audio_file_key]

# If workspace_dir is provided, join it with audio_path to get absolute path
if self.workspace_dir is not None:
full_audio_path = os.path.join(self.workspace_dir, audio_path)
else:
full_audio_path = audio_path

# Debug print first file path
if not hasattr(self, '_debug_printed'):
logger.info(f"First audio_path from manifest: {audio_path}")
logger.info(f"First full_audio_path: {full_audio_path}")
logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
self._debug_printed = True

key = os.path.splitext(audio_path)[0].split("/")[-1]
converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"

if not os.path.isfile(converted_file):
transformer = Transformer()

transformer.rate(self.rate)
transformer.channels(self.channels)

transformer.build(full_audio_path, converted_file)

data_entry[self.output_audio_file_key] = converted_file
return [DataEntry(data=data_entry)]
Loading
Loading