From fd8156183800bccc51918288caba6600d1ac16b0 Mon Sep 17 00:00:00 2001 From: Nithin Rao Koluguri Date: Tue, 3 Jun 2025 11:41:38 -0700 Subject: [PATCH 1/3] Add earnings config Signed-off-by: Nithin Rao Koluguri --- .../english/earnings21/config.yaml | 197 ++++ sdp/processors/__init__.py | 7 + .../datasets/earnings21/__init__.py | 24 + .../earnings21/apply_normalizations.py | 90 ++ .../earnings21/create_initial_manifest.py | 1010 +++++++++++++++++ 5 files changed, 1328 insertions(+) create mode 100644 dataset_configs/english/earnings21/config.yaml create mode 100644 sdp/processors/datasets/earnings21/__init__.py create mode 100644 sdp/processors/datasets/earnings21/apply_normalizations.py create mode 100644 sdp/processors/datasets/earnings21/create_initial_manifest.py diff --git a/dataset_configs/english/earnings21/config.yaml b/dataset_configs/english/earnings21/config.yaml new file mode 100644 index 00000000..d09ac5f2 --- /dev/null +++ b/dataset_configs/english/earnings21/config.yaml @@ -0,0 +1,197 @@ +# Configuration for processing Earnings21/22 datasets to NeMo format +# This config implements a 5-step pipeline with forced alignment: +# 1. CreateInitialAudioAndManifest: Create full audio manifest with duration +# 2. CreateFullAudioManifestEarnings21: Add ground truth text from NLP files +# 3. SubRegex: Clean text patterns +# 4. NeMoForcedAligner: Generate word-level CTM files using NeMo Forced Aligner +# 5. CreateSentenceSegmentedManifest: Create sentence-level segments based on NeMo Forced Aligner CTM files +# 6. SpeakerSegmentedManifest: Create speaker-level segments (optional) + +# Global parameters (ensure these are set, e.g., via command line or here) +output_directory: ?? # E.g., /path/to/your/main_output_sdp/ +dataset_root: ?? # E.g., /disk7/datasets/speech-datasets/earnings21 or /disk7/datasets/speech-datasets/earnings22 +raw_audio_input_dir: ${dataset_root}/media # Raw audio source directory + +# Dataset configuration +dataset_type: "earnings21" # Options: "earnings21" or "earnings22" +subset: "full" # Options: "full" or "eval10" (earnings21 only) +test_mode: false # Set to true to process only 2 files for testing + +# Dask configuration +use_dask: false + +# Text processing parameters +preserve_punctuation: true +preserve_capitalization: true + +# Output options +include_speaker_info: true +include_tags: false # Set to true to include entity tags (earnings21 only) +use_speaker_metadata_csv: false # Set to true to map speaker IDs to names from speaker-metadata.csv (earnings21 only) + +# Forced Alignment parameters +forced_alignment_model: nvidia/parakeet-tdt_ctc-1.1b # NeMo ASR model for forced alignment with CTC head +device: "cuda" # Device for forced alignment + +processors: + # Step 1: Create initial manifest with full audio files and duration + - _target_: sdp.processors.datasets.earnings21.CreateInitialAudioAndManifest + dataset_root: ${dataset_root} + raw_audio_source_dir: ${raw_audio_input_dir} + output_manifest_file: ${output_directory}/01_initial_audio_manifest.json + dataset_type: ${dataset_type} + subset: ${subset} + test_mode: ${test_mode} + + # Step 2: Add ground truth text from NLP files to the manifest + - _target_: sdp.processors.datasets.earnings21.CreateFullAudioManifestEarnings21 + input_manifest_file: ${output_directory}/01_initial_audio_manifest.json + dataset_root: ${dataset_root} + output_manifest_file: ${output_directory}/02_full_audio_with_text_manifest.json + dataset_type: ${dataset_type} + preserve_punctuation: ${preserve_punctuation} + preserve_capitalization: ${preserve_capitalization} + + # Step 3: Clean text patterns + - _target_: sdp.processors.SubRegex + input_manifest_file: ${output_directory}/02_full_audio_with_text_manifest.json + output_manifest_file: ${output_directory}/03_full_audio_with_text_manifest_cleaned.json + regex_params_list: + - {"pattern": "[…+×]", "repl": ""} + # remove text inside <> + - {"pattern": "<.*?>", "repl": ""} + - {"pattern": "\\[.*?\\]", "repl": ""} + + # Step 4: NeMo Forced Alignment - Generate word-level CTM files + - _target_: sdp.processors.datasets.earnings21.NeMoForcedAligner + input_manifest_file: ${output_directory}/03_full_audio_with_text_manifest_cleaned.json + output_manifest_file: ${output_directory}/04_aligned_manifest.json + output_dir: ${output_directory}/forced_alignment_output + pretrained_name: ${forced_alignment_model} + device: ${device} + batch_size: 1 + + # Step 5: Create sentence-level segments based on CTM files + - _target_: sdp.processors.datasets.earnings21.CreateSentenceSegmentedManifest + input_manifest_file: ${output_directory}/04_aligned_manifest.json + ctm_dir: ${output_directory}/forced_alignment_output/ctm/words + output_manifest_file: ${output_directory}/05_sentence_segmented_manifest.json + + # Step 6: Create speaker-level segments (optional) + - _target_: sdp.processors.datasets.earnings21.SpeakerSegmentedManifest + input_manifest_file: ${output_directory}/03_full_audio_with_text_manifest_cleaned.json + dataset_root: ${dataset_root} + output_manifest_file: ${output_directory}/06_speaker_segmented_manifest.json + dataset_type: ${dataset_type} + preserve_punctuation: ${preserve_punctuation} + preserve_capitalization: ${preserve_capitalization} + include_speaker_info: ${include_speaker_info} + include_tags: ${include_tags} + use_speaker_metadata_csv: ${use_speaker_metadata_csv} + + # Step 7: Filter manifest to keep only required fields + - _target_: sdp.processors.KeepOnlySpecifiedFields + input_manifest_file: ${output_directory}/05_sentence_segmented_manifest.json + output_manifest_file: ${output_directory}/07_final_filtered_manifest.json + fields_to_keep: ["audio_filepath", "duration", "offset", "text"] + +# Expected output from this 5-step pipeline: +# 1. ${output_directory}/01_initial_audio_manifest.json - Full audio manifest with duration +# 2. ${output_directory}/02_full_audio_with_text_manifest.json - Full audio with ground truth text +# 3. ${output_directory}/03_full_audio_with_text_manifest_cleaned.json - Cleaned audio with text +# 4. ${output_directory}/04_aligned_manifest.json - Final aligned manifest with word-level timestamps +# 5. ${output_directory}/05_sentence_segmented_manifest.json - Sentence-level segments based on CTM files +# 6. ${output_directory}/06_speaker_segmented_manifest.json - Speaker-level segments + +# Usage examples: +# For Earnings21: +# python main.py --config-path=dataset_configs/english/earnings21 --config-name=config dataset_type=earnings21 dataset_root=/path/to/earnings21 output_directory=/path/to/output +# +# For Earnings22: +# python main.py --config-path=dataset_configs/english/earnings21 --config-name=config dataset_type=earnings22 dataset_root=/path/to/earnings22 output_directory=/path/to/output +# +# For eval10 subset (earnings21 only): +# python main.py --config-path=dataset_configs/english/earnings21 --config-name=config dataset_type=earnings21 subset=eval10 dataset_root=/path/to/earnings21 output_directory=/path/to/output + +# Expected output format for Step 1 (full audio manifest): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 1800.0, # Actual audio duration in seconds +# "text": "", # Placeholder text +# "file_id": "original_file_id" +# } + +# Expected output format for Step 2 (full audio with text): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 1800.0, # Actual audio duration in seconds +# "text": "Complete transcribed text with punctuation and capitalization.", +# "file_id": "original_file_id" +# } + +# Expected output format for Step 3 (cleaned audio with text): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 1800.0, # Actual audio duration in seconds +# "text": "Complete transcribed text with punctuation and capitalization.", +# "file_id": "original_file_id" +# } + +# Expected output format for Step 4 (aligned manifest): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 15.2, # Actual segment duration from forced alignment +# "text": "This is the transcribed text for this speaker segment.", +# "file_id": "original_file_id", +# "segment_id": 0, +# "start_time": null, +# "end_time": null, +# "speaker": "speaker_1", +# "alignment": [ # Word-level alignments from NeMo Forced Aligner +# {"word": "This", "start": 0.0, "end": 0.3}, +# {"word": "is", "start": 0.3, "end": 0.5}, +# {"word": "the", "start": 0.5, "end": 0.7}, +# ... +# ] +# } + +# Expected output format for Step 5 (sentence-level segments): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 15.2, # Actual segment duration from forced alignment +# "text": "This is the transcribed text for this speaker segment.", +# "file_id": "original_file_id", +# "segment_id": 0, +# "start_time": null, +# "end_time": null, +# "speaker": "speaker_1", +# "alignment": [ # Word-level alignments from NeMo Forced Aligner +# {"word": "This", "start": 0.0, "end": 0.3}, +# {"word": "is", "start": 0.3, "end": 0.5}, +# {"word": "the", "start": 0.5, "end": 0.7}, +# ... +# ] +# } + +# Expected output format for Step 6 (speaker-level segments): +# { +# "audio_filepath": "/path/to/dataset/media/file_id.mp3", +# "duration": 0, # No duration calculation +# "text": "This is the transcribed text for this speaker segment.", +# "file_id": "original_file_id", +# "segment_id": 0, +# "start_time": null, # No timing information +# "end_time": null, # No timing information +# "speaker": "speaker_1" # If include_speaker_info=true +# } + +# Key features of this 5-step pipeline: +# - Step 1: Creates full audio manifest with actual duration from audio files +# - Step 2: Adds ground truth text from NLP files (full transcript per file) +# - Step 3: Cleans text patterns +# - Step 4: Adds word-level alignments using NeMo Forced Aligner while preserving ground truth text +# - Step 5: Creates sentence-level segments based on CTM files +# - Step 6: Creates speaker-level segments (optional) +# - Final output includes precise timing information for each word +# - Supports both earnings21 and earnings22 +# - Clean separation of concerns between steps \ No newline at end of file diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index df860331..13ef0eae 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -21,6 +21,13 @@ CreateInitialManifestCORAAL, TrainDevTestSplitCORAAL, ) +from sdp.processors.datasets.earnings21 import ( + CreateInitialAudioAndManifest, + CreateFullAudioManifestEarnings21, + SpeakerSegmentedManifest, + CreateSentenceSegmentedManifest, + ApplyEarnings21Normalizations, +) from sdp.processors.datasets.fleurs.create_initial_manifest import ( CreateInitialManifestFleurs, ) diff --git a/sdp/processors/datasets/earnings21/__init__.py b/sdp/processors/datasets/earnings21/__init__.py new file mode 100644 index 00000000..dc0470ad --- /dev/null +++ b/sdp/processors/datasets/earnings21/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sdp.processors.datasets.earnings21.create_initial_manifest import ( + CreateInitialAudioAndManifest, + CreateFullAudioManifestEarnings21, + SpeakerSegmentedManifest, + CreateSentenceSegmentedManifest, + NeMoForcedAligner, +) +from sdp.processors.datasets.earnings21.apply_normalizations import ( + ApplyEarnings21Normalizations, +) \ No newline at end of file diff --git a/sdp/processors/datasets/earnings21/apply_normalizations.py b/sdp/processors/datasets/earnings21/apply_normalizations.py new file mode 100644 index 00000000..2f70874f --- /dev/null +++ b/sdp/processors/datasets/earnings21/apply_normalizations.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Any + +from sdp.processors.base_processor import BaseProcessor, DataEntry + + +class ApplyEarnings21Normalizations(BaseProcessor): + """Apply text normalizations using Earnings 21 normalization data. + + This processor uses the normalization files provided with the Earnings 21 dataset + to apply text normalizations based on probability scores. + + Args: + earnings21_root (str): path to the root directory of Earnings 21 dataset. + use_top_candidate (bool): whether to use the highest probability candidate. Defaults to True. + fallback_to_original (bool): whether to fallback to original text if no normalization available. Defaults to True. + preserve_entity_tags (bool): whether to preserve entity tags during normalization. Defaults to True. + """ + + def __init__( + self, + earnings21_root: str, + use_top_candidate: bool = True, + fallback_to_original: bool = True, + preserve_entity_tags: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.earnings21_root = Path(earnings21_root) + self.use_top_candidate = use_top_candidate + self.fallback_to_original = fallback_to_original + self.preserve_entity_tags = preserve_entity_tags + + def process_dataset_entry(self, data_entry: DataEntry) -> List[DataEntry]: + """Process a single dataset entry to apply normalizations.""" + data = data_entry.data + + # Extract file_id to load corresponding normalization file + file_id = data.get('file_id') + if not file_id: + # If no file_id, return original entry + return [data_entry] + + # Load normalization data for this file + norm_file = self.earnings21_root / "transcripts" / "normalizations" / f"{file_id}.norm.json" + + if not norm_file.exists(): + # If no normalization file, return original entry + return [data_entry] + + try: + with open(norm_file, 'r', encoding='utf-8') as f: + normalizations = json.load(f) + except (json.JSONDecodeError, FileNotFoundError): + # If can't load normalization file, return original entry + return [data_entry] + + # Apply normalizations to text + normalized_text = self._apply_normalizations(data.get('text', ''), normalizations) + + # Create new data entry with normalized text + new_data = data.copy() + new_data['text'] = normalized_text + + return [DataEntry(data=new_data)] + + def _apply_normalizations(self, text: str, normalizations: Dict[str, Any]) -> str: + """Apply normalizations to text based on normalization data.""" + # This is a simplified implementation + # In practice, you would need to map tokens to normalization IDs + # and apply the appropriate normalizations + + # For now, just return the original text + # This can be extended to implement actual normalization logic + return text \ No newline at end of file diff --git a/sdp/processors/datasets/earnings21/create_initial_manifest.py b/sdp/processors/datasets/earnings21/create_initial_manifest.py new file mode 100644 index 00000000..0fce7757 --- /dev/null +++ b/sdp/processors/datasets/earnings21/create_initial_manifest.py @@ -0,0 +1,1010 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import json +import os +import re +from pathlib import Path +from typing import List, Dict, Any, Optional + +import pandas as pd +import librosa +import soundfile as sf + +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, BaseProcessor, DataEntry +from sdp.utils.common import extract_archive + + +# Step 1: Create Initial Audio and Manifest (Full Audio) +class CreateInitialAudioAndManifest(BaseParallelProcessor): + """ + Step 1: Create initial manifest with full audio files. + + Features: + - Supports both earnings21 and earnings22 + - Creates manifest pointing to original audio files + - No text processing (placeholder text) + - Gets audio duration from files + """ + + def __init__( + self, + dataset_root: str, + raw_audio_source_dir: str, + output_manifest_file: str, + dataset_type: str = "earnings21", # "earnings21" or "earnings22" + subset: str = "full", + test_mode: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.dataset_root = Path(dataset_root) + self.raw_audio_source_dir = Path(raw_audio_source_dir) + self.output_manifest_file = output_manifest_file + self.dataset_type = dataset_type + self.subset = subset + self.test_mode = test_mode + + # Create converted audio directory + self.converted_audio_dir = Path(self.output_manifest_file).parent / "converted_audio" + self.converted_audio_dir.mkdir(parents=True, exist_ok=True) + + def prepare(self): + """Prepare the processor by loading file metadata.""" + # Extract archives if needed (earnings21 only) + if self.dataset_type == "earnings21": + for archive_name in ["transcripts.tar.gz", "audio.tar.gz"]: + archive_path = self.dataset_root / archive_name + if archive_path.exists(): + extract_archive(str(archive_path), str(self.dataset_root)) + + # Load file list based on dataset type and subset + if self.dataset_type == "earnings21": + if self.subset == "eval10": + metadata_file = self.dataset_root / "eval10-file-metadata.csv" + else: + metadata_file = self.dataset_root / "earnings21-file-metadata.csv" + else: # earnings22 + metadata_file = self.dataset_root / "metadata.csv" + + # If metadata file doesn't exist, discover files from audio directory + if not metadata_file.exists(): + logger.warning(f"Metadata file not found: {metadata_file}. Discovering files from audio directory.") + file_ids = [] + for ext in ['*.mp3', '*.wav']: + file_ids.extend([f.stem for f in self.raw_audio_source_dir.glob(ext)]) + self.file_ids = file_ids + else: + file_metadata_df = pd.read_csv(metadata_file) + # Handle different column names between earnings21 and earnings22 + if 'file_id' in file_metadata_df.columns: + self.file_ids = file_metadata_df['file_id'].astype(str).tolist() + elif 'File ID' in file_metadata_df.columns: + self.file_ids = file_metadata_df['File ID'].astype(str).tolist() + else: + raise ValueError(f"Neither 'file_id' nor 'File ID' column found in {metadata_file}") + + if self.test_mode: + self.file_ids = self.file_ids[:2] + + logger.info(f"Loaded {len(self.file_ids)} file IDs for {self.dataset_type} subset {self.subset}.") + + def _convert_audio_if_needed(self, audio_file: Path, file_id: str) -> Path: + """ + Convert audio file to single-channel 16kHz WAV if needed. + + Args: + audio_file: Path to the original audio file + file_id: File ID for naming the converted file + + Returns: + Path to the audio file to use (original or converted) + """ + try: + # Load audio to check properties + audio_data, sample_rate = librosa.load(str(audio_file), sr=None, mono=False) + + # Check if conversion is needed + needs_conversion = False + conversion_reasons = [] + + # Check if it's MP3 + if audio_file.suffix.lower() == '.mp3': + needs_conversion = True + conversion_reasons.append("MP3 format") + + # Check if it's multi-channel + if audio_data.ndim > 1: + needs_conversion = True + conversion_reasons.append(f"{audio_data.shape[0]} channels") + + # Check if sample rate is not 16kHz + if sample_rate != 16000: + needs_conversion = True + conversion_reasons.append(f"{sample_rate}Hz sample rate") + + if not needs_conversion: + logger.debug(f"No conversion needed for {file_id}") + return audio_file + + # Convert audio + logger.info(f"Converting {file_id}: {', '.join(conversion_reasons)} -> single-channel 16kHz WAV") + + # Load as mono and resample to 16kHz + audio_mono, _ = librosa.load(str(audio_file), sr=16000, mono=True) + + # Save as WAV + converted_file = self.converted_audio_dir / f"{file_id}.wav" + sf.write(str(converted_file), audio_mono, 16000) + + logger.debug(f"Converted audio saved to {converted_file}") + return converted_file + + except Exception as e: + logger.error(f"Error converting audio file {audio_file}: {e}") + # Return original file if conversion fails + return audio_file + + def read_manifest(self): + """Read and process all files to create manifest entries.""" + return self.file_ids + + def process_dataset_entry(self, file_id: str) -> List[DataEntry]: + """Process a single file to create full audio manifest entry.""" + file_id = str(file_id) + + # Find audio file + audio_file = None + for ext in ['.mp3', '.wav']: + potential_path = self.raw_audio_source_dir / f"{file_id}{ext}" + if potential_path.exists(): + audio_file = potential_path + break + + if not audio_file: + logger.warning(f"Audio file not found for {file_id}") + return [] + + try: + # Convert audio if needed (handles MP3, multi-channel, non-16kHz) + final_audio_file = self._convert_audio_if_needed(audio_file, file_id) + + # Get audio duration from the final audio file + duration = librosa.get_duration(path=str(final_audio_file)) + + # Create manifest entry + entry_data = { + "audio_filepath": str(final_audio_file), + "duration": duration, + "text": "", # Placeholder text + "file_id": file_id, + } + + return [DataEntry(data=entry_data)] + + except Exception as e: + logger.error(f"Error processing audio file {file_id}: {e}") + return [] + + +# Step 2: Populate Full Text for Manifest +class CreateFullAudioManifestEarnings21(BaseParallelProcessor): + """ + Step 2: Add ground truth text from NLP files to the manifest. + + Features: + - Supports both earnings21 and earnings22 + - Reconstructs full text from NLP tokens + - Preserves punctuation and capitalization + """ + + def __init__( + self, + input_manifest_file: str, + dataset_root: str, + output_manifest_file: str, + dataset_type: str = "earnings21", # "earnings21" or "earnings22" + preserve_punctuation: bool = True, + preserve_capitalization: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_file = input_manifest_file + self.dataset_root = Path(dataset_root) + self.output_manifest_file = output_manifest_file + self.dataset_type = dataset_type + self.preserve_punctuation = preserve_punctuation + self.preserve_capitalization = preserve_capitalization + + def _get_nlp_file_path(self, file_id: str) -> Path: + """Get NLP file path based on dataset type.""" + if self.dataset_type == "earnings21": + return self.dataset_root / "transcripts" / "nlp_references" / f"{file_id}.nlp" + else: # earnings22 + # Check both possible locations for earnings22 + nlp_path1 = self.dataset_root / "transcripts" / "nlp_references" / f"{file_id}.nlp" + nlp_path2 = self.dataset_root / "subset10" / "nonverbatim_transcripts" / "nlp_references" / f"{file_id}.nlp" + + if nlp_path1.exists(): + return nlp_path1 + elif nlp_path2.exists(): + return nlp_path2 + else: + return nlp_path1 # Return first path for error reporting + + def _load_nlp_file(self, file_id: str) -> List[Dict[str, Any]]: + """Load NLP file containing tokens and metadata.""" + nlp_file = self._get_nlp_file_path(file_id) + + if not nlp_file.exists(): + logger.warning(f"NLP file not found: {nlp_file}") + return [] + + tokens_list = [] + try: + with open(nlp_file, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='|') + try: + header = next(reader) + except StopIteration: + logger.warning(f"NLP file {nlp_file} is empty or has no header.") + return [] + + for i, row_values in enumerate(reader): + if len(row_values) == len(header): + token_data = dict(zip(header, row_values)) + + # Parse 'tags' and 'wer_tags' fields if they are string representations of lists + for key_to_parse in ['tags', 'wer_tags']: + if key_to_parse in token_data: + field_value = token_data[key_to_parse] + if isinstance(field_value, str): + try: + token_data[key_to_parse] = json.loads(field_value) + except json.JSONDecodeError: + if field_value and field_value != "[]": + logger.debug(f"Field '{key_to_parse}' in {nlp_file} non-JSON: {field_value}") + tokens_list.append(token_data) + else: + logger.warning(f"Skipping malformed row in {nlp_file} (row {i+2})") + return tokens_list + + except Exception as e: + logger.error(f"Error processing NLP file {nlp_file}: {e}") + return [] + + def _reconstruct_text(self, tokens: List[Dict[str, Any]]) -> str: + """Reconstruct text from tokens with proper spacing and punctuation.""" + if not tokens: + return "" + + text_parts = [] + for token in tokens: + token_text = token.get('token', '').strip() + if not token_text: + continue + + text_parts.append(token_text) + # Add punctuation if preserving and it exists + if self.preserve_punctuation and token.get('punctuation'): + text_parts.append(token.get('punctuation')) + + # Join with spaces and clean up punctuation spacing + text = " ".join(text_parts) + if self.preserve_punctuation: + # Remove spaces before common punctuation marks + text = re.sub(r'\s+([,.!?;:])', r'\1', text) + + if not self.preserve_capitalization: + text = text.lower() + + # Final cleanup of multiple spaces + text = re.sub(r'\s+', ' ', text).strip() + return text + + def process_dataset_entry(self, data_entry: Dict[str, Any]) -> List[DataEntry]: + """Process a single manifest entry to add full text.""" + file_id = data_entry['file_id'] + tokens = self._load_nlp_file(file_id) + + if not tokens: + logger.warning(f"No NLP tokens for {file_id}, text will be empty.") + data_entry['text'] = data_entry.get('text', '') + else: + data_entry['text'] = self._reconstruct_text(tokens) + + return [DataEntry(data=data_entry)] + + +# Step 3: Create Speaker-level Segmented Manifest (renamed from CreateFinalSegmentedManifest) +class SpeakerSegmentedManifest(BaseParallelProcessor): + """ + Step 6: Create speaker-segmented manifest without duration calculation. + + Features: + - Supports both earnings21 and earnings22 + - Speaker-level segmentation based on NLP files + - No duration calculation (set to None) + - Optional speaker name mapping + """ + + def __init__( + self, + input_manifest_file: str, + dataset_root: str, + output_manifest_file: str, + dataset_type: str = "earnings21", # "earnings21" or "earnings22" + preserve_punctuation: bool = True, + preserve_capitalization: bool = True, + include_speaker_info: bool = True, + include_tags: bool = False, + use_speaker_metadata_csv: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_file = input_manifest_file + self.dataset_root = Path(dataset_root) + self.output_manifest_file = output_manifest_file + self.dataset_type = dataset_type + self.preserve_punctuation = preserve_punctuation + self.preserve_capitalization = preserve_capitalization + self.include_speaker_info = include_speaker_info + self.include_tags = include_tags + self.use_speaker_metadata_csv = use_speaker_metadata_csv + self.speaker_name_map = {} + + def prepare(self): + """Prepare the processor by loading speaker metadata if needed.""" + # Load speaker metadata if requested (earnings21 only) + if self.use_speaker_metadata_csv and self.dataset_type == "earnings21": + self._load_speaker_metadata() + + def _load_speaker_metadata(self): + """Load speaker metadata for earnings21.""" + metadata_file = self.dataset_root / "speaker-metadata.csv" + if not metadata_file.exists(): + logger.warning(f"Speaker metadata file not found: {metadata_file}") + return + + try: + df = pd.read_csv(metadata_file) + for _, row in df.iterrows(): + file_id_key = str(row['file_id']) + speaker_id_key = str(row['speaker_id']) + if file_id_key not in self.speaker_name_map: + self.speaker_name_map[file_id_key] = {} + self.speaker_name_map[file_id_key][speaker_id_key] = row['speaker_name'] + logger.info(f"Loaded speaker metadata from {metadata_file}") + except Exception as e: + logger.error(f"Error loading speaker metadata {metadata_file}: {e}") + + def _get_nlp_file_path(self, file_id: str) -> Path: + """Get NLP file path based on dataset type.""" + if self.dataset_type == "earnings21": + return self.dataset_root / "transcripts" / "nlp_references" / f"{file_id}.nlp" + else: # earnings22 + # Check both possible locations for earnings22 + nlp_path1 = self.dataset_root / "transcripts" / "nlp_references" / f"{file_id}.nlp" + nlp_path2 = self.dataset_root / "subset10" / "nonverbatim_transcripts" / "nlp_references" / f"{file_id}.nlp" + + if nlp_path1.exists(): + return nlp_path1 + elif nlp_path2.exists(): + return nlp_path2 + else: + return nlp_path1 # Return first path for error reporting + + def _load_nlp_file(self, file_id: str) -> List[Dict[str, Any]]: + """Load NLP file containing tokens and metadata.""" + nlp_file = self._get_nlp_file_path(file_id) + + if not nlp_file.exists(): + logger.warning(f"NLP file not found: {nlp_file}") + return [] + + tokens_list = [] + try: + with open(nlp_file, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='|') + try: + header = next(reader) + except StopIteration: + logger.warning(f"NLP file {nlp_file} is empty or has no header.") + return [] + + for i, row_values in enumerate(reader): + if len(row_values) == len(header): + token_data = dict(zip(header, row_values)) + + # Parse 'tags' and 'wer_tags' fields if they are string representations of lists + for key_to_parse in ['tags', 'wer_tags']: + if key_to_parse in token_data: + field_value = token_data[key_to_parse] + if isinstance(field_value, str): + try: + token_data[key_to_parse] = json.loads(field_value) + except json.JSONDecodeError: + if field_value and field_value != "[]": + logger.debug(f"Field '{key_to_parse}' in {nlp_file} non-JSON: {field_value}") + tokens_list.append(token_data) + else: + logger.warning(f"Skipping malformed row in {nlp_file} (row {i+2})") + return tokens_list + + except Exception as e: + logger.error(f"Error processing NLP file {nlp_file}: {e}") + return [] + + def _load_entity_tags(self, file_id: str) -> Dict[str, Dict[str, str]]: + """Load entity tags file (earnings21 only).""" + if self.dataset_type != "earnings21": + return {} + + tags_file = self.dataset_root / "transcripts" / "tags" / f"{file_id}.tags.json" + if not tags_file.exists(): + return {} + + try: + with open(tags_file, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.warning(f"Error loading tags file {tags_file}: {e}") + return {} + + def _reconstruct_text(self, tokens: List[Dict[str, Any]]) -> str: + """Reconstruct text from tokens with proper spacing and punctuation.""" + if not tokens: + return "" + + text_parts = [] + for token in tokens: + token_text = token.get('token', '').strip() + if not token_text: + continue + + text_parts.append(token_text) + # Add punctuation if preserving and it exists + if self.preserve_punctuation and token.get('punctuation'): + text_parts.append(token.get('punctuation')) + + # Join with spaces and clean up punctuation spacing + text = " ".join(text_parts) + if self.preserve_punctuation: + # Remove spaces before common punctuation marks + text = re.sub(r'\s+([,.!?;:])', r'\1', text) + + if not self.preserve_capitalization: + text = text.lower() + + # Final cleanup of multiple spaces + text = re.sub(r'\s+', ' ', text).strip() + return text + + def _create_segments(self, tokens: List[Dict[str, Any]], file_id: str) -> List[Dict[str, Any]]: + """Create segments based on speaker changes.""" + if not tokens: + return [] + + segments = [] + current_segment_tokens = [] + current_speaker_id = tokens[0].get('speaker', 'unknown_speaker_0') if tokens else 'unknown_speaker_0' + + for token in tokens: + token_speaker_id = token.get('speaker', current_speaker_id) + + # Check for speaker change + if token_speaker_id != current_speaker_id and current_segment_tokens: + # Finalize current segment + segment_text = self._reconstruct_text(current_segment_tokens) + if segment_text.strip(): + segments.append({ + 'tokens': current_segment_tokens, + 'text': segment_text, + 'speaker_id': current_speaker_id, + 'file_id': file_id, + }) + + # Start new segment + current_segment_tokens = [token] + current_speaker_id = token_speaker_id + else: + current_segment_tokens.append(token) + + # Handle last segment + if current_segment_tokens: + segment_text = self._reconstruct_text(current_segment_tokens) + if segment_text.strip(): + segments.append({ + 'tokens': current_segment_tokens, + 'text': segment_text, + 'speaker_id': current_speaker_id, + 'file_id': file_id, + }) + + return segments + + def process_dataset_entry(self, full_audio_manifest_entry: Dict[str, Any]) -> List[DataEntry]: + """Process a single full audio manifest entry to create segmented entries.""" + file_id = full_audio_manifest_entry['file_id'] + audio_filepath = full_audio_manifest_entry['audio_filepath'] + + logger.info(f"Processing file {file_id} for segmentation") + + # Load NLP tokens + tokens = self._load_nlp_file(file_id) + if not tokens: + logger.warning(f"No NLP tokens for {file_id}, cannot create segments.") + return [] + + # Load entity tags if requested + entity_tags = self._load_entity_tags(file_id) if self.include_tags else {} + + # Create segments + segments = self._create_segments(tokens, file_id) + logger.info(f"Created {len(segments)} segments for file {file_id}") + + # Create manifest entries + output_entries = [] + for idx, segment_dict in enumerate(segments): + segment_text = segment_dict['text'] + speaker_id = segment_dict['speaker_id'] + + # Create manifest entry + manifest_entry_data = { + "audio_filepath": audio_filepath, # Point to original audio file + "duration": 0, # Set to 0 instead of None to avoid TypeError in base processor + "text": segment_text, + "file_id": file_id, + "segment_id": idx, + "start_time": None, # No timing information + "end_time": None, # No timing information + } + + # Add speaker information + if self.include_speaker_info: + speaker_name = speaker_id # Default to ID + if (self.use_speaker_metadata_csv and + file_id in self.speaker_name_map and + speaker_id in self.speaker_name_map[file_id]): + speaker_name = self.speaker_name_map[file_id][speaker_id] + manifest_entry_data["speaker"] = speaker_name + + # Add tags if requested + if self.include_tags: + segment_tags = [] + segment_entities = [] + + # Extract basic tags from tokens + for token in segment_dict.get('tokens', []): + if token.get('tags') and str(token['tags']).strip(): + tag_val = str(token['tags']).strip() + tag_type = tag_val.split(':', 1)[1].strip() if ':' in tag_val else tag_val + if tag_type and tag_type not in segment_tags: + segment_tags.append(tag_type) + + manifest_entry_data["tags"] = segment_tags + manifest_entry_data["entities"] = segment_entities + + output_entries.append(DataEntry(data=manifest_entry_data)) + + logger.info(f"Successfully processed {len(output_entries)} segments for file {file_id}") + return output_entries + + +# Step 5: Create Sentence-level Segmented Manifest based on CTM files +class CreateSentenceSegmentedManifest(BaseParallelProcessor): + """ + Step 5: Create sentence-level segments based on CTM files. + + This processor reads CTM files generated by the NeMo Forced Aligner and creates + sentence-level segments based on punctuation patterns. It segments on words ending + with !, ?, or . (excluding numbers like 42.12) where the next segment starts with + a capital letter. + + Features: + - Reads word-level CTM files with timing information + - Creates sentence-level segments based on punctuation + - Preserves word-level alignments within each segment + - Calculates accurate segment durations from CTM data + """ + + def __init__( + self, + input_manifest_file: str, + ctm_dir: str, + output_manifest_file: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_file = input_manifest_file + self.ctm_dir = Path(ctm_dir) + self.output_manifest_file = output_manifest_file + + def _parse_ctm_file(self, ctm_path: str) -> List[Dict[str, Any]]: + """Parse CTM file to extract word alignments.""" + alignments = [] + try: + with open(ctm_path, 'r') as f: + for line in f: + parts = line.strip().split() + if len(parts) >= 5: + # CTM format: utt_id channel start_time duration word + utt_id = parts[0] + channel = parts[1] + start_time = float(parts[2]) + duration = float(parts[3]) + word = parts[4] + end_time = start_time + duration + + alignments.append({ + 'word': word, + 'start': round(start_time, 3), + 'end': round(end_time, 3), + 'utt_id': utt_id, + 'channel': channel + }) + except Exception as e: + logger.error(f"Error parsing CTM file {ctm_path}: {e}") + + return alignments + + def _is_sentence_end(self, word: str, next_word: str = None) -> bool: + """ + Check if a word marks the end of a sentence. + + Rules: + - Word ends with !, ?, or . + - Exclude numbers like 42.12 (. within numbers) + - Exclude common abbreviations like Ms., Mr., Dr., etc. + - Next word should start with capital letter (if available) + """ + if not word: + return False + + # Check if word ends with sentence-ending punctuation + if not word.endswith(('.', '!', '?')): + return False + + # Handle exclamation and question marks - these are always sentence endings + if word.endswith(('!', '?')): + return True + + # For words ending with '.', do additional checks + if word.endswith('.'): + # Remove the final '.' and check if what remains is a number + word_without_dot = word[:-1] + try: + # If it's a pure number, it's likely part of a decimal + float(word_without_dot) + return False + except ValueError: + # Not a number, continue with other checks + pass + + # Check for common abbreviations (case-insensitive) + common_abbreviations = { + 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'jr', 'vs', 'etc', 'inc', 'corp', 'ltd', 'co', + 'st', 'ave', 'blvd', 'rd', 'ln', 'ct', 'pl', 'sq', 'ft', 'in', 'cm', 'mm', 'kg', 'lb', + 'oz', 'pt', 'qt', 'gal', 'mph', 'rpm', 'vol', 'no', 'pg', 'pp', 'ch', 'sec', 'min', + 'hr', 'hrs', 'am', 'pm', 'est', 'pst', 'cst', 'mst', 'utc', 'gmt', 'jan', 'feb', 'mar', + 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'mon', 'tue', 'wed', + 'thu', 'fri', 'sat', 'sun', 'dept', 'div', 'mgr', 'dir', 'pres', 'vp', 'ceo', 'cfo', + 'cto', 'coo', 'evp', 'svp', 'avp' + } + + if word_without_dot.lower() in common_abbreviations: + return False + + # If we have a next word, check if it starts with capital letter + if next_word: + return next_word[0].isupper() + + # If no next word, assume it's sentence end + return True + + def _create_sentence_segments(self, alignments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Create sentence-level segments from word alignments.""" + if not alignments: + return [] + + segments = [] + current_segment_words = [] + + for i, alignment in enumerate(alignments): + current_segment_words.append(alignment) + + # Check if this word ends a sentence + next_word = alignments[i + 1]['word'] if i + 1 < len(alignments) else None + if self._is_sentence_end(alignment['word'], next_word): + # Create segment from current words + if current_segment_words: + segment_text = ' '.join([w['word'] for w in current_segment_words]) + segment_start = current_segment_words[0]['start'] + segment_end = current_segment_words[-1]['end'] + segment_duration = round(segment_end - segment_start, 3) + + segments.append({ + 'text': segment_text, + 'start_time': segment_start, + 'end_time': segment_end, + 'duration': segment_duration, + 'alignment': current_segment_words.copy() + }) + + current_segment_words = [] + + # Handle any remaining words + if current_segment_words: + segment_text = ' '.join([w['word'] for w in current_segment_words]) + segment_start = current_segment_words[0]['start'] + segment_end = current_segment_words[-1]['end'] + segment_duration = round(segment_end - segment_start, 3) + + segments.append({ + 'text': segment_text, + 'start_time': segment_start, + 'end_time': segment_end, + 'duration': segment_duration, + 'alignment': current_segment_words.copy() + }) + + return segments + + def process_dataset_entry(self, aligned_manifest_entry: Dict[str, Any]) -> List[DataEntry]: + """Process a single aligned manifest entry to create sentence-level segments.""" + file_id = aligned_manifest_entry['file_id'] + audio_filepath = aligned_manifest_entry['audio_filepath'] + + # Find corresponding CTM file + ctm_file = self.ctm_dir / f"{file_id}.ctm" + if not ctm_file.exists(): + logger.warning(f"CTM file not found: {ctm_file}") + return [] + + # Parse CTM file + alignments = self._parse_ctm_file(str(ctm_file)) + if not alignments: + logger.warning(f"No alignments found in CTM file: {ctm_file}") + return [] + + # Create sentence segments + segments = self._create_sentence_segments(alignments) + logger.info(f"Created {len(segments)} sentence segments for file {file_id}") + + # Create manifest entries + output_entries = [] + for idx, segment in enumerate(segments): + manifest_entry_data = { + "audio_filepath": audio_filepath, + "duration": segment['duration'], + "text": segment['text'], + "file_id": file_id, + "segment_id": idx, + "offset": segment['start_time'], # Use offset instead of start_time + "end_time": segment['end_time'], + "alignment": segment['alignment'] + } + + output_entries.append(DataEntry(data=manifest_entry_data)) + + logger.info(f"Successfully processed {len(output_entries)} sentence segments for file {file_id}") + return output_entries + + +class NeMoForcedAligner(BaseProcessor): + """ + Step 4: Apply NeMo Forced Aligner to get word-level timestamps. + + This processor wraps the NeMo Forced Aligner (NFA) script to generate + word-level alignments for the earnings21 segments. It uses the ground + truth text from the earnings21 dataset and aligns it with the audio + to produce precise timing information. + + Features: + - Uses NeMo's dedicated forced alignment script + - Preserves ground truth text from earnings21 + - Generates word-level timestamps + - Outputs CTM files with alignment information + """ + + def __init__( + self, + input_manifest_file: str, + output_manifest_file: str, + output_dir: str, + pretrained_name: str = "/disk7/projects/models/small-parakeet/oci-N-1_G-8_config-parakeet-wav2vec-600m-am-fl-mc-mm-yt-yo_En-d0.5-rnnt_ctc-quality_LR-1e-4_wup-0_ts-2500.nemo", + device: str = "cuda", + nemo_path: str = None, + **kwargs, + ): + super().__init__(output_manifest_file=output_manifest_file, input_manifest_file=input_manifest_file, **kwargs) + self.output_dir = Path(output_dir) + self.pretrained_name = pretrained_name + self.device = device + self.nemo_path = nemo_path + + # Create output directory + self.output_dir.mkdir(parents=True, exist_ok=True) + + def process(self): + """Process the manifest using NeMo Forced Aligner script.""" + import subprocess + import json + + try: + # Find NeMo forced aligner script + if self.nemo_path: + align_script = Path(self.nemo_path) / "tools" / "nemo_forced_aligner" / "align.py" + else: + # Try to find NeMo installation + try: + import nemo + nemo_dir = Path(nemo.__file__).parent.parent + align_script = nemo_dir / "tools" / "nemo_forced_aligner" / "align.py" + except ImportError: + raise ImportError("NeMo not found. Please install NeMo or specify nemo_path.") + + if not align_script.exists(): + raise FileNotFoundError(f"NeMo Forced Aligner script not found at {align_script}") + + logger.info(f"Using NeMo Forced Aligner script at: {align_script}") + + # Prepare manifest for forced alignment + input_manifest = [] + with open(self.input_manifest_file, 'r') as f: + for line in f: + if line.strip(): + input_manifest.append(json.loads(line)) + + # Create temporary manifest with absolute paths + temp_manifest_path = self.output_dir / "temp_manifest_for_alignment.json" + with open(temp_manifest_path, 'w') as f: + for entry in input_manifest: + if entry.get('text', '').strip(): # Only process entries with text + # Ensure absolute path + audio_path = Path(entry['audio_filepath']) + if not audio_path.is_absolute(): + audio_path = audio_path.resolve() + + alignment_entry = { + "audio_filepath": str(audio_path), + "text": entry['text'].strip() + } + f.write(json.dumps(alignment_entry) + '\n') + + # Run NeMo Forced Aligner + # Determine if we should use pretrained_name or model_path + if self.pretrained_name.endswith('.nemo'): + # Local model file path - use model_path + model_param = f"model_path={self.pretrained_name}" + else: + # Pretrained model name - use pretrained_name + model_param = f"pretrained_name={self.pretrained_name}" + + cmd = [ + "python", str(align_script), + model_param, + f"manifest_filepath={temp_manifest_path}", + f"output_dir={self.output_dir}", + f"transcribe_device={self.device}", + f"viterbi_device={self.device}", + "batch_size=1", + 'save_output_file_formats=["ctm"]' + ] + + logger.info(f"Running NeMo Forced Aligner: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + logger.info("NeMo Forced Aligner completed successfully") + + # Process the output and merge with original manifest + output_manifest_path = self.output_dir / f"{temp_manifest_path.stem}_with_output_file_paths.json" + + if output_manifest_path.exists(): + # Load alignment results + alignment_results = [] + with open(output_manifest_path, 'r') as f: + for line in f: + if line.strip(): + alignment_results.append(json.loads(line)) + + # Create mapping from audio filepath to alignment results + alignment_map = {} + for result in alignment_results: + audio_path = result['audio_filepath'] + alignment_map[audio_path] = result + + # Merge alignments with original manifest + output_entries = [] + for entry in input_manifest: + output_entry = entry.copy() + + if entry.get('text', '').strip(): + # Find corresponding alignment + audio_path = str(Path(entry['audio_filepath']).resolve()) + if audio_path in alignment_map: + alignment_result = alignment_map[audio_path] + + # Load word-level CTM file if available + if 'word_level_ctm_filepath' in alignment_result: + ctm_path = alignment_result['word_level_ctm_filepath'] + word_alignments = self._parse_ctm_file(ctm_path) + output_entry['alignment'] = word_alignments + + # Calculate duration from alignments + if word_alignments: + output_entry['duration'] = round( + word_alignments[-1]['end'] - word_alignments[0]['start'], 3 + ) + else: + output_entry['duration'] = 0.0 + else: + output_entry['alignment'] = [] + output_entry['duration'] = 0.0 + else: + output_entry['alignment'] = [] + output_entry['duration'] = 0.0 + else: + output_entry['alignment'] = [] + output_entry['duration'] = 0.0 + + output_entries.append(output_entry) + + # Save final manifest + with open(self.output_manifest_file, 'w') as f: + for entry in output_entries: + f.write(json.dumps(entry) + '\n') + + logger.info(f"Saved aligned manifest to {self.output_manifest_file}") + + # Clean up temporary files + temp_manifest_path.unlink(missing_ok=True) + + else: + logger.error(f"Expected output file not found: {output_manifest_path}") + raise FileNotFoundError(f"NeMo Forced Aligner did not produce expected output") + + except subprocess.CalledProcessError as e: + logger.error(f"NeMo Forced Aligner failed: {e}") + logger.error(f"stdout: {e.stdout}") + logger.error(f"stderr: {e.stderr}") + raise + except Exception as e: + logger.error(f"Error in forced alignment: {e}") + raise + + def _parse_ctm_file(self, ctm_path: str) -> List[Dict[str, Any]]: + """Parse CTM file to extract word alignments.""" + alignments = [] + try: + with open(ctm_path, 'r') as f: + for line in f: + parts = line.strip().split() + if len(parts) >= 5: + # CTM format: utt_id channel start_time duration word + start_time = float(parts[2]) + duration = float(parts[3]) + word = parts[4] + end_time = start_time + duration + + alignments.append({ + 'word': word, + 'start': round(start_time, 3), + 'end': round(end_time, 3) + }) + except Exception as e: + logger.error(f"Error parsing CTM file {ctm_path}: {e}") + + return alignments \ No newline at end of file From 6bd3da446d31667d17e612bd2c0b9b5f4d85802a Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Jun 2025 06:16:50 -0700 Subject: [PATCH 2/3] add docs in existing con Signed-off-by: George Zelenfroind --- docs/src/sdp/existing_configs.rst | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 233a05dd..16deea65 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -376,9 +376,17 @@ YouTube Commons (YTC) .. toctree:: :hidden: + config-docs/tts/ytc/config + +Earnings (21/22) +~~~~~~~~~~~~~~~~~~~~~~ - config-docs/armenian/toloka/pipeline_start - config-docs/armenian/toloka/pipeline_validate_answers - config-docs/armenian/toloka/pipeline_get_final_res +**Dataset link:** https://huggingface.co/datasets/Revai/earnings21, https://huggingface.co/datasets/distil-whisper/earnings22 - config-docs/tts/ytc/config +`config `__ | +:doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/english/earnings21/config \ No newline at end of file From 5e5c9780eb5ff8c3ccd89bfe9a4385012dc43a84 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Jun 2025 06:31:54 -0700 Subject: [PATCH 3/3] fixconflict Signed-off-by: George Zelenfroind --- docs/src/sdp/existing_configs.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 16deea65..ec21fd2b 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -376,8 +376,31 @@ YouTube Commons (YTC) .. toctree:: :hidden: + config-docs/tts/ytc/config +HiFiTTS-2 +~~~~~~~~~~~~~~~~~~~~~~~ + +**Dataset link:** https://huggingface.co/datasets/nvidia/hifitts-2 + +* **22kHz**: + `config `__ | + :doc:`documentation ` +* **44kHz**: + `config `__ | + :doc:`documentation ` +* **Bandwidth Estimation**: + `config `__ | + :doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/english/hifitts2/config_22khz + config-docs/english/hifitts2/config_44khz + config-docs/english/hifitts2/config_bandwidth + Earnings (21/22) ~~~~~~~~~~~~~~~~~~~~~~