NVIDIA · ssh-meister · Mar 18, 2024 · Mar 17, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/dataset_configs/youtube/de.yaml b/dataset_configs/youtube/de.yaml
@@ -0,0 +1,253 @@
+processors_to_run: "0:" 
+base_dir: "/ws/test_subset/"
+workspace_dir: "/ws/test_subset_out/"
+
+# filters
+lang: de
+min_duration: 1.0
+max_duration: 40.0
+max_wer: 75.0
+max_cer: 30.0
+
+
+processors:
+  # Create initial manifests based on pairs of .opus audio + .srt transcript (with ground-truth timestamps)
+  - _target_: sdp.processors.datasets.youtube.CreateInitialManifest
+    data_dir: ${base_dir}
+    output_audio_dir: ${workspace_dir}/audio/wav_samples
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    chunksize: 10
+    in_memory_chunksize: 400
+
+  # Aggregate ground-truth segments to longer one based on duration threshold
+  - _target_: sdp.processors.datasets.youtube.AggregateSegments
+    max_duration: ${max_duration}
+    output_segments_audio_dir: ${workspace_dir}/audio/wav_segments
+    output_manifest_file: ${workspace_dir}/manifest2.json
+
+  # Filter out samples which duration is out of range 0-40 sec.
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    low_duration_threshold: ${min_duration}
+    high_duration_threshold: ${max_duration}
+
+  # Identify language of the text
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_text_key: orig_text
+    output_lang_key: text_lang
+    device: cuda
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    drop_text_duplicates: True
+
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_lang_key: text_lang
+    output_lang_key: text_lang
+
+  ## Filter out samples with text in non-target language
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    input_value_key: text_lang
+    target_value: ${lang}
+
+  # Identify language of the audio
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_audio_key: audio_filepath
+    output_lang_key: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  ## Filter out samples with audio in non-target language
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_value_key: audio_lang
+    target_value: ${lang}
+
+  # ASR Inference
+  - _target_: sdp.processors.ASRInferenceParallel
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
+    batch_size: 64
+    devices: 2
+
+  ## Merge manifests
+  - _target_: sdp.processors.datasets.youtube.MergeManifests
+    input_manifest_file: ${workspace_dir}/manifest8.json
+    input_manifest_file2: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    key_field: audio_filepath
+    fields_to_merge: 
+      - {"pred_text" : "pred_text_pc"}
+
+  # Filter out samples with empty pred_text_pc
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: pred_text_pc
+    regex_patterns:
+      - "^\\s*$"
+
+  # Preprocess orig text for audio-based TN
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"orig_text" : "pre_normalized"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: pre_normalized
+    regex_params_list:
+        - {"pattern": '\\[hn]', "repl" : " "}
+        - {"pattern": "\\s+", "repl" : " "}
+        - {"pattern": "\\[", "repl" : " "}
+        - {"pattern": "\\]", "repl" : " "}
+        - {"pattern": "!", "repl" : "."}
+        - {"pattern": "\\)", "repl" : " "}
+        - {"pattern": "\\(", "repl" : " "}
+        - {"pattern": "“", "repl" : " "}
+        - {"pattern": "„", "repl" : " "}
+        - {"pattern": "–", "repl" : " "}
+        - {"pattern": ";", "repl" : ","}
+        - {"pattern": "'", "repl" : " "}
+        - {"pattern": "…", "repl" : "."}
+        - {"pattern": "«", "repl" : " "}
+        - {"pattern": "»", "repl" : " "}
+        - {"pattern": "’", "repl" : " "}
+        - {"pattern": "‘", "repl" : " "}
+        - {"pattern": "”", "repl" : " "}
+        - {"pattern": "—", "repl" : " "}
+        - {"pattern": "´", "repl" : " "}
+        - {"pattern": "″", "repl" : " "}
+        - {"pattern": "`", "repl" : " "}
+        - {"pattern": "\\|", "repl" : " "}
+        - {"pattern": "−", "repl" : " "}
+        - {"pattern": "‟", "repl" : " "}
+        - {"pattern": "‒", "repl" : " "}
+        - {"pattern": "	", "repl" : " "}
+        - {"pattern": "", "repl" : " "}
+        - {"pattern": "‐", "repl" : " "}
+        - {"pattern": "ʻ", "repl" : " "}
+        - {"pattern": "′", "repl" : " "}
+        - {"pattern": "\\\\", "repl" : " "}
+        - {"pattern": "^\\s?\\.\\.\\.", "repl" : ""}
+        - {"pattern": "\\s?\\.\\.\\.$", "repl" : "."}
+
+  ## Remove extra space
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: pre_normalized
+    regex_params_list:
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+
+  ## Filter out samples out of Regex
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pre_normalized
+    regex_patterns: 
+      - "^[ !#$%&'*+,\\-.0-9:=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_abcdefghijklmnopqrstuvwxyz{}~£¥°²³µÄÖÜßäöüμω₩€/]+$"
+
+  # Run audio based TN
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /ws/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=pre_normalized  --manifest_asr_pred_field=pred_text_pc \
+        --cache_dir=${workspace_dir}/cache \
+        --whitelist=/ws/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv"
+
+  # Post-normalization processing
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    duplicate_fields: {"normalized" : "post_normalized"}
+
+  ## Extra chars removing from normalized text
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: post_normalized
+    regex_params_list:
+        - {"pattern": "['\\-:{}\\/]", "repl" : " "}
+        - {"pattern": "!", "repl" : "."}
+        - {"pattern": "\\s+", "repl" : " "}
+        - {"pattern": "^\\s+", "repl" : ""}
+        - {"pattern": "\\s+$", "repl" : ""}
+
+  ## Remove samples with chars out of list (letters, comma, period, question mark, space)
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: post_normalized
+    regex_patterns: 
+      - "^[a-zA-ZäÄöÖüÜß,\\.?\\s]+$"
+
+  # Create text field with lowercased clean "post_normalized"
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    duplicate_fields: {"post_normalized" : "text"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest21.json
+    text_key: "text"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest22.json
+    text_key: "text"
+    regex_params_list:
+      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+
+  # Create pred_text field with lowercased clean  "pred_text_pc"
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest23.json
+    duplicate_fields: {"pred_text_pc" : "pred_text"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest24.json
+    text_key: "pred_text"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest25.json
+    text_key: "pred_text"
+    regex_params_list:
+      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+
+  # Filtration
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest26.json
+    cer_threshold: ${max_cer}
+    text_key: "text"
+    pred_text_key: "pred_text"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest27.json
+    wer_threshold: ${max_wer}
+    text_key: "text"
+    pred_text_key: "pred_text"
+
+  # Finalization 
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest28.json
+    fields_to_keep: ["audio_filepath", "duration", "post_normalized"]
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest29.json
+    rename_fields: {"post_normalized":"text"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${workspace_dir}/clean_data/audio/
+    path_levels: 1
+
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${workspace_dir}/clean_data/${lang}_manifest.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${workspace_dir}
+
+
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -74,5 +74,5 @@
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
-from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.asr_inference import ASRInference, ASRInferenceParallel
 from sdp.processors.nemo.pc_inference import PCInference
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
-    Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
-        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
+from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, \
+    Lang2Iso, SplitByVttSentence, AudioLid, TextLid, AllVttText, TxtToVtt, \
+        ReadParquet, CreateInitialManifestCC, ASR_HF, AlignerSubprocess, \
+        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, \
         TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -1107,7 +1107,7 @@ def __init__(
         }
 
     def process_dataset_entry(self, data_entry):
-        data_entry[self.output_lang_key] = self.iso_m[data_entry[self.input_lang_key]]
+        data_entry[self.output_lang_key] = self.iso_m.get(data_entry[self.input_lang_key], None)
         return [DataEntry(data=data_entry)]
 
 

diff --git a/sdp/processors/datasets/youtube/__init__.py b/sdp/processors/datasets/youtube/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .create_initial_manifest import CreateInitialManifest
+from .utils import parse_srt
+from .aggregate_segments import *
+from .merge_manifests import MergeManifests
diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py
@@ -0,0 +1,71 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pydub import AudioSegment
+import os
+
+from sdp.processors.base_processor import BaseParallelProcessor
+from sdp.processors.datasets.youtube.utils import RawSegment, AggregatedSegment, get_audio_segment
+
+
+class AggregateSegments(BaseParallelProcessor):
+    def __init__(
+        self,
+        max_duration: float = 40.0,
+        crop_audio_segments: bool = True,
+        output_segments_audio_dir: str = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_duration = max_duration
+        self.crop_audio_segments = crop_audio_segments
+        self.output_segments_audio_dir = output_segments_audio_dir
+
+    def prepare(self):
+        if self.crop_audio_segments and self.output_segments_audio_dir:
+            os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True)
+
+    def process_dataset_entry(self, data_entry: dict):
+        sample_id = data_entry['sample_id']
+        segments = data_entry['segments']
+        agg_segments = []
+
+        if len(segments) == 0:
+            return agg_segments
+
+        first_segment = RawSegment(**segments[0])
+        agg_segment = AggregatedSegment(segment=first_segment, segment_id=1, sample_id=sample_id, 
+                                        output_audio_dir = self.output_segments_audio_dir)
+
+        for segment in segments[1 : ]:
+            segment = RawSegment(**segment)
+
+            if (not agg_segment.duration_match or 
+                agg_segment.duration >= self.max_duration or
+                segment.end_time - agg_segment.start_time >= self.max_duration):
+                agg_segments.append(agg_segment.to_dataentry())
+                agg_segment = AggregatedSegment(segment=segment, 
+                                                segment_id=len(agg_segments) + 1, sample_id=sample_id, 
+                                                output_audio_dir = self.output_segments_audio_dir)
+            else:
+                agg_segment.aggregate(segment)
+        else:
+            agg_segments.append(agg_segment.to_dataentry())
+
+        if self.crop_audio_segments:
+            audio = AudioSegment.from_wav(data_entry['audio_filepath'])
+            for agg_segment in agg_segments:
+                get_audio_segment(audio = audio, 
+                                  start_time = agg_segment.data['start_time'], 
+                                  end_time = agg_segment.data['end_time'], 
+                                  output_audio_filepath = agg_segment.data['audio_filepath'])
+
+        return agg_segments