huggingface · lmaksym · Oct 13, 2025 · Feb 20, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
@@ -225,6 +225,10 @@ The following auto classes are available for the following audio tasks.
 
 [[autodoc]] AutoModelForCTC
 
+### AutoModelForTDT
+
+[[autodoc]] AutoModelForTDT
+
 ### AutoModelForSpeechSeq2Seq
 
 [[autodoc]] AutoModelForSpeechSeq2Seq

diff --git a/docs/source/en/model_doc/parakeet.md b/docs/source/en/model_doc/parakeet.md
@@ -34,15 +34,20 @@ Parakeet models, [introduced by NVIDIA NeMo](https://developer.nvidia.com/blog/p
     - 1D convolution projection from encoder hidden size to vocabulary size (for optimal NeMo compatibility).
     - CTC loss computation for training.
     - Greedy CTC decoding for inference.
+- [**ParakeetForTDT**](#parakeetfortdt): a Fast Conformer Encoder + a TDT (Token Duration Transducer) decoder
+  - **TDT Decoder**: Jointly predicts tokens and their durations, enabling efficient decoding:
+    - LSTM prediction network maintains language context across token predictions.
+    - Joint network combines encoder and decoder outputs.
+    - Duration head predicts how many frames to skip, enabling fast inference.
 
 The original implementation can be found in [NVIDIA NeMo](https://github.com/NVIDIA/NeMo).
 Model checkpoints are to be found under [the NVIDIA organization](https://huggingface.co/nvidia/models?search=parakeet).
 
-This model was contributed by [Nithin Rao Koluguri](https://huggingface.co/nithinraok), [Eustache Le Bihan](https://huggingface.co/eustlb) and [Eric Bezzam](https://huggingface.co/bezzam).
+This model was contributed by [Nithin Rao Koluguri](https://huggingface.co/nithinraok), [Eustache Le Bihan](https://huggingface.co/eustlb), [Eric Bezzam](https://huggingface.co/bezzam), [Maksym Lypivskyi](https://huggingface.co/MaksL), and [Hainan Xu](https://huggingface.co/hainanx).
 
 ## Usage
 
-### Basic usage
+### `ParakeetForCTC` usage
 
 <hfoptions id="usage">
 <hfoption id="Pipeline">
@@ -53,6 +58,7 @@ from transformers import pipeline
 pipe = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-1.1b")
 out = pipe("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")
 print(out)
+# {'text': 'yesterday it was thirty five degrees in barcelona but today the temperature will go down to minus twenty degrees'}
 ```
 
 </hfoption>
@@ -61,12 +67,10 @@ print(out)
 ```py
 from transformers import AutoModelForCTC, AutoProcessor
 from datasets import load_dataset, Audio
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
 
-processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b")
-model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device)
+model_id = "nvidia/parakeet-ctc-1.1b"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForCTC.from_pretrained(model_id, dtype="auto", device_map="auto")
 
 ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
@@ -75,7 +79,80 @@ speech_samples = [el['array'] for el in ds["audio"][:5]]
 inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
 inputs.to(model.device, dtype=model.dtype)
 outputs = model.generate(**inputs)
-print(processor.batch_decode(outputs))
+print(processor.decode(outputs))
+```
+
+</hfoption>
+</hfoptions>
+
+### `ParakeetForTDT` usage
+
+<hfoptions id="tdt-usage">
+<hfoption id="Pipeline">
+
+Parakeet TDT transcripts include casing, and the model can also perform token timestamping.
+
+```py
+from transformers import pipeline
+
+pipe = pipeline("automatic-speech-recognition", model="nvidia/parakeet-tdt-0.6b-v3")
+out = pipe("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")
+print(out)
+# {'text': 'Yesterday it was 35 degrees in Barcelona, but today the temperature will go down to minus 20 degrees.'}
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+from transformers import AutoModelForTDT, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "nvidia/parakeet-tdt-0.6b-v3"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForTDT.from_pretrained(model_id, dtype="auto", device_map="auto")
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
+speech_samples = [el['array'] for el in ds["audio"][:5]]
+
+inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
+inputs.to(model.device, dtype=model.dtype)
+output = model.generate(**inputs, return_dict_in_generate=True)
+print(processor.decode(output.sequences, skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="Timestamping">
+
+```py
+from datasets import Audio, load_dataset
+from transformers import AutoModelForTDT, AutoProcessor
+
+model_id = "nvidia/parakeet-tdt-0.6b-v3"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForTDT.from_pretrained(model_id, dtype="auto", device_map="auto")
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
+speech_samples = [el['array'] for el in ds["audio"][:1]]
+
+inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
+inputs.to(model.device, dtype=model.dtype)
+output = model.generate(**inputs, return_dict_in_generate=True)
+decoded_output, decoded_timestamps = processor.decode(
+    output.sequences,
+    durations=output.durations,
+    skip_special_tokens=True,
+)
+print("Transcription:", decoded_output)
+print("\nTimestamped tokens:", decoded_timestamps)
+
+"""
+Transcription: ['mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.']
+
+Timestamped tokens: [[{'token': 'm', 'start': 0.24, 'end': 0.48}, {'token': 'ister', 'start': 0.48, 'end': 0.64}, {'token': 'Qu', 'start': 0.64, 'end': 0.88}, {'token': 'il', 'start': 0.88, 'end': 1.12}, {'token': 'ter', 'start': 1.12, 'end': 1.36}, {'token': 'is', 'start': 1.36, 'end': 1.44}, {'token': 'the', 'start': 1.44, 'end': 1.6}, {'token': 'ap', 'start': 1.6, 'end': 1.76}, {'token': 'ost', 'start': 1.76, 'end': 1.92}, {'token': 'le', 'start': 2.0, 'end': 2.16}, {'token': 'of', 'start': 2.16, 'end': 2.24}, {'token': 'the', 'start': 2.24, 'end': 2.4}, {'token': 'mid', 'start': 2.4, 'end': 2.48}, {'token': 'd', 'start': 2.48, 'end': 2.56}, {'token': 'le', 'start': 2.56, 'end': 2.64}, {'token': 'clas', 'start': 2.72, 'end': 2.88}, {'token': 's', 'start': 2.88, 'end': 3.04}, {'token': 'es', 'start': 3.04, 'end': 3.12}, {'token': ',', 'start': 3.12, 'end': 3.12}, {'token': 'and', 'start': 3.2800000000000002, 'end': 3.44}, {'token': 'we', 'start': 3.44, 'end': 3.6}, {'token': 'are', 'start': 3.6, 'end': 3.7600000000000002}, {'token': 'gl', 'start': 3.7600000000000002, 'end': 3.92}, {'token': 'ad', 'start': 3.92, 'end': 4.08}, {'token': 'to', 'start': 4.08, 'end': 4.24}, {'token': 'wel', 'start': 4.24, 'end': 4.4}, {'token': 'c', 'start': 4.4, 'end': 4.48}, {'token': 'ome', 'start': 4.48, 'end': 4.72}, {'token': 'his', 'start': 4.72, 'end': 4.96}, {'token': 'gos', 'start': 4.96, 'end': 5.12}, {'token': 'pel', 'start': 5.36, 'end': 5.6000000000000005}, {'token': '.', 'start': 5.6000000000000005, 'end': 5.6000000000000005}]]
+"""
 ```
 
 </hfoption>
@@ -136,58 +213,90 @@ print("First generation - compiling...")
 # Generate with the compiled model
 with TimerContext("First generation"):
     outputs = model.generate(**inputs)
-print(processor.batch_decode(outputs))
+print(processor.decode(outputs))
 
 inputs = processor(speech_samples[1], **processor_kwargs)
 inputs.to(device, dtype=model.dtype)
 print("\n" + "="*50)
 print("Second generation - recording CUDA graphs...")
 with TimerContext("Second generation"):
     outputs = model.generate(**inputs)
-print(processor.batch_decode(outputs))
+print(processor.decode(outputs))
 
 inputs = processor(speech_samples[2], **processor_kwargs)
 inputs.to(device, dtype=model.dtype)
 print("\n" + "="*50)
 print("Third generation - fast !!!")
 with TimerContext("Third generation"):
     outputs = model.generate(**inputs)
-print(processor.batch_decode(outputs))
+print(processor.decode(outputs))
 
 inputs = processor(speech_samples[3], **processor_kwargs)
 inputs.to(device, dtype=model.dtype)
 print("\n" + "="*50)
 print("Fourth generation - still fast !!!")
 with TimerContext("Fourth generation"):
     outputs = model.generate(**inputs)
-print(processor.batch_decode(outputs))
+print(processor.decode(outputs))
 ```
 
-### Training
+### CTC Training
 
 ```python
+import torch
+from datasets import Audio, load_dataset
 from transformers import AutoModelForCTC, AutoProcessor
-from datasets import load_dataset, Audio
+
+model_id = "nvidia/parakeet-ctc-1.1b"
+NUM_SAMPLES = 5
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForCTC.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")
+model.train()
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
+speech_samples = [el['array'] for el in ds["audio"][:NUM_SAMPLES]]
+text_samples = ds["text"][:NUM_SAMPLES]
+
+# passing `text` to the processor will prepare inputs' `labels` key
+inputs = processor(audio=speech_samples, text=text_samples, sampling_rate=processor.feature_extractor.sampling_rate)
+inputs.to(device=model.device, dtype=model.dtype)
+
+outputs = model(**inputs)
+print("Loss:", outputs.loss.item())
+outputs.loss.backward()
+```
+
+### TDT Training
+
+```py
+from datasets import Audio, load_dataset
 import torch
+from transformers import AutoModelForTDT, AutoProcessor
 
-device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "nvidia/parakeet-tdt-0.6b-v3"
+NUM_SAMPLES = 4
 
-processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b")
-model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device)
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForTDT.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")
+model.train()
 
 ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
-speech_samples = [el['array'] for el in ds["audio"][:5]]
-text_samples = [el for el in ds["text"][:5]]
+speech_samples = [el['array'] for el in ds["audio"][:NUM_SAMPLES]]
+text_samples = ds["text"][:NUM_SAMPLES]
 
 # passing `text` to the processor will prepare inputs' `labels` key
 inputs = processor(audio=speech_samples, text=text_samples, sampling_rate=processor.feature_extractor.sampling_rate)
-inputs.to(device, dtype=model.dtype)
+inputs.to(device=model.device, dtype=model.dtype)
 
 outputs = model(**inputs)
+print("Loss:", outputs.loss.item())
 outputs.loss.backward()
 ```
 
+
 ## ParakeetTokenizer
 
 [[autodoc]] ParakeetTokenizer
@@ -201,7 +310,6 @@ outputs.loss.backward()
 
 [[autodoc]] ParakeetProcessor
     - __call__
-    - batch_decode
     - decode
 
 ## ParakeetEncoderConfig
@@ -212,10 +320,18 @@ outputs.loss.backward()
 
 [[autodoc]] ParakeetCTCConfig
 
+## ParakeetTDTConfig
+
+[[autodoc]] ParakeetTDTConfig
+
 ## ParakeetEncoder
 
 [[autodoc]] ParakeetEncoder
 
 ## ParakeetForCTC
 
 [[autodoc]] ParakeetForCTC
+
+## ParakeetForTDT
+
+[[autodoc]] ParakeetForTDT
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -735,7 +735,8 @@ def tokenizer(self, proto):
             )
 
         elif model_type == 2:
-            _, merges = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            result = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(None)
+            merges = result["merges"]
             bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
             tokenizer = Tokenizer(
                 BPE(
@@ -1842,7 +1843,8 @@ def __init__(self, vocab_file=None, *args):
     def tokenizer(self, proto):
         vocab_scores = self.vocab(proto)
 
-        _, merges = self.SpmExtractor(self.vocab_file).extract(vocab_scores)
+        result = self.SpmExtractor(self.vocab_file).extract(None)
+        merges = result["merges"]
         bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
         tokenizer = Tokenizer(
             BPE(

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -1459,6 +1459,13 @@ def compute_transition_scores(
     def _validate_generation_mode(
         self: "GenerativePreTrainedModel", generation_mode, generation_config, generation_mode_kwargs
     ):
+        supported_modes = getattr(self, "_supported_generation_modes", None)
+        if supported_modes is not None and generation_mode not in supported_modes:
+            raise ValueError(
+                f"{self.__class__.__name__} only supports {supported_modes}, but got "
+                f"generation mode '{generation_mode}'."
+            )
+
         if generation_mode == GenerationMode.BEAM_SEARCH and "streamer" in generation_mode_kwargs:
             raise ValueError(
                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."

diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
@@ -286,6 +286,7 @@ def register_kernel_mapping_transformers(*args, **kwargs):
     "falcon_mamba-ssm": {"repo_id": "kernels-community/mamba-ssm", "version": 1},
     "finegrained-fp8": {"repo_id": "kernels-community/finegrained-fp8", "version": 1},
     "deep-gemm": {"repo_id": "kernels-community/deep-gemm", "version": 1},
+    "tdt-loss": {"repo_id": "eustlb/tdt-loss", "revision": "v1"},
 }
 
 _KERNEL_MODULE_MAPPING: dict[str, ModuleType | None] = {}
@@ -372,10 +373,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
             repo_id = _HUB_KERNEL_MAPPING[kernel_name]["repo_id"]
             revision = _HUB_KERNEL_MAPPING[kernel_name].get("revision", None)
             version = _HUB_KERNEL_MAPPING[kernel_name].get("version", None)
-            kernel = get_kernel(repo_id, revision=revision, version=version)
+            # Since we only read from `_HUB_KERNEL_MAPPING`, we can allow all kernels
+            kernel = get_kernel(repo_id, revision=revision, version=version, allow_all_kernels=True)
             mapping[kernel_name] = kernel
-        except FileNotFoundError:
+        except FileNotFoundError as e:
             mapping[kernel_name] = None
+            logger.warning_once(f"Failed to load kernel {kernel_name}: {e}")
         except AssertionError:
             # Happens when torch is built without an accelerator backend; fall back to slow path.
             mapping[kernel_name] = None