huggingface · sayakpaul · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/docs/source/en/api/pipelines/helios.md b/docs/source/en/api/pipelines/helios.md
@@ -22,7 +22,7 @@
 
 # Helios
 
-[Helios: Real Real-Time Long Video Generation Model](https://huggingface.co/papers/) from Peking University & ByteDance & etc, by Shenghai Yuan, Yuanyang Yin, Xinwei Huang, Xiao Yang, Li Yuan.
+[Helios: Real Real-Time Long Video Generation Model](https://huggingface.co/papers/) from Peking University & ByteDance & etc, by Shenghai Yuan, Yuanyang Yin, Zongjian Li, Xinwei Huang, Xiao Yang, Li Yuan.
 
 *  <u>We introduce Helios, the first 14B video generation model that runs at 17 FPS on a single NVIDIA H100 GPU and supports minute-scale generation while matching a strong baseline in quality.</u> We make breakthroughs along three key dimensions: (1) robustness to long-video drifting without commonly used anti-drift heuristics such as self-forcing, error banks, or keyframe sampling; (2) real-time generation without standard acceleration techniques such as KV-cache, causal masking, or sparse attention; and (3) training without parallelism or sharding frameworks, enabling image-diffusion-scale batch sizes while fitting up to four 14B models within 80 GB of GPU memory. Specifically, Helios is a 14B autoregressive diffusion model with a unified input representation that natively supports T2V, I2V, and V2V tasks. To mitigate drifting in long-video generation, we characterize its typical failure modes and propose simple yet effective training strategies that explicitly simulate drifting during training, while eliminating repetitive motion at its source. For efficiency, we heavily compress the historical and noisy context and reduce the number of sampling steps, yielding computational costs comparable to—or lower than—those of 1.3B video generative models. Moreover, we introduce infrastructure-level optimizations that accelerate both inference and training while reducing memory consumption. Extensive experiments demonstrate that Helios consistently outperforms prior methods on both short- and long-video generation. All the code and models are available at [this https URL](https://pku-yuangroup.github.io/Helios-Page).
 
@@ -360,10 +360,10 @@ import torch
 from diffusers import AutoModel, HeliosPyramidPipeline
 from diffusers.utils import export_to_video, load_video, load_image
 
-vae = AutoModel.from_pretrained("BestWishYsh//Helios-Distilled", subfolder="vae", torch_dtype=torch.float32)
+vae = AutoModel.from_pretrained("BestWishYsh/Helios-Distilled", subfolder="vae", torch_dtype=torch.float32)
 
 pipeline = HeliosPyramidPipeline.from_pretrained(
-    "BestWishYsh//Helios-Distilled",
+    "BestWishYsh/Helios-Distilled",
     vae=vae,
     torch_dtype=torch.bfloat16
 )

diff --git a/src/diffusers/models/transformers/transformer_helios.py b/src/diffusers/models/transformers/transformer_helios.py
@@ -536,7 +536,14 @@ class HeliosTransformer3DModel(
     """
 
     _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _skip_layerwise_casting_patterns = [
+        "patch_embedding",
+        "patch_short",
+        "patch_mid",
+        "patch_long",
+        "condition_embedder",
+        "norm",
+    ]
     _no_split_modules = ["HeliosTransformerBlock", "HeliosOutputNorm"]
     _keep_in_fp32_modules = [
         "time_embedder",
@@ -594,18 +601,17 @@ def __init__(
 
         # 2. Initial Multi Term Memory Patch
         self.zero_history_timestep = zero_history_timestep
-        self.inner_dim = inner_dim
         if has_multi_term_memory_patch:
-            self.patch_short = nn.Conv3d(in_channels, self.inner_dim, kernel_size=patch_size, stride=patch_size)
+            self.patch_short = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
             self.patch_mid = nn.Conv3d(
                 in_channels,
-                self.inner_dim,
+                inner_dim,
                 kernel_size=tuple(2 * p for p in patch_size),
                 stride=tuple(2 * p for p in patch_size),
             )
             self.patch_long = nn.Conv3d(
                 in_channels,
-                self.inner_dim,
+                inner_dim,
                 kernel_size=tuple(4 * p for p in patch_size),
                 stride=tuple(4 * p for p in patch_size),
             )
@@ -683,7 +689,6 @@ def forward(
 
         # 3. Process short history latents
         if latents_history_short is not None and indices_latents_history_short is not None:
-            latents_history_short = latents_history_short.to(hidden_states)
             latents_history_short = self.patch_short(latents_history_short)
             _, _, _, H1, W1 = latents_history_short.shape
             latents_history_short = latents_history_short.flatten(2).transpose(1, 2)
@@ -701,7 +706,6 @@ def forward(
 
         # 4. Process mid history latents
         if latents_history_mid is not None and indices_latents_history_mid is not None:
-            latents_history_mid = latents_history_mid.to(hidden_states)
             latents_history_mid = pad_for_3d_conv(latents_history_mid, (2, 4, 4))
             latents_history_mid = self.patch_mid(latents_history_mid)
             latents_history_mid = latents_history_mid.flatten(2).transpose(1, 2)
@@ -721,7 +725,6 @@ def forward(
 
         # 5. Process long history latents
         if latents_history_long is not None and indices_latents_history_long is not None:
-            latents_history_long = latents_history_long.to(hidden_states)
             latents_history_long = pad_for_3d_conv(latents_history_long, (4, 8, 8))
             latents_history_long = self.patch_long(latents_history_long)
             latents_history_long = latents_history_long.flatten(2).transpose(1, 2)

diff --git a/src/diffusers/pipelines/helios/pipeline_helios.py b/src/diffusers/pipelines/helios/pipeline_helios.py
@@ -815,6 +815,9 @@ def __call__(
                     timestep = t.expand(latents.shape[0])
 
                     latent_model_input = latents.to(transformer_dtype)
+                    latents_history_short = latents_history_short.to(transformer_dtype)
+                    latents_history_mid = latents_history_mid.to(transformer_dtype)
+                    latents_history_long = latents_history_long.to(transformer_dtype)
                     with self.transformer.cache_context("cond"):
                         noise_pred = self.transformer(
                             hidden_states=latent_model_input,
@@ -824,9 +827,9 @@ def __call__(
                             indices_latents_history_short=indices_latents_history_short,
                             indices_latents_history_mid=indices_latents_history_mid,
                             indices_latents_history_long=indices_latents_history_long,
-                            latents_history_short=latents_history_short.to(transformer_dtype),
-                            latents_history_mid=latents_history_mid.to(transformer_dtype),
-                            latents_history_long=latents_history_long.to(transformer_dtype),
+                            latents_history_short=latents_history_short,
+                            latents_history_mid=latents_history_mid,
+                            latents_history_long=latents_history_long,
                             attention_kwargs=attention_kwargs,
                             return_dict=False,
                         )[0]
@@ -841,9 +844,9 @@ def __call__(
                                 indices_latents_history_short=indices_latents_history_short,
                                 indices_latents_history_mid=indices_latents_history_mid,
                                 indices_latents_history_long=indices_latents_history_long,
-                                latents_history_short=latents_history_short.to(transformer_dtype),
-                                latents_history_mid=latents_history_mid.to(transformer_dtype),
-                                latents_history_long=latents_history_long.to(transformer_dtype),
+                                latents_history_short=latents_history_short,
+                                latents_history_mid=latents_history_mid,
+                                latents_history_long=latents_history_long,
                                 attention_kwargs=attention_kwargs,
                                 return_dict=False,
                             )[0]