Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/en/api/pipelines/helios.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

# Helios

[Helios: Real Real-Time Long Video Generation Model](https://huggingface.co/papers/) from Peking University & ByteDance & etc, by Shenghai Yuan, Yuanyang Yin, Xinwei Huang, Xiao Yang, Li Yuan.
[Helios: Real Real-Time Long Video Generation Model](https://huggingface.co/papers/) from Peking University & ByteDance & etc, by Shenghai Yuan, Yuanyang Yin, Zongjian Li, Xinwei Huang, Xiao Yang, Li Yuan.

* <u>We introduce Helios, the first 14B video generation model that runs at 17 FPS on a single NVIDIA H100 GPU and supports minute-scale generation while matching a strong baseline in quality.</u> We make breakthroughs along three key dimensions: (1) robustness to long-video drifting without commonly used anti-drift heuristics such as self-forcing, error banks, or keyframe sampling; (2) real-time generation without standard acceleration techniques such as KV-cache, causal masking, or sparse attention; and (3) training without parallelism or sharding frameworks, enabling image-diffusion-scale batch sizes while fitting up to four 14B models within 80 GB of GPU memory. Specifically, Helios is a 14B autoregressive diffusion model with a unified input representation that natively supports T2V, I2V, and V2V tasks. To mitigate drifting in long-video generation, we characterize its typical failure modes and propose simple yet effective training strategies that explicitly simulate drifting during training, while eliminating repetitive motion at its source. For efficiency, we heavily compress the historical and noisy context and reduce the number of sampling steps, yielding computational costs comparable to—or lower than—those of 1.3B video generative models. Moreover, we introduce infrastructure-level optimizations that accelerate both inference and training while reducing memory consumption. Extensive experiments demonstrate that Helios consistently outperforms prior methods on both short- and long-video generation. All the code and models are available at [this https URL](https://pku-yuangroup.github.io/Helios-Page).

Expand Down Expand Up @@ -360,10 +360,10 @@ import torch
from diffusers import AutoModel, HeliosPyramidPipeline
from diffusers.utils import export_to_video, load_video, load_image

vae = AutoModel.from_pretrained("BestWishYsh//Helios-Distilled", subfolder="vae", torch_dtype=torch.float32)
vae = AutoModel.from_pretrained("BestWishYsh/Helios-Distilled", subfolder="vae", torch_dtype=torch.float32)

pipeline = HeliosPyramidPipeline.from_pretrained(
"BestWishYsh//Helios-Distilled",
"BestWishYsh/Helios-Distilled",
vae=vae,
torch_dtype=torch.bfloat16
)
Expand Down
19 changes: 11 additions & 8 deletions src/diffusers/models/transformers/transformer_helios.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,14 @@ class HeliosTransformer3DModel(
"""

_supports_gradient_checkpointing = True
_skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
_skip_layerwise_casting_patterns = [
"patch_embedding",
"patch_short",
"patch_mid",
"patch_long",
"condition_embedder",
"norm",
]
_no_split_modules = ["HeliosTransformerBlock", "HeliosOutputNorm"]
_keep_in_fp32_modules = [
"time_embedder",
Expand Down Expand Up @@ -594,18 +601,17 @@ def __init__(

# 2. Initial Multi Term Memory Patch
self.zero_history_timestep = zero_history_timestep
self.inner_dim = inner_dim
if has_multi_term_memory_patch:
self.patch_short = nn.Conv3d(in_channels, self.inner_dim, kernel_size=patch_size, stride=patch_size)
self.patch_short = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
self.patch_mid = nn.Conv3d(
in_channels,
self.inner_dim,
inner_dim,
kernel_size=tuple(2 * p for p in patch_size),
stride=tuple(2 * p for p in patch_size),
)
self.patch_long = nn.Conv3d(
in_channels,
self.inner_dim,
inner_dim,
kernel_size=tuple(4 * p for p in patch_size),
stride=tuple(4 * p for p in patch_size),
)
Expand Down Expand Up @@ -683,7 +689,6 @@ def forward(

# 3. Process short history latents
if latents_history_short is not None and indices_latents_history_short is not None:
latents_history_short = latents_history_short.to(hidden_states)
latents_history_short = self.patch_short(latents_history_short)
_, _, _, H1, W1 = latents_history_short.shape
latents_history_short = latents_history_short.flatten(2).transpose(1, 2)
Expand All @@ -701,7 +706,6 @@ def forward(

# 4. Process mid history latents
if latents_history_mid is not None and indices_latents_history_mid is not None:
latents_history_mid = latents_history_mid.to(hidden_states)
latents_history_mid = pad_for_3d_conv(latents_history_mid, (2, 4, 4))
latents_history_mid = self.patch_mid(latents_history_mid)
latents_history_mid = latents_history_mid.flatten(2).transpose(1, 2)
Expand All @@ -721,7 +725,6 @@ def forward(

# 5. Process long history latents
if latents_history_long is not None and indices_latents_history_long is not None:
latents_history_long = latents_history_long.to(hidden_states)
latents_history_long = pad_for_3d_conv(latents_history_long, (4, 8, 8))
latents_history_long = self.patch_long(latents_history_long)
latents_history_long = latents_history_long.flatten(2).transpose(1, 2)
Expand Down
15 changes: 9 additions & 6 deletions src/diffusers/pipelines/helios/pipeline_helios.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,9 @@ def __call__(
timestep = t.expand(latents.shape[0])

latent_model_input = latents.to(transformer_dtype)
latents_history_short = latents_history_short.to(transformer_dtype)
latents_history_mid = latents_history_mid.to(transformer_dtype)
latents_history_long = latents_history_long.to(transformer_dtype)
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
Expand All @@ -824,9 +827,9 @@ def __call__(
indices_latents_history_short=indices_latents_history_short,
indices_latents_history_mid=indices_latents_history_mid,
indices_latents_history_long=indices_latents_history_long,
latents_history_short=latents_history_short.to(transformer_dtype),
latents_history_mid=latents_history_mid.to(transformer_dtype),
latents_history_long=latents_history_long.to(transformer_dtype),
latents_history_short=latents_history_short,
latents_history_mid=latents_history_mid,
latents_history_long=latents_history_long,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
Expand All @@ -841,9 +844,9 @@ def __call__(
indices_latents_history_short=indices_latents_history_short,
indices_latents_history_mid=indices_latents_history_mid,
indices_latents_history_long=indices_latents_history_long,
latents_history_short=latents_history_short.to(transformer_dtype),
latents_history_mid=latents_history_mid.to(transformer_dtype),
latents_history_long=latents_history_long.to(transformer_dtype),
latents_history_short=latents_history_short,
latents_history_mid=latents_history_mid,
latents_history_long=latents_history_long,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
Expand Down