Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(tests/ltx_parity)
endif()

set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
Expand Down
126 changes: 126 additions & 0 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,10 @@ ArgOptions SDContextParams::get_options() {
"--qwen2vl_vision",
"alias of --llm_vision. Deprecated.",
&llm_vision_path},
{"",
"--gemma-tokenizer",
"path to Gemma's tokenizer.json (HF format). Required for LTX-2 text conditioning.",
&gemma_tokenizer_path},
{"",
"--diffusion-model",
"path to the standalone diffusion model",
Expand Down Expand Up @@ -376,6 +380,25 @@ ArgOptions SDContextParams::get_options() {
"--chroma-t5-mask-pad",
"t5 mask pad size of chroma",
&chroma_t5_mask_pad},
{"",
"--fit-target",
"auto-fit: MiB of free memory to leave on each GPU (default: 512)",
&auto_fit_target_mb},
{"",
"--fit-compute-reserve-dit",
"auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
"(default: 2048, 0 keeps the built-in default)",
&auto_fit_compute_reserve_dit_mb},
{"",
"--fit-compute-reserve-vae",
"auto-fit: MiB reserved on the VAE's GPU for its compute buffer "
"(default: 1024, 0 keeps the built-in default)",
&auto_fit_compute_reserve_vae_mb},
{"",
"--fit-compute-reserve-cond",
"auto-fit: MiB reserved on the conditioner's GPU for its compute "
"buffer (default: 512, 0 keeps the built-in default)",
&auto_fit_compute_reserve_cond_mb},
};

options.float_options = {};
Expand Down Expand Up @@ -445,6 +468,27 @@ ArgOptions SDContextParams::get_options() {
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
true, &chroma_use_t5_mask},
{"",
"--auto-fit",
"automatically pick DiT/VAE/Conditioner device placements based on "
"free GPU memory (default ON; priority: DiT+compute > VAE > "
"Conditioner; overflow goes to CPU or DiT-params-offload mode)",
true, &auto_fit},
{"",
"--no-auto-fit",
"disable auto-fit and use the explicit placement flags / env vars "
"(--clip-on-cpu, --vae-on-cpu, SD_CUDA_DEVICE*, etc.)",
false, &auto_fit},
{"",
"--no-tensor-split",
"disable auto tensor split: keep the DiT on a single GPU even when "
"more than one CUDA device is detected. SD_CUDA_TENSOR_SPLIT env "
"still wins when set.",
false, &auto_tensor_split},
{"",
"--fit-dry-run",
"auto-fit: print the computed plan and exit without loading models",
true, &auto_fit_dry_run},
};

auto on_type_arg = [&](int argc, const char** argv, int index) {
Expand Down Expand Up @@ -517,6 +561,12 @@ ArgOptions SDContextParams::get_options() {
return 1;
};

auto on_no_lazy_load_arg = [&](int /*argc*/, const char** /*argv*/, int /*index*/) {
lazy_load_dit = false;
lazy_load_cond = false;
return 0;
};

options.manual_options = {
{"",
"--type",
Expand All @@ -543,6 +593,12 @@ ArgOptions SDContextParams::get_options() {
"but it usually offers faster inference speed and, in some cases, lower memory usage. "
"The at_runtime mode, on the other hand, is exactly the opposite.",
on_lora_apply_mode_arg},
{"",
"--no-lazy-load",
"disable lazy load of DiT and conditioner-LLM weights (default ON). "
"Lazy load defers per-component allocation+read until first compute() "
"so the working set never holds all components resident.",
on_no_lazy_load_arg},
};

return options;
Expand Down Expand Up @@ -638,6 +694,7 @@ std::string SDContextParams::to_string() const {
<< " t5xxl_path: \"" << t5xxl_path << "\",\n"
<< " llm_path: \"" << llm_path << "\",\n"
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
<< " gemma_tokenizer_path: \"" << gemma_tokenizer_path << "\",\n"
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
<< " vae_path: \"" << vae_path << "\",\n"
Expand Down Expand Up @@ -693,6 +750,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
t5xxl_path.c_str(),
llm_path.c_str(),
llm_vision_path.c_str(),
gemma_tokenizer_path.c_str(),
diffusion_model_path.c_str(),
high_noise_diffusion_model_path.c_str(),
vae_path.c_str(),
Expand Down Expand Up @@ -727,6 +785,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
chroma_use_t5_mask,
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
auto_fit,
auto_fit_target_mb,
auto_fit_dry_run,
auto_fit_compute_reserve_dit_mb,
auto_fit_compute_reserve_vae_mb,
auto_fit_compute_reserve_cond_mb,
lazy_load_dit,
lazy_load_cond,
auto_tensor_split,
};
return sd_ctx_params;
}
Expand Down Expand Up @@ -841,6 +908,18 @@ ArgOptions SDGenerationParams::get_options() {
"--guidance",
"distilled guidance scale for models with guidance input (default: 3.5)",
&sample_params.guidance.distilled_guidance},
{"",
"--rescale-scale",
"CFG-rescale to combat oversaturation (default: 0; LTX-2.3 expects 0.7)",
&sample_params.guidance.rescale_scale},
{"",
"--stg-scale",
"Spatio-Temporal Guidance scale (default: 0; LTX-2.3 expects 1.0 with --stg-blocks [28])",
&sample_params.guidance.stg_scale},
{"",
"--high-noise-stg-scale",
"(high noise) Spatio-Temporal Guidance scale (default: 0)",
&high_noise_sample_params.guidance.stg_scale},
{"",
"--slg-scale",
"skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
Expand Down Expand Up @@ -1042,6 +1121,36 @@ ArgOptions SDGenerationParams::get_options() {
return 1;
};

auto parse_int_list = [](std::string s, std::vector<int>& out) -> bool {
if (s.empty()) return false;
if (s.front() == '[') s.erase(0, 1);
if (!s.empty() && s.back() == ']') s.pop_back();
std::regex regex("[, ]+");
std::sregex_token_iterator iter(s.begin(), s.end(), regex, -1);
std::sregex_token_iterator end;
std::vector<int> tmp;
for (auto it = iter; it != end; ++it) {
std::string token = *it;
if (token.empty()) continue;
try {
tmp.push_back(std::stoi(token));
} catch (const std::invalid_argument&) {
return false;
}
}
out = std::move(tmp);
return true;
};

auto on_stg_blocks_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) return -1;
return parse_int_list(argv[index], stg_blocks) ? 1 : -1;
};
auto on_high_noise_stg_blocks_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) return -1;
return parse_int_list(argv[index], high_noise_stg_blocks) ? 1 : -1;
};

auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
Expand Down Expand Up @@ -1209,6 +1318,14 @@ ArgOptions SDGenerationParams::get_options() {
"--high-noise-skip-layers",
"(high noise) layers to skip for SLG steps (default: [7,8,9])",
on_high_noise_skip_layers_arg},
{"",
"--stg-blocks",
"blocks for STG perturbed pass (LTX-2.3 default: [28]). Empty disables STG.",
on_stg_blocks_arg},
{"",
"--high-noise-stg-blocks",
"(high noise) blocks for STG perturbed pass.",
on_high_noise_stg_blocks_arg},
{"-r",
"--ref-image",
"reference image for Flux Kontext models (can be used multiple times)",
Expand Down Expand Up @@ -1932,6 +2049,10 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
sample_params.guidance.slg.layer_count = skip_layers.size();
high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data();
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.guidance.stg_blocks = stg_blocks.empty() ? nullptr : stg_blocks.data();
sample_params.guidance.stg_blocks_count = stg_blocks.size();
high_noise_sample_params.guidance.stg_blocks = high_noise_stg_blocks.empty() ? nullptr : high_noise_stg_blocks.data();
high_noise_sample_params.guidance.stg_blocks_count = high_noise_stg_blocks.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
Expand Down Expand Up @@ -1991,6 +2112,10 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
sample_params.guidance.slg.layer_count = skip_layers.size();
high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data();
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.guidance.stg_blocks = stg_blocks.empty() ? nullptr : stg_blocks.data();
sample_params.guidance.stg_blocks_count = stg_blocks.size();
high_noise_sample_params.guidance.stg_blocks = high_noise_stg_blocks.empty() ? nullptr : high_noise_stg_blocks.data();
high_noise_sample_params.guidance.stg_blocks_count = high_noise_stg_blocks.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
Expand All @@ -2012,6 +2137,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
params.strength = strength;
params.seed = seed;
params.video_frames = video_frames;
params.fps = static_cast<float>(fps);
params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
Expand Down
24 changes: 24 additions & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ struct SDContextParams {
std::string t5xxl_path;
std::string llm_path;
std::string llm_vision_path;
std::string gemma_tokenizer_path;
std::string diffusion_model_path;
std::string high_noise_diffusion_model_path;
std::string vae_path;
Expand Down Expand Up @@ -127,6 +128,25 @@ struct SDContextParams {

bool qwen_image_zero_cond_t = false;

// Auto-fit: pick DiT/VAE/Conditioner device placements from free GPU memory.
// Default ON; pass --no-auto-fit to opt out.
bool auto_fit = true;
int auto_fit_target_mb = 512;
bool auto_fit_dry_run = false;
int auto_fit_compute_reserve_dit_mb = 0; // 0 = use header default
int auto_fit_compute_reserve_vae_mb = 0;
int auto_fit_compute_reserve_cond_mb = 0;

// Lazy load: defer DiT and conditioner-LLM weight allocation+read until
// the first compute(). Default ON; pass --no-lazy-load to opt out.
bool lazy_load_dit = true;
bool lazy_load_cond = true;

// Auto tensor split: when >1 CUDA device is detected, split DiT row-wise
// across all GPUs by free-VRAM ratio. Default ON; pass --no-tensor-split
// to opt out. SD_CUDA_TENSOR_SPLIT env still wins when set.
bool auto_tensor_split = true;

prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;

Expand Down Expand Up @@ -168,6 +188,10 @@ struct SDGenerationParams {
sd_sample_params_t high_noise_sample_params;
std::vector<int> skip_layers = {7, 8, 9};
std::vector<int> high_noise_skip_layers = {7, 8, 9};
// STG (Spatio-Temporal Guidance) blocks (LTX-2.3 default: [28]). Empty means
// STG disabled even if --stg-scale is set.
std::vector<int> stg_blocks = {};
std::vector<int> high_noise_stg_blocks = {};

std::vector<float> custom_sigmas;

Expand Down
20 changes: 20 additions & 0 deletions ggml-patch.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
index cc80eb3f..dd79d9c0 100644
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -865,7 +865,14 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
}

static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
- GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+ // Views: the view's storage comes from view_src, so we don't allocate
+ // anything on the split buffer for it. The split buffer's per-device
+ // shards aren't applicable to a view — the view simply reuses view_src's
+ // memory layout. Sched will route any op that consumes this view through
+ // view_src's backend.
+ if (tensor->view_src != nullptr) {
+ return GGML_STATUS_SUCCESS;
+ }
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");

ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
56 changes: 56 additions & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ enum prediction_t {
FLOW_PRED,
FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
LTX2_FLOW_PRED,
PREDICTION_COUNT
};

Expand Down Expand Up @@ -169,6 +170,11 @@ typedef struct {
const char* t5xxl_path;
const char* llm_path;
const char* llm_vision_path;
// Path to a HuggingFace-format tokenizer.json file. Currently only read by the
// LTX-2 Gemma 3 conditioner, which requires Gemma's tokenizer for BPE + metaspace
// encoding of prompts. If empty for LTX-2, the conditioner aborts with a clear
// message. Non-LTX-2 pipelines ignore this field.
const char* gemma_tokenizer_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* vae_path;
Expand Down Expand Up @@ -203,6 +209,36 @@ typedef struct {
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;

// Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
// When `auto_fit` is true (default), the CLI placement overrides (env vars,
// keep_*_on_cpu) are ignored and the plan is computed automatically.
// `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
// `auto_fit_dry_run` prints the plan and aborts init before loading.
// `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
// per-component compute-buffer reserve; 0 means use the built-in default.
bool auto_fit;
int auto_fit_target_mb;
bool auto_fit_dry_run;
int auto_fit_compute_reserve_dit_mb;
int auto_fit_compute_reserve_vae_mb;
int auto_fit_compute_reserve_cond_mb;

// Lazy load: defer DiT and conditioner-LLM weight allocation+read until
// the first compute() call, so the working set never holds all components
// resident simultaneously. Required when sum-of-components exceeds combined
// VRAM (LTX-2 + Gemma + VAE all at once won't fit on a 24 GB rig).
// Defaults to true. Env-var overrides (SD_LAZY_LOAD_DIT/SD_LAZY_LOAD_COND)
// still work and force-enable when set; they cannot disable.
bool lazy_load_dit;
bool lazy_load_cond;

// Auto tensor split: when more than one CUDA device is detected and the
// user did NOT explicitly set SD_CUDA_TENSOR_SPLIT, automatically split
// the DiT row-wise across all available GPUs with ratios proportional to
// each device's free VRAM. Defaults to true. Set false to keep the DiT
// on a single GPU (with auto-fit choosing which one).
bool auto_tensor_split;
} sd_ctx_params_t;

typedef struct {
Expand All @@ -225,6 +261,20 @@ typedef struct {
float img_cfg;
float distilled_guidance;
sd_slg_params_t slg;
// CFG-rescale (LTX-2.3 default 0.7, no effect when 0). After CFG mixing,
// pred is rescaled toward cond's std to combat oversaturation:
// factor = cond.std() / pred.std()
// factor = rescale_scale * factor + (1 - rescale_scale)
// pred *= factor
float rescale_scale;
// Spatio-Temporal Guidance (LTX-2.3 default stg_scale=1.0, stg_blocks=[28]).
// Adds a third forward pass with self-attention skipped on the listed
// transformer blocks; the resulting "weakened" prediction is mixed into
// the guided pred: pred += stg_scale * (cond - perturbed).
// No effect when stg_scale==0 or stg_blocks_count==0.
float stg_scale;
int* stg_blocks;
size_t stg_blocks_count;
} sd_guidance_params_t;

typedef struct {
Expand Down Expand Up @@ -332,6 +382,12 @@ typedef struct {
float strength;
int64_t seed;
int video_frames;
// Output video fps. Carried through to models that use it for temporal
// positional embeddings — LTX-2's RoPE divides the time axis by fps
// (ltx_core/tools.py::VideoLatentTools.create_initial_state), so the
// default 24 on LTXRunner silently produces wrong positions at any
// other target fps. 0 means "don't override runner default".
float fps;
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
Expand Down
Loading