leejet · pwilkin · Apr 30, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -265,6 +265,7 @@ target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
 
 if (SD_BUILD_EXAMPLES)
     add_subdirectory(examples)
+    add_subdirectory(tests/ltx_parity)
 endif()
 
 set(SD_PUBLIC_HEADERS include/stable-diffusion.h)

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -319,6 +319,10 @@ ArgOptions SDContextParams::get_options() {
          "--qwen2vl_vision",
          "alias of --llm_vision. Deprecated.",
          &llm_vision_path},
+        {"",
+         "--gemma-tokenizer",
+         "path to Gemma's tokenizer.json (HF format). Required for LTX-2 text conditioning.",
+         &gemma_tokenizer_path},
         {"",
          "--diffusion-model",
          "path to the standalone diffusion model",
@@ -376,6 +380,25 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
+        {"",
+         "--fit-compute-reserve-dit",
+         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
+         "(default: 2048, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_dit_mb},
+        {"",
+         "--fit-compute-reserve-vae",
+         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer "
+         "(default: 1024, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_vae_mb},
+        {"",
+         "--fit-compute-reserve-cond",
+         "auto-fit: MiB reserved on the conditioner's GPU for its compute "
+         "buffer (default: 512, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {};
@@ -445,6 +468,27 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON; priority: DiT+compute > VAE > "
+         "Conditioner; overflow goes to CPU or DiT-params-offload mode)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit placement flags / env vars "
+         "(--clip-on-cpu, --vae-on-cpu, SD_CUDA_DEVICE*, etc.)",
+         false, &auto_fit},
+        {"",
+         "--no-tensor-split",
+         "disable auto tensor split: keep the DiT on a single GPU even when "
+         "more than one CUDA device is detected. SD_CUDA_TENSOR_SPLIT env "
+         "still wins when set.",
+         false, &auto_tensor_split},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -517,6 +561,12 @@ ArgOptions SDContextParams::get_options() {
         return 1;
     };
 
+    auto on_no_lazy_load_arg = [&](int /*argc*/, const char** /*argv*/, int /*index*/) {
+        lazy_load_dit  = false;
+        lazy_load_cond = false;
+        return 0;
+    };
+
     options.manual_options = {
         {"",
          "--type",
@@ -543,6 +593,12 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--no-lazy-load",
+         "disable lazy load of DiT and conditioner-LLM weights (default ON). "
+         "Lazy load defers per-component allocation+read until first compute() "
+         "so the working set never holds all components resident.",
+         on_no_lazy_load_arg},
     };
 
     return options;
@@ -638,6 +694,7 @@ std::string SDContextParams::to_string() const {
         << "  t5xxl_path: \"" << t5xxl_path << "\",\n"
         << "  llm_path: \"" << llm_path << "\",\n"
         << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
+        << "  gemma_tokenizer_path: \"" << gemma_tokenizer_path << "\",\n"
         << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
         << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
         << "  vae_path: \"" << vae_path << "\",\n"
@@ -693,6 +750,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         t5xxl_path.c_str(),
         llm_path.c_str(),
         llm_vision_path.c_str(),
+        gemma_tokenizer_path.c_str(),
         diffusion_model_path.c_str(),
         high_noise_diffusion_model_path.c_str(),
         vae_path.c_str(),
@@ -727,6 +785,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        auto_fit,
+        auto_fit_target_mb,
+        auto_fit_dry_run,
+        auto_fit_compute_reserve_dit_mb,
+        auto_fit_compute_reserve_vae_mb,
+        auto_fit_compute_reserve_cond_mb,
+        lazy_load_dit,
+        lazy_load_cond,
+        auto_tensor_split,
     };
     return sd_ctx_params;
 }
@@ -841,6 +908,18 @@ ArgOptions SDGenerationParams::get_options() {
          "--guidance",
          "distilled guidance scale for models with guidance input (default: 3.5)",
          &sample_params.guidance.distilled_guidance},
+        {"",
+         "--rescale-scale",
+         "CFG-rescale to combat oversaturation (default: 0; LTX-2.3 expects 0.7)",
+         &sample_params.guidance.rescale_scale},
+        {"",
+         "--stg-scale",
+         "Spatio-Temporal Guidance scale (default: 0; LTX-2.3 expects 1.0 with --stg-blocks [28])",
+         &sample_params.guidance.stg_scale},
+        {"",
+         "--high-noise-stg-scale",
+         "(high noise) Spatio-Temporal Guidance scale (default: 0)",
+         &high_noise_sample_params.guidance.stg_scale},
         {"",
          "--slg-scale",
          "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
@@ -1042,6 +1121,36 @@ ArgOptions SDGenerationParams::get_options() {
         return 1;
     };
 
+    auto parse_int_list = [](std::string s, std::vector<int>& out) -> bool {
+        if (s.empty()) return false;
+        if (s.front() == '[') s.erase(0, 1);
+        if (!s.empty() && s.back() == ']') s.pop_back();
+        std::regex regex("[, ]+");
+        std::sregex_token_iterator iter(s.begin(), s.end(), regex, -1);
+        std::sregex_token_iterator end;
+        std::vector<int> tmp;
+        for (auto it = iter; it != end; ++it) {
+            std::string token = *it;
+            if (token.empty()) continue;
+            try {
+                tmp.push_back(std::stoi(token));
+            } catch (const std::invalid_argument&) {
+                return false;
+            }
+        }
+        out = std::move(tmp);
+        return true;
+    };
+
+    auto on_stg_blocks_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) return -1;
+        return parse_int_list(argv[index], stg_blocks) ? 1 : -1;
+    };
+    auto on_high_noise_stg_blocks_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) return -1;
+        return parse_int_list(argv[index], high_noise_stg_blocks) ? 1 : -1;
+    };
+
     auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
             return -1;
@@ -1209,6 +1318,14 @@ ArgOptions SDGenerationParams::get_options() {
          "--high-noise-skip-layers",
          "(high noise) layers to skip for SLG steps (default: [7,8,9])",
          on_high_noise_skip_layers_arg},
+        {"",
+         "--stg-blocks",
+         "blocks for STG perturbed pass (LTX-2.3 default: [28]). Empty disables STG.",
+         on_stg_blocks_arg},
+        {"",
+         "--high-noise-stg-blocks",
+         "(high noise) blocks for STG perturbed pass.",
+         on_high_noise_stg_blocks_arg},
         {"-r",
          "--ref-image",
          "reference image for Flux Kontext models (can be used multiple times)",
@@ -1932,6 +2049,10 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
     sample_params.guidance.slg.layer_count            = skip_layers.size();
     high_noise_sample_params.guidance.slg.layers      = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data();
     high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
+    sample_params.guidance.stg_blocks                 = stg_blocks.empty() ? nullptr : stg_blocks.data();
+    sample_params.guidance.stg_blocks_count           = stg_blocks.size();
+    high_noise_sample_params.guidance.stg_blocks      = high_noise_stg_blocks.empty() ? nullptr : high_noise_stg_blocks.data();
+    high_noise_sample_params.guidance.stg_blocks_count = high_noise_stg_blocks.size();
     sample_params.custom_sigmas                       = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
     sample_params.custom_sigmas_count                 = static_cast<int>(custom_sigmas.size());
     cache_params.scm_mask                             = scm_mask.empty() ? nullptr : scm_mask.c_str();
@@ -1991,6 +2112,10 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
     sample_params.guidance.slg.layer_count            = skip_layers.size();
     high_noise_sample_params.guidance.slg.layers      = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data();
     high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
+    sample_params.guidance.stg_blocks                 = stg_blocks.empty() ? nullptr : stg_blocks.data();
+    sample_params.guidance.stg_blocks_count           = stg_blocks.size();
+    high_noise_sample_params.guidance.stg_blocks      = high_noise_stg_blocks.empty() ? nullptr : high_noise_stg_blocks.data();
+    high_noise_sample_params.guidance.stg_blocks_count = high_noise_stg_blocks.size();
     sample_params.custom_sigmas                       = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
     sample_params.custom_sigmas_count                 = static_cast<int>(custom_sigmas.size());
     cache_params.scm_mask                             = scm_mask.empty() ? nullptr : scm_mask.c_str();
@@ -2012,6 +2137,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
     params.strength                 = strength;
     params.seed                     = seed;
     params.video_frames             = video_frames;
+    params.fps                      = static_cast<float>(fps);
     params.vace_strength            = vace_strength;
     params.vae_tiling_params        = vae_tiling_params;
     params.cache                    = cache_params;

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -90,6 +90,7 @@ struct SDContextParams {
     std::string t5xxl_path;
     std::string llm_path;
     std::string llm_vision_path;
+    std::string gemma_tokenizer_path;
     std::string diffusion_model_path;
     std::string high_noise_diffusion_model_path;
     std::string vae_path;
@@ -127,6 +128,25 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit: pick DiT/VAE/Conditioner device placements from free GPU memory.
+    // Default ON; pass --no-auto-fit to opt out.
+    bool auto_fit                         = true;
+    int  auto_fit_target_mb               = 512;
+    bool auto_fit_dry_run                 = false;
+    int  auto_fit_compute_reserve_dit_mb  = 0;  // 0 = use header default
+    int  auto_fit_compute_reserve_vae_mb  = 0;
+    int  auto_fit_compute_reserve_cond_mb = 0;
+
+    // Lazy load: defer DiT and conditioner-LLM weight allocation+read until
+    // the first compute(). Default ON; pass --no-lazy-load to opt out.
+    bool lazy_load_dit  = true;
+    bool lazy_load_cond = true;
+
+    // Auto tensor split: when >1 CUDA device is detected, split DiT row-wise
+    // across all GPUs by free-VRAM ratio. Default ON; pass --no-tensor-split
+    // to opt out. SD_CUDA_TENSOR_SPLIT env still wins when set.
+    bool auto_tensor_split = true;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
@@ -168,6 +188,10 @@ struct SDGenerationParams {
     sd_sample_params_t high_noise_sample_params;
     std::vector<int> skip_layers            = {7, 8, 9};
     std::vector<int> high_noise_skip_layers = {7, 8, 9};
+    // STG (Spatio-Temporal Guidance) blocks (LTX-2.3 default: [28]). Empty means
+    // STG disabled even if --stg-scale is set.
+    std::vector<int> stg_blocks            = {};
+    std::vector<int> high_noise_stg_blocks = {};
 
     std::vector<float> custom_sigmas;
 

diff --git a/ggml-patch.diff b/ggml-patch.diff
@@ -0,0 +1,20 @@
+diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
+index cc80eb3f..dd79d9c0 100644
+--- a/src/ggml-cuda/ggml-cuda.cu
++++ b/src/ggml-cuda/ggml-cuda.cu
+@@ -865,7 +865,14 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
+ }
+
+ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
++    // Views: the view's storage comes from view_src, so we don't allocate
++    // anything on the split buffer for it. The split buffer's per-device
++    // shards aren't applicable to a view — the view simply reuses view_src's
++    // memory layout. Sched will route any op that consumes this view through
++    // view_src's backend.
++    if (tensor->view_src != nullptr) {
++        return GGML_STATUS_SUCCESS;
++    }
+     GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -76,6 +76,7 @@ enum prediction_t {
     FLOW_PRED,
     FLUX_FLOW_PRED,
     FLUX2_FLOW_PRED,
+    LTX2_FLOW_PRED,
     PREDICTION_COUNT
 };
 
@@ -169,6 +170,11 @@ typedef struct {
     const char* t5xxl_path;
     const char* llm_path;
     const char* llm_vision_path;
+    // Path to a HuggingFace-format tokenizer.json file. Currently only read by the
+    // LTX-2 Gemma 3 conditioner, which requires Gemma's tokenizer for BPE + metaspace
+    // encoding of prompts. If empty for LTX-2, the conditioner aborts with a clear
+    // message. Non-LTX-2 pipelines ignore this field.
+    const char* gemma_tokenizer_path;
     const char* diffusion_model_path;
     const char* high_noise_diffusion_model_path;
     const char* vae_path;
@@ -203,6 +209,36 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), the CLI placement overrides (env vars,
+    // keep_*_on_cpu) are ignored and the plan is computed automatically.
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
+    // per-component compute-buffer reserve; 0 means use the built-in default.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    int  auto_fit_compute_reserve_dit_mb;
+    int  auto_fit_compute_reserve_vae_mb;
+    int  auto_fit_compute_reserve_cond_mb;
+
+    // Lazy load: defer DiT and conditioner-LLM weight allocation+read until
+    // the first compute() call, so the working set never holds all components
+    // resident simultaneously. Required when sum-of-components exceeds combined
+    // VRAM (LTX-2 + Gemma + VAE all at once won't fit on a 24 GB rig).
+    // Defaults to true. Env-var overrides (SD_LAZY_LOAD_DIT/SD_LAZY_LOAD_COND)
+    // still work and force-enable when set; they cannot disable.
+    bool lazy_load_dit;
+    bool lazy_load_cond;
+
+    // Auto tensor split: when more than one CUDA device is detected and the
+    // user did NOT explicitly set SD_CUDA_TENSOR_SPLIT, automatically split
+    // the DiT row-wise across all available GPUs with ratios proportional to
+    // each device's free VRAM. Defaults to true. Set false to keep the DiT
+    // on a single GPU (with auto-fit choosing which one).
+    bool auto_tensor_split;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -225,6 +261,20 @@ typedef struct {
     float img_cfg;
     float distilled_guidance;
     sd_slg_params_t slg;
+    // CFG-rescale (LTX-2.3 default 0.7, no effect when 0). After CFG mixing,
+    // pred is rescaled toward cond's std to combat oversaturation:
+    //   factor = cond.std() / pred.std()
+    //   factor = rescale_scale * factor + (1 - rescale_scale)
+    //   pred  *= factor
+    float rescale_scale;
+    // Spatio-Temporal Guidance (LTX-2.3 default stg_scale=1.0, stg_blocks=[28]).
+    // Adds a third forward pass with self-attention skipped on the listed
+    // transformer blocks; the resulting "weakened" prediction is mixed into
+    // the guided pred:  pred += stg_scale * (cond - perturbed).
+    // No effect when stg_scale==0 or stg_blocks_count==0.
+    float stg_scale;
+    int*   stg_blocks;
+    size_t stg_blocks_count;
 } sd_guidance_params_t;
 
 typedef struct {
@@ -332,6 +382,12 @@ typedef struct {
     float strength;
     int64_t seed;
     int video_frames;
+    // Output video fps. Carried through to models that use it for temporal
+    // positional embeddings — LTX-2's RoPE divides the time axis by fps
+    // (ltx_core/tools.py::VideoLatentTools.create_initial_state), so the
+    // default 24 on LTXRunner silently produces wrong positions at any
+    // other target fps. 0 means "don't override runner default".
+    float fps;
     float vace_strength;
     sd_tiling_params_t vae_tiling_params;
     sd_cache_params_t cache;