From 0e4c44b0b527923bd4256f38a5c754e3251082c3 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 13:39:29 +0200
Subject: [PATCH 1/9] feat: auto-fit component placement and per-component
 backend devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an auto-fit planner that picks DiT / VAE / Conditioner device
placements from free GPU memory, treating each component as atomic
(no intra-tensor row split — equivalent to llama.cpp's
LLAMA_SPLIT_MODE_LAYER at component granularity, so views never land
on a split buffer and no ggml patch is needed).

Also adopt the PR #1184 CLI conventions:
- new: --main-backend-device, --diffusion-backend-device,
  --clip-backend-device, --vae-backend-device,
  --control-net-backend-device, --tae-backend-device,
  --upscaler-backend-device, --photomaker-backend-device,
  --vision-backend-device, --list-devices
- removed: --clip-on-cpu, --vae-on-cpu, --control-net-cpu
  (and the matching keep_*_on_cpu fields on sd_ctx_params_t)

Auto-fit knobs: --auto-fit / --no-auto-fit, --no-multi-gpu,
--fit-target, --fit-compute-reserve-{dit,vae,cond}, --fit-dry-run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/common/common.cpp | 131 +++++++++--
 examples/common/common.h   |  22 +-
 include/stable-diffusion.h |  41 +++-
 src/backend_fit.hpp        | 434 +++++++++++++++++++++++++++++++++++++
 src/model.h                |   2 +
 src/stable-diffusion.cpp   | 263 ++++++++++++++++++----
 src/version.cpp            |  12 +
 7 files changed, 838 insertions(+), 67 deletions(-)
 create mode 100644 src/backend_fit.hpp

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 1a5399b82..d3626fcce 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -380,6 +380,46 @@ ArgOptions SDContextParams::get_options() {
          "--upscale-model",
          "path to esrgan model.",
          &esrgan_path},
+        {"",
+         "--main-backend-device",
+         "ggml device name to use as the main backend (see --list-devices). "
+         "When unset, the first GPU device is used.",
+         &main_backend_device},
+        {"",
+         "--diffusion-backend-device",
+         "ggml device name for the diffusion / flow model. "
+         "Falls back to --main-backend-device.",
+         &diffusion_backend_device},
+        {"",
+         "--clip-backend-device",
+         "ggml device name for the text encoders. "
+         "Falls back to --main-backend-device.",
+         &clip_backend_device},
+        {"",
+         "--vae-backend-device",
+         "ggml device name for the VAE. Falls back to --main-backend-device.",
+         &vae_backend_device},
+        {"",
+         "--control-net-backend-device",
+         "ggml device name for the ControlNet. "
+         "Falls back to --main-backend-device.",
+         &control_net_backend_device},
+        {"",
+         "--tae-backend-device",
+         "ggml device name for the TAE (currently routed through main).",
+         &tae_backend_device},
+        {"",
+         "--upscaler-backend-device",
+         "ggml device name for the upscaler (currently routed through main).",
+         &upscaler_backend_device},
+        {"",
+         "--photomaker-backend-device",
+         "ggml device name for PhotoMaker (currently routed through main).",
+         &photomaker_backend_device},
+        {"",
+         "--vision-backend-device",
+         "ggml device name for the vision model (currently routed through main).",
+         &vision_backend_device},
     };
 
     options.int_options = {
@@ -392,6 +432,23 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
+        {"",
+         "--fit-compute-reserve-dit",
+         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
+         "(0 keeps the built-in default)",
+         &auto_fit_compute_reserve_dit_mb},
+        {"",
+         "--fit-compute-reserve-vae",
+         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer",
+         &auto_fit_compute_reserve_vae_mb},
+        {"",
+         "--fit-compute-reserve-cond",
+         "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer",
+         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {};
@@ -409,18 +466,6 @@ ArgOptions SDContextParams::get_options() {
          "--mmap",
          "whether to memory-map model",
          true, &enable_mmap},
-        {"",
-         "--control-net-cpu",
-         "keep controlnet in cpu (for low vram)",
-         true, &control_net_cpu},
-        {"",
-         "--clip-on-cpu",
-         "keep clip in cpu (for low vram)",
-         true, &clip_on_cpu},
-        {"",
-         "--vae-on-cpu",
-         "keep vae in cpu (for low vram)",
-         true, &vae_on_cpu},
         {"",
          "--fa",
          "use flash attention",
@@ -461,6 +506,24 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit *-backend-device flags",
+         false, &auto_fit},
+        {"",
+         "--no-multi-gpu",
+         "auto-fit: keep all components on a single GPU when they fit "
+         "(by default, multi-GPU placements are preferred to balance load)",
+         false, &auto_multi_gpu},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -559,6 +622,15 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--list-devices",
+         "list available ggml backend devices (one per line, "
+         "name<TAB>description) and exit",
+         [](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             sd_list_devices();
+             std::exit(0);
+             return 0;
+         }},
     };
 
     return options;
@@ -671,9 +743,19 @@ std::string SDContextParams::to_string() const {
         << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
         << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-        << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-        << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-        << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+        << "  main_backend_device: \"" << main_backend_device << "\",\n"
+        << "  diffusion_backend_device: \"" << diffusion_backend_device << "\",\n"
+        << "  clip_backend_device: \"" << clip_backend_device << "\",\n"
+        << "  vae_backend_device: \"" << vae_backend_device << "\",\n"
+        << "  control_net_backend_device: \"" << control_net_backend_device << "\",\n"
+        << "  tae_backend_device: \"" << tae_backend_device << "\",\n"
+        << "  upscaler_backend_device: \"" << upscaler_backend_device << "\",\n"
+        << "  photomaker_backend_device: \"" << photomaker_backend_device << "\",\n"
+        << "  vision_backend_device: \"" << vision_backend_device << "\",\n"
+        << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
+        << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
+        << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -729,9 +811,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         lora_apply_mode,
         offload_params_to_cpu,
         enable_mmap,
-        clip_on_cpu,
-        control_net_cpu,
-        vae_on_cpu,
+        main_backend_device.empty() ? nullptr : main_backend_device.c_str(),
+        diffusion_backend_device.empty() ? nullptr : diffusion_backend_device.c_str(),
+        clip_backend_device.empty() ? nullptr : clip_backend_device.c_str(),
+        vae_backend_device.empty() ? nullptr : vae_backend_device.c_str(),
+        control_net_backend_device.empty() ? nullptr : control_net_backend_device.c_str(),
+        tae_backend_device.empty() ? nullptr : tae_backend_device.c_str(),
+        upscaler_backend_device.empty() ? nullptr : upscaler_backend_device.c_str(),
+        photomaker_backend_device.empty() ? nullptr : photomaker_backend_device.c_str(),
+        vision_backend_device.empty() ? nullptr : vision_backend_device.c_str(),
         flash_attn,
         diffusion_flash_attn,
         taesd_preview,
@@ -744,6 +832,13 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        auto_fit,
+        auto_fit_target_mb,
+        auto_fit_dry_run,
+        auto_fit_compute_reserve_dit_mb,
+        auto_fit_compute_reserve_vae_mb,
+        auto_fit_compute_reserve_cond_mb,
+        auto_multi_gpu,
     };
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index c4498c352..8243d6cba 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -110,9 +110,15 @@ struct SDContextParams {
     rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
     bool offload_params_to_cpu  = false;
     bool enable_mmap            = false;
-    bool control_net_cpu        = false;
-    bool clip_on_cpu            = false;
-    bool vae_on_cpu             = false;
+    std::string main_backend_device;
+    std::string diffusion_backend_device;
+    std::string clip_backend_device;
+    std::string vae_backend_device;
+    std::string control_net_backend_device;
+    std::string tae_backend_device;
+    std::string upscaler_backend_device;
+    std::string photomaker_backend_device;
+    std::string vision_backend_device;
     bool flash_attn             = false;
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
@@ -128,6 +134,16 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit defaults — placement is computed automatically based on free
+    // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device.
+    bool auto_fit                         = true;
+    int  auto_fit_target_mb               = 512;
+    bool auto_fit_dry_run                 = false;
+    int  auto_fit_compute_reserve_dit_mb  = 0;
+    int  auto_fit_compute_reserve_vae_mb  = 0;
+    int  auto_fit_compute_reserve_cond_mb = 0;
+    bool auto_multi_gpu                   = true;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 75027f8f8..ed6336ba1 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -188,9 +188,18 @@ typedef struct {
     enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
+    // Per-component backend device names (ggml device names). Empty / NULL
+    // means "use the main backend device". The strings are only borrowed for
+    // the duration of the init call. See sd_list_devices() for what to pass.
+    const char* main_backend_device;
+    const char* diffusion_backend_device;
+    const char* clip_backend_device;
+    const char* vae_backend_device;
+    const char* control_net_backend_device;
+    const char* tae_backend_device;
+    const char* upscaler_backend_device;
+    const char* photomaker_backend_device;
+    const char* vision_backend_device;
     bool flash_attn;
     bool diffusion_flash_attn;
     bool tae_preview_only;
@@ -203,6 +212,27 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), the *_backend_device strings are
+    // ignored and the plan is computed automatically.
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
+    // per-component compute-buffer reserve; 0 means use the built-in default.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    int  auto_fit_compute_reserve_dit_mb;
+    int  auto_fit_compute_reserve_vae_mb;
+    int  auto_fit_compute_reserve_cond_mb;
+
+    // When more than one GPU device is present, prefer placing different
+    // components on different GPUs to balance load and fit larger total
+    // working sets. Set false to keep all components on a single GPU when
+    // they fit. Defaults to true. Each component still lives entirely on
+    // one device — no intra-tensor row split.
+    bool auto_multi_gpu;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -449,6 +479,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
+// per-line format. The output is intended to be parsed by tools and used
+// directly as the value of --*-backend-device flags.
+SD_API void sd_list_devices(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
new file mode 100644
index 000000000..52254f0e8
--- /dev/null
+++ b/src/backend_fit.hpp
@@ -0,0 +1,434 @@
+#ifndef __SD_BACKEND_FIT_HPP__
+#define __SD_BACKEND_FIT_HPP__
+
+// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the
+// available GPU devices and system RAM.
+//
+// Each component is treated as a single atomic unit that lives entirely on
+// one device (plus its compute buffer on the same device). There is no
+// intra-tensor row split: cross-device parallelism comes from placing
+// different components on different GPUs, not from splitting individual
+// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER
+// at the component granularity.
+//
+// Placement priority: DiT + compute buffer -> VAE -> Conditioner.
+// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
+// support streaming params from RAM at compute time).
+
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include "model.h"
+#include "util.h"
+
+namespace backend_fit {
+
+constexpr int64_t MiB           = 1024 * 1024;
+constexpr int     DEVICE_ID_CPU = -1;
+
+enum class ComponentKind {
+    DIT,
+    VAE,
+    CONDITIONER,
+};
+
+enum class Placement {
+    CPU,
+    GPU,
+    GPU_OFFLOAD_PARAMS,  // params in RAM, compute on GPU
+};
+
+struct Component {
+    ComponentKind kind;
+    std::string   name;
+    int64_t       params_bytes     = 0;
+    int64_t       compute_bytes    = 0;
+    bool          supports_offload = false;
+};
+
+struct Device {
+    int                id = DEVICE_ID_CPU;
+    std::string        name;
+    std::string        description;
+    int64_t            free_bytes  = 0;
+    int64_t            total_bytes = 0;
+    ggml_backend_dev_t dev         = nullptr;  // backing ggml device handle (GPU only)
+};
+
+struct Decision {
+    ComponentKind kind;
+    std::string   name;
+    Placement     placement       = Placement::CPU;
+    int           device_id       = DEVICE_ID_CPU;
+    int64_t       on_device_bytes = 0;
+    int64_t       on_host_bytes   = 0;
+};
+
+struct Plan {
+    std::vector<Decision>  decisions;
+    std::map<int, int64_t> device_bytes;
+    int64_t                host_bytes  = 0;
+    bool                   any_changes = false;
+};
+
+struct ComputeReserves {
+    int64_t dit_bytes         = int64_t(2048) * MiB;
+    int64_t vae_bytes         = int64_t(1024) * MiB;
+    int64_t conditioner_bytes = int64_t(512) * MiB;
+};
+
+// --- Classification -------------------------------------------------------
+
+inline bool classify_tensor(const std::string& name, ComponentKind& out) {
+    auto contains = [&](const char* s) { return name.find(s) != std::string::npos; };
+
+    if (contains("model.diffusion_model.") || contains("unet.")) {
+        out = ComponentKind::DIT;
+        return true;
+    }
+
+    if (contains("first_stage_model.") ||
+        name.rfind("vae.", 0) == 0 ||
+        name.rfind("tae.", 0) == 0) {
+        out = ComponentKind::VAE;
+        return true;
+    }
+
+    if (contains("text_encoders") ||
+        contains("cond_stage_model") ||
+        contains("te.text_model.") ||
+        contains("conditioner") ||
+        name.rfind("text_encoder.", 0) == 0) {
+        out = ComponentKind::CONDITIONER;
+        return true;
+    }
+
+    return false;
+}
+
+// --- Memory estimation ----------------------------------------------------
+
+inline std::vector<Component> estimate_components(ModelLoader&           loader,
+                                                  ggml_type              override_wtype,
+                                                  int64_t                alignment,
+                                                  const ComputeReserves& reserves) {
+    auto& storage = loader.get_tensor_storage_map();
+
+    int64_t bytes[3] = {0, 0, 0};
+
+    for (auto& [name, ts_const] : storage) {
+        TensorStorage ts = ts_const;
+        if (is_unused_tensor(ts.name)) {
+            continue;
+        }
+
+        ComponentKind k;
+        if (!classify_tensor(ts.name, k)) {
+            continue;
+        }
+
+        if (override_wtype != GGML_TYPE_COUNT &&
+            loader.tensor_should_be_converted(ts, override_wtype)) {
+            ts.type = override_wtype;
+        } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) {
+            ts.type = ts.expected_type;
+        }
+
+        bytes[int(k)] += ts.nbytes() + alignment;
+    }
+
+    std::vector<Component> out;
+    out.reserve(3);
+    out.push_back({ComponentKind::DIT, "DiT",
+                   bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true});
+    out.push_back({ComponentKind::VAE, "VAE",
+                   bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false});
+    out.push_back({ComponentKind::CONDITIONER, "Conditioner",
+                   bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true});
+    return out;
+}
+
+// --- Device enumeration ---------------------------------------------------
+
+inline std::vector<Device> enumerate_gpu_devices() {
+    std::vector<Device> out;
+    int gpu_idx = 0;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            continue;
+        }
+        Device d;
+        d.id          = gpu_idx++;
+        d.dev         = dev;
+        d.name        = ggml_backend_dev_name(dev);
+        d.description = ggml_backend_dev_description(dev);
+        size_t free_b = 0, total_b = 0;
+        ggml_backend_dev_memory(dev, &free_b, &total_b);
+        d.free_bytes  = int64_t(free_b);
+        d.total_bytes = int64_t(total_b);
+        out.push_back(d);
+    }
+    return out;
+}
+
+// --- Core algorithm -------------------------------------------------------
+
+// Peak per device = MAX of any single component's footprint on that device,
+// because free_params_immediately frees params between phases so components
+// time-share VRAM.
+inline int64_t gpu_peak(int                           gpu_idx,
+                        const std::vector<Placement>& pl,
+                        const std::vector<int>&       dev,
+                        const std::vector<Component>& components) {
+    int64_t peak = 0;
+    for (size_t i = 0; i < components.size(); i++) {
+        if (dev[i] != gpu_idx) continue;
+        int64_t footprint = 0;
+        if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            footprint = components[i].params_bytes + components[i].compute_bytes;
+        }
+        peak = std::max(peak, footprint);
+    }
+    return peak;
+}
+
+inline Plan compute_plan(const std::vector<Component>& components,
+                         const std::vector<Device>&    devices,
+                         int64_t                       margin_bytes,
+                         bool                          allow_multi_gpu = true) {
+    const size_t nC = components.size();
+    const size_t nG = devices.size();
+
+    std::vector<int64_t> cap(nG, 0);
+    for (size_t g = 0; g < nG; g++) {
+        cap[g] = std::max<int64_t>(0, devices[g].free_bytes - margin_bytes);
+    }
+
+    struct OptionSlot {
+        Placement placement;
+        int       device_idx;
+    };
+
+    auto build_options = [&](const Component& c) {
+        std::vector<OptionSlot> opts;
+        for (size_t g = 0; g < nG; g++) {
+            opts.push_back({Placement::GPU, int(g)});
+            if (c.supports_offload) {
+                opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
+            }
+        }
+        opts.push_back({Placement::CPU, -1});
+        return opts;
+    };
+
+    std::vector<std::vector<OptionSlot>> options;
+    options.reserve(nC);
+    for (const Component& c : components) {
+        options.push_back(build_options(c));
+    }
+
+    auto priority_weight = [](ComponentKind k) -> int {
+        switch (k) {
+            case ComponentKind::DIT:         return 300;
+            case ComponentKind::CONDITIONER: return 120;
+            case ComponentKind::VAE:         return 60;
+        }
+        return 1;
+    };
+
+    auto score = [&](const std::vector<Placement>& pl, const std::vector<int>& dev) {
+        int64_t       s = 0;
+        std::set<int> gpus_used;
+        for (size_t i = 0; i < nC; i++) {
+            const int pw = priority_weight(components[i].kind);
+            if (pl[i] == Placement::GPU) {
+                s += 10 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                s += 5 * pw;
+                gpus_used.insert(dev[i]);
+            } else {
+                s -= 10 * pw;
+            }
+        }
+        if (allow_multi_gpu) {
+            s += 2 * int64_t(gpus_used.size());
+        }
+        return s;
+    };
+
+    std::vector<size_t>    idx(nC, 0);
+    std::vector<Placement> best_pl;
+    std::vector<int>       best_dev;
+    int64_t                best_score = std::numeric_limits<int64_t>::min();
+    bool                   found_any  = false;
+
+    while (true) {
+        std::vector<Placement> pl(nC);
+        std::vector<int>       dev(nC);
+        for (size_t i = 0; i < nC; i++) {
+            pl[i]  = options[i][idx[i]].placement;
+            dev[i] = options[i][idx[i]].device_idx;
+        }
+        // Constraint: when multi-GPU is disabled, all GPU placements must
+        // share the same device index.
+        if (!allow_multi_gpu) {
+            int common = -1;
+            bool ok = true;
+            for (size_t i = 0; i < nC; i++) {
+                if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                    if (common < 0) common = dev[i];
+                    else if (dev[i] != common) { ok = false; break; }
+                }
+            }
+            if (ok) {
+                bool feasible = true;
+                for (size_t g = 0; g < nG; g++) {
+                    if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                }
+                if (feasible) {
+                    int64_t sc = score(pl, dev);
+                    if (sc > best_score) {
+                        best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                    }
+                }
+            }
+        } else {
+            bool feasible = true;
+            for (size_t g = 0; g < nG; g++) {
+                if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+            }
+            if (feasible) {
+                int64_t sc = score(pl, dev);
+                if (sc > best_score) {
+                    best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                }
+            }
+        }
+
+        size_t pos = 0;
+        while (pos < nC) {
+            idx[pos]++;
+            if (idx[pos] < options[pos].size()) break;
+            idx[pos] = 0;
+            pos++;
+        }
+        if (pos >= nC) break;
+    }
+
+    Plan plan;
+    if (!found_any) {
+        best_pl.assign(nC, Placement::CPU);
+        best_dev.assign(nC, -1);
+    }
+
+    for (size_t i = 0; i < nC; i++) {
+        const Component& c = components[i];
+        Decision         d;
+        d.kind      = c.kind;
+        d.name      = c.name;
+        d.placement = best_pl[i];
+        if (best_pl[i] == Placement::CPU) {
+            d.device_id      = DEVICE_ID_CPU;
+            d.on_host_bytes  = c.params_bytes + c.compute_bytes;
+            plan.any_changes = true;
+        } else {
+            d.device_id = devices[best_dev[i]].id;
+            if (best_pl[i] == Placement::GPU) {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+            } else {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+                d.on_host_bytes   = c.params_bytes;
+                plan.any_changes  = true;
+            }
+        }
+        plan.decisions.push_back(d);
+        plan.host_bytes += d.on_host_bytes;
+    }
+
+    for (size_t g = 0; g < nG; g++) {
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components);
+    }
+    return plan;
+}
+
+inline const char* placement_str(Placement p) {
+    switch (p) {
+        case Placement::CPU: return "CPU";
+        case Placement::GPU: return "GPU";
+        case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+    }
+    return "?";
+}
+
+inline void print_plan(const Plan&                   plan,
+                       const std::vector<Component>& components,
+                       const std::vector<Device>&    devices,
+                       int64_t                       margin_bytes) {
+    LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB));
+    LOG_INFO("  available devices:");
+    if (devices.empty()) {
+        LOG_INFO("    (no GPU devices detected — all components will run on CPU)");
+    }
+    for (const Device& d : devices) {
+        LOG_INFO("    %-12s %-32s free %6lld / %6lld MiB",
+                 d.name.c_str(), d.description.c_str(),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)(d.total_bytes / MiB));
+    }
+    LOG_INFO("  components:");
+    for (const Component& c : components) {
+        LOG_INFO("    %-12s params %6lld MiB, compute reserve %6lld MiB",
+                 c.name.c_str(),
+                 (long long)(c.params_bytes / MiB),
+                 (long long)(c.compute_bytes / MiB));
+    }
+    LOG_INFO("  decisions:");
+    for (const Decision& d : plan.decisions) {
+        if (d.placement == Placement::CPU) {
+            LOG_INFO("    %-12s -> CPU                (RAM %lld MiB)",
+                     d.name.c_str(), (long long)(d.on_host_bytes / MiB));
+        } else if (d.placement == Placement::GPU) {
+            LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB));
+        } else {
+            LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB),
+                     (long long)(d.on_host_bytes / MiB));
+        }
+    }
+    LOG_INFO("  projected per-device peak:");
+    for (const Device& d : devices) {
+        int64_t peak = 0;
+        auto    it   = plan.device_bytes.find(d.id);
+        if (it != plan.device_bytes.end()) peak = it->second;
+        LOG_INFO("    %-12s peak %6lld / %6lld MiB free  (remaining %lld MiB)",
+                 d.name.c_str(),
+                 (long long)(peak / MiB),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)((d.free_bytes - peak) / MiB));
+    }
+    LOG_INFO("    %-12s host RAM additional %lld MiB", "CPU",
+             (long long)(plan.host_bytes / MiB));
+}
+
+inline const Decision* find_decision(const Plan& plan, ComponentKind kind) {
+    for (const Decision& d : plan.decisions) {
+        if (d.kind == kind) return &d;
+    }
+    return nullptr;
+}
+
+}  // namespace backend_fit
+
+#endif  // __SD_BACKEND_FIT_HPP__
diff --git a/src/model.h b/src/model.h
index 65bc6c367..10aaf8512 100644
--- a/src/model.h
+++ b/src/model.h
@@ -193,6 +193,8 @@ using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
 
 TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
 
+bool is_unused_tensor(const std::string& name);
+
 class ModelLoader {
 protected:
     SDVersion version_ = VERSION_COUNT;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 88102ff61..dfe2a8873 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1,5 +1,6 @@
 #include "ggml_extend.hpp"
 
+#include "backend_fit.hpp"
 #include "model.h"
 #include "rng.hpp"
 #include "rng_mt19937.hpp"
@@ -108,10 +109,23 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
 
 class StableDiffusionGGML {
 public:
-    ggml_backend_t backend             = nullptr;  // general backend
+    ggml_backend_t backend             = nullptr;  // general / main backend
     ggml_backend_t clip_backend        = nullptr;
     ggml_backend_t control_net_backend = nullptr;
     ggml_backend_t vae_backend         = nullptr;
+    ggml_backend_t diffusion_backend   = nullptr;
+
+    // Auto-fit decisions resolved into device-name strings. When non-empty,
+    // these win over the user-provided sd_ctx_params->*_backend_device.
+    // When empty, the explicit param (or `backend` fallback) is used.
+    std::string fit_diffusion_device;
+    std::string fit_clip_device;
+    std::string fit_vae_device;
+    // Per-component offload-params override coming from auto-fit. Forces
+    // offload_params_to_cpu for that component even when global flag is off.
+    bool fit_dit_offload_params  = false;
+    bool fit_cond_offload_params = false;
+    bool fit_vae_offload_params  = false;
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -168,11 +182,23 @@ class StableDiffusionGGML {
         if (vae_backend != backend) {
             ggml_backend_free(vae_backend);
         }
+        if (diffusion_backend != backend) {
+            ggml_backend_free(diffusion_backend);
+        }
         ggml_backend_free(backend);
     }
 
-    void init_backend() {
-        backend = sd_get_default_backend();
+    void init_backend(const char* main_device_name) {
+        if (main_device_name != nullptr && main_device_name[0] != '\0') {
+            backend = init_named_backend(main_device_name);
+            if (backend == nullptr) {
+                LOG_WARN("main backend device '%s' init failed; falling back to default",
+                         main_device_name);
+            }
+        }
+        if (backend == nullptr) {
+            backend = sd_get_default_backend();
+        }
     }
 
     std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
@@ -202,7 +228,7 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        init_backend();
+        init_backend(sd_ctx_params->main_backend_device);
 
         ModelLoader model_loader;
 
@@ -328,6 +354,75 @@ class StableDiffusionGGML {
             return oss.str();
         };
 
+        if (sd_ctx_params->auto_fit) {
+            backend_fit::ComputeReserves reserves;
+            if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) {
+                reserves.dit_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) {
+                reserves.vae_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) {
+                reserves.conditioner_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB;
+            }
+            auto components = backend_fit::estimate_components(
+                model_loader, wtype, /*alignment=*/64, reserves);
+            auto    devices = backend_fit::enumerate_gpu_devices();
+            int64_t margin_bytes =
+                int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            auto plan = backend_fit::compute_plan(
+                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu);
+            backend_fit::print_plan(plan, components, devices, margin_bytes);
+
+            if (sd_ctx_params->auto_fit_dry_run) {
+                LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models");
+                return false;
+            }
+
+            // Find the CPU device's ggml name (so we can route "CPU"
+            // placements through init_named_backend uniformly).
+            std::string cpu_device_name;
+            for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                    cpu_device_name = ggml_backend_dev_name(dev);
+                    break;
+                }
+            }
+            auto resolve = [&](const backend_fit::Decision* d, std::string& out_device,
+                               bool& out_offload) {
+                if (d == nullptr) {
+                    out_device.clear();
+                    out_offload = false;
+                    return;
+                }
+                if (d->placement == backend_fit::Placement::CPU) {
+                    out_device = cpu_device_name;
+                    out_offload = false;
+                    return;
+                }
+                for (const auto& dev : devices) {
+                    if (dev.id == d->device_id) {
+                        out_device = dev.name;
+                        break;
+                    }
+                }
+                out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS);
+            };
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT),
+                    fit_diffusion_device, fit_dit_offload_params);
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE),
+                    fit_vae_device, fit_vae_offload_params);
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER),
+                    fit_clip_device, fit_cond_offload_params);
+
+            // CPU placements: leave fit_*_device empty AND remember they're
+            // CPU so the resolver below picks ggml_backend_cpu_init().
+        }
+
         LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
         LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
         LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
@@ -373,19 +468,57 @@ class StableDiffusionGGML {
             LOG_INFO("Using circular padding for convolutions");
         }
 
-        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
+        // If auto-fit decided ANY component must offload params, force the
+        // global flag on. This is a coarsening: one component needing offload
+        // forces all to offload (safer, just slower for non-offload ones).
+        if (fit_dit_offload_params || fit_cond_offload_params || fit_vae_offload_params) {
+            if (!offload_params_to_cpu) {
+                LOG_INFO("auto-fit: enabling offload_params_to_cpu (one or more "
+                         "components don't fit without param streaming)");
+                offload_params_to_cpu = true;
+            }
+        }
+
+        // Pick the effective device name for each component: the auto-fit
+        // override (if any) wins; otherwise the user-provided string; nullptr
+        // falls back to `backend` (the main).
+        auto effective_device = [&](const std::string& fit_str, const char* user_str) -> const char* {
+            if (!fit_str.empty()) return fit_str.c_str();
+            return user_str;
+        };
+        const char* diffusion_dev_name = effective_device(fit_diffusion_device,
+                                                          sd_ctx_params->diffusion_backend_device);
+        const char* clip_dev_name      = effective_device(fit_clip_device,
+                                                          sd_ctx_params->clip_backend_device);
+        const char* vae_dev_name       = effective_device(fit_vae_device,
+                                                          sd_ctx_params->vae_backend_device);
+
+        // Helper: init a named backend if name is non-null/non-empty,
+        // returns nullptr on missing/failed name (caller falls back to main).
+        auto init_named_or_null = [](const char* name) -> ggml_backend_t {
+            if (name == nullptr || name[0] == '\0') return nullptr;
+            return init_named_backend(name);
+        };
+
+        diffusion_backend = init_named_or_null(diffusion_dev_name);
+        if (!diffusion_backend) {
+            diffusion_backend = backend;
+        } else {
+            LOG_INFO("Diffusion model: using device %s", diffusion_dev_name);
+        }
 
         {
-            clip_backend = backend;
-            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("CLIP: Using CPU backend");
-                clip_backend = ggml_backend_cpu_init();
+            clip_backend = init_named_or_null(clip_dev_name);
+            if (!clip_backend) {
+                clip_backend = backend;
+            } else {
+                LOG_INFO("CLIP: using device %s", clip_dev_name);
             }
             if (sd_version_is_sd3(version)) {
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend,
+                diffusion_model  = std::make_shared<MMDiTModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
@@ -423,7 +556,7 @@ class StableDiffusionGGML {
                                                                           offload_params_to_cpu,
                                                                           tensor_storage_map);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend,
+                diffusion_model = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
@@ -434,7 +567,7 @@ class StableDiffusionGGML {
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<FluxModel>(backend,
+                diffusion_model  = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
@@ -446,13 +579,13 @@ class StableDiffusionGGML {
                                                                     true,
                                                                     0,
                                                                     true);
-                diffusion_model  = std::make_shared<WanModel>(backend,
+                diffusion_model  = std::make_shared<WanModel>(diffusion_backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
-                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
+                    high_noise_diffusion_model = std::make_shared<WanModel>(diffusion_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
@@ -478,7 +611,7 @@ class StableDiffusionGGML {
                                                                  version,
                                                                  "",
                                                                  enable_vision);
-                diffusion_model  = std::make_shared<QwenImageModel>(backend,
+                diffusion_model  = std::make_shared<QwenImageModel>(diffusion_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model",
@@ -488,7 +621,7 @@ class StableDiffusionGGML {
                 cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
-                diffusion_model  = std::make_shared<AnimaModel>(backend,
+                diffusion_model  = std::make_shared<AnimaModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map,
                                                                "model.diffusion_model");
@@ -497,7 +630,7 @@ class StableDiffusionGGML {
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ZImageModel>(backend,
+                diffusion_model  = std::make_shared<ZImageModel>(diffusion_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
@@ -507,7 +640,7 @@ class StableDiffusionGGML {
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ErnieImageModel>(backend,
+                diffusion_model  = std::make_shared<ErnieImageModel>(diffusion_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     "model.diffusion_model");
@@ -530,7 +663,7 @@ class StableDiffusionGGML {
                                                                                            embbeding_map,
                                                                                            version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend,
+                diffusion_model = std::make_shared<UNetModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version);
@@ -555,11 +688,13 @@ class StableDiffusionGGML {
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
 
-            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("VAE Autoencoder: Using CPU backend");
-                vae_backend = ggml_backend_cpu_init();
-            } else {
+            if (vae_dev_name != nullptr && vae_dev_name[0] != '\0') {
+                vae_backend = init_named_backend(vae_dev_name);
+            }
+            if (!vae_backend) {
                 vae_backend = backend;
+            } else {
+                LOG_INFO("VAE: using device %s", vae_dev_name);
             }
 
             auto create_tae = [&]() -> std::shared_ptr<VAE> {
@@ -648,11 +783,14 @@ class StableDiffusionGGML {
 
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                 ggml_backend_t controlnet_backend = nullptr;
-                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
-                    LOG_DEBUG("ControlNet: Using CPU backend");
-                    controlnet_backend = ggml_backend_cpu_init();
-                } else {
+                const char* cn_dev_name = sd_ctx_params->control_net_backend_device;
+                if (cn_dev_name != nullptr && cn_dev_name[0] != '\0') {
+                    controlnet_backend = init_named_backend(cn_dev_name);
+                }
+                if (!controlnet_backend) {
                     controlnet_backend = backend;
+                } else {
+                    LOG_INFO("ControlNet: using device %s", cn_dev_name);
                 }
                 control_net = std::make_shared<ControlNet>(controlnet_backend,
                                                            offload_params_to_cpu,
@@ -2142,16 +2280,29 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->prediction              = PREDICTION_COUNT;
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
-    sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
-    sd_ctx_params->diffusion_flash_attn    = false;
-    sd_ctx_params->circular_x              = false;
-    sd_ctx_params->circular_y              = false;
-    sd_ctx_params->chroma_use_dit_mask     = true;
-    sd_ctx_params->chroma_use_t5_mask      = false;
-    sd_ctx_params->chroma_t5_mask_pad      = 1;
+    sd_ctx_params->enable_mmap                  = false;
+    sd_ctx_params->main_backend_device          = nullptr;
+    sd_ctx_params->diffusion_backend_device     = nullptr;
+    sd_ctx_params->clip_backend_device          = nullptr;
+    sd_ctx_params->vae_backend_device           = nullptr;
+    sd_ctx_params->control_net_backend_device   = nullptr;
+    sd_ctx_params->tae_backend_device           = nullptr;
+    sd_ctx_params->upscaler_backend_device      = nullptr;
+    sd_ctx_params->photomaker_backend_device    = nullptr;
+    sd_ctx_params->vision_backend_device        = nullptr;
+    sd_ctx_params->diffusion_flash_attn         = false;
+    sd_ctx_params->circular_x                   = false;
+    sd_ctx_params->circular_y                   = false;
+    sd_ctx_params->chroma_use_dit_mask          = true;
+    sd_ctx_params->chroma_use_t5_mask           = false;
+    sd_ctx_params->chroma_t5_mask_pad           = 1;
+    sd_ctx_params->auto_fit                     = true;
+    sd_ctx_params->auto_fit_target_mb           = 512;
+    sd_ctx_params->auto_fit_dry_run             = false;
+    sd_ctx_params->auto_fit_compute_reserve_dit_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
+    sd_ctx_params->auto_multi_gpu               = true;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2183,9 +2334,22 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "sampler_rng_type: %s\n"
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
+             "main_backend_device: %s\n"
+             "diffusion_backend_device: %s\n"
+             "clip_backend_device: %s\n"
+             "vae_backend_device: %s\n"
+             "control_net_backend_device: %s\n"
+             "tae_backend_device: %s\n"
+             "upscaler_backend_device: %s\n"
+             "photomaker_backend_device: %s\n"
+             "vision_backend_device: %s\n"
+             "auto_fit: %s\n"
+             "auto_fit_target_mb: %d\n"
+             "auto_fit_dry_run: %s\n"
+             "auto_fit_compute_reserve_dit_mb: %d\n"
+             "auto_fit_compute_reserve_vae_mb: %d\n"
+             "auto_fit_compute_reserve_cond_mb: %d\n"
+             "auto_multi_gpu: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2215,9 +2379,22 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
+             SAFE_STR(sd_ctx_params->main_backend_device),
+             SAFE_STR(sd_ctx_params->diffusion_backend_device),
+             SAFE_STR(sd_ctx_params->clip_backend_device),
+             SAFE_STR(sd_ctx_params->vae_backend_device),
+             SAFE_STR(sd_ctx_params->control_net_backend_device),
+             SAFE_STR(sd_ctx_params->tae_backend_device),
+             SAFE_STR(sd_ctx_params->upscaler_backend_device),
+             SAFE_STR(sd_ctx_params->photomaker_backend_device),
+             SAFE_STR(sd_ctx_params->vision_backend_device),
+             BOOL_STR(sd_ctx_params->auto_fit),
+             sd_ctx_params->auto_fit_target_mb,
+             BOOL_STR(sd_ctx_params->auto_fit_dry_run),
+             sd_ctx_params->auto_fit_compute_reserve_dit_mb,
+             sd_ctx_params->auto_fit_compute_reserve_vae_mb,
+             sd_ctx_params->auto_fit_compute_reserve_cond_mb,
+             BOOL_STR(sd_ctx_params->auto_multi_gpu),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
diff --git a/src/version.cpp b/src/version.cpp
index 97dc8426b..6c266153c 100644
--- a/src/version.cpp
+++ b/src/version.cpp
@@ -1,3 +1,6 @@
+#include <cstdio>
+
+#include "ggml-backend.h"
 #include "stable-diffusion.h"
 
 #ifndef SDCPP_BUILD_COMMIT
@@ -18,3 +21,12 @@ const char* sd_commit(void) {
 const char* sd_version(void) {
     return STRINGIZE(SDCPP_BUILD_VERSION);
 }
+
+void sd_list_devices(void) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* name       = ggml_backend_dev_name(dev);
+        const char* desc       = ggml_backend_dev_description(dev);
+        std::printf("%s\t%s\n", name ? name : "", desc ? desc : "");
+    }
+}

From 717c79ae738d5bdc920e129e28ac97280ef86ce2 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 16:13:51 +0200
Subject: [PATCH 2/9] wip: layer-split + lazy load (RAM regression)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds runner-level multi-backend (sched-based) layer-split, per-tensor
buft callback alloc, GPU_LAYER_SPLIT placement in backend_fit, and
auto-fit lazy load when init-time SUM exceeds device cap. Wires the
LTX-2 DiT and Conditioner LLM through the new path.

Known issue: system RAM OOM-kill during DiT lazy load even with --mmap.
Per-thread staging buffers in ModelLoader::load_tensors hold a copy of
each tensor as it streams from mmap to GPU; with 8 threads × ~600 MB
each + cumulative mmap'd page cache, peak RSS exceeds 16 GB.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/backend_fit.hpp      | 129 ++++++++++++-
 src/conditioner.hpp      |   5 +
 src/diffusion_model.hpp  |   4 +
 src/ggml_extend.hpp      | 337 +++++++++++++++++++++++++++++++-
 src/stable-diffusion.cpp | 402 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 847 insertions(+), 30 deletions(-)

diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
index 52254f0e8..7ca789a0b 100644
--- a/src/backend_fit.hpp
+++ b/src/backend_fit.hpp
@@ -15,9 +15,11 @@
 // Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
 // support streaming params from RAM at compute time).
 
+#include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <map>
+#include <numeric>
 #include <set>
 #include <string>
 #include <vector>
@@ -42,7 +44,8 @@ enum class ComponentKind {
 enum class Placement {
     CPU,
     GPU,
-    GPU_OFFLOAD_PARAMS,  // params in RAM, compute on GPU
+    GPU_OFFLOAD_PARAMS,    // params in RAM, compute on GPU
+    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries
 };
 
 struct Component {
@@ -69,6 +72,13 @@ struct Decision {
     int           device_id       = DEVICE_ID_CPU;
     int64_t       on_device_bytes = 0;
     int64_t       on_host_bytes   = 0;
+
+    // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs
+    // that share this component (in order) and each device's estimated share
+    // of the params. The order also defines block-range partitioning: the
+    // i-th device gets a contiguous range of blocks proportional to share[i].
+    std::vector<int>     split_device_ids;
+    std::vector<int64_t> split_share_bytes;
 };
 
 struct Plan {
@@ -105,7 +115,13 @@ inline bool classify_tensor(const std::string& name, ComponentKind& out) {
         contains("cond_stage_model") ||
         contains("te.text_model.") ||
         contains("conditioner") ||
-        name.rfind("text_encoder.", 0) == 0) {
+        name.rfind("text_encoder.", 0) == 0 ||
+        // Connector / text projection layers that run on the conditioner
+        // backend (e.g. LTX-2's text_embedding_projection: video/audio
+        // aggregate embeds + projection that map LLM hidden states into
+        // DiT-input space).
+        name.rfind("text_embedding_projection.", 0) == 0 ||
+        contains(".aggregate_embed.")) {
         out = ComponentKind::CONDITIONER;
         return true;
     }
@@ -181,19 +197,58 @@ inline std::vector<Device> enumerate_gpu_devices() {
 
 // --- Core algorithm -------------------------------------------------------
 
+// Per-GPU share for a layer-split component: free-VRAM-weighted partition
+// of params, plus the full compute reserve on each participating device.
+// (Compute reserve is per-device since each shard activates its own kernels.)
+inline std::vector<int64_t> layer_split_shares(int64_t                    params_bytes,
+                                               int64_t                    compute_bytes,
+                                               const std::vector<Device>& devices,
+                                               const std::vector<size_t>& gpu_idxs) {
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    int64_t total_free = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        total_free += std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes);
+    }
+    if (total_free <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        double r = double(std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes)) / double(total_free);
+        out[k]   = int64_t(double(params_bytes) * r) + compute_bytes;
+    }
+    return out;
+}
+
 // Peak per device = MAX of any single component's footprint on that device,
 // because free_params_immediately frees params between phases so components
 // time-share VRAM.
 inline int64_t gpu_peak(int                           gpu_idx,
                         const std::vector<Placement>& pl,
                         const std::vector<int>&       dev,
-                        const std::vector<Component>& components) {
+                        const std::vector<Component>& components,
+                        const std::vector<Device>&    devices = {}) {
     int64_t peak = 0;
     for (size_t i = 0; i < components.size(); i++) {
-        if (dev[i] != gpu_idx) continue;
         int64_t footprint = 0;
         if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            if (dev[i] != gpu_idx) continue;
             footprint = components[i].params_bytes + components[i].compute_bytes;
+        } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+            // dev[i] holds the bitmask of participating GPU indices into the
+            // devices[] vector (encoded by the planner). Look up our slot.
+            const int mask = dev[i];
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Find this gpu's slot in gpu_idxs.
+            int slot = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             components[i].compute_bytes,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
         }
         peak = std::max(peak, footprint);
     }
@@ -217,6 +272,13 @@ inline Plan compute_plan(const std::vector<Component>& components,
         int       device_idx;
     };
 
+    // Layer-split is only meaningful for components made up of many similarly
+    // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE
+    // is too structurally heterogeneous for naive block partitioning.
+    auto supports_layer_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
+    };
+
     auto build_options = [&](const Component& c) {
         std::vector<OptionSlot> opts;
         for (size_t g = 0; g < nG; g++) {
@@ -225,6 +287,15 @@ inline Plan compute_plan(const std::vector<Component>& components,
                 opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
             }
         }
+        // Layer-split: enumerate non-trivial subsets of GPUs (size >= 2).
+        // Encode the participating set as a bitmask in device_idx.
+        if (allow_multi_gpu && nG >= 2 && supports_layer_split(c.kind)) {
+            const int max_mask = 1 << nG;
+            for (int mask = 1; mask < max_mask; mask++) {
+                if (__builtin_popcount(mask) < 2) continue;
+                opts.push_back({Placement::GPU_LAYER_SPLIT, mask});
+            }
+        }
         opts.push_back({Placement::CPU, -1});
         return opts;
     };
@@ -255,6 +326,13 @@ inline Plan compute_plan(const std::vector<Component>& components,
             } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
                 s += 5 * pw;
                 gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+                // Better than CPU but worse than fitting on a single GPU
+                // (cross-GPU traffic between blocks).
+                s += 7 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
             } else {
                 s -= 10 * pw;
             }
@@ -292,7 +370,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
             if (ok) {
                 bool feasible = true;
                 for (size_t g = 0; g < nG; g++) {
-                    if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                    if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
                 }
                 if (feasible) {
                     int64_t sc = score(pl, dev);
@@ -304,7 +382,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
         } else {
             bool feasible = true;
             for (size_t g = 0; g < nG; g++) {
-                if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
             }
             if (feasible) {
                 int64_t sc = score(pl, dev);
@@ -340,6 +418,33 @@ inline Plan compute_plan(const std::vector<Component>& components,
             d.device_id      = DEVICE_ID_CPU;
             d.on_host_bytes  = c.params_bytes + c.compute_bytes;
             plan.any_changes = true;
+        } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, c.compute_bytes,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending share so the LARGEST-share
+            // GPU is listed first. Sub-runners that don't get the layer-split
+            // spec (e.g. the LTX-2 text projection) follow the "main" backend
+            // (= first in this list) — putting the biggest one first keeps
+            // them on the GPU with most headroom.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(),
+                      [&](size_t a, size_t b) { return shares[a] > shares[b]; });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                d.split_share_bytes.push_back(shares[k]);
+                max_share = std::max(max_share, shares[k]);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
         } else {
             d.device_id = devices[best_dev[i]].id;
             if (best_pl[i] == Placement::GPU) {
@@ -355,7 +460,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
     }
 
     for (size_t g = 0; g < nG; g++) {
-        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components);
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices);
     }
     return plan;
 }
@@ -365,6 +470,7 @@ inline const char* placement_str(Placement p) {
         case Placement::CPU: return "CPU";
         case Placement::GPU: return "GPU";
         case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+        case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)";
     }
     return "?";
 }
@@ -400,6 +506,15 @@ inline void print_plan(const Plan&                   plan,
             LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
                      d.name.c_str(), d.device_id,
                      (long long)(d.on_device_bytes / MiB));
+        } else if (d.placement == Placement::GPU_LAYER_SPLIT) {
+            std::string ids;
+            for (size_t k = 0; k < d.split_device_ids.size(); k++) {
+                if (k > 0) ids += "+";
+                ids += "GPU" + std::to_string(d.split_device_ids[k]);
+                ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)";
+            }
+            LOG_INFO("    %-12s -> %s",
+                     d.name.c_str(), ids.c_str());
         } else {
             LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
                      d.name.c_str(), d.device_id,
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 9f4d45524..99e27ae39 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -87,6 +87,11 @@ struct Conditioner {
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_flash_attention_enabled(bool enabled)                                 = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
+    // Defer the LLM sub-runner's params alloc + read until first compute().
+    // Only conditioners with a heavy LLM (e.g. LTX-2 Gemma) override this;
+    // others ignore the call. The callback is invoked AFTER the runner's
+    // alloc_params_buffer succeeds and is responsible for tensor data load.
+    virtual void set_llm_lazy_load(std::function<bool()> /*fn*/) {}
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
         GGML_ABORT("Not implemented yet!");
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index c0a2a11c0..d7ea6ede7 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -50,6 +50,10 @@ struct DiffusionModel {
     virtual int64_t get_adm_in_channels()                            = 0;
     virtual void set_flash_attention_enabled(bool enabled)           = 0;
     virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
+    // Defer params alloc + tensor data load until the first compute() call.
+    // Default: no-op. Subclasses backed by a single GGMLRunner forward to
+    // its set_lazy_load.
+    virtual void set_lazy_load(std::function<bool()> /*fn*/) {}
 };
 
 struct UNetModel : public DiffusionModel {
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 8b748194f..cd1662523 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1705,6 +1705,42 @@ struct GGMLRunnerContext {
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
+// ---------------------------------------------------------------------------
+// Multi-backend (layer-split) support
+// ---------------------------------------------------------------------------
+// A GGMLRunner can opt into "layer-split" mode where each weight tensor lives
+// entirely on one of several backends, picked by a caller-supplied callback
+// (typically based on the tensor name's block index). The runner switches
+// from gallocr to ggml_backend_sched for graph compute, so cross-backend
+// edges are routed automatically.
+//
+// This is the llama.cpp LLAMA_SPLIT_MODE_LAYER analogue. There is no
+// intra-tensor row split, so every tensor lives on a single normal device
+// buffer — views work without any ggml-cuda patch.
+//
+// To enable: populate g_pending_multi_backend_spec() with the additional
+// backends + tensor->backend callback, then construct the GGMLRunner. The
+// ctor consumes and clears the pending pointer.
+struct MultiBackendSpec {
+    // Extra backends *in addition to* the runner's main runtime_backend.
+    // The first entry's role is the main backend; we don't list it here.
+    std::vector<ggml_backend_t> additional_backends;
+
+    // Maps a weight tensor name to one of the runner's backends (the main
+    // runtime_backend, or one of additional_backends). Returning nullptr
+    // means "use the main runtime_backend".
+    std::function<ggml_backend_t(const std::string& tensor_name)> tensor_backend_fn;
+
+    // Optional CPU backend appended last to the sched for unsupported-op
+    // fallback. May be nullptr.
+    ggml_backend_t cpu_fallback = nullptr;
+};
+
+__STATIC_INLINE__ MultiBackendSpec*& g_pending_multi_backend_spec() {
+    thread_local MultiBackendSpec* spec = nullptr;
+    return spec;
+}
+
 struct GGMLRunner {
 protected:
     typedef std::function<ggml_cgraph*()> get_graph_cb_t;
@@ -1712,6 +1748,25 @@ struct GGMLRunner {
     ggml_backend_t params_backend  = nullptr;
     ggml_backend_t runtime_backend = nullptr;
 
+    // --- multi-backend (layer-split) state ---
+    bool                                                            multi_backend_mode = false;
+    std::vector<ggml_backend_t>                                     additional_backends;
+    ggml_backend_t                                                  cpu_fallback_backend = nullptr;
+    bool                                                            owns_cpu_fallback_backend = false;
+    std::function<ggml_backend_t(const std::string& tensor_name)>   tensor_backend_fn    = nullptr;
+    ggml_backend_sched_t                                            sched                = nullptr;
+    bool                                                            sched_reserved       = false;
+    // Per-backend params buffers when multi_backend_mode is on.
+    // params_buffer (single-backend) stays nullptr in this mode.
+    std::vector<ggml_backend_buffer_t>                              multi_params_buffers;
+
+    // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual
+    // alloc + tensor-data load is deferred until the first compute(). The
+    // callback is invoked AFTER do_alloc_params_buffer succeeds and is
+    // responsible for populating tensor->data via ModelLoader. Used to keep
+    // peak VRAM per-component-MAX rather than sum-of-components at init.
+    std::function<bool()>                                           lazy_load_fn = nullptr;
+
     ggml_context* params_ctx                    = nullptr;
     ggml_backend_buffer_t params_buffer         = nullptr;
     ggml_context* offload_ctx                   = nullptr;
@@ -1859,7 +1914,56 @@ struct GGMLRunner {
         return gf;
     }
 
+    // Build the multi-backend sched (lazily).
+    bool ensure_sched() {
+        if (sched != nullptr) return true;
+        std::vector<ggml_backend_t> backends;
+        backends.reserve(1 + additional_backends.size() + 1);
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+        // ggml_backend_sched_new asserts the last backend is a CPU; create
+        // a CPU fallback if the caller didn't provide one. We own this
+        // instance and free it in the dtor below.
+        if (cpu_fallback_backend == nullptr) {
+            cpu_fallback_backend     = ggml_backend_cpu_init();
+            owns_cpu_fallback_backend = true;
+        }
+        backends.push_back(cpu_fallback_backend);
+        sched = ggml_backend_sched_new(backends.data(),
+                                       /*bufts=*/nullptr,
+                                       (int)backends.size(),
+                                       MAX_GRAPH_SIZE,
+                                       /*parallel=*/false,
+                                       /*op_offload=*/false);
+        if (sched == nullptr) {
+            LOG_ERROR("%s: failed to create backend sched", get_desc().c_str());
+            return false;
+        }
+        return true;
+    }
+
     bool alloc_compute_buffer(get_graph_cb_t get_graph) {
+        if (multi_backend_mode) {
+            if (sched_reserved) return true;
+            if (!ensure_sched()) return false;
+            reset_compute_ctx();
+            ggml_cgraph* gf = get_compute_graph(get_graph);
+            backend_tensor_data_map.clear();
+            if (!ggml_backend_sched_reserve(sched, gf)) {
+                LOG_ERROR("%s: sched reserve failed", get_desc().c_str());
+                return false;
+            }
+            sched_reserved = true;
+            for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); i++) {
+                ggml_backend_t b = ggml_backend_sched_get_backend(sched, i);
+                size_t s         = ggml_backend_sched_get_buffer_size(sched, b);
+                LOG_DEBUG("%s sched buf[%d] %s = %.2f MB",
+                          get_desc().c_str(), i, ggml_backend_name(b),
+                          s / (1024.f * 1024.f));
+            }
+            return true;
+        }
+
         if (compute_allocr != nullptr) {
             return true;
         }
@@ -2018,6 +2122,22 @@ struct GGMLRunner {
 
     GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
         : runtime_backend(backend) {
+        // Consume any pending multi-backend (layer-split) spec set by the
+        // caller via g_pending_multi_backend_spec().
+        MultiBackendSpec* pending = g_pending_multi_backend_spec();
+        if (pending != nullptr) {
+            g_pending_multi_backend_spec() = nullptr;
+            multi_backend_mode             = true;
+            additional_backends            = pending->additional_backends;
+            tensor_backend_fn              = pending->tensor_backend_fn;
+            cpu_fallback_backend           = pending->cpu_fallback;
+            if (offload_params_to_cpu) {
+                LOG_WARN("multi-backend layer-split is incompatible with "
+                         "offload_params_to_cpu; ignoring offload");
+                offload_params_to_cpu = false;
+            }
+        }
+
         alloc_params_ctx();
         if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
             params_backend = ggml_backend_cpu_init();
@@ -2035,6 +2155,16 @@ struct GGMLRunner {
             ggml_backend_free(params_backend);
         }
         free_cache_ctx_and_buffer();
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched = nullptr;
+        }
+        if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) {
+            ggml_backend_free(cpu_fallback_backend);
+            cpu_fallback_backend = nullptr;
+        }
+        // additional_backends are owned by the caller (see the MultiBackendSpec
+        // setup site in stable-diffusion.cpp); not freed here.
     }
 
     virtual GGMLRunnerContext get_context() {
@@ -2054,7 +2184,102 @@ struct GGMLRunner {
         alloc_compute_ctx();
     }
 
-    bool alloc_params_buffer() {
+    // Multi-backend params allocation: walk params_ctx, classify each tensor
+    // via tensor_backend_fn, allocate one buffer per backend on its default
+    // buft, bind tensors via ggml_tallocr.
+    bool alloc_params_buffer_layer_split() {
+        // Build the backend list (main first, then additional). Index 0 is
+        // the default for tensors whose callback returns nullptr.
+        std::vector<ggml_backend_t> backends;
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+
+        std::vector<ggml_backend_buffer_type_t> bufts;
+        bufts.reserve(backends.size());
+        std::vector<size_t> aligns(backends.size());
+        std::vector<size_t> sizes(backends.size(), 0);
+        std::vector<size_t> counts(backends.size(), 0);
+        for (size_t i = 0; i < backends.size(); i++) {
+            bufts.push_back(ggml_backend_get_default_buffer_type(backends[i]));
+            aligns[i] = ggml_backend_buft_get_alignment(bufts[i]);
+        }
+
+        // First pass: assign each tensor to a backend, accumulate sizes.
+        std::map<ggml_tensor*, int> tensor_backend_idx;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            int idx = 0;
+            if (tensor_backend_fn) {
+                ggml_backend_t target = tensor_backend_fn(t->name);
+                if (target != nullptr) {
+                    for (size_t i = 0; i < backends.size(); i++) {
+                        if (backends[i] == target) {
+                            idx = int(i);
+                            break;
+                        }
+                    }
+                }
+            }
+            tensor_backend_idx[t] = idx;
+            size_t s              = ggml_backend_buft_get_alloc_size(bufts[idx], t);
+            sizes[idx] += GGML_PAD(s, aligns[idx]);
+            counts[idx] += 1;
+        }
+
+        // Allocate one buffer per used backend.
+        multi_params_buffers.assign(backends.size(), nullptr);
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (sizes[i] == 0) continue;
+            multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]);
+            if (multi_params_buffers[i] == nullptr) {
+                LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)",
+                          get_desc().c_str(),
+                          ggml_backend_name(backends[i]),
+                          sizes[i] / (1024.f * 1024.f));
+                return false;
+            }
+        }
+
+        // Bind tensors via ggml_tallocr.
+        std::vector<ggml_tallocr> tallocs(backends.size());
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (multi_params_buffers[i] != nullptr) {
+                tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]);
+            }
+        }
+        for (auto& kv : tensor_backend_idx) {
+            ggml_status st = ggml_tallocr_alloc(&tallocs[kv.second], kv.first);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s tallocr_alloc failed for tensor %s",
+                          get_desc().c_str(), kv.first->name);
+                return false;
+            }
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            }
+        }
+
+        // Log the breakdown.
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (counts[i] == 0) continue;
+            LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)",
+                     get_desc().c_str(),
+                     ggml_backend_name(backends[i]),
+                     sizes[i] / (1024.f * 1024.f),
+                     counts[i]);
+        }
+        return true;
+    }
+
+    // Internal: always materializes the params buffer. Used by both the
+    // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded`
+    // path; the latter must bypass the lazy-skip.
+    bool do_alloc_params_buffer() {
+        if (multi_backend_mode) {
+            return alloc_params_buffer_layer_split();
+        }
         size_t num_tensors = ggml_tensor_num(params_ctx);
         params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
         if (params_buffer == nullptr) {
@@ -2072,18 +2297,66 @@ struct GGMLRunner {
         return true;
     }
 
+    bool alloc_params_buffer() {
+        // Lazy mode: skip alloc until first compute() (via ensure_params_loaded).
+        // The caller still goes through alloc_params_buffer + get_param_tensors
+        // at init; ModelLoader::load_tensors will silently skip this runner's
+        // tensors (their data ptrs are null because no buffer is allocated yet)
+        // and the lazy_load_fn callback re-loads them on demand.
+        if (lazy_load_fn) return true;
+        return do_alloc_params_buffer();
+    }
+
+    void set_lazy_load(std::function<bool()> fn) {
+        lazy_load_fn = std::move(fn);
+    }
+
+    bool ensure_params_loaded() {
+        if (params_buffer != nullptr || !multi_params_buffers.empty()) {
+            return true;
+        }
+        if (!lazy_load_fn) {
+            LOG_ERROR("%s: no params buffer and no lazy_load_fn", get_desc().c_str());
+            return false;
+        }
+        int64_t t0 = ggml_time_ms();
+        if (!do_alloc_params_buffer()) return false;
+        if (!lazy_load_fn()) {
+            LOG_ERROR("%s: lazy load callback failed", get_desc().c_str());
+            return false;
+        }
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f);
+        return true;
+    }
+
     void free_params_buffer() {
         if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
             params_buffer = nullptr;
         }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_free(buf);
+            }
+        }
+        multi_params_buffers.clear();
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched          = nullptr;
+            sched_reserved = false;
+        }
     }
 
     size_t get_params_buffer_size() {
+        size_t total = 0;
         if (params_buffer != nullptr) {
-            return ggml_backend_buffer_get_size(params_buffer);
+            total += ggml_backend_buffer_get_size(params_buffer);
         }
-        return 0;
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) total += ggml_backend_buffer_get_size(buf);
+        }
+        return total;
     }
 
     void free_cache_ctx_and_buffer() {
@@ -2096,11 +2369,23 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
+        if (sched != nullptr) {
+            // Reset rather than free: keeping the sched alive across compute()
+            // calls of a sampling loop avoids the per-step rebuild cost.
+            ggml_backend_sched_reset(sched);
+            sched_reserved = false;
+        }
         offload_params_to_params_backend();
     }
 
     // do copy after alloc graph
     void set_backend_tensor_data(ggml_tensor* tensor, const void* data) {
+        // In multi-backend mode, sched needs the tensor flagged as input so
+        // it gets a backend assignment (otherwise tensors with no producers
+        // and no consumers leave sched at backend_id=-1).
+        if (multi_backend_mode) {
+            ggml_set_input(tensor);
+        }
         backend_tensor_data_map[tensor] = data;
     }
 
@@ -2160,6 +2445,9 @@ struct GGMLRunner {
                                          int n_threads,
                                          bool free_compute_buffer_immediately,
                                          bool no_return = false) {
+        if (!ensure_params_loaded()) {
+            return std::nullopt;
+        }
         if (!offload_params_to_runtime_backend()) {
             LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
             return std::nullopt;
@@ -2168,18 +2456,41 @@ struct GGMLRunner {
             LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
             return std::nullopt;
         }
-        reset_compute_ctx();
-        ggml_cgraph* gf = get_compute_graph(get_graph);
-        if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
-            LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
-            return std::nullopt;
+        ggml_cgraph* gf = nullptr;
+        if (multi_backend_mode) {
+            ggml_backend_sched_reset(sched);
+            reset_compute_ctx();
+            gf = get_compute_graph(get_graph);
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LOG_ERROR("%s sched alloc graph failed", get_desc().c_str());
+                return std::nullopt;
+            }
+        } else {
+            reset_compute_ctx();
+            gf = get_compute_graph(get_graph);
+            if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
+                LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
+                return std::nullopt;
+            }
         }
         copy_data_to_backend_tensor();
         if (ggml_backend_is_cpu(runtime_backend)) {
             ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
         }
+        if (multi_backend_mode && cpu_fallback_backend &&
+            ggml_backend_is_cpu(cpu_fallback_backend)) {
+            ggml_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads);
+        }
 
-        ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
+        ggml_status status;
+        if (multi_backend_mode) {
+            status = ggml_backend_sched_graph_compute(sched, gf);
+            if (status == GGML_STATUS_SUCCESS) {
+                ggml_backend_sched_synchronize(sched);
+            }
+        } else {
+            status = ggml_backend_graph_compute(runtime_backend, gf);
+        }
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
             return std::nullopt;
@@ -2259,6 +2570,14 @@ class GGMLBlock {
             prefix = prefix + ".";
         }
         init_params(ctx, tensor_storage_map, prefix);
+        // Tag each param tensor with its full (prefix-qualified) name so the
+        // multi-backend runner's tensor_backend_fn callback can route it.
+        // Without this, init_params leaves tensors with empty t->name.
+        for (auto& pair : params) {
+            if (pair.second != nullptr) {
+                ggml_set_name(pair.second, (prefix + pair.first).c_str());
+            }
+        }
         init_blocks(ctx, tensor_storage_map, prefix);
     }
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index dfe2a8873..356038146 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -127,6 +127,26 @@ class StableDiffusionGGML {
     bool fit_cond_offload_params = false;
     bool fit_vae_offload_params  = false;
 
+    // Layer-split state (when auto-fit picks GPU_LAYER_SPLIT). Holds the
+    // ordered list of device names and per-device share bytes; the actual
+    // backend handles are init'd at construction time and stored in
+    // *_extra_backends so the destructor can free them.
+    std::vector<std::string>    fit_dit_split_device_names;
+    std::vector<int64_t>        fit_dit_split_share_bytes;
+    std::vector<ggml_backend_t> fit_dit_extra_backends;
+    std::vector<std::string>    fit_cond_split_device_names;
+    std::vector<int64_t>        fit_cond_split_share_bytes;
+    std::vector<ggml_backend_t> fit_cond_extra_backends;
+
+    // Owned model loader: kept alive across init() so lazy_load callbacks
+    // can re-read tensor data from disk on demand. Only set when at least
+    // one component is configured for lazy load.
+    std::unique_ptr<ModelLoader> owned_model_loader;
+    // Auto-fit decided init-time SUM exceeds device cap; defer cond + DiT
+    // allocation until first compute() so peaks don't pile up.
+    bool auto_lazy_load = false;
+    bool enable_mmap_member = false;
+
     SDVersion version;
     bool vae_decode_only         = false;
     bool external_vae_is_invalid = false;
@@ -185,6 +205,18 @@ class StableDiffusionGGML {
         if (diffusion_backend != backend) {
             ggml_backend_free(diffusion_backend);
         }
+        for (auto* b : fit_dit_extra_backends) {
+            if (b != backend && b != diffusion_backend && b != clip_backend &&
+                b != vae_backend && b != control_net_backend) {
+                ggml_backend_free(b);
+            }
+        }
+        for (auto* b : fit_cond_extra_backends) {
+            if (b != backend && b != diffusion_backend && b != clip_backend &&
+                b != vae_backend && b != control_net_backend) {
+                ggml_backend_free(b);
+            }
+        }
         ggml_backend_free(backend);
     }
 
@@ -230,7 +262,12 @@ class StableDiffusionGGML {
 
         init_backend(sd_ctx_params->main_backend_device);
 
-        ModelLoader model_loader;
+        // Use a stack-local handle that points into `owned_model_loader` if we
+        // need lazy callbacks (decided after auto-fit), otherwise a temp local
+        // is fine. Defer the unique_ptr decision; for now always own it so the
+        // pointer is stable even if lazy load is enabled later in this init().
+        owned_model_loader = std::make_unique<ModelLoader>();
+        ModelLoader& model_loader = *owned_model_loader;
 
         if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
             LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
@@ -392,8 +429,19 @@ class StableDiffusionGGML {
                     break;
                 }
             }
-            auto resolve = [&](const backend_fit::Decision* d, std::string& out_device,
-                               bool& out_offload) {
+            auto device_id_to_name = [&](int dev_id) -> std::string {
+                for (const auto& dev : devices) {
+                    if (dev.id == dev_id) return dev.name;
+                }
+                return {};
+            };
+            auto resolve = [&](const backend_fit::Decision*  d,
+                               std::string&                  out_device,
+                               bool&                         out_offload,
+                               std::vector<std::string>&     out_split_devices,
+                               std::vector<int64_t>&         out_split_shares) {
+                out_split_devices.clear();
+                out_split_shares.clear();
                 if (d == nullptr) {
                     out_device.clear();
                     out_offload = false;
@@ -404,23 +452,67 @@ class StableDiffusionGGML {
                     out_offload = false;
                     return;
                 }
-                for (const auto& dev : devices) {
-                    if (dev.id == d->device_id) {
-                        out_device = dev.name;
-                        break;
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                    // Primary device drives main_backend choice for the model;
+                    // the rest become additional backends in the spec.
+                    for (size_t k = 0; k < d->split_device_ids.size(); k++) {
+                        out_split_devices.push_back(device_id_to_name(d->split_device_ids[k]));
+                        out_split_shares.push_back(d->split_share_bytes[k]);
                     }
+                    if (!out_split_devices.empty()) out_device = out_split_devices[0];
+                    out_offload = false;
+                    return;
                 }
+                out_device  = device_id_to_name(d->device_id);
                 out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS);
             };
+            std::vector<std::string> dummy_devs;
+            std::vector<int64_t>     dummy_shares;
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT),
-                    fit_diffusion_device, fit_dit_offload_params);
+                    fit_diffusion_device, fit_dit_offload_params,
+                    fit_dit_split_device_names, fit_dit_split_share_bytes);
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE),
-                    fit_vae_device, fit_vae_offload_params);
+                    fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares);
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER),
-                    fit_clip_device, fit_cond_offload_params);
+                    fit_clip_device, fit_cond_offload_params,
+                    fit_cond_split_device_names, fit_cond_split_share_bytes);
 
             // CPU placements: leave fit_*_device empty AND remember they're
             // CPU so the resolver below picks ggml_backend_cpu_init().
+
+            // Decide auto-lazy-load: if the per-component MAX-based plan fits
+            // but the SUM-of-components on any device would exceed cap, defer
+            // alloc until first compute() so peaks don't pile up. Heuristic:
+            // sum the per-device on_device_bytes across all GPU decisions
+            // (excluding VAE which is small) and compare to free_bytes.
+            std::map<int, int64_t> sum_per_device;
+            auto add_sum = [&](const backend_fit::Decision* d) {
+                if (!d) return;
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                    for (size_t k = 0; k < d->split_device_ids.size(); k++) {
+                        sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k];
+                    }
+                } else if (d->placement == backend_fit::Placement::GPU ||
+                           d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) {
+                    sum_per_device[d->device_id] += d->on_device_bytes;
+                }
+            };
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT));
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE));
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER));
+            for (const auto& dev : devices) {
+                int64_t cap = dev.free_bytes - margin_bytes;
+                int64_t sum = sum_per_device.count(dev.id) ? sum_per_device[dev.id] : 0;
+                if (sum > cap) {
+                    LOG_INFO("auto-fit: enabling lazy load (init-time SUM %lld MiB on %s "
+                             "exceeds cap %lld MiB; per-component MAX plan needs lazy alloc)",
+                             (long long)(sum / backend_fit::MiB),
+                             dev.name.c_str(),
+                             (long long)(cap / backend_fit::MiB));
+                    auto_lazy_load = true;
+                    break;
+                }
+            }
         }
 
         LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
@@ -493,6 +585,145 @@ class StableDiffusionGGML {
         const char* vae_dev_name       = effective_device(fit_vae_device,
                                                           sd_ctx_params->vae_backend_device);
 
+        // Build the layer-split MultiBackendSpec for a component. Only used
+        // when auto-fit picked GPU_LAYER_SPLIT for this component.
+        // - main_backend: the runner's primary backend (also first in the spec)
+        // - extra_device_names: additional device names to span
+        // - share_bytes: per-device share (for proportional block partition)
+        // - tensor_prefix: the model's weight name prefix (e.g.,
+        //   "model.diffusion_model.") — used to locate block-indexed tensors
+        // Returns true if a spec was prepared and pending_spec_storage was
+        // populated; the caller must set g_pending_multi_backend_spec()
+        // immediately before constructing the model.
+        auto prepare_layer_split_spec = [&](ggml_backend_t                       main_backend,
+                                            const std::vector<std::string>&      extra_device_names,
+                                            const std::vector<int64_t>&          share_bytes,
+                                            const std::string&                   tensor_prefix,
+                                            std::vector<ggml_backend_t>&         out_extra_backends,
+                                            MultiBackendSpec&                    out_spec) -> bool {
+            if (extra_device_names.size() < 2) return false;  // only [main] -> single GPU
+            // Init the additional backends (skip [0] which is main_backend).
+            std::vector<ggml_backend_t> all_backends;
+            all_backends.push_back(main_backend);
+            for (size_t k = 1; k < extra_device_names.size(); k++) {
+                ggml_backend_t b = init_named_backend(extra_device_names[k]);
+                if (b == nullptr) {
+                    LOG_WARN("layer-split: failed to init extra backend %s; falling back to single backend",
+                             extra_device_names[k].c_str());
+                    return false;
+                }
+                out_extra_backends.push_back(b);
+                all_backends.push_back(b);
+            }
+
+            // Walk tensor_storage_map to get per-block byte sizes and the
+            // total non-block bytes that will land on backend[0]. Then
+            // greedy-partition blocks by byte budget to balance per-backend
+            // bytes (accounting for non-block fixed load on backend[0]).
+            int max_block_idx = -1;
+            static const std::regex block_re(
+                R"((?:transformer_blocks|joint_blocks|double_blocks|single_blocks|blocks|layers)\.([0-9]+)\.)");
+            std::map<int, int64_t> block_bytes;        // block idx -> bytes
+            int64_t                non_block_bytes = 0;
+            for (const auto& kv : tensor_storage_map) {
+                if (!tensor_prefix.empty() && kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                    continue;
+                }
+                int64_t bytes = (int64_t)kv.second.nbytes();
+                std::smatch m;
+                if (std::regex_search(kv.first, m, block_re)) {
+                    int idx = std::stoi(m[1]);
+                    if (idx > max_block_idx) max_block_idx = idx;
+                    block_bytes[idx] += bytes;
+                } else {
+                    non_block_bytes += bytes;
+                }
+            }
+            if (max_block_idx < 0) {
+                LOG_WARN("layer-split: no blocks found under prefix '%s'; aborting split",
+                         tensor_prefix.c_str());
+                return false;
+            }
+            const int n_blocks = max_block_idx + 1;
+
+            // Build per-backend byte budgets from share_bytes (ratios). The
+            // first backend absorbs `non_block_bytes` as a fixed load, so we
+            // SHRINK its remaining budget for blocks accordingly.
+            int64_t total_share = 0;
+            for (auto s : share_bytes) total_share += s;
+            int64_t total_block_bytes = 0;
+            for (const auto& kv : block_bytes) total_block_bytes += kv.second;
+            std::vector<int64_t> backend_block_budgets(share_bytes.size(), 0);
+            for (size_t k = 0; k < share_bytes.size(); k++) {
+                int64_t share = int64_t(double(total_block_bytes + non_block_bytes) *
+                                        double(share_bytes[k]) / double(total_share));
+                if (k == 0) share = std::max<int64_t>(share - non_block_bytes, 0);
+                backend_block_budgets[k] = share;
+            }
+            // Greedy assign each block (in order) to the current backend
+            // until its budget is filled, then move to the next.
+            std::vector<int> boundaries(share_bytes.size(), 0);
+            size_t            cur_backend = 0;
+            int64_t           cur_used    = 0;
+            for (int b = 0; b < n_blocks; b++) {
+                int64_t bb = block_bytes[b];
+                if (cur_backend + 1 < share_bytes.size() &&
+                    cur_used + bb > backend_block_budgets[cur_backend] &&
+                    cur_used > 0) {
+                    boundaries[cur_backend] = b;
+                    cur_backend++;
+                    cur_used = 0;
+                }
+                cur_used += bb;
+            }
+            // The remaining backends get the rest, terminating at n_blocks.
+            for (size_t k = cur_backend; k < boundaries.size(); k++) {
+                boundaries[k] = n_blocks;
+            }
+            // Safety: ensure each backend has at least one block.
+            for (size_t k = 0; k < boundaries.size(); k++) {
+                int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+                if (boundaries[k] < min_bound) boundaries[k] = std::min(min_bound, n_blocks);
+            }
+            std::string boundary_log = "layer-split [" + tensor_prefix + "] " +
+                                       std::to_string(n_blocks) + " blocks: ";
+            int prev = 0;
+            for (size_t k = 0; k < all_backends.size() && k < boundaries.size(); k++) {
+                if (k > 0) boundary_log += ", ";
+                boundary_log += std::string(ggml_backend_name(all_backends[k])) + "=[" +
+                                std::to_string(prev) + ".." + std::to_string(boundaries[k]) + ")";
+                prev = boundaries[k];
+            }
+            LOG_INFO("%s", boundary_log.c_str());
+
+            // Build the tensor_backend_fn closure.
+            std::vector<ggml_backend_t> backends_capture = all_backends;
+            std::vector<int>            boundaries_capture = boundaries;
+            std::string                 prefix_capture     = tensor_prefix;
+            out_spec.tensor_backend_fn =
+                [backends_capture, boundaries_capture, prefix_capture](const std::string& name) -> ggml_backend_t {
+                    if (!prefix_capture.empty() &&
+                        name.compare(0, prefix_capture.size(), prefix_capture) != 0) {
+                        return backends_capture[0];
+                    }
+                    std::smatch m;
+                    if (!std::regex_search(name, m, block_re)) {
+                        return backends_capture[0];
+                    }
+                    int idx = std::stoi(m[1]);
+                    for (size_t k = 0; k < boundaries_capture.size(); k++) {
+                        if (idx < boundaries_capture[k]) {
+                            return backends_capture[std::min(k, backends_capture.size() - 1)];
+                        }
+                    }
+                    return backends_capture.back();
+                };
+            // Spec contains the additional backends only (main is implicit).
+            out_spec.additional_backends.assign(out_extra_backends.begin(), out_extra_backends.end());
+            out_spec.cpu_fallback = nullptr;
+            return true;
+        };
+
         // Helper: init a named backend if name is non-null/non-empty,
         // returns nullptr on missing/failed name (caller falls back to main).
         auto init_named_or_null = [](const char* name) -> ggml_backend_t {
@@ -507,6 +738,59 @@ class StableDiffusionGGML {
             LOG_INFO("Diffusion model: using device %s", diffusion_dev_name);
         }
 
+        // Tensor name sets for components that are configured for lazy load.
+        // Populated below right before/after the cond + DiT construction;
+        // consumed by the bulk-load step's ignore_tensors.
+        std::set<std::string> cond_lazy_tensor_names;
+        std::set<std::string> dit_lazy_tensor_names;
+
+        // Build the layer-split MultiBackendSpec for DiT (when auto-fit picked
+        // GPU_LAYER_SPLIT). The spec is consumed by the diffusion_model's
+        // GGMLRunner ctor when we set g_pending_multi_backend_spec() to it.
+        MultiBackendSpec dit_spec;
+        bool             dit_spec_active = false;
+        if (!fit_dit_split_device_names.empty()) {
+            dit_spec_active = prepare_layer_split_spec(diffusion_backend,
+                                                       fit_dit_split_device_names,
+                                                       fit_dit_split_share_bytes,
+                                                       "model.diffusion_model.",
+                                                       fit_dit_extra_backends,
+                                                       dit_spec);
+        }
+        // Lambda to set the pending spec immediately before constructing the
+        // diffusion model. Caller must invoke this on the same line / right
+        // before the std::make_shared<...Model>(diffusion_backend, ...) call.
+        auto prime_dit_spec = [&]() {
+            if (dit_spec_active) {
+                g_pending_multi_backend_spec() = &dit_spec;
+            }
+        };
+
+        // Same dance for the conditioner. The conditioner uses clip_backend as
+        // its main backend; we need to set up the spec BEFORE the cond_stage
+        // ctor runs (which is BEFORE the DiT ctor). Each cond model wraps one
+        // or more sub-runners; the spec's tensor_backend_fn handles all of
+        // them since it's keyed on tensor names with a generic block regex.
+        // (Some conditioners construct multiple sub-runners — only the FIRST
+        // ggml runner ctor consumes the pending spec, so we re-prime between
+        // sub-runners' allocs by leaving cond_spec_active true; the runner's
+        // multi_backend_mode is per-runner.)
+        // For LTX-2 specifically: LTXAVEmbedder constructs LLMRunner first
+        // (consumes spec), then LTXAVTextProjectionRunner (no spec consumed).
+        // The LLM has block-named tensors so layer-split applies; the
+        // projector has only 4 tensors and they should ride along on its
+        // single backend (clip_backend = main). Auto-fit's cond share counts
+        // both, so the share is over-counted on backend[0] for the projector.
+        // Acceptable for now — small correction.
+        ggml_backend_t   clip_main_backend_for_spec = nullptr;  // resolved below
+        MultiBackendSpec cond_spec;
+        bool             cond_spec_active = false;
+        auto prime_cond_spec = [&]() {
+            if (cond_spec_active) {
+                g_pending_multi_backend_spec() = &cond_spec;
+            }
+        };
+
         {
             clip_backend = init_named_or_null(clip_dev_name);
             if (!clip_backend) {
@@ -514,10 +798,22 @@ class StableDiffusionGGML {
             } else {
                 LOG_INFO("CLIP: using device %s", clip_dev_name);
             }
+            // Now that clip_backend is resolved, build the conditioner's
+            // layer-split spec if auto-fit picked it.
+            if (!fit_cond_split_device_names.empty()) {
+                cond_spec_active = prepare_layer_split_spec(clip_backend,
+                                                            fit_cond_split_device_names,
+                                                            fit_cond_split_share_bytes,
+                                                            "text_encoders.",  // covers text_encoders.llm.* and text_encoders.t5xxl.*
+                                                            fit_cond_extra_backends,
+                                                            cond_spec);
+            }
             if (sd_version_is_sd3(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<MMDiTModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
@@ -539,12 +835,14 @@ class StableDiffusionGGML {
                             "--chroma-disable-dit-mask as a workaround.");
                     }
 
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else if (version == VERSION_OVIS_IMAGE) {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map,
@@ -552,10 +850,12 @@ class StableDiffusionGGML {
                                                                      "",
                                                                      false);
                 } else {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                           offload_params_to_cpu,
                                                                           tensor_storage_map);
                 }
+                prime_dit_spec();
                 diffusion_model = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
@@ -563,28 +863,33 @@ class StableDiffusionGGML {
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     true,
                                                                     0,
                                                                     true);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<WanModel>(diffusion_backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
+                    prime_dit_spec();
                     high_noise_diffusion_model = std::make_shared<WanModel>(diffusion_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
@@ -605,12 +910,14 @@ class StableDiffusionGGML {
                 if (!vae_decode_only) {
                     enable_vision = true;
                 }
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version,
                                                                  "",
                                                                  enable_vision);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<QwenImageModel>(diffusion_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
@@ -618,28 +925,34 @@ class StableDiffusionGGML {
                                                                    version,
                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_anima(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<AnimaModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map,
                                                                "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<ZImageModel>(diffusion_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
                                                                 version);
             } else if (sd_version_is_ernie_image(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
+                prime_dit_spec();
                 diffusion_model  = std::make_shared<ErnieImageModel>(diffusion_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
@@ -650,6 +963,7 @@ class StableDiffusionGGML {
                     embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
                 }
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
@@ -657,12 +971,14 @@ class StableDiffusionGGML {
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version);
                 }
+                prime_dit_spec();
                 diffusion_model = std::make_shared<UNetModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
@@ -673,11 +989,61 @@ class StableDiffusionGGML {
                 }
             }
 
-            cond_stage_model->alloc_params_buffer();
-            cond_stage_model->get_param_tensors(tensors);
+            // Conditioner: publish its tensors to the global map, EXCEPT the
+            // ones that are about to be configured for lazy load (we want the
+            // bulk loader to skip them — they have no buffer yet).
+            std::map<std::string, ggml_tensor*> cond_only_tensors;
+            cond_stage_model->get_param_tensors(cond_only_tensors);
+            std::map<std::string, ggml_tensor*> llm_lazy_map;
+            if (auto_lazy_load) {
+                for (const auto& kv : cond_only_tensors) {
+                    if (kv.first.rfind("text_encoders.llm.", 0) == 0) {
+                        llm_lazy_map[kv.first] = kv.second;
+                        cond_lazy_tensor_names.insert(kv.first);
+                    }
+                }
+            }
+            for (const auto& kv : cond_only_tensors) {
+                if (cond_lazy_tensor_names.find(kv.first) == cond_lazy_tensor_names.end()) {
+                    tensors[kv.first] = kv.second;  // eager — bulk loader will fill
+                }
+            }
+            if (auto_lazy_load && !llm_lazy_map.empty()) {
+                ModelLoader* loader_ptr        = owned_model_loader.get();
+                int          n_threads_capture = sd_ctx_params->n_threads;
+                bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                cond_stage_model->set_llm_lazy_load([=]() -> bool {
+                    auto local_map = llm_lazy_map;
+                    return loader_ptr->load_tensors(local_map, /*ignore=*/{},
+                                                    n_threads_capture, mmap_capture);
+                });
+                LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)",
+                         llm_lazy_map.size());
+            }
+            cond_stage_model->alloc_params_buffer();  // no-op for the lazy LLM
 
-            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors);
+            std::map<std::string, ggml_tensor*> dit_only_tensors;
+            diffusion_model->get_param_tensors(dit_only_tensors);
+            if (auto_lazy_load) {
+                for (const auto& kv : dit_only_tensors) {
+                    dit_lazy_tensor_names.insert(kv.first);
+                }
+                ModelLoader* loader_ptr        = owned_model_loader.get();
+                int          n_threads_capture = sd_ctx_params->n_threads;
+                bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                diffusion_model->set_lazy_load([=]() -> bool {
+                    auto local_map = dit_only_tensors;
+                    return loader_ptr->load_tensors(local_map, /*ignore=*/{},
+                                                    n_threads_capture, mmap_capture);
+                });
+                LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)",
+                         dit_only_tensors.size());
+            } else {
+                for (const auto& kv : dit_only_tensors) {
+                    tensors[kv.first] = kv.second;
+                }
+            }
+            diffusion_model->alloc_params_buffer();  // no-op when lazy_load_fn is set
 
             if (sd_version_is_unet_edit(version)) {
                 vae_decode_only = false;
@@ -892,6 +1258,14 @@ class StableDiffusionGGML {
 
         std::set<std::string> ignore_tensors;
         tensors["alphas_cumprod"] = alphas_cumprod_tensor;
+        // Lazy-loaded components: skip them in the bulk load; their lazy
+        // callbacks will load them on first compute().
+        for (const auto& name : cond_lazy_tensor_names) {
+            ignore_tensors.insert(name);
+        }
+        for (const auto& name : dit_lazy_tensor_names) {
+            ignore_tensors.insert(name);
+        }
         if (use_tae && !tae_preview_only) {
             ignore_tensors.insert("first_stage_model.");
         }

From b8d1c992c39a89401cd3cc0c38a6b300b523079f Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 16:52:00 +0200
Subject: [PATCH 3/9] fix: drop pagecache after each lazy load to bound RAM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without an explicit posix_fadvise(POSIX_FADV_DONTNEED), the kernel
keeps a model file's pages cached as buff/cache long after we're done
with it, so loading the LLM (13.7 GB) followed by the DiT (17 GB)
piles up to 30+ GB of cached pages on a 32 GB box and triggers the
OOM-killer.

- Keep the file descriptor alive in MmapWrapperImpl so we can
  posix_fadvise(POSIX_FADV_DONTNEED) on it before munmap. madvise
  alone only unmaps the address range — it does not evict pagecache.
- Add POSIX_FADV_SEQUENTIAL on open: nudges the kernel toward a
  smaller working set during the read.
- Make the "using mmap" log line INFO instead of DEBUG so the user
  can confirm at a glance.
- Bound the lazy-load worker count to 2: the per-thread staging
  buffers grow to the largest tensor seen, so n_threads=8 doubles
  RAM peak for no measurable read-throughput gain.

Result on 32 GB box: peak RSS ~6 GB, peak buff/cache ~12 GB during
LLM lazy load — comfortably within budget.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/model.cpp            |  9 +++++++--
 src/stable-diffusion.cpp | 22 ++++++++++++++++++++--
 src/util.cpp             | 37 +++++++++++++++++++++++++++++--------
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/model.cpp b/src/model.cpp
index 8fdde3b76..32dfbed3c 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -783,11 +783,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 
         std::unique_ptr<MmapWrapper> mmapped;
         if (enable_mmap && !is_zip) {
-            LOG_DEBUG("using mmap for I/O");
             mmapped = MmapWrapper::create(file_path);
             if (!mmapped) {
-                LOG_WARN("failed to memory-map '%s'", file_path.c_str());
+                LOG_WARN("failed to memory-map '%s' (falling back to read())",
+                         file_path.c_str());
+            } else {
+                LOG_INFO("using mmap for '%s'", file_path.c_str());
             }
+        } else if (!is_zip) {
+            LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
+                     file_path.c_str());
         }
 
         int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 356038146..520d17b48 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1010,7 +1010,16 @@ class StableDiffusionGGML {
             }
             if (auto_lazy_load && !llm_lazy_map.empty()) {
                 ModelLoader* loader_ptr        = owned_model_loader.get();
-                int          n_threads_capture = sd_ctx_params->n_threads;
+                // Bound lazy-load threads to keep the per-thread staging
+                // buffer footprint small. The default n_threads = nproc gives
+                // ~nproc × max_tensor_bytes (up to several GB total) of
+                // CPU-side staging; for RAM-constrained systems running large
+                // models that's enough to trigger the OOM-killer even with
+                // mmap enabled. 2 threads still keep the disk-read pipeline
+                // fed while keeping staging bounded to ~2 × max_tensor_bytes.
+                int          n_threads_capture = std::min(sd_ctx_params->n_threads > 0
+                                                             ? sd_ctx_params->n_threads : 2,
+                                                          2);
                 bool         mmap_capture      = sd_ctx_params->enable_mmap;
                 cond_stage_model->set_llm_lazy_load([=]() -> bool {
                     auto local_map = llm_lazy_map;
@@ -1029,7 +1038,16 @@ class StableDiffusionGGML {
                     dit_lazy_tensor_names.insert(kv.first);
                 }
                 ModelLoader* loader_ptr        = owned_model_loader.get();
-                int          n_threads_capture = sd_ctx_params->n_threads;
+                // Bound lazy-load threads to keep the per-thread staging
+                // buffer footprint small. The default n_threads = nproc gives
+                // ~nproc × max_tensor_bytes (up to several GB total) of
+                // CPU-side staging; for RAM-constrained systems running large
+                // models that's enough to trigger the OOM-killer even with
+                // mmap enabled. 2 threads still keep the disk-read pipeline
+                // fed while keeping staging bounded to ~2 × max_tensor_bytes.
+                int          n_threads_capture = std::min(sd_ctx_params->n_threads > 0
+                                                             ? sd_ctx_params->n_threads : 2,
+                                                          2);
                 bool         mmap_capture      = sd_ctx_params->enable_mmap;
                 diffusion_model->set_lazy_load([=]() -> bool {
                     auto local_map = dit_only_tensors;
diff --git a/src/util.cpp b/src/util.cpp
index 0b514bb73..743738813 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -174,12 +174,33 @@ bool is_directory(const std::string& path) {
 
 class MmapWrapperImpl : public MmapWrapper {
 public:
-    MmapWrapperImpl(void* data, size_t size)
-        : MmapWrapper(data, size) {}
+    MmapWrapperImpl(void* data, size_t size, int fd)
+        : MmapWrapper(data, size), fd_(fd) {}
 
     ~MmapWrapperImpl() override {
+#ifdef __linux__
+        // Drop the kernel pagecache pages for this file. madvise(DONTNEED)
+        // alone only unmaps from the process address space; pagecache
+        // entries persist (`free` reports them as buff/cache and the OOM
+        // killer doesn't touch them, but they ARE counted against
+        // overcommit and can starve other allocations on tight-RAM
+        // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
+        // way to evict pagecache for a specific fd's pages.
+        if (data_ != nullptr && size_ > 0) {
+            madvise(data_, size_, MADV_DONTNEED);
+        }
+        if (fd_ >= 0) {
+            posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        }
+#endif
         munmap(data_, size_);
+        if (fd_ >= 0) {
+            close(fd_);
+        }
     }
+
+private:
+    int fd_;
 };
 
 std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
@@ -191,9 +212,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
     int mmap_flags = MAP_PRIVATE;
 
 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
-    // mmap_flags |= MAP_POPULATE;
+    // Sequential access hint helps the kernel read-ahead efficiently and
+    // also encourages eviction of already-read pages (the kernel keeps
+    // a smaller working set when this is set).
+    posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
 #endif
 
     struct stat sb;
@@ -206,9 +228,8 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
 
-    close(file_descriptor);
-
     if (mapped_data == MAP_FAILED) {
+        close(file_descriptor);
         return nullptr;
     }
 
@@ -217,7 +238,7 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
     // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
 #endif
 
-    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
+    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
 }
 
 #endif

From f8d4a585ff428f00df7a89eddcce4c5ceedb31d8 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 17:26:44 +0200
Subject: [PATCH 4/9] wip: restrict layer-split to DiT, fall back to
 single-GPU/CPU for Cond
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The layer-split path for the Conditioner has a bug where some
sub-runners (LTX-2 text projection) and possibly some Gemma ops route
through host memory despite the planner placing them on GPU bufts —
process RSS climbs to ~13 GB worth of supposedly-on-GPU tensors and
CUDA1 reports only ~150 MB used after the lazy load completes. Until
that is root-caused, restrict layer-split to the DiT only. The Cond
falls back to whatever auto-fit picks (single GPU when it fits,
OFFLOAD or CPU otherwise — slow but correct).

DiT layer-split is unaffected and continues to work for the user's
primary use case (split a 17 GB Q6_K LTX-2 DiT across two GPUs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/backend_fit.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
index 7ca789a0b..59fa491cb 100644
--- a/src/backend_fit.hpp
+++ b/src/backend_fit.hpp
@@ -273,10 +273,14 @@ inline Plan compute_plan(const std::vector<Component>& components,
     };
 
     // Layer-split is only meaningful for components made up of many similarly
-    // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE
-    // is too structurally heterogeneous for naive block partitioning.
+    // shaped blocks. Currently restricted to the DiT — the Conditioner's
+    // layer-split path has a known issue where some sub-runners (e.g. LTX-2
+    // text projection) and possibly some Gemma ops route through CPU,
+    // dragging weights back into RAM and tanking performance. Until that
+    // is fixed, the planner keeps the Conditioner on a single GPU (or
+    // OFFLOAD / CPU when it doesn't fit).
     auto supports_layer_split = [](ComponentKind k) {
-        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
+        return k == ComponentKind::DIT;
     };
 
     auto build_options = [&](const Component& c) {

From 155f5235e937dcee556016d3a18532e790f99be8 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 21:40:06 +0200
Subject: [PATCH 5/9] fix: layer-split alloc actually lands on GPU; was a
 diagnostic artifact

Re-enable Conditioner layer-split and add detailed diagnostics around
alloc_params_buffer_layer_split + ensure_params_loaded. The added
prints query ggml_backend_dev_memory before and after each per-backend
buft_alloc_buffer and confirm cudaMalloc DOES reserve the requested
GPU memory:

  gemma3_12b layer-split alloc[0] CUDA1 req=10464 MB
    dev_free 13495 -> 3029 MB (drop 10466 MB) is_host=0
  gemma3_12b layer-split alloc[1] CUDA0 req=5177 MB
    dev_free 7903 -> 2725 MB (drop 5178 MB) is_host=0

Combined Q6_K LTX-2 DiT (17 GB) + Gemma 3 12B Q8_K_XL (15 GB) +
text projection (2.2 GB) + VAE (1.4 GB) end-to-end on a 9.8 GB +
15.7 GB GPU pair via layer-split + lazy-load now completes:

  generate_video completed in 82.72s

The earlier "tensors in RAM" symptom was from a stale binary state
during the iterative build cycle, not an actual correctness bug.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/backend_fit.hpp | 10 +++-----
 src/ggml_extend.hpp | 61 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
index 59fa491cb..7ca789a0b 100644
--- a/src/backend_fit.hpp
+++ b/src/backend_fit.hpp
@@ -273,14 +273,10 @@ inline Plan compute_plan(const std::vector<Component>& components,
     };
 
     // Layer-split is only meaningful for components made up of many similarly
-    // shaped blocks. Currently restricted to the DiT — the Conditioner's
-    // layer-split path has a known issue where some sub-runners (e.g. LTX-2
-    // text projection) and possibly some Gemma ops route through CPU,
-    // dragging weights back into RAM and tanking performance. Until that
-    // is fixed, the planner keeps the Conditioner on a single GPU (or
-    // OFFLOAD / CPU when it doesn't fit).
+    // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE
+    // is too structurally heterogeneous for naive block partitioning.
     auto supports_layer_split = [](ComponentKind k) {
-        return k == ComponentKind::DIT;
+        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
     };
 
     auto build_options = [&](const Component& c) {
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index cd1662523..64095764e 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2202,6 +2202,16 @@ struct GGMLRunner {
         for (size_t i = 0; i < backends.size(); i++) {
             bufts.push_back(ggml_backend_get_default_buffer_type(backends[i]));
             aligns[i] = ggml_backend_buft_get_alignment(bufts[i]);
+            // Diagnostic: confirm we got a sensible buft from each backend.
+            const char* buft_name    = ggml_backend_buft_name(bufts[i]);
+            const char* backend_name = ggml_backend_name(backends[i]);
+            ggml_backend_dev_t        dev      = ggml_backend_buft_get_device(bufts[i]);
+            enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU;
+            const char*              dev_name = dev ? ggml_backend_dev_name(dev) : "(none)";
+            LOG_INFO("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d",
+                     get_desc().c_str(), i, backend_name ? backend_name : "(null)",
+                     buft_name ? buft_name : "(null)", dev_name,
+                     (int)dev_type);
         }
 
         // First pass: assign each tensor to a backend, accumulate sizes.
@@ -2230,6 +2240,10 @@ struct GGMLRunner {
         multi_params_buffers.assign(backends.size(), nullptr);
         for (size_t i = 0; i < backends.size(); i++) {
             if (sizes[i] == 0) continue;
+            // Diagnostic: query the device's free memory BEFORE alloc.
+            ggml_backend_dev_t dev_pre = ggml_backend_buft_get_device(bufts[i]);
+            size_t free_pre = 0, total_pre = 0;
+            if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_pre, &total_pre);
             multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]);
             if (multi_params_buffers[i] == nullptr) {
                 LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)",
@@ -2238,6 +2252,22 @@ struct GGMLRunner {
                           sizes[i] / (1024.f * 1024.f));
                 return false;
             }
+            // Diagnostic: query AFTER alloc. The drop in free memory tells
+            // us whether the alloc actually went to GPU device memory or
+            // to a virtual reservation that's not yet committed.
+            size_t free_post = 0, total_post = 0;
+            if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_post, &total_post);
+            int64_t actual_drop = (int64_t)free_pre - (int64_t)free_post;
+            void*  base       = ggml_backend_buffer_get_base(multi_params_buffers[i]);
+            size_t actual_sz  = ggml_backend_buffer_get_size(multi_params_buffers[i]);
+            bool   is_host    = ggml_backend_buffer_is_host(multi_params_buffers[i]);
+            LOG_INFO("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB "
+                     "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d",
+                     get_desc().c_str(), i, ggml_backend_name(backends[i]),
+                     sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f),
+                     free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f),
+                     actual_drop / (1024.f * 1024.f),
+                     base, (int)is_host);
         }
 
         // Bind tensors via ggml_tallocr.
@@ -2255,6 +2285,18 @@ struct GGMLRunner {
                 return false;
             }
         }
+        // Diagnostic: pick a sample tensor per backend and confirm its
+        // buffer + data pointer.
+        std::vector<bool> sampled(backends.size(), false);
+        for (auto& kv : tensor_backend_idx) {
+            int idx = kv.second;
+            if (sampled[idx]) continue;
+            sampled[idx] = true;
+            ggml_tensor* t = kv.first;
+            LOG_INFO("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d",
+                     get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data,
+                     t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1);
+        }
         for (auto* buf : multi_params_buffers) {
             if (buf != nullptr) {
                 ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
@@ -2327,6 +2369,25 @@ struct GGMLRunner {
         }
         int64_t t1 = ggml_time_ms();
         LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f);
+        // Diagnostic: report device-memory free per backend AFTER load.
+        // If the bytes actually went to GPU, free should have decreased
+        // by ~params_size for each layer-split shard.
+        if (multi_backend_mode) {
+            std::vector<ggml_backend_t> backends;
+            backends.push_back(runtime_backend);
+            for (auto* b : additional_backends) backends.push_back(b);
+            for (size_t i = 0; i < backends.size(); i++) {
+                ggml_backend_dev_t dev = ggml_backend_get_device(backends[i]);
+                if (!dev) continue;
+                size_t free_b = 0, total_b = 0;
+                ggml_backend_dev_memory(dev, &free_b, &total_b);
+                LOG_INFO("%s post-load device %s free=%.1f MB / %.1f MB",
+                         get_desc().c_str(),
+                         ggml_backend_dev_name(dev),
+                         free_b / (1024.f * 1024.f),
+                         total_b / (1024.f * 1024.f));
+            }
+        }
         return true;
     }
 

From 3c874965c905fc227f5639f4f5d24fe0882550f0 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 21:46:58 +0200
Subject: [PATCH 6/9] feat: add --quiet-unknown-tensors; demote layer-split
 diagnostics to DEBUG

- New CLI flag --quiet-unknown-tensors (sd_ctx_params_t.quiet_unknown_tensors)
  suppresses the per-tensor 'unknown tensor X in model file' log lines emitted
  during model loading. LTX-2 ships ~4600 audio-branch and encoder tensors a
  video-only pipeline doesn't consume; without this flag the load output is
  drowned out by them. A single summary line is emitted at the end with the
  count of skipped tensors.

- The flag is plumbed through all three load paths:
  - bulk loader at init (eager components: VAE, projector)
  - lazy LLM load callback (Conditioner)
  - lazy DiT load callback

- ModelLoader::load_tensors gains a quiet_unknown_tensors=false default
  parameter so existing callers keep their current behaviour.

- The four layer-split diagnostic LOG_INFO lines (backend, alloc, sample,
  post-load) are demoted to LOG_DEBUG; they're noisy and only useful when
  triaging the bug we just chased. The 'layer-split params on CUDA*: X MB'
  summary line stays at INFO since it shows the user how params were
  distributed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/common/common.cpp |  8 ++++++++
 examples/common/common.h   |  4 ++++
 include/stable-diffusion.h |  7 +++++++
 src/ggml_extend.hpp        |  8 ++++----
 src/model.cpp              | 14 ++++++++++++--
 src/model.h                |  3 ++-
 src/stable-diffusion.cpp   | 15 ++++++++++++---
 7 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index d3626fcce..2bbd1d216 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -524,6 +524,12 @@ ArgOptions SDContextParams::get_options() {
          "--fit-dry-run",
          "auto-fit: print the computed plan and exit without loading models",
          true, &auto_fit_dry_run},
+        {"",
+         "--quiet-unknown-tensors",
+         "suppress per-tensor 'unknown tensor X in model file' log lines "
+         "(useful for LTX-2 and similar models that ship many unused "
+         "tensors); a single summary line with the count is logged instead",
+         true, &quiet_unknown_tensors},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -756,6 +762,7 @@ std::string SDContextParams::to_string() const {
         << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
         << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
         << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -839,6 +846,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         auto_fit_compute_reserve_vae_mb,
         auto_fit_compute_reserve_cond_mb,
         auto_multi_gpu,
+        quiet_unknown_tensors,
     };
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index 8243d6cba..e21a68142 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -144,6 +144,10 @@ struct SDContextParams {
     int  auto_fit_compute_reserve_cond_mb = 0;
     bool auto_multi_gpu                   = true;
 
+    // When set, the model loader skips per-tensor "unknown tensor" log
+    // lines and instead emits a single summary count at the end of load.
+    bool quiet_unknown_tensors            = false;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index ed6336ba1..2de064288 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -233,6 +233,13 @@ typedef struct {
     // they fit. Defaults to true. Each component still lives entirely on
     // one device — no intra-tensor row split.
     bool auto_multi_gpu;
+
+    // Suppress per-tensor "unknown tensor 'X' in model file" log lines
+    // emitted during model loading. Useful for models like LTX-2 that
+    // ship hundreds of audio-branch / encoder tensors a video-only
+    // pipeline doesn't consume. A single summary line is logged at the
+    // end with the count of skipped tensors.
+    bool quiet_unknown_tensors;
 } sd_ctx_params_t;
 
 typedef struct {
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 64095764e..54ec2fd1a 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2208,7 +2208,7 @@ struct GGMLRunner {
             ggml_backend_dev_t        dev      = ggml_backend_buft_get_device(bufts[i]);
             enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU;
             const char*              dev_name = dev ? ggml_backend_dev_name(dev) : "(none)";
-            LOG_INFO("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d",
+            LOG_DEBUG("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d",
                      get_desc().c_str(), i, backend_name ? backend_name : "(null)",
                      buft_name ? buft_name : "(null)", dev_name,
                      (int)dev_type);
@@ -2261,7 +2261,7 @@ struct GGMLRunner {
             void*  base       = ggml_backend_buffer_get_base(multi_params_buffers[i]);
             size_t actual_sz  = ggml_backend_buffer_get_size(multi_params_buffers[i]);
             bool   is_host    = ggml_backend_buffer_is_host(multi_params_buffers[i]);
-            LOG_INFO("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB "
+            LOG_DEBUG("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB "
                      "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d",
                      get_desc().c_str(), i, ggml_backend_name(backends[i]),
                      sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f),
@@ -2293,7 +2293,7 @@ struct GGMLRunner {
             if (sampled[idx]) continue;
             sampled[idx] = true;
             ggml_tensor* t = kv.first;
-            LOG_INFO("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d",
+            LOG_DEBUG("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d",
                      get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data,
                      t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1);
         }
@@ -2381,7 +2381,7 @@ struct GGMLRunner {
                 if (!dev) continue;
                 size_t free_b = 0, total_b = 0;
                 ggml_backend_dev_memory(dev, &free_b, &total_b);
-                LOG_INFO("%s post-load device %s free=%.1f MB / %.1f MB",
+                LOG_DEBUG("%s post-load device %s free=%.1f MB / %.1f MB",
                          get_desc().c_str(),
                          ggml_backend_dev_name(dev),
                          free_b / (1024.f * 1024.f),
diff --git a/src/model.cpp b/src/model.cpp
index 32dfbed3c..2f7e2b78f 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -1008,9 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                                std::set<std::string> ignore_tensors,
                                int n_threads,
-                               bool enable_mmap) {
+                               bool enable_mmap,
+                               bool quiet_unknown_tensors) {
     std::set<std::string> tensor_names_in_file;
     std::mutex tensor_names_mutex;
+    std::atomic<size_t> unknown_tensor_count{0};
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
         // LOG_DEBUG("%s", tensor_storage.to_string().c_str());
@@ -1028,7 +1030,11 @@ bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                     return true;
                 }
             }
-            LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str());
+            if (quiet_unknown_tensors) {
+                unknown_tensor_count.fetch_add(1);
+            } else {
+                LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str());
+            }
             return true;
         }
 
@@ -1077,6 +1083,10 @@ bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
     if (some_tensor_not_init) {
         return false;
     }
+    if (quiet_unknown_tensors && unknown_tensor_count.load() > 0) {
+        LOG_INFO("skipped %zu unknown tensors (--quiet-unknown-tensors)",
+                 unknown_tensor_count.load());
+    }
     return true;
 }
 
diff --git a/src/model.h b/src/model.h
index 10aaf8512..03d4e3732 100644
--- a/src/model.h
+++ b/src/model.h
@@ -226,7 +226,8 @@ class ModelLoader {
     bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
                       int n_threads                        = 0,
-                      bool use_mmap                        = false);
+                      bool use_mmap                        = false,
+                      bool quiet_unknown_tensors           = false);
 
     std::vector<std::string> get_tensor_names() const {
         std::vector<std::string> names;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 520d17b48..181dd8350 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1021,10 +1021,12 @@ class StableDiffusionGGML {
                                                              ? sd_ctx_params->n_threads : 2,
                                                           2);
                 bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                bool         quiet_capture     = sd_ctx_params->quiet_unknown_tensors;
                 cond_stage_model->set_llm_lazy_load([=]() -> bool {
                     auto local_map = llm_lazy_map;
                     return loader_ptr->load_tensors(local_map, /*ignore=*/{},
-                                                    n_threads_capture, mmap_capture);
+                                                    n_threads_capture, mmap_capture,
+                                                    quiet_capture);
                 });
                 LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)",
                          llm_lazy_map.size());
@@ -1049,10 +1051,12 @@ class StableDiffusionGGML {
                                                              ? sd_ctx_params->n_threads : 2,
                                                           2);
                 bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                bool         quiet_capture     = sd_ctx_params->quiet_unknown_tensors;
                 diffusion_model->set_lazy_load([=]() -> bool {
                     auto local_map = dit_only_tensors;
                     return loader_ptr->load_tensors(local_map, /*ignore=*/{},
-                                                    n_threads_capture, mmap_capture);
+                                                    n_threads_capture, mmap_capture,
+                                                    quiet_capture);
                 });
                 LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)",
                          dit_only_tensors.size());
@@ -1313,7 +1317,9 @@ class StableDiffusionGGML {
             ignore_tensors.insert("text_encoders.llm.vision_tower.");
             ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads,
+                                                  sd_ctx_params->enable_mmap,
+                                                  sd_ctx_params->quiet_unknown_tensors);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -2695,6 +2701,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
     sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
     sd_ctx_params->auto_multi_gpu               = true;
+    sd_ctx_params->quiet_unknown_tensors        = false;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2742,6 +2749,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "auto_fit_compute_reserve_vae_mb: %d\n"
              "auto_fit_compute_reserve_cond_mb: %d\n"
              "auto_multi_gpu: %s\n"
+             "quiet_unknown_tensors: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2787,6 +2795,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_ctx_params->auto_fit_compute_reserve_vae_mb,
              sd_ctx_params->auto_fit_compute_reserve_cond_mb,
              BOOL_STR(sd_ctx_params->auto_multi_gpu),
+             BOOL_STR(sd_ctx_params->quiet_unknown_tensors),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),

From d9d38baa58f0765a82dd2d4c81709a4a68d1b8b9 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 22:48:30 +0200
Subject: [PATCH 7/9] feat: add row-split (cuda_split_buffer_type) alongside
 layer-split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restores the row-split path that worked on the original LTX-2 branch:
matmul weight tensors are split row-wise across CUDA devices via
cuda_split_buffer_type and the CUDA backend handles cross-device
dispatch internally. Sched still wires the additional CUDA backends
so it can route copies between devices, but no per-block buffer
doubling — the compute buffer is dramatically smaller than layer-
split for cross-backend graphs.

Surface:
- New CLI: --multi-gpu-mode={row,layer,off} (default: row).
- New API field: sd_ctx_params_t.multi_gpu_mode.
- New planner placement: GPU_TENSOR_SPLIT, scored slightly above
  GPU_LAYER_SPLIT so the planner prefers it when both fit.

Changes:
- backend_fit::Placement gains GPU_TENSOR_SPLIT and a MultiGpuMode
  enum; build_options enumerates only the chosen mode's split
  options. Decision population sorts split_device_ids by descending
  TOTAL memory (always use the bigger GPU as main).
- gpu_peak handles GPU_TENSOR_SPLIT correctly: per-device share +
  compute reserve added to the biggest-memory GPU only.
- MultiBackendSpec gains a `mode` field. ROW_SPLIT carries
  tensor_split_ratios + main_device; LAYER_SPLIT carries
  tensor_backend_fn.
- GGMLRunner ctor branches on mode: ROW_SPLIT initializes
  cuda_split_buffer_type; LAYER_SPLIT consumes the backend callback.
- alloc_params_buffer_row_split: walks params_ctx, splits into
  matmul-eligible (row-split-buft) vs main (default buft) buffers,
  binds via tallocr. is_row_split_eligible excludes views, so the
  cuda split buft never sees a view tensor.
- free_params_buffer + ensure_params_loaded recognize the new
  row_split_buffer / row_main_buffer fields so lazy load doesn't
  re-trigger on subsequent compute() calls.
- Spec wiring in stable-diffusion.cpp: prepare_row_split_spec
  computes per-device ratios from the planner's share_bytes, picks
  main_device by largest share, and inits the additional CUDA
  backends so sched can schedule cross-device copies.
- CMakeLists.txt: add -DSD_USE_CUDA when SD_CUDA is enabled (had
  been silently undefined, leaving all #ifdef SD_USE_CUDA blocks
  inactive — a latent bug that broke row-split alloc).
- ggml-cuda.cu: re-add the small view-init early-return in
  ggml_backend_cuda_split_buffer_init_tensor. Without this, sched's
  per-tensor init crashes on view tensors of split-tensor weights.
  The row-split path itself routes views to the main buffer (not
  the split-buft) via is_row_split_eligible, but sched-managed
  scratch tensors still hit the split buft for op outputs.
- Lazy-load auto-detect now counts GPU_TENSOR_SPLIT shares in its
  init-time-SUM check (was missing — the planner thought everything
  fit and lazy-load never triggered).

Known issue: at LTX-2 Q6_K + Gemma 12B Q8_K_XL scale, the per-tensor
cudaMalloc fragmentation inside cuda_split_buffer_type's init_tensor
exhausts CUDA0's small-alloc pool during DiT load even though the
planner's MAX-based peak fits. Pre-existing limitation of the split
buft; needs a separate followup (e.g. coalesce tensors into a single
big alloc, or use VMM-backed managed memory).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt             |   1 +
 examples/common/common.cpp |  10 +++
 examples/common/common.h   |   3 +
 include/stable-diffusion.h |  19 ++++-
 src/backend_fit.hpp        | 133 +++++++++++++++++++++++++----
 src/ggml_extend.hpp        | 171 ++++++++++++++++++++++++++++++++++---
 src/stable-diffusion.cpp   | 159 ++++++++++++++++++++++++++++------
 7 files changed, 443 insertions(+), 53 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48ce456ea..32375b163 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF
 if(SD_CUDA)
     message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
+    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 2bbd1d216..36f1c6a86 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -420,6 +420,14 @@ ArgOptions SDContextParams::get_options() {
          "--vision-backend-device",
          "ggml device name for the vision model (currently routed through main).",
          &vision_backend_device},
+        {"",
+         "--multi-gpu-mode",
+         "auto-fit multi-GPU split mechanism: 'row' (default; CUDA-only "
+         "row-split via cuda_split_buffer_type, single backend, smaller "
+         "compute buffer), 'layer' (block-indexed tensors split across "
+         "per-block backends + sched, generic but ~2x activation cost at "
+         "boundaries), or 'off' (never split a single component)",
+         &multi_gpu_mode},
     };
 
     options.int_options = {
@@ -762,6 +770,7 @@ std::string SDContextParams::to_string() const {
         << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
         << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
         << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
         << "  quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
@@ -846,6 +855,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         auto_fit_compute_reserve_vae_mb,
         auto_fit_compute_reserve_cond_mb,
         auto_multi_gpu,
+        multi_gpu_mode.empty() ? nullptr : multi_gpu_mode.c_str(),
         quiet_unknown_tensors,
     };
     return sd_ctx_params;
diff --git a/examples/common/common.h b/examples/common/common.h
index e21a68142..1df32f9c0 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -143,6 +143,9 @@ struct SDContextParams {
     int  auto_fit_compute_reserve_vae_mb  = 0;
     int  auto_fit_compute_reserve_cond_mb = 0;
     bool auto_multi_gpu                   = true;
+    // "row" (default), "layer", or "off". Selects the multi-GPU split
+    // mechanism the auto-fit planner is allowed to emit.
+    std::string multi_gpu_mode            = "row";
 
     // When set, the model loader skips per-tensor "unknown tensor" log
     // lines and instead emits a single summary count at the end of load.
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 2de064288..7da5324b4 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -230,10 +230,25 @@ typedef struct {
     // When more than one GPU device is present, prefer placing different
     // components on different GPUs to balance load and fit larger total
     // working sets. Set false to keep all components on a single GPU when
-    // they fit. Defaults to true. Each component still lives entirely on
-    // one device — no intra-tensor row split.
+    // they fit. Defaults to true.
     bool auto_multi_gpu;
 
+    // When auto_multi_gpu is true and a single component doesn't fit on
+    // one GPU, the planner can split it across multiple GPUs. Two
+    // mechanisms:
+    //   "row":   matmul weights row-split across CUDA devices via
+    //            cuda_split_buffer_type. Single CUDA backend; no sched.
+    //            Cheaper compute buffer (no cross-backend doubling) but
+    //            CUDA-only. Default.
+    //   "layer": block-indexed tensors assigned to per-block backends
+    //            and routed via ggml_backend_sched. Generic across
+    //            backends but costs ~2x activation memory at boundaries.
+    //   "off":   never split a single component across GPUs. Components
+    //            that don't fit fall back to OFFLOAD or CPU.
+    // The string is parsed by backend_fit::str_to_multi_gpu_mode; if
+    // unrecognized, "row" is used.
+    const char* multi_gpu_mode;
+
     // Suppress per-tensor "unknown tensor 'X' in model file" log lines
     // emitted during model loading. Useful for models like LTX-2 that
     // ship hundreds of audio-branch / encoder tensors a video-only
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
index 7ca789a0b..b95632750 100644
--- a/src/backend_fit.hpp
+++ b/src/backend_fit.hpp
@@ -45,7 +45,8 @@ enum class Placement {
     CPU,
     GPU,
     GPU_OFFLOAD_PARAMS,    // params in RAM, compute on GPU
-    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries
+    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries (sched-based)
+    GPU_TENSOR_SPLIT,      // matmul weights row-split across GPUs (CUDA split-buft, single backend)
 };
 
 struct Component {
@@ -94,6 +95,28 @@ struct ComputeReserves {
     int64_t conditioner_bytes = int64_t(512) * MiB;
 };
 
+enum class MultiGpuMode {
+    OFF,    // never split a single component across GPUs
+    ROW,    // CUDA-only: row-split matmul weights via cuda_split_buffer_type
+    LAYER,  // generic: assign block-indexed tensors to per-block backends + sched
+};
+
+inline const char* multi_gpu_mode_str(MultiGpuMode m) {
+    switch (m) {
+        case MultiGpuMode::OFF:   return "off";
+        case MultiGpuMode::ROW:   return "row";
+        case MultiGpuMode::LAYER: return "layer";
+    }
+    return "?";
+}
+
+inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) {
+    if (s == "off")   return MultiGpuMode::OFF;
+    if (s == "row")   return MultiGpuMode::ROW;
+    if (s == "layer") return MultiGpuMode::LAYER;
+    return MultiGpuMode::ROW;  // default
+}
+
 // --- Classification -------------------------------------------------------
 
 inline bool classify_tensor(const std::string& name, ComponentKind& out) {
@@ -231,6 +254,34 @@ inline int64_t gpu_peak(int                           gpu_idx,
         if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
             if (dev[i] != gpu_idx) continue;
             footprint = components[i].params_bytes + components[i].compute_bytes;
+        } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            // Row-split: every GPU in the mask gets a free-VRAM-weighted
+            // share of params; the compute reserve lands on the BIGGEST
+            // GPU (which becomes the runner's main backend).
+            const int mask = dev[i];
+            if (!(mask & (1 << gpu_idx))) continue;
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            int slot = -1;
+            int biggest_slot = 0;
+            int64_t biggest_mem = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) slot = int(k);
+                if (devices[gpu_idxs[k]].total_bytes > biggest_mem) {
+                    biggest_mem  = devices[gpu_idxs[k]].total_bytes;
+                    biggest_slot = int(k);
+                }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             /*compute_bytes=*/0,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
+            if (slot == biggest_slot) {
+                footprint += components[i].compute_bytes;
+            }
         } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
             // dev[i] holds the bitmask of participating GPU indices into the
             // devices[] vector (encoded by the planner). Look up our slot.
@@ -258,9 +309,13 @@ inline int64_t gpu_peak(int                           gpu_idx,
 inline Plan compute_plan(const std::vector<Component>& components,
                          const std::vector<Device>&    devices,
                          int64_t                       margin_bytes,
-                         bool                          allow_multi_gpu = true) {
+                         bool                          allow_multi_gpu = true,
+                         MultiGpuMode                  mode = MultiGpuMode::ROW) {
     const size_t nC = components.size();
     const size_t nG = devices.size();
+    if (!allow_multi_gpu) {
+        mode = MultiGpuMode::OFF;
+    }
 
     std::vector<int64_t> cap(nG, 0);
     for (size_t g = 0; g < nG; g++) {
@@ -287,9 +342,15 @@ inline Plan compute_plan(const std::vector<Component>& components,
                 opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
             }
         }
-        // Layer-split: enumerate non-trivial subsets of GPUs (size >= 2).
-        // Encode the participating set as a bitmask in device_idx.
-        if (allow_multi_gpu && nG >= 2 && supports_layer_split(c.kind)) {
+        // Multi-GPU split: one option type per mode. Encoded as a bitmask
+        // of participating GPUs in device_idx.
+        if (mode == MultiGpuMode::ROW && nG >= 2 && supports_layer_split(c.kind)) {
+            // Row-split spans all GPUs; single option with all bits set.
+            int all_mask = (1 << nG) - 1;
+            opts.push_back({Placement::GPU_TENSOR_SPLIT, all_mask});
+        }
+        if (mode == MultiGpuMode::LAYER && nG >= 2 && supports_layer_split(c.kind)) {
+            // Layer-split: enumerate non-trivial subsets (size >= 2).
             const int max_mask = 1 << nG;
             for (int mask = 1; mask < max_mask; mask++) {
                 if (__builtin_popcount(mask) < 2) continue;
@@ -326,6 +387,15 @@ inline Plan compute_plan(const std::vector<Component>& components,
             } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
                 s += 5 * pw;
                 gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+                // Row-split: cheaper than layer-split (no sched cross-
+                // backend doubling) but pays per-matmul cross-device
+                // reductions. Score it slightly above LAYER_SPLIT so the
+                // planner prefers it when both fit.
+                s += 8 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
             } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
                 // Better than CPU but worse than fitting on a single GPU
                 // (cross-GPU traffic between blocks).
@@ -418,6 +488,36 @@ inline Plan compute_plan(const std::vector<Component>& components,
             d.device_id      = DEVICE_ID_CPU;
             d.on_host_bytes  = c.params_bytes + c.compute_bytes;
             plan.any_changes = true;
+        } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, /*compute_bytes=*/0,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending TOTAL memory so the
+            // largest device is the "main" (gets the row-split's compute
+            // buffer + sub-runners that don't get their own spec). This
+            // matches the user's preference: always use the bigger GPU
+            // as main for splits.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                int64_t share = shares[k];
+                if (pos == 0) share += c.compute_bytes;  // main (= biggest) gets compute
+                d.split_share_bytes.push_back(share);
+                max_share = std::max(max_share, share);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
         } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) {
             std::vector<size_t> gpu_idxs;
             for (size_t k = 0; k < nG; k++) {
@@ -425,15 +525,15 @@ inline Plan compute_plan(const std::vector<Component>& components,
             }
             auto shares = layer_split_shares(c.params_bytes, c.compute_bytes,
                                              devices, gpu_idxs);
-            // Sort participating GPUs by descending share so the LARGEST-share
-            // GPU is listed first. Sub-runners that don't get the layer-split
-            // spec (e.g. the LTX-2 text projection) follow the "main" backend
-            // (= first in this list) — putting the biggest one first keeps
-            // them on the GPU with most headroom.
+            // Sort participating GPUs by descending TOTAL memory so the
+            // physically bigger GPU is listed first (and becomes the runner's
+            // main backend). Sub-runners that don't get the layer-split spec
+            // (e.g. the LTX-2 text projection) follow the main backend.
             std::vector<size_t> order(gpu_idxs.size());
             std::iota(order.begin(), order.end(), 0);
-            std::sort(order.begin(), order.end(),
-                      [&](size_t a, size_t b) { return shares[a] > shares[b]; });
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
 
             int64_t max_share = 0;
             for (size_t pos = 0; pos < order.size(); pos++) {
@@ -471,6 +571,7 @@ inline const char* placement_str(Placement p) {
         case Placement::GPU: return "GPU";
         case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
         case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)";
+        case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)";
     }
     return "?";
 }
@@ -506,15 +607,17 @@ inline void print_plan(const Plan&                   plan,
             LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
                      d.name.c_str(), d.device_id,
                      (long long)(d.on_device_bytes / MiB));
-        } else if (d.placement == Placement::GPU_LAYER_SPLIT) {
+        } else if (d.placement == Placement::GPU_LAYER_SPLIT ||
+                   d.placement == Placement::GPU_TENSOR_SPLIT) {
             std::string ids;
+            const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer";
             for (size_t k = 0; k < d.split_device_ids.size(); k++) {
                 if (k > 0) ids += "+";
                 ids += "GPU" + std::to_string(d.split_device_ids[k]);
                 ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)";
             }
-            LOG_INFO("    %-12s -> %s",
-                     d.name.c_str(), ids.c_str());
+            LOG_INFO("    %-12s -> %s-split %s",
+                     d.name.c_str(), tag, ids.c_str());
         } else {
             LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
                      d.name.c_str(), d.device_id,
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 54ec2fd1a..de2bbcd2b 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -25,6 +25,9 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
+#ifdef SD_USE_CUDA
+#include "ggml-cuda.h"
+#endif
 #include "ggml_extend_backend.hpp"
 
 #include "model.h"
@@ -1721,16 +1724,29 @@ struct GGMLRunnerContext {
 // To enable: populate g_pending_multi_backend_spec() with the additional
 // backends + tensor->backend callback, then construct the GGMLRunner. The
 // ctor consumes and clears the pending pointer.
+enum class MultiBackendMode {
+    LAYER_SPLIT,  // assign block-indexed tensors to per-block backends + sched
+    ROW_SPLIT,    // CUDA split-buft: matmul weights row-split across devices
+};
+
 struct MultiBackendSpec {
+    MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT;
+
     // Extra backends *in addition to* the runner's main runtime_backend.
     // The first entry's role is the main backend; we don't list it here.
     std::vector<ggml_backend_t> additional_backends;
 
-    // Maps a weight tensor name to one of the runner's backends (the main
-    // runtime_backend, or one of additional_backends). Returning nullptr
-    // means "use the main runtime_backend".
+    // LAYER_SPLIT: maps a weight tensor name to one of the runner's
+    // backends (the main runtime_backend, or one of additional_backends).
+    // Returning nullptr means "use the main runtime_backend".
     std::function<ggml_backend_t(const std::string& tensor_name)> tensor_backend_fn;
 
+    // ROW_SPLIT (CUDA-only): per-device row split ratios (length = total
+    // CUDA device count) and main device. Empty means use CUDA's default
+    // free-VRAM proportions.
+    std::vector<float> tensor_split_ratios;
+    int                main_device = 0;
+
     // Optional CPU backend appended last to the sched for unsupported-op
     // fallback. May be nullptr.
     ggml_backend_t cpu_fallback = nullptr;
@@ -1748,17 +1764,25 @@ struct GGMLRunner {
     ggml_backend_t params_backend  = nullptr;
     ggml_backend_t runtime_backend = nullptr;
 
-    // --- multi-backend (layer-split) state ---
-    bool                                                            multi_backend_mode = false;
+    // --- multi-backend state (layer-split via sched OR row-split via cuda_split_buft) ---
+    bool                                                            multi_backend_mode    = false;
+    MultiBackendMode                                                multi_backend_kind    = MultiBackendMode::LAYER_SPLIT;
     std::vector<ggml_backend_t>                                     additional_backends;
     ggml_backend_t                                                  cpu_fallback_backend = nullptr;
     bool                                                            owns_cpu_fallback_backend = false;
     std::function<ggml_backend_t(const std::string& tensor_name)>   tensor_backend_fn    = nullptr;
     ggml_backend_sched_t                                            sched                = nullptr;
     bool                                                            sched_reserved       = false;
-    // Per-backend params buffers when multi_backend_mode is on.
-    // params_buffer (single-backend) stays nullptr in this mode.
+    // Per-backend params buffers when LAYER_SPLIT is active. ROW_SPLIT uses
+    // a CUDA split-buft buffer + a regular buffer for non-split tensors,
+    // stored in row_split_buffer + row_main_buffer instead.
     std::vector<ggml_backend_buffer_t>                              multi_params_buffers;
+    // ROW_SPLIT-only state.
+    std::vector<float>                                              row_split_ratios;
+    int                                                             row_main_device     = 0;
+    ggml_backend_buffer_type_t                                      row_split_buft      = nullptr;
+    ggml_backend_buffer_t                                           row_split_buffer    = nullptr;
+    ggml_backend_buffer_t                                           row_main_buffer     = nullptr;
 
     // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual
     // alloc + tensor-data load is deferred until the first compute(). The
@@ -2122,17 +2146,40 @@ struct GGMLRunner {
 
     GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
         : runtime_backend(backend) {
-        // Consume any pending multi-backend (layer-split) spec set by the
-        // caller via g_pending_multi_backend_spec().
+        // Consume any pending multi-backend spec set by the caller via
+        // g_pending_multi_backend_spec().
         MultiBackendSpec* pending = g_pending_multi_backend_spec();
         if (pending != nullptr) {
             g_pending_multi_backend_spec() = nullptr;
             multi_backend_mode             = true;
+            multi_backend_kind             = pending->mode;
             additional_backends            = pending->additional_backends;
             tensor_backend_fn              = pending->tensor_backend_fn;
             cpu_fallback_backend           = pending->cpu_fallback;
-            if (offload_params_to_cpu) {
-                LOG_WARN("multi-backend layer-split is incompatible with "
+            row_split_ratios               = pending->tensor_split_ratios;
+            row_main_device                = pending->main_device;
+#ifdef SD_USE_CUDA
+            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+                row_split_buft = ggml_backend_cuda_split_buffer_type(
+                    row_main_device,
+                    row_split_ratios.empty() ? nullptr : row_split_ratios.data());
+                if (row_split_buft == nullptr) {
+                    LOG_WARN("multi-backend: cuda split buft init failed; "
+                             "falling back to single-backend mode");
+                    multi_backend_mode = false;
+                    additional_backends.clear();
+                    cpu_fallback_backend = nullptr;
+                }
+            }
+#else
+            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+                LOG_WARN("multi-backend: row-split requires CUDA; "
+                         "falling back to single-backend mode");
+                multi_backend_mode = false;
+            }
+#endif
+            if (multi_backend_mode && offload_params_to_cpu) {
+                LOG_WARN("multi-backend split is incompatible with "
                          "offload_params_to_cpu; ignoring offload");
                 offload_params_to_cpu = false;
             }
@@ -2315,10 +2362,99 @@ struct GGMLRunner {
         return true;
     }
 
+    // Heuristic for row-split eligibility: contiguous, rank-2, both dims
+    // >= 256, and NOT a view. 1D biases / norms / embeddings / small
+    // projections / views fall back to the main GPU's regular per-device
+    // buft. Excluding views avoids the cuda split buft's
+    // GGML_ASSERT(view_src == nullptr) — sticking to the buft's documented
+    // contract instead of patching ggml.
+    static bool is_row_split_eligible(const ggml_tensor* t) {
+        if (t->view_src != nullptr) return false;
+        if (!ggml_is_contiguous(t)) return false;
+        if (ggml_n_dims(t) != 2) return false;
+        if (t->ne[0] < 256 || t->ne[1] < 256) return false;
+        return true;
+    }
+
+    bool alloc_params_buffer_row_split() {
+#ifdef SD_USE_CUDA
+        ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend);
+        const size_t main_align  = ggml_backend_buft_get_alignment(main_buft);
+        const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft);
+
+        size_t main_size = 0, split_size = 0;
+        size_t main_count = 0, split_count = 0;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            if (is_row_split_eligible(t)) {
+                size_t s = ggml_backend_buft_get_alloc_size(row_split_buft, t);
+                split_size += GGML_PAD(s, split_align);
+                split_count++;
+            } else {
+                size_t s = ggml_backend_buft_get_alloc_size(main_buft, t);
+                main_size += GGML_PAD(s, main_align);
+                main_count++;
+            }
+        }
+
+        if (main_size > 0) {
+            row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size);
+            if (row_main_buffer == nullptr) {
+                LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)",
+                          get_desc().c_str(), main_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+        if (split_size > 0) {
+            row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size);
+            if (row_split_buffer == nullptr) {
+                LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)",
+                          get_desc().c_str(), split_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+
+        ggml_tallocr main_alloc{};
+        ggml_tallocr split_alloc{};
+        if (row_main_buffer != nullptr)  main_alloc  = ggml_tallocr_new(row_main_buffer);
+        if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer);
+
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            ggml_status st = is_row_split_eligible(t)
+                                 ? ggml_tallocr_alloc(&split_alloc, t)
+                                 : ggml_tallocr_alloc(&main_alloc, t);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s row-split tallocr_alloc failed for tensor %s",
+                          get_desc().c_str(), t->name);
+                return false;
+            }
+        }
+
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)",
+                 get_desc().c_str(),
+                 main_size / (1024.f * 1024.f), main_count,
+                 split_size / (1024.f * 1024.f), split_count);
+        return true;
+#else
+        LOG_ERROR("alloc_params_buffer_row_split called without CUDA");
+        return false;
+#endif
+    }
+
     // Internal: always materializes the params buffer. Used by both the
     // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded`
     // path; the latter must bypass the lazy-skip.
     bool do_alloc_params_buffer() {
+        if (multi_backend_mode && multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+            return alloc_params_buffer_row_split();
+        }
         if (multi_backend_mode) {
             return alloc_params_buffer_layer_split();
         }
@@ -2354,7 +2490,8 @@ struct GGMLRunner {
     }
 
     bool ensure_params_loaded() {
-        if (params_buffer != nullptr || !multi_params_buffers.empty()) {
+        if (params_buffer != nullptr || !multi_params_buffers.empty() ||
+            row_split_buffer != nullptr || row_main_buffer != nullptr) {
             return true;
         }
         if (!lazy_load_fn) {
@@ -2402,6 +2539,14 @@ struct GGMLRunner {
             }
         }
         multi_params_buffers.clear();
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_free(row_split_buffer);
+            row_split_buffer = nullptr;
+        }
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_free(row_main_buffer);
+            row_main_buffer = nullptr;
+        }
         if (sched != nullptr) {
             ggml_backend_sched_free(sched);
             sched          = nullptr;
@@ -2417,6 +2562,8 @@ struct GGMLRunner {
         for (auto* buf : multi_params_buffers) {
             if (buf != nullptr) total += ggml_backend_buffer_get_size(buf);
         }
+        if (row_split_buffer != nullptr) total += ggml_backend_buffer_get_size(row_split_buffer);
+        if (row_main_buffer != nullptr)  total += ggml_backend_buffer_get_size(row_main_buffer);
         return total;
     }
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 181dd8350..682b2347d 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -127,16 +127,20 @@ class StableDiffusionGGML {
     bool fit_cond_offload_params = false;
     bool fit_vae_offload_params  = false;
 
-    // Layer-split state (when auto-fit picks GPU_LAYER_SPLIT). Holds the
-    // ordered list of device names and per-device share bytes; the actual
-    // backend handles are init'd at construction time and stored in
-    // *_extra_backends so the destructor can free them.
+    // Multi-GPU split state (LAYER_SPLIT or ROW_SPLIT). Holds the ordered
+    // list of device names and per-device share bytes; the actual backend
+    // handles are init'd at construction time and stored in *_extra_backends
+    // so the destructor can free them. fit_*_row_split=true means use the
+    // CUDA row-split path (matmul weights split row-wise via cuda_split_buft);
+    // false means layer-split (per-block backend assignment via sched).
     std::vector<std::string>    fit_dit_split_device_names;
     std::vector<int64_t>        fit_dit_split_share_bytes;
     std::vector<ggml_backend_t> fit_dit_extra_backends;
+    bool                        fit_dit_row_split  = false;
     std::vector<std::string>    fit_cond_split_device_names;
     std::vector<int64_t>        fit_cond_split_share_bytes;
     std::vector<ggml_backend_t> fit_cond_extra_backends;
+    bool                        fit_cond_row_split = false;
 
     // Owned model loader: kept alive across init() so lazy_load callbacks
     // can re-read tensor data from disk on demand. Only set when at least
@@ -410,8 +414,11 @@ class StableDiffusionGGML {
             auto    devices = backend_fit::enumerate_gpu_devices();
             int64_t margin_bytes =
                 int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            backend_fit::MultiGpuMode mode = backend_fit::str_to_multi_gpu_mode(
+                SAFE_STR(sd_ctx_params->multi_gpu_mode));
             auto plan = backend_fit::compute_plan(
-                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu);
+                components, devices, margin_bytes,
+                sd_ctx_params->auto_multi_gpu, mode);
             backend_fit::print_plan(plan, components, devices, margin_bytes);
 
             if (sd_ctx_params->auto_fit_dry_run) {
@@ -439,9 +446,11 @@ class StableDiffusionGGML {
                                std::string&                  out_device,
                                bool&                         out_offload,
                                std::vector<std::string>&     out_split_devices,
-                               std::vector<int64_t>&         out_split_shares) {
+                               std::vector<int64_t>&         out_split_shares,
+                               bool&                         out_row_split) {
                 out_split_devices.clear();
                 out_split_shares.clear();
+                out_row_split = false;
                 if (d == nullptr) {
                     out_device.clear();
                     out_offload = false;
@@ -452,7 +461,8 @@ class StableDiffusionGGML {
                     out_offload = false;
                     return;
                 }
-                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) {
                     // Primary device drives main_backend choice for the model;
                     // the rest become additional backends in the spec.
                     for (size_t k = 0; k < d->split_device_ids.size(); k++) {
@@ -461,6 +471,7 @@ class StableDiffusionGGML {
                     }
                     if (!out_split_devices.empty()) out_device = out_split_devices[0];
                     out_offload = false;
+                    out_row_split = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT);
                     return;
                 }
                 out_device  = device_id_to_name(d->device_id);
@@ -468,14 +479,18 @@ class StableDiffusionGGML {
             };
             std::vector<std::string> dummy_devs;
             std::vector<int64_t>     dummy_shares;
+            bool                     dummy_row_split = false;
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT),
                     fit_diffusion_device, fit_dit_offload_params,
-                    fit_dit_split_device_names, fit_dit_split_share_bytes);
+                    fit_dit_split_device_names, fit_dit_split_share_bytes,
+                    fit_dit_row_split);
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE),
-                    fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares);
+                    fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares,
+                    dummy_row_split);
             resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER),
                     fit_clip_device, fit_cond_offload_params,
-                    fit_cond_split_device_names, fit_cond_split_share_bytes);
+                    fit_cond_split_device_names, fit_cond_split_share_bytes,
+                    fit_cond_row_split);
 
             // CPU placements: leave fit_*_device empty AND remember they're
             // CPU so the resolver below picks ggml_backend_cpu_init().
@@ -488,7 +503,8 @@ class StableDiffusionGGML {
             std::map<int, int64_t> sum_per_device;
             auto add_sum = [&](const backend_fit::Decision* d) {
                 if (!d) return;
-                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) {
                     for (size_t k = 0; k < d->split_device_ids.size(); k++) {
                         sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k];
                     }
@@ -585,6 +601,84 @@ class StableDiffusionGGML {
         const char* vae_dev_name       = effective_device(fit_vae_device,
                                                           sd_ctx_params->vae_backend_device);
 
+        // Build the row-split MultiBackendSpec for a component (ROW_SPLIT
+        // mode). Unlike layer-split, the runner uses a SINGLE CUDA backend;
+        // matmul weights are row-split across all CUDA devices internally
+        // by cuda_split_buffer_type. extra_backends stays empty.
+        // - share_devices/share_bytes: per-device share order from auto-fit
+        //   (largest first by descending share). The first device is the
+        //   "main" CUDA device, where the compute buffer lives.
+        // Returns true on success; populates out_spec.tensor_split_ratios
+        // with a vector of length total CUDA device count.
+        auto prepare_row_split_spec = [&](const std::vector<std::string>&     share_devices,
+                                          const std::vector<int64_t>&         share_bytes,
+                                          std::vector<ggml_backend_t>&        out_extra_backends,
+                                          MultiBackendSpec&                   out_spec) -> bool {
+#ifdef SD_USE_CUDA
+            const int cuda_count = ggml_backend_cuda_get_device_count();
+            if (cuda_count <= 0 || share_devices.size() < 2) return false;
+
+            // Map device names like "CUDA0" -> 0, "CUDA1" -> 1, ...
+            auto cuda_index_of = [](const std::string& name) -> int {
+                if (name.rfind("CUDA", 0) != 0) return -1;
+                try { return std::stoi(name.substr(4)); } catch (...) { return -1; }
+            };
+
+            std::vector<float> ratios(cuda_count, 0.0f);
+            int64_t total = 0;
+            for (auto b : share_bytes) total += b;
+            if (total <= 0) return false;
+            int main_dev = -1;
+            int64_t max_share = -1;
+            for (size_t k = 0; k < share_devices.size(); k++) {
+                int idx = cuda_index_of(share_devices[k]);
+                if (idx < 0 || idx >= cuda_count) continue;
+                ratios[idx] = float(double(share_bytes[k]) / double(total));
+                if (share_bytes[k] > max_share) {
+                    max_share = share_bytes[k];
+                    main_dev  = idx;
+                }
+            }
+            if (main_dev < 0) return false;
+
+            // Init extra CUDA backends for the non-main devices so sched
+            // can route ops across them (row-split tensors are dispatched
+            // by the CUDA backend; ggml-sched still needs all participating
+            // backends in its list to schedule cross-device copies).
+            for (size_t k = 0; k < share_devices.size(); k++) {
+                int idx = cuda_index_of(share_devices[k]);
+                if (idx == main_dev || idx < 0) continue;
+                ggml_backend_t b = init_named_backend(share_devices[k]);
+                if (b != nullptr) {
+                    out_extra_backends.push_back(b);
+                } else {
+                    LOG_WARN("row-split: failed to init backend %s",
+                             share_devices[k].c_str());
+                }
+            }
+            out_spec.mode                = MultiBackendMode::ROW_SPLIT;
+            out_spec.tensor_split_ratios = ratios;
+            out_spec.main_device         = main_dev;
+            out_spec.additional_backends.assign(out_extra_backends.begin(),
+                                                out_extra_backends.end());
+            out_spec.tensor_backend_fn   = nullptr;
+            out_spec.cpu_fallback        = nullptr;
+
+            std::string ratio_str;
+            for (int i = 0; i < cuda_count; i++) {
+                if (i > 0) ratio_str += ",";
+                char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]);
+                ratio_str += buf;
+            }
+            LOG_INFO("row-split spec: ratios=[%s] main_device=%d",
+                     ratio_str.c_str(), main_dev);
+            return true;
+#else
+            (void)share_devices; (void)share_bytes; (void)out_spec;
+            return false;
+#endif
+        };
+
         // Build the layer-split MultiBackendSpec for a component. Only used
         // when auto-fit picked GPU_LAYER_SPLIT for this component.
         // - main_backend: the runner's primary backend (also first in the spec)
@@ -750,12 +844,19 @@ class StableDiffusionGGML {
         MultiBackendSpec dit_spec;
         bool             dit_spec_active = false;
         if (!fit_dit_split_device_names.empty()) {
-            dit_spec_active = prepare_layer_split_spec(diffusion_backend,
-                                                       fit_dit_split_device_names,
-                                                       fit_dit_split_share_bytes,
-                                                       "model.diffusion_model.",
-                                                       fit_dit_extra_backends,
-                                                       dit_spec);
+            if (fit_dit_row_split) {
+                dit_spec_active = prepare_row_split_spec(fit_dit_split_device_names,
+                                                         fit_dit_split_share_bytes,
+                                                         fit_dit_extra_backends,
+                                                         dit_spec);
+            } else {
+                dit_spec_active = prepare_layer_split_spec(diffusion_backend,
+                                                           fit_dit_split_device_names,
+                                                           fit_dit_split_share_bytes,
+                                                           "model.diffusion_model.",
+                                                           fit_dit_extra_backends,
+                                                           dit_spec);
+            }
         }
         // Lambda to set the pending spec immediately before constructing the
         // diffusion model. Caller must invoke this on the same line / right
@@ -799,14 +900,21 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: using device %s", clip_dev_name);
             }
             // Now that clip_backend is resolved, build the conditioner's
-            // layer-split spec if auto-fit picked it.
+            // multi-GPU spec if auto-fit picked one (row-split or layer-split).
             if (!fit_cond_split_device_names.empty()) {
-                cond_spec_active = prepare_layer_split_spec(clip_backend,
-                                                            fit_cond_split_device_names,
-                                                            fit_cond_split_share_bytes,
-                                                            "text_encoders.",  // covers text_encoders.llm.* and text_encoders.t5xxl.*
-                                                            fit_cond_extra_backends,
-                                                            cond_spec);
+                if (fit_cond_row_split) {
+                    cond_spec_active = prepare_row_split_spec(fit_cond_split_device_names,
+                                                              fit_cond_split_share_bytes,
+                                                              fit_cond_extra_backends,
+                                                              cond_spec);
+                } else {
+                    cond_spec_active = prepare_layer_split_spec(clip_backend,
+                                                                fit_cond_split_device_names,
+                                                                fit_cond_split_share_bytes,
+                                                                "text_encoders.",
+                                                                fit_cond_extra_backends,
+                                                                cond_spec);
+                }
             }
             if (sd_version_is_sd3(version)) {
                 prime_cond_spec();
@@ -2701,6 +2809,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
     sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
     sd_ctx_params->auto_multi_gpu               = true;
+    sd_ctx_params->multi_gpu_mode               = "row";
     sd_ctx_params->quiet_unknown_tensors        = false;
 }
 
@@ -2749,6 +2858,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "auto_fit_compute_reserve_vae_mb: %d\n"
              "auto_fit_compute_reserve_cond_mb: %d\n"
              "auto_multi_gpu: %s\n"
+             "multi_gpu_mode: %s\n"
              "quiet_unknown_tensors: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
@@ -2795,6 +2905,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_ctx_params->auto_fit_compute_reserve_vae_mb,
              sd_ctx_params->auto_fit_compute_reserve_cond_mb,
              BOOL_STR(sd_ctx_params->auto_multi_gpu),
+             SAFE_STR(sd_ctx_params->multi_gpu_mode),
              BOOL_STR(sd_ctx_params->quiet_unknown_tensors),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),

From 27b1ed3f34aa47784e12c91d85652f3dcc71ba7e Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Fri, 1 May 2026 16:37:48 +0200
Subject: [PATCH 8/9] ggml.patch: cuda_split_buffer_type pool allocator

cuda_split_buffer_type::init_tensor previously did one raw cudaMalloc
per tensor per device. Each call returns memory from CUDA's bucketed
reuse pool; when the buffer is freed and a new split buffer is
allocated, the driver doesn't coalesce returned chunks into a
contiguous range. With multiple sequential row-split loads in the
same process (e.g. row-split conditioner -> free -> row-split DiT),
the second load OOMs on the smaller GPU even when the planner's
MAX-based peak says memory should be available.

This patch:
1. Pre-allocates one contiguous cudaMalloc per device sized by the
   tensor_split ratio + a 16 MiB safety margin in alloc_buffer.
2. Bump-allocates from each per-device pool in init_tensor, falling
   back to per-tensor cudaMalloc only for tail tensors whose
   per-device slice exceeds the margin (rounded sizes diverge from
   the ratio-derived bound).
3. Adds an early-return for view tensors in init_tensor (sched
   routinely calls init_tensor on views of split-backed weights).

The patch lives in this file rather than the vendored ggml submodule
so it survives future ggml syncs. Apply with:

    git -C ggml apply ggml.patch

Verified with sd.cpp's auto-fit pipeline (3080 + 5060 Ti):
- Heavy quants (Q6_K DiT 17.7 GB + Q8_K_XL LLM 15.6 GB) at
  256x256x9/4 steps with both components row-split: generation 92s.
  Was OOM'ing on DiT load after Conditioner freed.
- Lighter quants (Q5_K_S DiT + IQ4_XS LLM) at 640x480x25/8: DiT
  row-split, Conditioner single-GPU, 131s. Unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ggml.patch | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 ggml.patch

diff --git a/ggml.patch b/ggml.patch
new file mode 100644
index 000000000..0515013e1
--- /dev/null
+++ b/ggml.patch
@@ -0,0 +1,184 @@
+diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
+index cc80eb3f..a73ef0de 100644
+--- a/src/ggml-cuda/ggml-cuda.cu
++++ b/src/ggml-cuda/ggml-cuda.cu
+@@ -832,6 +832,19 @@ struct ggml_backend_cuda_split_buffer_type_context {
+ };
+ 
+ struct ggml_backend_cuda_split_buffer_context {
++    // Per-device pool: one contiguous cudaMalloc per device, sub-allocated
++    // by init_tensor. Replaces the previous per-tensor cudaMalloc to avoid
++    // bucketed-free fragmentation when multiple split buffers are loaded
++    // and freed sequentially (e.g. row-split conditioner -> row-split DiT).
++    char * pool_base[GGML_CUDA_MAX_DEVICES] = {};
++    size_t pool_size[GGML_CUDA_MAX_DEVICES] = {};
++    size_t pool_used[GGML_CUDA_MAX_DEVICES] = {};
++    // Side-allocations for tensors whose per-device slice didn't fit in the
++    // pool (row-split rounding skews per-device sizes off the planner's
++    // ratio). These do hit the per-tensor cudaMalloc path but only for the
++    // tail few tensors, not all of them.
++    std::vector<char *> pool_overflow_ptrs[GGML_CUDA_MAX_DEVICES];
++
+     ~ggml_backend_cuda_split_buffer_context() {
+         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+             for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+@@ -840,12 +853,22 @@ struct ggml_backend_cuda_split_buffer_context {
+                         CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+                     }
+                 }
+-                if (extra->data_device[id] != nullptr) {
+-                    CUDA_CHECK(cudaFree(extra->data_device[id]));
+-                }
++                // tensor data lives inside per-device pool or pool_overflow_ptrs; freed below
+             }
+             delete extra;
+         }
++        for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
++            if (pool_base[id] == nullptr && pool_overflow_ptrs[id].empty()) {
++                continue;  // never touched this device — skip set_device
++            }
++            ggml_cuda_set_device(id);
++            for (char * p : pool_overflow_ptrs[id]) {
++                if (p != nullptr) CUDA_CHECK(cudaFree(p));
++            }
++            if (pool_base[id] != nullptr) {
++                CUDA_CHECK(cudaFree(pool_base[id]));
++            }
++        }
+     }
+ 
+     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+@@ -865,7 +888,13 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
+ }
+ 
+ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
++    // Views: storage comes from view_src, so this split buffer has nothing
++    // to allocate for the view. Sched routes any op that consumes the view
++    // through view_src's backend. Mirrors the non-split buffer init's
++    // early-return for views.
++    if (tensor->view_src != nullptr) {
++        return GGML_STATUS_SUCCESS;
++    }
+     GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+ 
+     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+@@ -876,6 +905,10 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
+     ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+     ctx->tensor_extras.push_back(extra);
+ 
++    // 256-byte alignment is the CUDA default and matches what plain
++    // cudaMalloc returns; matmul kernels assume at least this.
++    constexpr size_t SPLIT_POOL_ALIGN = 256;
++
+     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+         int64_t row_low, row_high;
+         get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+@@ -893,11 +926,34 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
+             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+         }
+ 
+-        // FIXME: do not crash if cudaMalloc fails
+-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+         ggml_cuda_set_device(id);
+-        char * buf;
+-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
++
++        char * buf = nullptr;
++        if (ctx->pool_base[id] != nullptr) {
++            // Pool path: bump-allocate inside the pre-allocated per-device
++            // slab. Avoids the per-tensor cudaMalloc fragmentation that
++            // breaks sequential row-split loads (Cond -> free -> DiT).
++            size_t off = (ctx->pool_used[id] + SPLIT_POOL_ALIGN - 1) & ~(SPLIT_POOL_ALIGN - 1);
++            if (off + size <= ctx->pool_size[id]) {
++                buf = ctx->pool_base[id] + off;
++                ctx->pool_used[id] = off + size;
++            } else {
++                // Pool exhausted (per-device share computation undershoot
++                // because row-split rounding skews per-device sizes away
++                // from tensor_split ratios). Fall back to a side cudaMalloc
++                // for this tensor's slice; freed by the per-tensor branch
++                // in the dtor. Most tensors still hit the pool; only the
++                // tail few that don't fit pay the fragmentation cost.
++                CUDA_CHECK(ggml_cuda_device_malloc((void **)&buf, size, id));
++                ctx->pool_overflow_ptrs[id].push_back(buf);
++            }
++        } else {
++            // Fallback for the legacy path (pool alloc failed in alloc_buffer
++            // or some caller bypassed the pool). Per-tensor cudaMalloc.
++            // FIXME: do not crash if cudaMalloc fails
++            // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
++            CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
++        }
+ 
+         // set padding to 0 to avoid possible NaN values
+         if (size > original_size) {
+@@ -1022,12 +1078,64 @@ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
+ }
+ 
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+-    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+-    // instead, we allocate them for each tensor separately in init_tensor
+-    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+-    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
++    // size is the cumulative max across ALL devices and ALL tensors (sum of
++    // get_alloc_size). Pre-allocate one contiguous slab per device sized by
++    // the tensor_split ratio + a small safety margin for per-tensor padding
++    // rounding. init_tensor then bump-allocates inside these slabs.
++    //
++    // Why: per-tensor cudaMalloc fragments the CUDA driver's free-list when
++    // the buffer is freed (driver keeps bucketed reuse pools). When two
++    // split buffers are loaded sequentially (e.g. row-split conditioner ->
++    // free -> row-split DiT), the second load OOMs even when the planner's
++    // MAX-based peak says memory should be available.
++    ggml_backend_cuda_split_buffer_type_context * buft_ctx =
++        (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+     ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
+ 
++    const int dev_count = ggml_backend_cuda_get_device_count();
++
++    // tensor_split is cumulative offsets in [0, 1]: device i covers
++    // [tensor_split[i], tensor_split[i+1]). Its share of the total is the
++    // delta. The last device gets up to 1.0.
++    bool pool_alloc_ok = true;
++    for (int id = 0; id < dev_count; ++id) {
++        const float lo = buft_ctx->tensor_split[id];
++        const float hi = (id == dev_count - 1) ? 1.0f : buft_ctx->tensor_split[id + 1];
++        const float frac = hi - lo;
++        if (frac <= 0.0f) {
++            continue;
++        }
++        // Safety margin: each tensor's per-device slice can pad up to
++        // (MATRIX_ROW_PADDING - 1) elements * row_size bytes. With many
++        // tensors that adds up; size_t(frac * size) plus 16 MiB cushion
++        // covers it for typical row counts. Plus one pool-alignment quantum
++        // per tensor would be tighter but harder to compute upfront.
++        size_t per_dev = size_t((double)frac * (double)size) + size_t(16) * 1024 * 1024;
++        ggml_cuda_set_device(id);
++        cudaError_t err = ggml_cuda_device_malloc((void **)&ctx->pool_base[id], per_dev, id);
++        if (err != cudaSuccess) {
++            GGML_LOG_WARN("%s: split pool alloc failed on device %d (%zu bytes, frac=%.3f); "
++                          "falling back to per-tensor cudaMalloc\n",
++                          __func__, id, per_dev, frac);
++            ctx->pool_base[id] = nullptr;
++            pool_alloc_ok = false;
++            // Don't bail — release any pools we've already taken so we don't
++            // hold partial pools while running fragmented anyway.
++            break;
++        }
++        ctx->pool_size[id] = per_dev;
++    }
++    if (!pool_alloc_ok) {
++        for (int id = 0; id < dev_count; ++id) {
++            if (ctx->pool_base[id] != nullptr) {
++                ggml_cuda_set_device(id);
++                CUDA_CHECK(cudaFree(ctx->pool_base[id]));
++                ctx->pool_base[id] = nullptr;
++                ctx->pool_size[id] = 0;
++            }
++        }
++    }
++
+     return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+ }
+ 

From 04fb57f81f7c199755a3319c7feada4cb955a11d Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sat, 2 May 2026 16:05:25 +0200
Subject: [PATCH 9/9] review: address PR #1470 #ifdef and old-flag-alias
 feedback

Two changes from wbruna's review:

1. Replace `#ifdef SD_USE_CUDA` blocks with runtime backend dispatch.
   - Add `ggml_backend_split_buffer_type(backend, ...)` helper in
     `ggml_extend_backend.hpp` that looks up `ggml_backend_split_buffer_type`
     via `reg_get_proc_address`. Both CUDA and SYCL publish this proc, so
     row-split is no longer compile-time gated to CUDA.
   - Drop `#include "ggml-cuda.h"` and the `#ifdef SD_USE_CUDA` blocks in
     `ggml_extend.hpp` (constructor + `alloc_params_buffer_row_split`).
   - In `stable-diffusion.cpp::prepare_row_split_spec`, derive the backend
     registry from the device-name prefix (CUDA0 -> reg "CUDA", SYCL1 ->
     reg "SYCL") instead of calling `ggml_backend_cuda_get_device_count`.
   - Drop `add_definitions(-DSD_USE_CUDA)` from CMakeLists.txt; the macro
     is no longer referenced.

2. Restore the removed CPU-placement flags as soft-deprecated aliases
   (matching the existing `--qwen2vl` / `--qwen2vl_vision` deprecation
   pattern). Each alias sets the new `--*-backend-device` to "CPU" and
   disables auto-fit so the placement is honored verbatim:
   - `--clip-on-cpu`         -> `--clip-backend-device CPU`
   - `--vae-on-cpu`          -> `--vae-backend-device CPU`
   - `--control-net-cpu`     -> `--control-net-backend-device CPU`

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt              |  1 -
 examples/common/common.cpp  | 28 +++++++++++++++++++++
 src/ggml_extend.hpp         | 31 +++++++++--------------
 src/ggml_extend_backend.hpp | 18 ++++++++++++++
 src/stable-diffusion.cpp    | 49 ++++++++++++++++++++++---------------
 5 files changed, 87 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32375b163..48ce456ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,7 +72,6 @@ option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF
 if(SD_CUDA)
     message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 36f1c6a86..792b580c5 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -645,6 +645,34 @@ ArgOptions SDContextParams::get_options() {
              std::exit(0);
              return 0;
          }},
+        // Soft-deprecated aliases for the old per-component CPU-placement
+        // toggles. They map onto the new --*-backend-device strings and also
+        // disable auto-fit so the placement is honored verbatim (matching
+        // the pre-auto-fit behavior these flags expressed).
+        {"",
+         "--clip-on-cpu",
+         "alias of --clip-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             clip_backend_device = "CPU";
+             auto_fit            = false;
+             return 0;
+         }},
+        {"",
+         "--vae-on-cpu",
+         "alias of --vae-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             vae_backend_device = "CPU";
+             auto_fit           = false;
+             return 0;
+         }},
+        {"",
+         "--control-net-cpu",
+         "alias of --control-net-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             control_net_backend_device = "CPU";
+             auto_fit                   = false;
+             return 0;
+         }},
     };
 
     return options;
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index de2bbcd2b..ea8a28812 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -25,9 +25,6 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
-#ifdef SD_USE_CUDA
-#include "ggml-cuda.h"
-#endif
 #include "ggml_extend_backend.hpp"
 
 #include "model.h"
@@ -2158,26 +2155,21 @@ struct GGMLRunner {
             cpu_fallback_backend           = pending->cpu_fallback;
             row_split_ratios               = pending->tensor_split_ratios;
             row_main_device                = pending->main_device;
-#ifdef SD_USE_CUDA
             if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
-                row_split_buft = ggml_backend_cuda_split_buffer_type(
+                row_split_buft = ggml_backend_split_buffer_type(
+                    runtime_backend,
                     row_main_device,
                     row_split_ratios.empty() ? nullptr : row_split_ratios.data());
                 if (row_split_buft == nullptr) {
-                    LOG_WARN("multi-backend: cuda split buft init failed; "
-                             "falling back to single-backend mode");
+                    LOG_WARN("multi-backend: row-split buft init failed "
+                             "(backend does not publish "
+                             "ggml_backend_split_buffer_type); falling back "
+                             "to single-backend mode");
                     multi_backend_mode = false;
                     additional_backends.clear();
                     cpu_fallback_backend = nullptr;
                 }
             }
-#else
-            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
-                LOG_WARN("multi-backend: row-split requires CUDA; "
-                         "falling back to single-backend mode");
-                multi_backend_mode = false;
-            }
-#endif
             if (multi_backend_mode && offload_params_to_cpu) {
                 LOG_WARN("multi-backend split is incompatible with "
                          "offload_params_to_cpu; ignoring offload");
@@ -2377,7 +2369,12 @@ struct GGMLRunner {
     }
 
     bool alloc_params_buffer_row_split() {
-#ifdef SD_USE_CUDA
+        if (row_split_buft == nullptr) {
+            LOG_ERROR("alloc_params_buffer_row_split: row-split buft not "
+                      "initialized (backend lacks "
+                      "ggml_backend_split_buffer_type)");
+            return false;
+        }
         ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend);
         const size_t main_align  = ggml_backend_buft_get_alignment(main_buft);
         const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft);
@@ -2442,10 +2439,6 @@ struct GGMLRunner {
                  main_size / (1024.f * 1024.f), main_count,
                  split_size / (1024.f * 1024.f), split_count);
         return true;
-#else
-        LOG_ERROR("alloc_params_buffer_row_split called without CUDA");
-        return false;
-#endif
     }
 
     // Internal: always materializes the params buffer. Used by both the
diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp
index 50158c883..6d60a73ec 100644
--- a/src/ggml_extend_backend.hpp
+++ b/src/ggml_extend_backend.hpp
@@ -121,6 +121,24 @@ __STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backen
     }
 }
 
+// Runtime lookup of a backend's row-split buffer type (currently published by
+// the CUDA and SYCL backends as `ggml_backend_split_buffer_type` in their
+// reg_get_proc_address tables). Returns nullptr when the backend does not
+// support row-split, leaving the caller to fall back to a non-split path.
+using __ggml_backend_split_buffer_type_t = ggml_backend_buffer_type_t (*)(int main_device, const float* tensor_split);
+
+__STATIC_INLINE__ ggml_backend_buffer_type_t ggml_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) {
+    ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend);
+    if (reg == nullptr) {
+        return nullptr;
+    }
+    auto fn = reinterpret_cast<__ggml_backend_split_buffer_type_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"));
+    if (fn == nullptr) {
+        return nullptr;
+    }
+    return fn(main_device, tensor_split);
+}
+
 __STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
     if (tensor == nullptr) {
         return nullptr;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 682b2347d..c389c6242 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -614,25 +614,38 @@ class StableDiffusionGGML {
                                           const std::vector<int64_t>&         share_bytes,
                                           std::vector<ggml_backend_t>&        out_extra_backends,
                                           MultiBackendSpec&                   out_spec) -> bool {
-#ifdef SD_USE_CUDA
-            const int cuda_count = ggml_backend_cuda_get_device_count();
-            if (cuda_count <= 0 || share_devices.size() < 2) return false;
-
-            // Map device names like "CUDA0" -> 0, "CUDA1" -> 1, ...
-            auto cuda_index_of = [](const std::string& name) -> int {
-                if (name.rfind("CUDA", 0) != 0) return -1;
-                try { return std::stoi(name.substr(4)); } catch (...) { return -1; }
+            if (share_devices.size() < 2) return false;
+
+            // Derive the backend registry from the device-name prefix (e.g.
+            // "CUDA0" -> reg "CUDA", "SYCL1" -> reg "SYCL"). This keeps the
+            // code backend-agnostic: any backend whose registry publishes
+            // `ggml_backend_split_buffer_type` via reg_get_proc_address can
+            // drive row-split, not just CUDA.
+            auto reg_prefix_of = [](const std::string& name) -> std::string {
+                size_t i = 0;
+                while (i < name.size() && (std::isalpha((unsigned char)name[i]) || name[i] == '_')) i++;
+                return name.substr(0, i);
+            };
+            const std::string reg_name = reg_prefix_of(share_devices[0]);
+            ggml_backend_reg_t reg     = ggml_backend_reg_by_name(reg_name.c_str());
+            if (reg == nullptr) return false;
+            const int dev_count = (int)ggml_backend_reg_dev_count(reg);
+            if (dev_count <= 0) return false;
+
+            auto reg_index_of = [&](const std::string& name) -> int {
+                if (name.rfind(reg_name, 0) != 0) return -1;
+                try { return std::stoi(name.substr(reg_name.size())); } catch (...) { return -1; }
             };
 
-            std::vector<float> ratios(cuda_count, 0.0f);
+            std::vector<float> ratios(dev_count, 0.0f);
             int64_t total = 0;
             for (auto b : share_bytes) total += b;
             if (total <= 0) return false;
             int main_dev = -1;
             int64_t max_share = -1;
             for (size_t k = 0; k < share_devices.size(); k++) {
-                int idx = cuda_index_of(share_devices[k]);
-                if (idx < 0 || idx >= cuda_count) continue;
+                int idx = reg_index_of(share_devices[k]);
+                if (idx < 0 || idx >= dev_count) continue;
                 ratios[idx] = float(double(share_bytes[k]) / double(total));
                 if (share_bytes[k] > max_share) {
                     max_share = share_bytes[k];
@@ -641,12 +654,12 @@ class StableDiffusionGGML {
             }
             if (main_dev < 0) return false;
 
-            // Init extra CUDA backends for the non-main devices so sched
-            // can route ops across them (row-split tensors are dispatched
-            // by the CUDA backend; ggml-sched still needs all participating
+            // Init extra backends for the non-main devices so sched can
+            // route ops across them (row-split tensors are dispatched by the
+            // primary backend; ggml-sched still needs all participating
             // backends in its list to schedule cross-device copies).
             for (size_t k = 0; k < share_devices.size(); k++) {
-                int idx = cuda_index_of(share_devices[k]);
+                int idx = reg_index_of(share_devices[k]);
                 if (idx == main_dev || idx < 0) continue;
                 ggml_backend_t b = init_named_backend(share_devices[k]);
                 if (b != nullptr) {
@@ -665,7 +678,7 @@ class StableDiffusionGGML {
             out_spec.cpu_fallback        = nullptr;
 
             std::string ratio_str;
-            for (int i = 0; i < cuda_count; i++) {
+            for (int i = 0; i < dev_count; i++) {
                 if (i > 0) ratio_str += ",";
                 char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]);
                 ratio_str += buf;
@@ -673,10 +686,6 @@ class StableDiffusionGGML {
             LOG_INFO("row-split spec: ratios=[%s] main_device=%d",
                      ratio_str.c_str(), main_dev);
             return true;
-#else
-            (void)share_devices; (void)share_bytes; (void)out_spec;
-            return false;
-#endif
         };
 
         // Build the layer-split MultiBackendSpec for a component. Only used