diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 1a5399b82..792b580c5 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -380,6 +380,54 @@ ArgOptions SDContextParams::get_options() {
          "--upscale-model",
          "path to esrgan model.",
          &esrgan_path},
+        {"",
+         "--main-backend-device",
+         "ggml device name to use as the main backend (see --list-devices). "
+         "When unset, the first GPU device is used.",
+         &main_backend_device},
+        {"",
+         "--diffusion-backend-device",
+         "ggml device name for the diffusion / flow model. "
+         "Falls back to --main-backend-device.",
+         &diffusion_backend_device},
+        {"",
+         "--clip-backend-device",
+         "ggml device name for the text encoders. "
+         "Falls back to --main-backend-device.",
+         &clip_backend_device},
+        {"",
+         "--vae-backend-device",
+         "ggml device name for the VAE. Falls back to --main-backend-device.",
+         &vae_backend_device},
+        {"",
+         "--control-net-backend-device",
+         "ggml device name for the ControlNet. "
+         "Falls back to --main-backend-device.",
+         &control_net_backend_device},
+        {"",
+         "--tae-backend-device",
+         "ggml device name for the TAE (currently routed through main).",
+         &tae_backend_device},
+        {"",
+         "--upscaler-backend-device",
+         "ggml device name for the upscaler (currently routed through main).",
+         &upscaler_backend_device},
+        {"",
+         "--photomaker-backend-device",
+         "ggml device name for PhotoMaker (currently routed through main).",
+         &photomaker_backend_device},
+        {"",
+         "--vision-backend-device",
+         "ggml device name for the vision model (currently routed through main).",
+         &vision_backend_device},
+        {"",
+         "--multi-gpu-mode",
+         "auto-fit multi-GPU split mechanism: 'row' (default; CUDA-only "
+         "row-split via cuda_split_buffer_type, single backend, smaller "
+         "compute buffer), 'layer' (block-indexed tensors split across "
+         "per-block backends + sched, generic but ~2x activation cost at "
+         "boundaries), or 'off' (never split a single component)",
+         &multi_gpu_mode},
     };
 
     options.int_options = {
@@ -392,6 +440,23 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
+        {"",
+         "--fit-compute-reserve-dit",
+         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
+         "(0 keeps the built-in default)",
+         &auto_fit_compute_reserve_dit_mb},
+        {"",
+         "--fit-compute-reserve-vae",
+         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer",
+         &auto_fit_compute_reserve_vae_mb},
+        {"",
+         "--fit-compute-reserve-cond",
+         "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer",
+         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {};
@@ -409,18 +474,6 @@ ArgOptions SDContextParams::get_options() {
          "--mmap",
          "whether to memory-map model",
          true, &enable_mmap},
-        {"",
-         "--control-net-cpu",
-         "keep controlnet in cpu (for low vram)",
-         true, &control_net_cpu},
-        {"",
-         "--clip-on-cpu",
-         "keep clip in cpu (for low vram)",
-         true, &clip_on_cpu},
-        {"",
-         "--vae-on-cpu",
-         "keep vae in cpu (for low vram)",
-         true, &vae_on_cpu},
         {"",
          "--fa",
          "use flash attention",
@@ -461,6 +514,30 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit *-backend-device flags",
+         false, &auto_fit},
+        {"",
+         "--no-multi-gpu",
+         "auto-fit: keep all components on a single GPU when they fit "
+         "(by default, multi-GPU placements are preferred to balance load)",
+         false, &auto_multi_gpu},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
+        {"",
+         "--quiet-unknown-tensors",
+         "suppress per-tensor 'unknown tensor X in model file' log lines "
+         "(useful for LTX-2 and similar models that ship many unused "
+         "tensors); a single summary line with the count is logged instead",
+         true, &quiet_unknown_tensors},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -559,6 +636,43 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--list-devices",
+         "list available ggml backend devices (one per line, "
+         "name<TAB>description) and exit",
+         [](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             sd_list_devices();
+             std::exit(0);
+             return 0;
+         }},
+        // Soft-deprecated aliases for the old per-component CPU-placement
+        // toggles. They map onto the new --*-backend-device strings and also
+        // disable auto-fit so the placement is honored verbatim (matching
+        // the pre-auto-fit behavior these flags expressed).
+        {"",
+         "--clip-on-cpu",
+         "alias of --clip-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             clip_backend_device = "CPU";
+             auto_fit            = false;
+             return 0;
+         }},
+        {"",
+         "--vae-on-cpu",
+         "alias of --vae-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             vae_backend_device = "CPU";
+             auto_fit           = false;
+             return 0;
+         }},
+        {"",
+         "--control-net-cpu",
+         "alias of --control-net-backend-device CPU (also disables --auto-fit). Deprecated.",
+         [this](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             control_net_backend_device = "CPU";
+             auto_fit                   = false;
+             return 0;
+         }},
     };
 
     return options;
@@ -671,9 +785,21 @@ std::string SDContextParams::to_string() const {
         << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
         << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-        << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-        << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-        << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+        << "  main_backend_device: \"" << main_backend_device << "\",\n"
+        << "  diffusion_backend_device: \"" << diffusion_backend_device << "\",\n"
+        << "  clip_backend_device: \"" << clip_backend_device << "\",\n"
+        << "  vae_backend_device: \"" << vae_backend_device << "\",\n"
+        << "  control_net_backend_device: \"" << control_net_backend_device << "\",\n"
+        << "  tae_backend_device: \"" << tae_backend_device << "\",\n"
+        << "  upscaler_backend_device: \"" << upscaler_backend_device << "\",\n"
+        << "  photomaker_backend_device: \"" << photomaker_backend_device << "\",\n"
+        << "  vision_backend_device: \"" << vision_backend_device << "\",\n"
+        << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
+        << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
+        << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
+        << "  quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -729,9 +855,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         lora_apply_mode,
         offload_params_to_cpu,
         enable_mmap,
-        clip_on_cpu,
-        control_net_cpu,
-        vae_on_cpu,
+        main_backend_device.empty() ? nullptr : main_backend_device.c_str(),
+        diffusion_backend_device.empty() ? nullptr : diffusion_backend_device.c_str(),
+        clip_backend_device.empty() ? nullptr : clip_backend_device.c_str(),
+        vae_backend_device.empty() ? nullptr : vae_backend_device.c_str(),
+        control_net_backend_device.empty() ? nullptr : control_net_backend_device.c_str(),
+        tae_backend_device.empty() ? nullptr : tae_backend_device.c_str(),
+        upscaler_backend_device.empty() ? nullptr : upscaler_backend_device.c_str(),
+        photomaker_backend_device.empty() ? nullptr : photomaker_backend_device.c_str(),
+        vision_backend_device.empty() ? nullptr : vision_backend_device.c_str(),
         flash_attn,
         diffusion_flash_attn,
         taesd_preview,
@@ -744,6 +876,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        auto_fit,
+        auto_fit_target_mb,
+        auto_fit_dry_run,
+        auto_fit_compute_reserve_dit_mb,
+        auto_fit_compute_reserve_vae_mb,
+        auto_fit_compute_reserve_cond_mb,
+        auto_multi_gpu,
+        multi_gpu_mode.empty() ? nullptr : multi_gpu_mode.c_str(),
+        quiet_unknown_tensors,
     };
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index c4498c352..1df32f9c0 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -110,9 +110,15 @@ struct SDContextParams {
     rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
     bool offload_params_to_cpu  = false;
     bool enable_mmap            = false;
-    bool control_net_cpu        = false;
-    bool clip_on_cpu            = false;
-    bool vae_on_cpu             = false;
+    std::string main_backend_device;
+    std::string diffusion_backend_device;
+    std::string clip_backend_device;
+    std::string vae_backend_device;
+    std::string control_net_backend_device;
+    std::string tae_backend_device;
+    std::string upscaler_backend_device;
+    std::string photomaker_backend_device;
+    std::string vision_backend_device;
     bool flash_attn             = false;
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
@@ -128,6 +134,23 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit defaults — placement is computed automatically based on free
+    // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device.
+    bool auto_fit                         = true;
+    int  auto_fit_target_mb               = 512;
+    bool auto_fit_dry_run                 = false;
+    int  auto_fit_compute_reserve_dit_mb  = 0;
+    int  auto_fit_compute_reserve_vae_mb  = 0;
+    int  auto_fit_compute_reserve_cond_mb = 0;
+    bool auto_multi_gpu                   = true;
+    // "row" (default), "layer", or "off". Selects the multi-GPU split
+    // mechanism the auto-fit planner is allowed to emit.
+    std::string multi_gpu_mode            = "row";
+
+    // When set, the model loader skips per-tensor "unknown tensor" log
+    // lines and instead emits a single summary count at the end of load.
+    bool quiet_unknown_tensors            = false;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
diff --git a/ggml.patch b/ggml.patch
new file mode 100644
index 000000000..0515013e1
--- /dev/null
+++ b/ggml.patch
@@ -0,0 +1,184 @@
+diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
+index cc80eb3f..a73ef0de 100644
+--- a/src/ggml-cuda/ggml-cuda.cu
++++ b/src/ggml-cuda/ggml-cuda.cu
+@@ -832,6 +832,19 @@ struct ggml_backend_cuda_split_buffer_type_context {
+ };
+ 
+ struct ggml_backend_cuda_split_buffer_context {
++    // Per-device pool: one contiguous cudaMalloc per device, sub-allocated
++    // by init_tensor. Replaces the previous per-tensor cudaMalloc to avoid
++    // bucketed-free fragmentation when multiple split buffers are loaded
++    // and freed sequentially (e.g. row-split conditioner -> row-split DiT).
++    char * pool_base[GGML_CUDA_MAX_DEVICES] = {};
++    size_t pool_size[GGML_CUDA_MAX_DEVICES] = {};
++    size_t pool_used[GGML_CUDA_MAX_DEVICES] = {};
++    // Side-allocations for tensors whose per-device slice didn't fit in the
++    // pool (row-split rounding skews per-device sizes off the planner's
++    // ratio). These do hit the per-tensor cudaMalloc path but only for the
++    // tail few tensors, not all of them.
++    std::vector<char *> pool_overflow_ptrs[GGML_CUDA_MAX_DEVICES];
++
+     ~ggml_backend_cuda_split_buffer_context() {
+         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+             for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+@@ -840,12 +853,22 @@ struct ggml_backend_cuda_split_buffer_context {
+                         CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+                     }
+                 }
+-                if (extra->data_device[id] != nullptr) {
+-                    CUDA_CHECK(cudaFree(extra->data_device[id]));
+-                }
++                // tensor data lives inside per-device pool or pool_overflow_ptrs; freed below
+             }
+             delete extra;
+         }
++        for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
++            if (pool_base[id] == nullptr && pool_overflow_ptrs[id].empty()) {
++                continue;  // never touched this device — skip set_device
++            }
++            ggml_cuda_set_device(id);
++            for (char * p : pool_overflow_ptrs[id]) {
++                if (p != nullptr) CUDA_CHECK(cudaFree(p));
++            }
++            if (pool_base[id] != nullptr) {
++                CUDA_CHECK(cudaFree(pool_base[id]));
++            }
++        }
+     }
+ 
+     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+@@ -865,7 +888,13 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
+ }
+ 
+ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
++    // Views: storage comes from view_src, so this split buffer has nothing
++    // to allocate for the view. Sched routes any op that consumes the view
++    // through view_src's backend. Mirrors the non-split buffer init's
++    // early-return for views.
++    if (tensor->view_src != nullptr) {
++        return GGML_STATUS_SUCCESS;
++    }
+     GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+ 
+     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+@@ -876,6 +905,10 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
+     ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+     ctx->tensor_extras.push_back(extra);
+ 
++    // 256-byte alignment is the CUDA default and matches what plain
++    // cudaMalloc returns; matmul kernels assume at least this.
++    constexpr size_t SPLIT_POOL_ALIGN = 256;
++
+     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+         int64_t row_low, row_high;
+         get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+@@ -893,11 +926,34 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
+             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+         }
+ 
+-        // FIXME: do not crash if cudaMalloc fails
+-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+         ggml_cuda_set_device(id);
+-        char * buf;
+-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
++
++        char * buf = nullptr;
++        if (ctx->pool_base[id] != nullptr) {
++            // Pool path: bump-allocate inside the pre-allocated per-device
++            // slab. Avoids the per-tensor cudaMalloc fragmentation that
++            // breaks sequential row-split loads (Cond -> free -> DiT).
++            size_t off = (ctx->pool_used[id] + SPLIT_POOL_ALIGN - 1) & ~(SPLIT_POOL_ALIGN - 1);
++            if (off + size <= ctx->pool_size[id]) {
++                buf = ctx->pool_base[id] + off;
++                ctx->pool_used[id] = off + size;
++            } else {
++                // Pool exhausted (per-device share computation undershoot
++                // because row-split rounding skews per-device sizes away
++                // from tensor_split ratios). Fall back to a side cudaMalloc
++                // for this tensor's slice; freed by the per-tensor branch
++                // in the dtor. Most tensors still hit the pool; only the
++                // tail few that don't fit pay the fragmentation cost.
++                CUDA_CHECK(ggml_cuda_device_malloc((void **)&buf, size, id));
++                ctx->pool_overflow_ptrs[id].push_back(buf);
++            }
++        } else {
++            // Fallback for the legacy path (pool alloc failed in alloc_buffer
++            // or some caller bypassed the pool). Per-tensor cudaMalloc.
++            // FIXME: do not crash if cudaMalloc fails
++            // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
++            CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
++        }
+ 
+         // set padding to 0 to avoid possible NaN values
+         if (size > original_size) {
+@@ -1022,12 +1078,64 @@ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
+ }
+ 
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+-    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+-    // instead, we allocate them for each tensor separately in init_tensor
+-    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+-    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
++    // size is the cumulative max across ALL devices and ALL tensors (sum of
++    // get_alloc_size). Pre-allocate one contiguous slab per device sized by
++    // the tensor_split ratio + a small safety margin for per-tensor padding
++    // rounding. init_tensor then bump-allocates inside these slabs.
++    //
++    // Why: per-tensor cudaMalloc fragments the CUDA driver's free-list when
++    // the buffer is freed (driver keeps bucketed reuse pools). When two
++    // split buffers are loaded sequentially (e.g. row-split conditioner ->
++    // free -> row-split DiT), the second load OOMs even when the planner's
++    // MAX-based peak says memory should be available.
++    ggml_backend_cuda_split_buffer_type_context * buft_ctx =
++        (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+     ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
+ 
++    const int dev_count = ggml_backend_cuda_get_device_count();
++
++    // tensor_split is cumulative offsets in [0, 1]: device i covers
++    // [tensor_split[i], tensor_split[i+1]). Its share of the total is the
++    // delta. The last device gets up to 1.0.
++    bool pool_alloc_ok = true;
++    for (int id = 0; id < dev_count; ++id) {
++        const float lo = buft_ctx->tensor_split[id];
++        const float hi = (id == dev_count - 1) ? 1.0f : buft_ctx->tensor_split[id + 1];
++        const float frac = hi - lo;
++        if (frac <= 0.0f) {
++            continue;
++        }
++        // Safety margin: each tensor's per-device slice can pad up to
++        // (MATRIX_ROW_PADDING - 1) elements * row_size bytes. With many
++        // tensors that adds up; size_t(frac * size) plus 16 MiB cushion
++        // covers it for typical row counts. Plus one pool-alignment quantum
++        // per tensor would be tighter but harder to compute upfront.
++        size_t per_dev = size_t((double)frac * (double)size) + size_t(16) * 1024 * 1024;
++        ggml_cuda_set_device(id);
++        cudaError_t err = ggml_cuda_device_malloc((void **)&ctx->pool_base[id], per_dev, id);
++        if (err != cudaSuccess) {
++            GGML_LOG_WARN("%s: split pool alloc failed on device %d (%zu bytes, frac=%.3f); "
++                          "falling back to per-tensor cudaMalloc\n",
++                          __func__, id, per_dev, frac);
++            ctx->pool_base[id] = nullptr;
++            pool_alloc_ok = false;
++            // Don't bail — release any pools we've already taken so we don't
++            // hold partial pools while running fragmented anyway.
++            break;
++        }
++        ctx->pool_size[id] = per_dev;
++    }
++    if (!pool_alloc_ok) {
++        for (int id = 0; id < dev_count; ++id) {
++            if (ctx->pool_base[id] != nullptr) {
++                ggml_cuda_set_device(id);
++                CUDA_CHECK(cudaFree(ctx->pool_base[id]));
++                ctx->pool_base[id] = nullptr;
++                ctx->pool_size[id] = 0;
++            }
++        }
++    }
++
+     return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+ }
+ 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 75027f8f8..7da5324b4 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -188,9 +188,18 @@ typedef struct {
     enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
+    // Per-component backend device names (ggml device names). Empty / NULL
+    // means "use the main backend device". The strings are only borrowed for
+    // the duration of the init call. See sd_list_devices() for what to pass.
+    const char* main_backend_device;
+    const char* diffusion_backend_device;
+    const char* clip_backend_device;
+    const char* vae_backend_device;
+    const char* control_net_backend_device;
+    const char* tae_backend_device;
+    const char* upscaler_backend_device;
+    const char* photomaker_backend_device;
+    const char* vision_backend_device;
     bool flash_attn;
     bool diffusion_flash_attn;
     bool tae_preview_only;
@@ -203,6 +212,49 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), the *_backend_device strings are
+    // ignored and the plan is computed automatically.
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
+    // per-component compute-buffer reserve; 0 means use the built-in default.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    int  auto_fit_compute_reserve_dit_mb;
+    int  auto_fit_compute_reserve_vae_mb;
+    int  auto_fit_compute_reserve_cond_mb;
+
+    // When more than one GPU device is present, prefer placing different
+    // components on different GPUs to balance load and fit larger total
+    // working sets. Set false to keep all components on a single GPU when
+    // they fit. Defaults to true.
+    bool auto_multi_gpu;
+
+    // When auto_multi_gpu is true and a single component doesn't fit on
+    // one GPU, the planner can split it across multiple GPUs. Two
+    // mechanisms:
+    //   "row":   matmul weights row-split across CUDA devices via
+    //            cuda_split_buffer_type. Single CUDA backend; no sched.
+    //            Cheaper compute buffer (no cross-backend doubling) but
+    //            CUDA-only. Default.
+    //   "layer": block-indexed tensors assigned to per-block backends
+    //            and routed via ggml_backend_sched. Generic across
+    //            backends but costs ~2x activation memory at boundaries.
+    //   "off":   never split a single component across GPUs. Components
+    //            that don't fit fall back to OFFLOAD or CPU.
+    // The string is parsed by backend_fit::str_to_multi_gpu_mode; if
+    // unrecognized, "row" is used.
+    const char* multi_gpu_mode;
+
+    // Suppress per-tensor "unknown tensor 'X' in model file" log lines
+    // emitted during model loading. Useful for models like LTX-2 that
+    // ship hundreds of audio-branch / encoder tensors a video-only
+    // pipeline doesn't consume. A single summary line is logged at the
+    // end with the count of skipped tensors.
+    bool quiet_unknown_tensors;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -449,6 +501,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
+// per-line format. The output is intended to be parsed by tools and used
+// directly as the value of --*-backend-device flags.
+SD_API void sd_list_devices(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
new file mode 100644
index 000000000..b95632750
--- /dev/null
+++ b/src/backend_fit.hpp
@@ -0,0 +1,652 @@
+#ifndef __SD_BACKEND_FIT_HPP__
+#define __SD_BACKEND_FIT_HPP__
+
+// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the
+// available GPU devices and system RAM.
+//
+// Each component is treated as a single atomic unit that lives entirely on
+// one device (plus its compute buffer on the same device). There is no
+// intra-tensor row split: cross-device parallelism comes from placing
+// different components on different GPUs, not from splitting individual
+// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER
+// at the component granularity.
+//
+// Placement priority: DiT + compute buffer -> VAE -> Conditioner.
+// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
+// support streaming params from RAM at compute time).
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include "model.h"
+#include "util.h"
+
+namespace backend_fit {
+
+constexpr int64_t MiB           = 1024 * 1024;
+constexpr int     DEVICE_ID_CPU = -1;
+
+enum class ComponentKind {
+    DIT,
+    VAE,
+    CONDITIONER,
+};
+
+enum class Placement {
+    CPU,
+    GPU,
+    GPU_OFFLOAD_PARAMS,    // params in RAM, compute on GPU
+    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries (sched-based)
+    GPU_TENSOR_SPLIT,      // matmul weights row-split across GPUs (CUDA split-buft, single backend)
+};
+
+struct Component {
+    ComponentKind kind;
+    std::string   name;
+    int64_t       params_bytes     = 0;
+    int64_t       compute_bytes    = 0;
+    bool          supports_offload = false;
+};
+
+struct Device {
+    int                id = DEVICE_ID_CPU;
+    std::string        name;
+    std::string        description;
+    int64_t            free_bytes  = 0;
+    int64_t            total_bytes = 0;
+    ggml_backend_dev_t dev         = nullptr;  // backing ggml device handle (GPU only)
+};
+
+struct Decision {
+    ComponentKind kind;
+    std::string   name;
+    Placement     placement       = Placement::CPU;
+    int           device_id       = DEVICE_ID_CPU;
+    int64_t       on_device_bytes = 0;
+    int64_t       on_host_bytes   = 0;
+
+    // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs
+    // that share this component (in order) and each device's estimated share
+    // of the params. The order also defines block-range partitioning: the
+    // i-th device gets a contiguous range of blocks proportional to share[i].
+    std::vector<int>     split_device_ids;
+    std::vector<int64_t> split_share_bytes;
+};
+
+struct Plan {
+    std::vector<Decision>  decisions;
+    std::map<int, int64_t> device_bytes;
+    int64_t                host_bytes  = 0;
+    bool                   any_changes = false;
+};
+
+struct ComputeReserves {
+    int64_t dit_bytes         = int64_t(2048) * MiB;
+    int64_t vae_bytes         = int64_t(1024) * MiB;
+    int64_t conditioner_bytes = int64_t(512) * MiB;
+};
+
+enum class MultiGpuMode {
+    OFF,    // never split a single component across GPUs
+    ROW,    // CUDA-only: row-split matmul weights via cuda_split_buffer_type
+    LAYER,  // generic: assign block-indexed tensors to per-block backends + sched
+};
+
+inline const char* multi_gpu_mode_str(MultiGpuMode m) {
+    switch (m) {
+        case MultiGpuMode::OFF:   return "off";
+        case MultiGpuMode::ROW:   return "row";
+        case MultiGpuMode::LAYER: return "layer";
+    }
+    return "?";
+}
+
+inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) {
+    if (s == "off")   return MultiGpuMode::OFF;
+    if (s == "row")   return MultiGpuMode::ROW;
+    if (s == "layer") return MultiGpuMode::LAYER;
+    return MultiGpuMode::ROW;  // default
+}
+
+// --- Classification -------------------------------------------------------
+
+inline bool classify_tensor(const std::string& name, ComponentKind& out) {
+    auto contains = [&](const char* s) { return name.find(s) != std::string::npos; };
+
+    if (contains("model.diffusion_model.") || contains("unet.")) {
+        out = ComponentKind::DIT;
+        return true;
+    }
+
+    if (contains("first_stage_model.") ||
+        name.rfind("vae.", 0) == 0 ||
+        name.rfind("tae.", 0) == 0) {
+        out = ComponentKind::VAE;
+        return true;
+    }
+
+    if (contains("text_encoders") ||
+        contains("cond_stage_model") ||
+        contains("te.text_model.") ||
+        contains("conditioner") ||
+        name.rfind("text_encoder.", 0) == 0 ||
+        // Connector / text projection layers that run on the conditioner
+        // backend (e.g. LTX-2's text_embedding_projection: video/audio
+        // aggregate embeds + projection that map LLM hidden states into
+        // DiT-input space).
+        name.rfind("text_embedding_projection.", 0) == 0 ||
+        contains(".aggregate_embed.")) {
+        out = ComponentKind::CONDITIONER;
+        return true;
+    }
+
+    return false;
+}
+
+// --- Memory estimation ----------------------------------------------------
+
+inline std::vector<Component> estimate_components(ModelLoader&           loader,
+                                                  ggml_type              override_wtype,
+                                                  int64_t                alignment,
+                                                  const ComputeReserves& reserves) {
+    auto& storage = loader.get_tensor_storage_map();
+
+    int64_t bytes[3] = {0, 0, 0};
+
+    for (auto& [name, ts_const] : storage) {
+        TensorStorage ts = ts_const;
+        if (is_unused_tensor(ts.name)) {
+            continue;
+        }
+
+        ComponentKind k;
+        if (!classify_tensor(ts.name, k)) {
+            continue;
+        }
+
+        if (override_wtype != GGML_TYPE_COUNT &&
+            loader.tensor_should_be_converted(ts, override_wtype)) {
+            ts.type = override_wtype;
+        } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) {
+            ts.type = ts.expected_type;
+        }
+
+        bytes[int(k)] += ts.nbytes() + alignment;
+    }
+
+    std::vector<Component> out;
+    out.reserve(3);
+    out.push_back({ComponentKind::DIT, "DiT",
+                   bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true});
+    out.push_back({ComponentKind::VAE, "VAE",
+                   bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false});
+    out.push_back({ComponentKind::CONDITIONER, "Conditioner",
+                   bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true});
+    return out;
+}
+
+// --- Device enumeration ---------------------------------------------------
+
+inline std::vector<Device> enumerate_gpu_devices() {
+    std::vector<Device> out;
+    int gpu_idx = 0;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            continue;
+        }
+        Device d;
+        d.id          = gpu_idx++;
+        d.dev         = dev;
+        d.name        = ggml_backend_dev_name(dev);
+        d.description = ggml_backend_dev_description(dev);
+        size_t free_b = 0, total_b = 0;
+        ggml_backend_dev_memory(dev, &free_b, &total_b);
+        d.free_bytes  = int64_t(free_b);
+        d.total_bytes = int64_t(total_b);
+        out.push_back(d);
+    }
+    return out;
+}
+
+// --- Core algorithm -------------------------------------------------------
+
+// Per-GPU share for a layer-split component: free-VRAM-weighted partition
+// of params, plus the full compute reserve on each participating device.
+// (Compute reserve is per-device since each shard activates its own kernels.)
+inline std::vector<int64_t> layer_split_shares(int64_t                    params_bytes,
+                                               int64_t                    compute_bytes,
+                                               const std::vector<Device>& devices,
+                                               const std::vector<size_t>& gpu_idxs) {
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    int64_t total_free = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        total_free += std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes);
+    }
+    if (total_free <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        double r = double(std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes)) / double(total_free);
+        out[k]   = int64_t(double(params_bytes) * r) + compute_bytes;
+    }
+    return out;
+}
+
+// Peak per device = MAX of any single component's footprint on that device,
+// because free_params_immediately frees params between phases so components
+// time-share VRAM.
+inline int64_t gpu_peak(int                           gpu_idx,
+                        const std::vector<Placement>& pl,
+                        const std::vector<int>&       dev,
+                        const std::vector<Component>& components,
+                        const std::vector<Device>&    devices = {}) {
+    int64_t peak = 0;
+    for (size_t i = 0; i < components.size(); i++) {
+        int64_t footprint = 0;
+        if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            if (dev[i] != gpu_idx) continue;
+            footprint = components[i].params_bytes + components[i].compute_bytes;
+        } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            // Row-split: every GPU in the mask gets a free-VRAM-weighted
+            // share of params; the compute reserve lands on the BIGGEST
+            // GPU (which becomes the runner's main backend).
+            const int mask = dev[i];
+            if (!(mask & (1 << gpu_idx))) continue;
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            int slot = -1;
+            int biggest_slot = 0;
+            int64_t biggest_mem = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) slot = int(k);
+                if (devices[gpu_idxs[k]].total_bytes > biggest_mem) {
+                    biggest_mem  = devices[gpu_idxs[k]].total_bytes;
+                    biggest_slot = int(k);
+                }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             /*compute_bytes=*/0,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
+            if (slot == biggest_slot) {
+                footprint += components[i].compute_bytes;
+            }
+        } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+            // dev[i] holds the bitmask of participating GPU indices into the
+            // devices[] vector (encoded by the planner). Look up our slot.
+            const int mask = dev[i];
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Find this gpu's slot in gpu_idxs.
+            int slot = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             components[i].compute_bytes,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
+        }
+        peak = std::max(peak, footprint);
+    }
+    return peak;
+}
+
+inline Plan compute_plan(const std::vector<Component>& components,
+                         const std::vector<Device>&    devices,
+                         int64_t                       margin_bytes,
+                         bool                          allow_multi_gpu = true,
+                         MultiGpuMode                  mode = MultiGpuMode::ROW) {
+    const size_t nC = components.size();
+    const size_t nG = devices.size();
+    if (!allow_multi_gpu) {
+        mode = MultiGpuMode::OFF;
+    }
+
+    std::vector<int64_t> cap(nG, 0);
+    for (size_t g = 0; g < nG; g++) {
+        cap[g] = std::max<int64_t>(0, devices[g].free_bytes - margin_bytes);
+    }
+
+    struct OptionSlot {
+        Placement placement;
+        int       device_idx;
+    };
+
+    // Layer-split is only meaningful for components made up of many similarly
+    // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE
+    // is too structurally heterogeneous for naive block partitioning.
+    auto supports_layer_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
+    };
+
+    auto build_options = [&](const Component& c) {
+        std::vector<OptionSlot> opts;
+        for (size_t g = 0; g < nG; g++) {
+            opts.push_back({Placement::GPU, int(g)});
+            if (c.supports_offload) {
+                opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
+            }
+        }
+        // Multi-GPU split: one option type per mode. Encoded as a bitmask
+        // of participating GPUs in device_idx.
+        if (mode == MultiGpuMode::ROW && nG >= 2 && supports_layer_split(c.kind)) {
+            // Row-split spans all GPUs; single option with all bits set.
+            int all_mask = (1 << nG) - 1;
+            opts.push_back({Placement::GPU_TENSOR_SPLIT, all_mask});
+        }
+        if (mode == MultiGpuMode::LAYER && nG >= 2 && supports_layer_split(c.kind)) {
+            // Layer-split: enumerate non-trivial subsets (size >= 2).
+            const int max_mask = 1 << nG;
+            for (int mask = 1; mask < max_mask; mask++) {
+                if (__builtin_popcount(mask) < 2) continue;
+                opts.push_back({Placement::GPU_LAYER_SPLIT, mask});
+            }
+        }
+        opts.push_back({Placement::CPU, -1});
+        return opts;
+    };
+
+    std::vector<std::vector<OptionSlot>> options;
+    options.reserve(nC);
+    for (const Component& c : components) {
+        options.push_back(build_options(c));
+    }
+
+    auto priority_weight = [](ComponentKind k) -> int {
+        switch (k) {
+            case ComponentKind::DIT:         return 300;
+            case ComponentKind::CONDITIONER: return 120;
+            case ComponentKind::VAE:         return 60;
+        }
+        return 1;
+    };
+
+    auto score = [&](const std::vector<Placement>& pl, const std::vector<int>& dev) {
+        int64_t       s = 0;
+        std::set<int> gpus_used;
+        for (size_t i = 0; i < nC; i++) {
+            const int pw = priority_weight(components[i].kind);
+            if (pl[i] == Placement::GPU) {
+                s += 10 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                s += 5 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+                // Row-split: cheaper than layer-split (no sched cross-
+                // backend doubling) but pays per-matmul cross-device
+                // reductions. Score it slightly above LAYER_SPLIT so the
+                // planner prefers it when both fit.
+                s += 8 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
+            } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+                // Better than CPU but worse than fitting on a single GPU
+                // (cross-GPU traffic between blocks).
+                s += 7 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
+            } else {
+                s -= 10 * pw;
+            }
+        }
+        if (allow_multi_gpu) {
+            s += 2 * int64_t(gpus_used.size());
+        }
+        return s;
+    };
+
+    std::vector<size_t>    idx(nC, 0);
+    std::vector<Placement> best_pl;
+    std::vector<int>       best_dev;
+    int64_t                best_score = std::numeric_limits<int64_t>::min();
+    bool                   found_any  = false;
+
+    while (true) {
+        std::vector<Placement> pl(nC);
+        std::vector<int>       dev(nC);
+        for (size_t i = 0; i < nC; i++) {
+            pl[i]  = options[i][idx[i]].placement;
+            dev[i] = options[i][idx[i]].device_idx;
+        }
+        // Constraint: when multi-GPU is disabled, all GPU placements must
+        // share the same device index.
+        if (!allow_multi_gpu) {
+            int common = -1;
+            bool ok = true;
+            for (size_t i = 0; i < nC; i++) {
+                if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                    if (common < 0) common = dev[i];
+                    else if (dev[i] != common) { ok = false; break; }
+                }
+            }
+            if (ok) {
+                bool feasible = true;
+                for (size_t g = 0; g < nG; g++) {
+                    if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
+                }
+                if (feasible) {
+                    int64_t sc = score(pl, dev);
+                    if (sc > best_score) {
+                        best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                    }
+                }
+            }
+        } else {
+            bool feasible = true;
+            for (size_t g = 0; g < nG; g++) {
+                if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
+            }
+            if (feasible) {
+                int64_t sc = score(pl, dev);
+                if (sc > best_score) {
+                    best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                }
+            }
+        }
+
+        size_t pos = 0;
+        while (pos < nC) {
+            idx[pos]++;
+            if (idx[pos] < options[pos].size()) break;
+            idx[pos] = 0;
+            pos++;
+        }
+        if (pos >= nC) break;
+    }
+
+    Plan plan;
+    if (!found_any) {
+        best_pl.assign(nC, Placement::CPU);
+        best_dev.assign(nC, -1);
+    }
+
+    for (size_t i = 0; i < nC; i++) {
+        const Component& c = components[i];
+        Decision         d;
+        d.kind      = c.kind;
+        d.name      = c.name;
+        d.placement = best_pl[i];
+        if (best_pl[i] == Placement::CPU) {
+            d.device_id      = DEVICE_ID_CPU;
+            d.on_host_bytes  = c.params_bytes + c.compute_bytes;
+            plan.any_changes = true;
+        } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, /*compute_bytes=*/0,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending TOTAL memory so the
+            // largest device is the "main" (gets the row-split's compute
+            // buffer + sub-runners that don't get their own spec). This
+            // matches the user's preference: always use the bigger GPU
+            // as main for splits.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                int64_t share = shares[k];
+                if (pos == 0) share += c.compute_bytes;  // main (= biggest) gets compute
+                d.split_share_bytes.push_back(share);
+                max_share = std::max(max_share, share);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
+        } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, c.compute_bytes,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending TOTAL memory so the
+            // physically bigger GPU is listed first (and becomes the runner's
+            // main backend). Sub-runners that don't get the layer-split spec
+            // (e.g. the LTX-2 text projection) follow the main backend.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                d.split_share_bytes.push_back(shares[k]);
+                max_share = std::max(max_share, shares[k]);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
+        } else {
+            d.device_id = devices[best_dev[i]].id;
+            if (best_pl[i] == Placement::GPU) {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+            } else {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+                d.on_host_bytes   = c.params_bytes;
+                plan.any_changes  = true;
+            }
+        }
+        plan.decisions.push_back(d);
+        plan.host_bytes += d.on_host_bytes;
+    }
+
+    for (size_t g = 0; g < nG; g++) {
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices);
+    }
+    return plan;
+}
+
+inline const char* placement_str(Placement p) {
+    switch (p) {
+        case Placement::CPU: return "CPU";
+        case Placement::GPU: return "GPU";
+        case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+        case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)";
+        case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)";
+    }
+    return "?";
+}
+
+inline void print_plan(const Plan&                   plan,
+                       const std::vector<Component>& components,
+                       const std::vector<Device>&    devices,
+                       int64_t                       margin_bytes) {
+    LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB));
+    LOG_INFO("  available devices:");
+    if (devices.empty()) {
+        LOG_INFO("    (no GPU devices detected — all components will run on CPU)");
+    }
+    for (const Device& d : devices) {
+        LOG_INFO("    %-12s %-32s free %6lld / %6lld MiB",
+                 d.name.c_str(), d.description.c_str(),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)(d.total_bytes / MiB));
+    }
+    LOG_INFO("  components:");
+    for (const Component& c : components) {
+        LOG_INFO("    %-12s params %6lld MiB, compute reserve %6lld MiB",
+                 c.name.c_str(),
+                 (long long)(c.params_bytes / MiB),
+                 (long long)(c.compute_bytes / MiB));
+    }
+    LOG_INFO("  decisions:");
+    for (const Decision& d : plan.decisions) {
+        if (d.placement == Placement::CPU) {
+            LOG_INFO("    %-12s -> CPU                (RAM %lld MiB)",
+                     d.name.c_str(), (long long)(d.on_host_bytes / MiB));
+        } else if (d.placement == Placement::GPU) {
+            LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB));
+        } else if (d.placement == Placement::GPU_LAYER_SPLIT ||
+                   d.placement == Placement::GPU_TENSOR_SPLIT) {
+            std::string ids;
+            const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer";
+            for (size_t k = 0; k < d.split_device_ids.size(); k++) {
+                if (k > 0) ids += "+";
+                ids += "GPU" + std::to_string(d.split_device_ids[k]);
+                ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)";
+            }
+            LOG_INFO("    %-12s -> %s-split %s",
+                     d.name.c_str(), tag, ids.c_str());
+        } else {
+            LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB),
+                     (long long)(d.on_host_bytes / MiB));
+        }
+    }
+    LOG_INFO("  projected per-device peak:");
+    for (const Device& d : devices) {
+        int64_t peak = 0;
+        auto    it   = plan.device_bytes.find(d.id);
+        if (it != plan.device_bytes.end()) peak = it->second;
+        LOG_INFO("    %-12s peak %6lld / %6lld MiB free  (remaining %lld MiB)",
+                 d.name.c_str(),
+                 (long long)(peak / MiB),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)((d.free_bytes - peak) / MiB));
+    }
+    LOG_INFO("    %-12s host RAM additional %lld MiB", "CPU",
+             (long long)(plan.host_bytes / MiB));
+}
+
+inline const Decision* find_decision(const Plan& plan, ComponentKind kind) {
+    for (const Decision& d : plan.decisions) {
+        if (d.kind == kind) return &d;
+    }
+    return nullptr;
+}
+
+}  // namespace backend_fit
+
+#endif  // __SD_BACKEND_FIT_HPP__
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 9f4d45524..99e27ae39 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -87,6 +87,11 @@ struct Conditioner {
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_flash_attention_enabled(bool enabled)                                 = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
+    // Defer the LLM sub-runner's params alloc + read until first compute().
+    // Only conditioners with a heavy LLM (e.g. LTX-2 Gemma) override this;
+    // others ignore the call. The callback is invoked AFTER the runner's
+    // alloc_params_buffer succeeds and is responsible for tensor data load.
+    virtual void set_llm_lazy_load(std::function<bool()> /*fn*/) {}
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
         GGML_ABORT("Not implemented yet!");
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index c0a2a11c0..d7ea6ede7 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -50,6 +50,10 @@ struct DiffusionModel {
     virtual int64_t get_adm_in_channels()                            = 0;
     virtual void set_flash_attention_enabled(bool enabled)           = 0;
     virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
+    // Defer params alloc + tensor data load until the first compute() call.
+    // Default: no-op. Subclasses backed by a single GGMLRunner forward to
+    // its set_lazy_load.
+    virtual void set_lazy_load(std::function<bool()> /*fn*/) {}
 };
 
 struct UNetModel : public DiffusionModel {
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 8b748194f..ea8a28812 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1705,6 +1705,55 @@ struct GGMLRunnerContext {
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
+// ---------------------------------------------------------------------------
+// Multi-backend (layer-split) support
+// ---------------------------------------------------------------------------
+// A GGMLRunner can opt into "layer-split" mode where each weight tensor lives
+// entirely on one of several backends, picked by a caller-supplied callback
+// (typically based on the tensor name's block index). The runner switches
+// from gallocr to ggml_backend_sched for graph compute, so cross-backend
+// edges are routed automatically.
+//
+// This is the llama.cpp LLAMA_SPLIT_MODE_LAYER analogue. There is no
+// intra-tensor row split, so every tensor lives on a single normal device
+// buffer — views work without any ggml-cuda patch.
+//
+// To enable: populate g_pending_multi_backend_spec() with the additional
+// backends + tensor->backend callback, then construct the GGMLRunner. The
+// ctor consumes and clears the pending pointer.
+enum class MultiBackendMode {
+    LAYER_SPLIT,  // assign block-indexed tensors to per-block backends + sched
+    ROW_SPLIT,    // CUDA split-buft: matmul weights row-split across devices
+};
+
+struct MultiBackendSpec {
+    MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT;
+
+    // Extra backends *in addition to* the runner's main runtime_backend.
+    // The first entry's role is the main backend; we don't list it here.
+    std::vector<ggml_backend_t> additional_backends;
+
+    // LAYER_SPLIT: maps a weight tensor name to one of the runner's
+    // backends (the main runtime_backend, or one of additional_backends).
+    // Returning nullptr means "use the main runtime_backend".
+    std::function<ggml_backend_t(const std::string& tensor_name)> tensor_backend_fn;
+
+    // ROW_SPLIT (CUDA-only): per-device row split ratios (length = total
+    // CUDA device count) and main device. Empty means use CUDA's default
+    // free-VRAM proportions.
+    std::vector<float> tensor_split_ratios;
+    int                main_device = 0;
+
+    // Optional CPU backend appended last to the sched for unsupported-op
+    // fallback. May be nullptr.
+    ggml_backend_t cpu_fallback = nullptr;
+};
+
+__STATIC_INLINE__ MultiBackendSpec*& g_pending_multi_backend_spec() {
+    thread_local MultiBackendSpec* spec = nullptr;
+    return spec;
+}
+
 struct GGMLRunner {
 protected:
     typedef std::function<ggml_cgraph*()> get_graph_cb_t;
@@ -1712,6 +1761,33 @@ struct GGMLRunner {
     ggml_backend_t params_backend  = nullptr;
     ggml_backend_t runtime_backend = nullptr;
 
+    // --- multi-backend state (layer-split via sched OR row-split via cuda_split_buft) ---
+    bool                                                            multi_backend_mode    = false;
+    MultiBackendMode                                                multi_backend_kind    = MultiBackendMode::LAYER_SPLIT;
+    std::vector<ggml_backend_t>                                     additional_backends;
+    ggml_backend_t                                                  cpu_fallback_backend = nullptr;
+    bool                                                            owns_cpu_fallback_backend = false;
+    std::function<ggml_backend_t(const std::string& tensor_name)>   tensor_backend_fn    = nullptr;
+    ggml_backend_sched_t                                            sched                = nullptr;
+    bool                                                            sched_reserved       = false;
+    // Per-backend params buffers when LAYER_SPLIT is active. ROW_SPLIT uses
+    // a CUDA split-buft buffer + a regular buffer for non-split tensors,
+    // stored in row_split_buffer + row_main_buffer instead.
+    std::vector<ggml_backend_buffer_t>                              multi_params_buffers;
+    // ROW_SPLIT-only state.
+    std::vector<float>                                              row_split_ratios;
+    int                                                             row_main_device     = 0;
+    ggml_backend_buffer_type_t                                      row_split_buft      = nullptr;
+    ggml_backend_buffer_t                                           row_split_buffer    = nullptr;
+    ggml_backend_buffer_t                                           row_main_buffer     = nullptr;
+
+    // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual
+    // alloc + tensor-data load is deferred until the first compute(). The
+    // callback is invoked AFTER do_alloc_params_buffer succeeds and is
+    // responsible for populating tensor->data via ModelLoader. Used to keep
+    // peak VRAM per-component-MAX rather than sum-of-components at init.
+    std::function<bool()>                                           lazy_load_fn = nullptr;
+
     ggml_context* params_ctx                    = nullptr;
     ggml_backend_buffer_t params_buffer         = nullptr;
     ggml_context* offload_ctx                   = nullptr;
@@ -1859,7 +1935,56 @@ struct GGMLRunner {
         return gf;
     }
 
+    // Build the multi-backend sched (lazily).
+    bool ensure_sched() {
+        if (sched != nullptr) return true;
+        std::vector<ggml_backend_t> backends;
+        backends.reserve(1 + additional_backends.size() + 1);
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+        // ggml_backend_sched_new asserts the last backend is a CPU; create
+        // a CPU fallback if the caller didn't provide one. We own this
+        // instance and free it in the dtor below.
+        if (cpu_fallback_backend == nullptr) {
+            cpu_fallback_backend     = ggml_backend_cpu_init();
+            owns_cpu_fallback_backend = true;
+        }
+        backends.push_back(cpu_fallback_backend);
+        sched = ggml_backend_sched_new(backends.data(),
+                                       /*bufts=*/nullptr,
+                                       (int)backends.size(),
+                                       MAX_GRAPH_SIZE,
+                                       /*parallel=*/false,
+                                       /*op_offload=*/false);
+        if (sched == nullptr) {
+            LOG_ERROR("%s: failed to create backend sched", get_desc().c_str());
+            return false;
+        }
+        return true;
+    }
+
     bool alloc_compute_buffer(get_graph_cb_t get_graph) {
+        if (multi_backend_mode) {
+            if (sched_reserved) return true;
+            if (!ensure_sched()) return false;
+            reset_compute_ctx();
+            ggml_cgraph* gf = get_compute_graph(get_graph);
+            backend_tensor_data_map.clear();
+            if (!ggml_backend_sched_reserve(sched, gf)) {
+                LOG_ERROR("%s: sched reserve failed", get_desc().c_str());
+                return false;
+            }
+            sched_reserved = true;
+            for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); i++) {
+                ggml_backend_t b = ggml_backend_sched_get_backend(sched, i);
+                size_t s         = ggml_backend_sched_get_buffer_size(sched, b);
+                LOG_DEBUG("%s sched buf[%d] %s = %.2f MB",
+                          get_desc().c_str(), i, ggml_backend_name(b),
+                          s / (1024.f * 1024.f));
+            }
+            return true;
+        }
+
         if (compute_allocr != nullptr) {
             return true;
         }
@@ -2018,6 +2143,40 @@ struct GGMLRunner {
 
     GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
         : runtime_backend(backend) {
+        // Consume any pending multi-backend spec set by the caller via
+        // g_pending_multi_backend_spec().
+        MultiBackendSpec* pending = g_pending_multi_backend_spec();
+        if (pending != nullptr) {
+            g_pending_multi_backend_spec() = nullptr;
+            multi_backend_mode             = true;
+            multi_backend_kind             = pending->mode;
+            additional_backends            = pending->additional_backends;
+            tensor_backend_fn              = pending->tensor_backend_fn;
+            cpu_fallback_backend           = pending->cpu_fallback;
+            row_split_ratios               = pending->tensor_split_ratios;
+            row_main_device                = pending->main_device;
+            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+                row_split_buft = ggml_backend_split_buffer_type(
+                    runtime_backend,
+                    row_main_device,
+                    row_split_ratios.empty() ? nullptr : row_split_ratios.data());
+                if (row_split_buft == nullptr) {
+                    LOG_WARN("multi-backend: row-split buft init failed "
+                             "(backend does not publish "
+                             "ggml_backend_split_buffer_type); falling back "
+                             "to single-backend mode");
+                    multi_backend_mode = false;
+                    additional_backends.clear();
+                    cpu_fallback_backend = nullptr;
+                }
+            }
+            if (multi_backend_mode && offload_params_to_cpu) {
+                LOG_WARN("multi-backend split is incompatible with "
+                         "offload_params_to_cpu; ignoring offload");
+                offload_params_to_cpu = false;
+            }
+        }
+
         alloc_params_ctx();
         if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
             params_backend = ggml_backend_cpu_init();
@@ -2035,6 +2194,16 @@ struct GGMLRunner {
             ggml_backend_free(params_backend);
         }
         free_cache_ctx_and_buffer();
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched = nullptr;
+        }
+        if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) {
+            ggml_backend_free(cpu_fallback_backend);
+            cpu_fallback_backend = nullptr;
+        }
+        // additional_backends are owned by the caller (see the MultiBackendSpec
+        // setup site in stable-diffusion.cpp); not freed here.
     }
 
     virtual GGMLRunnerContext get_context() {
@@ -2054,7 +2223,234 @@ struct GGMLRunner {
         alloc_compute_ctx();
     }
 
-    bool alloc_params_buffer() {
+    // Multi-backend params allocation: walk params_ctx, classify each tensor
+    // via tensor_backend_fn, allocate one buffer per backend on its default
+    // buft, bind tensors via ggml_tallocr.
+    bool alloc_params_buffer_layer_split() {
+        // Build the backend list (main first, then additional). Index 0 is
+        // the default for tensors whose callback returns nullptr.
+        std::vector<ggml_backend_t> backends;
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+
+        std::vector<ggml_backend_buffer_type_t> bufts;
+        bufts.reserve(backends.size());
+        std::vector<size_t> aligns(backends.size());
+        std::vector<size_t> sizes(backends.size(), 0);
+        std::vector<size_t> counts(backends.size(), 0);
+        for (size_t i = 0; i < backends.size(); i++) {
+            bufts.push_back(ggml_backend_get_default_buffer_type(backends[i]));
+            aligns[i] = ggml_backend_buft_get_alignment(bufts[i]);
+            // Diagnostic: confirm we got a sensible buft from each backend.
+            const char* buft_name    = ggml_backend_buft_name(bufts[i]);
+            const char* backend_name = ggml_backend_name(backends[i]);
+            ggml_backend_dev_t        dev      = ggml_backend_buft_get_device(bufts[i]);
+            enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU;
+            const char*              dev_name = dev ? ggml_backend_dev_name(dev) : "(none)";
+            LOG_DEBUG("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d",
+                     get_desc().c_str(), i, backend_name ? backend_name : "(null)",
+                     buft_name ? buft_name : "(null)", dev_name,
+                     (int)dev_type);
+        }
+
+        // First pass: assign each tensor to a backend, accumulate sizes.
+        std::map<ggml_tensor*, int> tensor_backend_idx;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            int idx = 0;
+            if (tensor_backend_fn) {
+                ggml_backend_t target = tensor_backend_fn(t->name);
+                if (target != nullptr) {
+                    for (size_t i = 0; i < backends.size(); i++) {
+                        if (backends[i] == target) {
+                            idx = int(i);
+                            break;
+                        }
+                    }
+                }
+            }
+            tensor_backend_idx[t] = idx;
+            size_t s              = ggml_backend_buft_get_alloc_size(bufts[idx], t);
+            sizes[idx] += GGML_PAD(s, aligns[idx]);
+            counts[idx] += 1;
+        }
+
+        // Allocate one buffer per used backend.
+        multi_params_buffers.assign(backends.size(), nullptr);
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (sizes[i] == 0) continue;
+            // Diagnostic: query the device's free memory BEFORE alloc.
+            ggml_backend_dev_t dev_pre = ggml_backend_buft_get_device(bufts[i]);
+            size_t free_pre = 0, total_pre = 0;
+            if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_pre, &total_pre);
+            multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]);
+            if (multi_params_buffers[i] == nullptr) {
+                LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)",
+                          get_desc().c_str(),
+                          ggml_backend_name(backends[i]),
+                          sizes[i] / (1024.f * 1024.f));
+                return false;
+            }
+            // Diagnostic: query AFTER alloc. The drop in free memory tells
+            // us whether the alloc actually went to GPU device memory or
+            // to a virtual reservation that's not yet committed.
+            size_t free_post = 0, total_post = 0;
+            if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_post, &total_post);
+            int64_t actual_drop = (int64_t)free_pre - (int64_t)free_post;
+            void*  base       = ggml_backend_buffer_get_base(multi_params_buffers[i]);
+            size_t actual_sz  = ggml_backend_buffer_get_size(multi_params_buffers[i]);
+            bool   is_host    = ggml_backend_buffer_is_host(multi_params_buffers[i]);
+            LOG_DEBUG("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB "
+                     "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d",
+                     get_desc().c_str(), i, ggml_backend_name(backends[i]),
+                     sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f),
+                     free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f),
+                     actual_drop / (1024.f * 1024.f),
+                     base, (int)is_host);
+        }
+
+        // Bind tensors via ggml_tallocr.
+        std::vector<ggml_tallocr> tallocs(backends.size());
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (multi_params_buffers[i] != nullptr) {
+                tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]);
+            }
+        }
+        for (auto& kv : tensor_backend_idx) {
+            ggml_status st = ggml_tallocr_alloc(&tallocs[kv.second], kv.first);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s tallocr_alloc failed for tensor %s",
+                          get_desc().c_str(), kv.first->name);
+                return false;
+            }
+        }
+        // Diagnostic: pick a sample tensor per backend and confirm its
+        // buffer + data pointer.
+        std::vector<bool> sampled(backends.size(), false);
+        for (auto& kv : tensor_backend_idx) {
+            int idx = kv.second;
+            if (sampled[idx]) continue;
+            sampled[idx] = true;
+            ggml_tensor* t = kv.first;
+            LOG_DEBUG("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d",
+                     get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data,
+                     t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1);
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            }
+        }
+
+        // Log the breakdown.
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (counts[i] == 0) continue;
+            LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)",
+                     get_desc().c_str(),
+                     ggml_backend_name(backends[i]),
+                     sizes[i] / (1024.f * 1024.f),
+                     counts[i]);
+        }
+        return true;
+    }
+
+    // Heuristic for row-split eligibility: contiguous, rank-2, both dims
+    // >= 256, and NOT a view. 1D biases / norms / embeddings / small
+    // projections / views fall back to the main GPU's regular per-device
+    // buft. Excluding views avoids the cuda split buft's
+    // GGML_ASSERT(view_src == nullptr) — sticking to the buft's documented
+    // contract instead of patching ggml.
+    static bool is_row_split_eligible(const ggml_tensor* t) {
+        if (t->view_src != nullptr) return false;
+        if (!ggml_is_contiguous(t)) return false;
+        if (ggml_n_dims(t) != 2) return false;
+        if (t->ne[0] < 256 || t->ne[1] < 256) return false;
+        return true;
+    }
+
+    bool alloc_params_buffer_row_split() {
+        if (row_split_buft == nullptr) {
+            LOG_ERROR("alloc_params_buffer_row_split: row-split buft not "
+                      "initialized (backend lacks "
+                      "ggml_backend_split_buffer_type)");
+            return false;
+        }
+        ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend);
+        const size_t main_align  = ggml_backend_buft_get_alignment(main_buft);
+        const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft);
+
+        size_t main_size = 0, split_size = 0;
+        size_t main_count = 0, split_count = 0;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            if (is_row_split_eligible(t)) {
+                size_t s = ggml_backend_buft_get_alloc_size(row_split_buft, t);
+                split_size += GGML_PAD(s, split_align);
+                split_count++;
+            } else {
+                size_t s = ggml_backend_buft_get_alloc_size(main_buft, t);
+                main_size += GGML_PAD(s, main_align);
+                main_count++;
+            }
+        }
+
+        if (main_size > 0) {
+            row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size);
+            if (row_main_buffer == nullptr) {
+                LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)",
+                          get_desc().c_str(), main_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+        if (split_size > 0) {
+            row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size);
+            if (row_split_buffer == nullptr) {
+                LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)",
+                          get_desc().c_str(), split_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+
+        ggml_tallocr main_alloc{};
+        ggml_tallocr split_alloc{};
+        if (row_main_buffer != nullptr)  main_alloc  = ggml_tallocr_new(row_main_buffer);
+        if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer);
+
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr;
+             t = ggml_get_next_tensor(params_ctx, t)) {
+            ggml_status st = is_row_split_eligible(t)
+                                 ? ggml_tallocr_alloc(&split_alloc, t)
+                                 : ggml_tallocr_alloc(&main_alloc, t);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s row-split tallocr_alloc failed for tensor %s",
+                          get_desc().c_str(), t->name);
+                return false;
+            }
+        }
+
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)",
+                 get_desc().c_str(),
+                 main_size / (1024.f * 1024.f), main_count,
+                 split_size / (1024.f * 1024.f), split_count);
+        return true;
+    }
+
+    // Internal: always materializes the params buffer. Used by both the
+    // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded`
+    // path; the latter must bypass the lazy-skip.
+    bool do_alloc_params_buffer() {
+        if (multi_backend_mode && multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+            return alloc_params_buffer_row_split();
+        }
+        if (multi_backend_mode) {
+            return alloc_params_buffer_layer_split();
+        }
         size_t num_tensors = ggml_tensor_num(params_ctx);
         params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
         if (params_buffer == nullptr) {
@@ -2072,18 +2468,96 @@ struct GGMLRunner {
         return true;
     }
 
+    bool alloc_params_buffer() {
+        // Lazy mode: skip alloc until first compute() (via ensure_params_loaded).
+        // The caller still goes through alloc_params_buffer + get_param_tensors
+        // at init; ModelLoader::load_tensors will silently skip this runner's
+        // tensors (their data ptrs are null because no buffer is allocated yet)
+        // and the lazy_load_fn callback re-loads them on demand.
+        if (lazy_load_fn) return true;
+        return do_alloc_params_buffer();
+    }
+
+    void set_lazy_load(std::function<bool()> fn) {
+        lazy_load_fn = std::move(fn);
+    }
+
+    bool ensure_params_loaded() {
+        if (params_buffer != nullptr || !multi_params_buffers.empty() ||
+            row_split_buffer != nullptr || row_main_buffer != nullptr) {
+            return true;
+        }
+        if (!lazy_load_fn) {
+            LOG_ERROR("%s: no params buffer and no lazy_load_fn", get_desc().c_str());
+            return false;
+        }
+        int64_t t0 = ggml_time_ms();
+        if (!do_alloc_params_buffer()) return false;
+        if (!lazy_load_fn()) {
+            LOG_ERROR("%s: lazy load callback failed", get_desc().c_str());
+            return false;
+        }
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f);
+        // Diagnostic: report device-memory free per backend AFTER load.
+        // If the bytes actually went to GPU, free should have decreased
+        // by ~params_size for each layer-split shard.
+        if (multi_backend_mode) {
+            std::vector<ggml_backend_t> backends;
+            backends.push_back(runtime_backend);
+            for (auto* b : additional_backends) backends.push_back(b);
+            for (size_t i = 0; i < backends.size(); i++) {
+                ggml_backend_dev_t dev = ggml_backend_get_device(backends[i]);
+                if (!dev) continue;
+                size_t free_b = 0, total_b = 0;
+                ggml_backend_dev_memory(dev, &free_b, &total_b);
+                LOG_DEBUG("%s post-load device %s free=%.1f MB / %.1f MB",
+                         get_desc().c_str(),
+                         ggml_backend_dev_name(dev),
+                         free_b / (1024.f * 1024.f),
+                         total_b / (1024.f * 1024.f));
+            }
+        }
+        return true;
+    }
+
     void free_params_buffer() {
         if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
             params_buffer = nullptr;
         }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_free(buf);
+            }
+        }
+        multi_params_buffers.clear();
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_free(row_split_buffer);
+            row_split_buffer = nullptr;
+        }
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_free(row_main_buffer);
+            row_main_buffer = nullptr;
+        }
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched          = nullptr;
+            sched_reserved = false;
+        }
     }
 
     size_t get_params_buffer_size() {
+        size_t total = 0;
         if (params_buffer != nullptr) {
-            return ggml_backend_buffer_get_size(params_buffer);
+            total += ggml_backend_buffer_get_size(params_buffer);
         }
-        return 0;
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) total += ggml_backend_buffer_get_size(buf);
+        }
+        if (row_split_buffer != nullptr) total += ggml_backend_buffer_get_size(row_split_buffer);
+        if (row_main_buffer != nullptr)  total += ggml_backend_buffer_get_size(row_main_buffer);
+        return total;
     }
 
     void free_cache_ctx_and_buffer() {
@@ -2096,11 +2570,23 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
+        if (sched != nullptr) {
+            // Reset rather than free: keeping the sched alive across compute()
+            // calls of a sampling loop avoids the per-step rebuild cost.
+            ggml_backend_sched_reset(sched);
+            sched_reserved = false;
+        }
         offload_params_to_params_backend();
     }
 
     // do copy after alloc graph
     void set_backend_tensor_data(ggml_tensor* tensor, const void* data) {
+        // In multi-backend mode, sched needs the tensor flagged as input so
+        // it gets a backend assignment (otherwise tensors with no producers
+        // and no consumers leave sched at backend_id=-1).
+        if (multi_backend_mode) {
+            ggml_set_input(tensor);
+        }
         backend_tensor_data_map[tensor] = data;
     }
 
@@ -2160,6 +2646,9 @@ struct GGMLRunner {
                                          int n_threads,
                                          bool free_compute_buffer_immediately,
                                          bool no_return = false) {
+        if (!ensure_params_loaded()) {
+            return std::nullopt;
+        }
         if (!offload_params_to_runtime_backend()) {
             LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
             return std::nullopt;
@@ -2168,18 +2657,41 @@ struct GGMLRunner {
             LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
             return std::nullopt;
         }
-        reset_compute_ctx();
-        ggml_cgraph* gf = get_compute_graph(get_graph);
-        if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
-            LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
-            return std::nullopt;
+        ggml_cgraph* gf = nullptr;
+        if (multi_backend_mode) {
+            ggml_backend_sched_reset(sched);
+            reset_compute_ctx();
+            gf = get_compute_graph(get_graph);
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LOG_ERROR("%s sched alloc graph failed", get_desc().c_str());
+                return std::nullopt;
+            }
+        } else {
+            reset_compute_ctx();
+            gf = get_compute_graph(get_graph);
+            if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
+                LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
+                return std::nullopt;
+            }
         }
         copy_data_to_backend_tensor();
         if (ggml_backend_is_cpu(runtime_backend)) {
             ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
         }
+        if (multi_backend_mode && cpu_fallback_backend &&
+            ggml_backend_is_cpu(cpu_fallback_backend)) {
+            ggml_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads);
+        }
 
-        ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
+        ggml_status status;
+        if (multi_backend_mode) {
+            status = ggml_backend_sched_graph_compute(sched, gf);
+            if (status == GGML_STATUS_SUCCESS) {
+                ggml_backend_sched_synchronize(sched);
+            }
+        } else {
+            status = ggml_backend_graph_compute(runtime_backend, gf);
+        }
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
             return std::nullopt;
@@ -2259,6 +2771,14 @@ class GGMLBlock {
             prefix = prefix + ".";
         }
         init_params(ctx, tensor_storage_map, prefix);
+        // Tag each param tensor with its full (prefix-qualified) name so the
+        // multi-backend runner's tensor_backend_fn callback can route it.
+        // Without this, init_params leaves tensors with empty t->name.
+        for (auto& pair : params) {
+            if (pair.second != nullptr) {
+                ggml_set_name(pair.second, (prefix + pair.first).c_str());
+            }
+        }
         init_blocks(ctx, tensor_storage_map, prefix);
     }
 
diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp
index 50158c883..6d60a73ec 100644
--- a/src/ggml_extend_backend.hpp
+++ b/src/ggml_extend_backend.hpp
@@ -121,6 +121,24 @@ __STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backen
     }
 }
 
+// Runtime lookup of a backend's row-split buffer type (currently published by
+// the CUDA and SYCL backends as `ggml_backend_split_buffer_type` in their
+// reg_get_proc_address tables). Returns nullptr when the backend does not
+// support row-split, leaving the caller to fall back to a non-split path.
+using __ggml_backend_split_buffer_type_t = ggml_backend_buffer_type_t (*)(int main_device, const float* tensor_split);
+
+__STATIC_INLINE__ ggml_backend_buffer_type_t ggml_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) {
+    ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend);
+    if (reg == nullptr) {
+        return nullptr;
+    }
+    auto fn = reinterpret_cast<__ggml_backend_split_buffer_type_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"));
+    if (fn == nullptr) {
+        return nullptr;
+    }
+    return fn(main_device, tensor_split);
+}
+
 __STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
     if (tensor == nullptr) {
         return nullptr;
diff --git a/src/model.cpp b/src/model.cpp
index 8fdde3b76..2f7e2b78f 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -783,11 +783,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 
         std::unique_ptr<MmapWrapper> mmapped;
         if (enable_mmap && !is_zip) {
-            LOG_DEBUG("using mmap for I/O");
             mmapped = MmapWrapper::create(file_path);
             if (!mmapped) {
-                LOG_WARN("failed to memory-map '%s'", file_path.c_str());
+                LOG_WARN("failed to memory-map '%s' (falling back to read())",
+                         file_path.c_str());
+            } else {
+                LOG_INFO("using mmap for '%s'", file_path.c_str());
             }
+        } else if (!is_zip) {
+            LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
+                     file_path.c_str());
         }
 
         int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
@@ -1003,9 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                                std::set<std::string> ignore_tensors,
                                int n_threads,
-                               bool enable_mmap) {
+                               bool enable_mmap,
+                               bool quiet_unknown_tensors) {
     std::set<std::string> tensor_names_in_file;
     std::mutex tensor_names_mutex;
+    std::atomic<size_t> unknown_tensor_count{0};
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
         // LOG_DEBUG("%s", tensor_storage.to_string().c_str());
@@ -1023,7 +1030,11 @@ bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                     return true;
                 }
             }
-            LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str());
+            if (quiet_unknown_tensors) {
+                unknown_tensor_count.fetch_add(1);
+            } else {
+                LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str());
+            }
             return true;
         }
 
@@ -1072,6 +1083,10 @@ bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
     if (some_tensor_not_init) {
         return false;
     }
+    if (quiet_unknown_tensors && unknown_tensor_count.load() > 0) {
+        LOG_INFO("skipped %zu unknown tensors (--quiet-unknown-tensors)",
+                 unknown_tensor_count.load());
+    }
     return true;
 }
 
diff --git a/src/model.h b/src/model.h
index 65bc6c367..03d4e3732 100644
--- a/src/model.h
+++ b/src/model.h
@@ -193,6 +193,8 @@ using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
 
 TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
 
+bool is_unused_tensor(const std::string& name);
+
 class ModelLoader {
 protected:
     SDVersion version_ = VERSION_COUNT;
@@ -224,7 +226,8 @@ class ModelLoader {
     bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
                       int n_threads                        = 0,
-                      bool use_mmap                        = false);
+                      bool use_mmap                        = false,
+                      bool quiet_unknown_tensors           = false);
 
     std::vector<std::string> get_tensor_names() const {
         std::vector<std::string> names;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 88102ff61..c389c6242 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1,5 +1,6 @@
 #include "ggml_extend.hpp"
 
+#include "backend_fit.hpp"
 #include "model.h"
 #include "rng.hpp"
 #include "rng_mt19937.hpp"
@@ -108,10 +109,47 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
 
 class StableDiffusionGGML {
 public:
-    ggml_backend_t backend             = nullptr;  // general backend
+    ggml_backend_t backend             = nullptr;  // general / main backend
     ggml_backend_t clip_backend        = nullptr;
     ggml_backend_t control_net_backend = nullptr;
     ggml_backend_t vae_backend         = nullptr;
+    ggml_backend_t diffusion_backend   = nullptr;
+
+    // Auto-fit decisions resolved into device-name strings. When non-empty,
+    // these win over the user-provided sd_ctx_params->*_backend_device.
+    // When empty, the explicit param (or `backend` fallback) is used.
+    std::string fit_diffusion_device;
+    std::string fit_clip_device;
+    std::string fit_vae_device;
+    // Per-component offload-params override coming from auto-fit. Forces
+    // offload_params_to_cpu for that component even when global flag is off.
+    bool fit_dit_offload_params  = false;
+    bool fit_cond_offload_params = false;
+    bool fit_vae_offload_params  = false;
+
+    // Multi-GPU split state (LAYER_SPLIT or ROW_SPLIT). Holds the ordered
+    // list of device names and per-device share bytes; the actual backend
+    // handles are init'd at construction time and stored in *_extra_backends
+    // so the destructor can free them. fit_*_row_split=true means use the
+    // CUDA row-split path (matmul weights split row-wise via cuda_split_buft);
+    // false means layer-split (per-block backend assignment via sched).
+    std::vector<std::string>    fit_dit_split_device_names;
+    std::vector<int64_t>        fit_dit_split_share_bytes;
+    std::vector<ggml_backend_t> fit_dit_extra_backends;
+    bool                        fit_dit_row_split  = false;
+    std::vector<std::string>    fit_cond_split_device_names;
+    std::vector<int64_t>        fit_cond_split_share_bytes;
+    std::vector<ggml_backend_t> fit_cond_extra_backends;
+    bool                        fit_cond_row_split = false;
+
+    // Owned model loader: kept alive across init() so lazy_load callbacks
+    // can re-read tensor data from disk on demand. Only set when at least
+    // one component is configured for lazy load.
+    std::unique_ptr<ModelLoader> owned_model_loader;
+    // Auto-fit decided init-time SUM exceeds device cap; defer cond + DiT
+    // allocation until first compute() so peaks don't pile up.
+    bool auto_lazy_load = false;
+    bool enable_mmap_member = false;
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -168,11 +206,35 @@ class StableDiffusionGGML {
         if (vae_backend != backend) {
             ggml_backend_free(vae_backend);
         }
+        if (diffusion_backend != backend) {
+            ggml_backend_free(diffusion_backend);
+        }
+        for (auto* b : fit_dit_extra_backends) {
+            if (b != backend && b != diffusion_backend && b != clip_backend &&
+                b != vae_backend && b != control_net_backend) {
+                ggml_backend_free(b);
+            }
+        }
+        for (auto* b : fit_cond_extra_backends) {
+            if (b != backend && b != diffusion_backend && b != clip_backend &&
+                b != vae_backend && b != control_net_backend) {
+                ggml_backend_free(b);
+            }
+        }
         ggml_backend_free(backend);
     }
 
-    void init_backend() {
-        backend = sd_get_default_backend();
+    void init_backend(const char* main_device_name) {
+        if (main_device_name != nullptr && main_device_name[0] != '\0') {
+            backend = init_named_backend(main_device_name);
+            if (backend == nullptr) {
+                LOG_WARN("main backend device '%s' init failed; falling back to default",
+                         main_device_name);
+            }
+        }
+        if (backend == nullptr) {
+            backend = sd_get_default_backend();
+        }
     }
 
     std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
@@ -202,9 +264,14 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        init_backend();
+        init_backend(sd_ctx_params->main_backend_device);
 
-        ModelLoader model_loader;
+        // Use a stack-local handle that points into `owned_model_loader` if we
+        // need lazy callbacks (decided after auto-fit), otherwise a temp local
+        // is fine. Defer the unique_ptr decision; for now always own it so the
+        // pointer is stable even if lazy load is enabled later in this init().
+        owned_model_loader = std::make_unique<ModelLoader>();
+        ModelLoader& model_loader = *owned_model_loader;
 
         if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
             LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
@@ -328,6 +395,142 @@ class StableDiffusionGGML {
             return oss.str();
         };
 
+        if (sd_ctx_params->auto_fit) {
+            backend_fit::ComputeReserves reserves;
+            if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) {
+                reserves.dit_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) {
+                reserves.vae_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) {
+                reserves.conditioner_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB;
+            }
+            auto components = backend_fit::estimate_components(
+                model_loader, wtype, /*alignment=*/64, reserves);
+            auto    devices = backend_fit::enumerate_gpu_devices();
+            int64_t margin_bytes =
+                int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            backend_fit::MultiGpuMode mode = backend_fit::str_to_multi_gpu_mode(
+                SAFE_STR(sd_ctx_params->multi_gpu_mode));
+            auto plan = backend_fit::compute_plan(
+                components, devices, margin_bytes,
+                sd_ctx_params->auto_multi_gpu, mode);
+            backend_fit::print_plan(plan, components, devices, margin_bytes);
+
+            if (sd_ctx_params->auto_fit_dry_run) {
+                LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models");
+                return false;
+            }
+
+            // Find the CPU device's ggml name (so we can route "CPU"
+            // placements through init_named_backend uniformly).
+            std::string cpu_device_name;
+            for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                    cpu_device_name = ggml_backend_dev_name(dev);
+                    break;
+                }
+            }
+            auto device_id_to_name = [&](int dev_id) -> std::string {
+                for (const auto& dev : devices) {
+                    if (dev.id == dev_id) return dev.name;
+                }
+                return {};
+            };
+            auto resolve = [&](const backend_fit::Decision*  d,
+                               std::string&                  out_device,
+                               bool&                         out_offload,
+                               std::vector<std::string>&     out_split_devices,
+                               std::vector<int64_t>&         out_split_shares,
+                               bool&                         out_row_split) {
+                out_split_devices.clear();
+                out_split_shares.clear();
+                out_row_split = false;
+                if (d == nullptr) {
+                    out_device.clear();
+                    out_offload = false;
+                    return;
+                }
+                if (d->placement == backend_fit::Placement::CPU) {
+                    out_device = cpu_device_name;
+                    out_offload = false;
+                    return;
+                }
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) {
+                    // Primary device drives main_backend choice for the model;
+                    // the rest become additional backends in the spec.
+                    for (size_t k = 0; k < d->split_device_ids.size(); k++) {
+                        out_split_devices.push_back(device_id_to_name(d->split_device_ids[k]));
+                        out_split_shares.push_back(d->split_share_bytes[k]);
+                    }
+                    if (!out_split_devices.empty()) out_device = out_split_devices[0];
+                    out_offload = false;
+                    out_row_split = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT);
+                    return;
+                }
+                out_device  = device_id_to_name(d->device_id);
+                out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS);
+            };
+            std::vector<std::string> dummy_devs;
+            std::vector<int64_t>     dummy_shares;
+            bool                     dummy_row_split = false;
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT),
+                    fit_diffusion_device, fit_dit_offload_params,
+                    fit_dit_split_device_names, fit_dit_split_share_bytes,
+                    fit_dit_row_split);
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE),
+                    fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares,
+                    dummy_row_split);
+            resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER),
+                    fit_clip_device, fit_cond_offload_params,
+                    fit_cond_split_device_names, fit_cond_split_share_bytes,
+                    fit_cond_row_split);
+
+            // CPU placements: leave fit_*_device empty AND remember they're
+            // CPU so the resolver below picks ggml_backend_cpu_init().
+
+            // Decide auto-lazy-load: if the per-component MAX-based plan fits
+            // but the SUM-of-components on any device would exceed cap, defer
+            // alloc until first compute() so peaks don't pile up. Heuristic:
+            // sum the per-device on_device_bytes across all GPU decisions
+            // (excluding VAE which is small) and compare to free_bytes.
+            std::map<int, int64_t> sum_per_device;
+            auto add_sum = [&](const backend_fit::Decision* d) {
+                if (!d) return;
+                if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) {
+                    for (size_t k = 0; k < d->split_device_ids.size(); k++) {
+                        sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k];
+                    }
+                } else if (d->placement == backend_fit::Placement::GPU ||
+                           d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) {
+                    sum_per_device[d->device_id] += d->on_device_bytes;
+                }
+            };
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT));
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE));
+            add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER));
+            for (const auto& dev : devices) {
+                int64_t cap = dev.free_bytes - margin_bytes;
+                int64_t sum = sum_per_device.count(dev.id) ? sum_per_device[dev.id] : 0;
+                if (sum > cap) {
+                    LOG_INFO("auto-fit: enabling lazy load (init-time SUM %lld MiB on %s "
+                             "exceeds cap %lld MiB; per-component MAX plan needs lazy alloc)",
+                             (long long)(sum / backend_fit::MiB),
+                             dev.name.c_str(),
+                             (long long)(cap / backend_fit::MiB));
+                    auto_lazy_load = true;
+                    break;
+                }
+            }
+        }
+
         LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
         LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
         LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
@@ -373,19 +576,362 @@ class StableDiffusionGGML {
             LOG_INFO("Using circular padding for convolutions");
         }
 
-        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
+        // If auto-fit decided ANY component must offload params, force the
+        // global flag on. This is a coarsening: one component needing offload
+        // forces all to offload (safer, just slower for non-offload ones).
+        if (fit_dit_offload_params || fit_cond_offload_params || fit_vae_offload_params) {
+            if (!offload_params_to_cpu) {
+                LOG_INFO("auto-fit: enabling offload_params_to_cpu (one or more "
+                         "components don't fit without param streaming)");
+                offload_params_to_cpu = true;
+            }
+        }
+
+        // Pick the effective device name for each component: the auto-fit
+        // override (if any) wins; otherwise the user-provided string; nullptr
+        // falls back to `backend` (the main).
+        auto effective_device = [&](const std::string& fit_str, const char* user_str) -> const char* {
+            if (!fit_str.empty()) return fit_str.c_str();
+            return user_str;
+        };
+        const char* diffusion_dev_name = effective_device(fit_diffusion_device,
+                                                          sd_ctx_params->diffusion_backend_device);
+        const char* clip_dev_name      = effective_device(fit_clip_device,
+                                                          sd_ctx_params->clip_backend_device);
+        const char* vae_dev_name       = effective_device(fit_vae_device,
+                                                          sd_ctx_params->vae_backend_device);
+
+        // Build the row-split MultiBackendSpec for a component (ROW_SPLIT
+        // mode). Unlike layer-split, the runner uses a SINGLE CUDA backend;
+        // matmul weights are row-split across all CUDA devices internally
+        // by cuda_split_buffer_type. extra_backends stays empty.
+        // - share_devices/share_bytes: per-device share order from auto-fit
+        //   (largest first by descending share). The first device is the
+        //   "main" CUDA device, where the compute buffer lives.
+        // Returns true on success; populates out_spec.tensor_split_ratios
+        // with a vector of length total CUDA device count.
+        auto prepare_row_split_spec = [&](const std::vector<std::string>&     share_devices,
+                                          const std::vector<int64_t>&         share_bytes,
+                                          std::vector<ggml_backend_t>&        out_extra_backends,
+                                          MultiBackendSpec&                   out_spec) -> bool {
+            if (share_devices.size() < 2) return false;
+
+            // Derive the backend registry from the device-name prefix (e.g.
+            // "CUDA0" -> reg "CUDA", "SYCL1" -> reg "SYCL"). This keeps the
+            // code backend-agnostic: any backend whose registry publishes
+            // `ggml_backend_split_buffer_type` via reg_get_proc_address can
+            // drive row-split, not just CUDA.
+            auto reg_prefix_of = [](const std::string& name) -> std::string {
+                size_t i = 0;
+                while (i < name.size() && (std::isalpha((unsigned char)name[i]) || name[i] == '_')) i++;
+                return name.substr(0, i);
+            };
+            const std::string reg_name = reg_prefix_of(share_devices[0]);
+            ggml_backend_reg_t reg     = ggml_backend_reg_by_name(reg_name.c_str());
+            if (reg == nullptr) return false;
+            const int dev_count = (int)ggml_backend_reg_dev_count(reg);
+            if (dev_count <= 0) return false;
+
+            auto reg_index_of = [&](const std::string& name) -> int {
+                if (name.rfind(reg_name, 0) != 0) return -1;
+                try { return std::stoi(name.substr(reg_name.size())); } catch (...) { return -1; }
+            };
+
+            std::vector<float> ratios(dev_count, 0.0f);
+            int64_t total = 0;
+            for (auto b : share_bytes) total += b;
+            if (total <= 0) return false;
+            int main_dev = -1;
+            int64_t max_share = -1;
+            for (size_t k = 0; k < share_devices.size(); k++) {
+                int idx = reg_index_of(share_devices[k]);
+                if (idx < 0 || idx >= dev_count) continue;
+                ratios[idx] = float(double(share_bytes[k]) / double(total));
+                if (share_bytes[k] > max_share) {
+                    max_share = share_bytes[k];
+                    main_dev  = idx;
+                }
+            }
+            if (main_dev < 0) return false;
+
+            // Init extra backends for the non-main devices so sched can
+            // route ops across them (row-split tensors are dispatched by the
+            // primary backend; ggml-sched still needs all participating
+            // backends in its list to schedule cross-device copies).
+            for (size_t k = 0; k < share_devices.size(); k++) {
+                int idx = reg_index_of(share_devices[k]);
+                if (idx == main_dev || idx < 0) continue;
+                ggml_backend_t b = init_named_backend(share_devices[k]);
+                if (b != nullptr) {
+                    out_extra_backends.push_back(b);
+                } else {
+                    LOG_WARN("row-split: failed to init backend %s",
+                             share_devices[k].c_str());
+                }
+            }
+            out_spec.mode                = MultiBackendMode::ROW_SPLIT;
+            out_spec.tensor_split_ratios = ratios;
+            out_spec.main_device         = main_dev;
+            out_spec.additional_backends.assign(out_extra_backends.begin(),
+                                                out_extra_backends.end());
+            out_spec.tensor_backend_fn   = nullptr;
+            out_spec.cpu_fallback        = nullptr;
+
+            std::string ratio_str;
+            for (int i = 0; i < dev_count; i++) {
+                if (i > 0) ratio_str += ",";
+                char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]);
+                ratio_str += buf;
+            }
+            LOG_INFO("row-split spec: ratios=[%s] main_device=%d",
+                     ratio_str.c_str(), main_dev);
+            return true;
+        };
+
+        // Build the layer-split MultiBackendSpec for a component. Only used
+        // when auto-fit picked GPU_LAYER_SPLIT for this component.
+        // - main_backend: the runner's primary backend (also first in the spec)
+        // - extra_device_names: additional device names to span
+        // - share_bytes: per-device share (for proportional block partition)
+        // - tensor_prefix: the model's weight name prefix (e.g.,
+        //   "model.diffusion_model.") — used to locate block-indexed tensors
+        // Returns true if a spec was prepared and pending_spec_storage was
+        // populated; the caller must set g_pending_multi_backend_spec()
+        // immediately before constructing the model.
+        auto prepare_layer_split_spec = [&](ggml_backend_t                       main_backend,
+                                            const std::vector<std::string>&      extra_device_names,
+                                            const std::vector<int64_t>&          share_bytes,
+                                            const std::string&                   tensor_prefix,
+                                            std::vector<ggml_backend_t>&         out_extra_backends,
+                                            MultiBackendSpec&                    out_spec) -> bool {
+            if (extra_device_names.size() < 2) return false;  // only [main] -> single GPU
+            // Init the additional backends (skip [0] which is main_backend).
+            std::vector<ggml_backend_t> all_backends;
+            all_backends.push_back(main_backend);
+            for (size_t k = 1; k < extra_device_names.size(); k++) {
+                ggml_backend_t b = init_named_backend(extra_device_names[k]);
+                if (b == nullptr) {
+                    LOG_WARN("layer-split: failed to init extra backend %s; falling back to single backend",
+                             extra_device_names[k].c_str());
+                    return false;
+                }
+                out_extra_backends.push_back(b);
+                all_backends.push_back(b);
+            }
+
+            // Walk tensor_storage_map to get per-block byte sizes and the
+            // total non-block bytes that will land on backend[0]. Then
+            // greedy-partition blocks by byte budget to balance per-backend
+            // bytes (accounting for non-block fixed load on backend[0]).
+            int max_block_idx = -1;
+            static const std::regex block_re(
+                R"((?:transformer_blocks|joint_blocks|double_blocks|single_blocks|blocks|layers)\.([0-9]+)\.)");
+            std::map<int, int64_t> block_bytes;        // block idx -> bytes
+            int64_t                non_block_bytes = 0;
+            for (const auto& kv : tensor_storage_map) {
+                if (!tensor_prefix.empty() && kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                    continue;
+                }
+                int64_t bytes = (int64_t)kv.second.nbytes();
+                std::smatch m;
+                if (std::regex_search(kv.first, m, block_re)) {
+                    int idx = std::stoi(m[1]);
+                    if (idx > max_block_idx) max_block_idx = idx;
+                    block_bytes[idx] += bytes;
+                } else {
+                    non_block_bytes += bytes;
+                }
+            }
+            if (max_block_idx < 0) {
+                LOG_WARN("layer-split: no blocks found under prefix '%s'; aborting split",
+                         tensor_prefix.c_str());
+                return false;
+            }
+            const int n_blocks = max_block_idx + 1;
+
+            // Build per-backend byte budgets from share_bytes (ratios). The
+            // first backend absorbs `non_block_bytes` as a fixed load, so we
+            // SHRINK its remaining budget for blocks accordingly.
+            int64_t total_share = 0;
+            for (auto s : share_bytes) total_share += s;
+            int64_t total_block_bytes = 0;
+            for (const auto& kv : block_bytes) total_block_bytes += kv.second;
+            std::vector<int64_t> backend_block_budgets(share_bytes.size(), 0);
+            for (size_t k = 0; k < share_bytes.size(); k++) {
+                int64_t share = int64_t(double(total_block_bytes + non_block_bytes) *
+                                        double(share_bytes[k]) / double(total_share));
+                if (k == 0) share = std::max<int64_t>(share - non_block_bytes, 0);
+                backend_block_budgets[k] = share;
+            }
+            // Greedy assign each block (in order) to the current backend
+            // until its budget is filled, then move to the next.
+            std::vector<int> boundaries(share_bytes.size(), 0);
+            size_t            cur_backend = 0;
+            int64_t           cur_used    = 0;
+            for (int b = 0; b < n_blocks; b++) {
+                int64_t bb = block_bytes[b];
+                if (cur_backend + 1 < share_bytes.size() &&
+                    cur_used + bb > backend_block_budgets[cur_backend] &&
+                    cur_used > 0) {
+                    boundaries[cur_backend] = b;
+                    cur_backend++;
+                    cur_used = 0;
+                }
+                cur_used += bb;
+            }
+            // The remaining backends get the rest, terminating at n_blocks.
+            for (size_t k = cur_backend; k < boundaries.size(); k++) {
+                boundaries[k] = n_blocks;
+            }
+            // Safety: ensure each backend has at least one block.
+            for (size_t k = 0; k < boundaries.size(); k++) {
+                int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+                if (boundaries[k] < min_bound) boundaries[k] = std::min(min_bound, n_blocks);
+            }
+            std::string boundary_log = "layer-split [" + tensor_prefix + "] " +
+                                       std::to_string(n_blocks) + " blocks: ";
+            int prev = 0;
+            for (size_t k = 0; k < all_backends.size() && k < boundaries.size(); k++) {
+                if (k > 0) boundary_log += ", ";
+                boundary_log += std::string(ggml_backend_name(all_backends[k])) + "=[" +
+                                std::to_string(prev) + ".." + std::to_string(boundaries[k]) + ")";
+                prev = boundaries[k];
+            }
+            LOG_INFO("%s", boundary_log.c_str());
+
+            // Build the tensor_backend_fn closure.
+            std::vector<ggml_backend_t> backends_capture = all_backends;
+            std::vector<int>            boundaries_capture = boundaries;
+            std::string                 prefix_capture     = tensor_prefix;
+            out_spec.tensor_backend_fn =
+                [backends_capture, boundaries_capture, prefix_capture](const std::string& name) -> ggml_backend_t {
+                    if (!prefix_capture.empty() &&
+                        name.compare(0, prefix_capture.size(), prefix_capture) != 0) {
+                        return backends_capture[0];
+                    }
+                    std::smatch m;
+                    if (!std::regex_search(name, m, block_re)) {
+                        return backends_capture[0];
+                    }
+                    int idx = std::stoi(m[1]);
+                    for (size_t k = 0; k < boundaries_capture.size(); k++) {
+                        if (idx < boundaries_capture[k]) {
+                            return backends_capture[std::min(k, backends_capture.size() - 1)];
+                        }
+                    }
+                    return backends_capture.back();
+                };
+            // Spec contains the additional backends only (main is implicit).
+            out_spec.additional_backends.assign(out_extra_backends.begin(), out_extra_backends.end());
+            out_spec.cpu_fallback = nullptr;
+            return true;
+        };
+
+        // Helper: init a named backend if name is non-null/non-empty,
+        // returns nullptr on missing/failed name (caller falls back to main).
+        auto init_named_or_null = [](const char* name) -> ggml_backend_t {
+            if (name == nullptr || name[0] == '\0') return nullptr;
+            return init_named_backend(name);
+        };
+
+        diffusion_backend = init_named_or_null(diffusion_dev_name);
+        if (!diffusion_backend) {
+            diffusion_backend = backend;
+        } else {
+            LOG_INFO("Diffusion model: using device %s", diffusion_dev_name);
+        }
+
+        // Tensor name sets for components that are configured for lazy load.
+        // Populated below right before/after the cond + DiT construction;
+        // consumed by the bulk-load step's ignore_tensors.
+        std::set<std::string> cond_lazy_tensor_names;
+        std::set<std::string> dit_lazy_tensor_names;
+
+        // Build the layer-split MultiBackendSpec for DiT (when auto-fit picked
+        // GPU_LAYER_SPLIT). The spec is consumed by the diffusion_model's
+        // GGMLRunner ctor when we set g_pending_multi_backend_spec() to it.
+        MultiBackendSpec dit_spec;
+        bool             dit_spec_active = false;
+        if (!fit_dit_split_device_names.empty()) {
+            if (fit_dit_row_split) {
+                dit_spec_active = prepare_row_split_spec(fit_dit_split_device_names,
+                                                         fit_dit_split_share_bytes,
+                                                         fit_dit_extra_backends,
+                                                         dit_spec);
+            } else {
+                dit_spec_active = prepare_layer_split_spec(diffusion_backend,
+                                                           fit_dit_split_device_names,
+                                                           fit_dit_split_share_bytes,
+                                                           "model.diffusion_model.",
+                                                           fit_dit_extra_backends,
+                                                           dit_spec);
+            }
+        }
+        // Lambda to set the pending spec immediately before constructing the
+        // diffusion model. Caller must invoke this on the same line / right
+        // before the std::make_shared<...Model>(diffusion_backend, ...) call.
+        auto prime_dit_spec = [&]() {
+            if (dit_spec_active) {
+                g_pending_multi_backend_spec() = &dit_spec;
+            }
+        };
+
+        // Same dance for the conditioner. The conditioner uses clip_backend as
+        // its main backend; we need to set up the spec BEFORE the cond_stage
+        // ctor runs (which is BEFORE the DiT ctor). Each cond model wraps one
+        // or more sub-runners; the spec's tensor_backend_fn handles all of
+        // them since it's keyed on tensor names with a generic block regex.
+        // (Some conditioners construct multiple sub-runners — only the FIRST
+        // ggml runner ctor consumes the pending spec, so we re-prime between
+        // sub-runners' allocs by leaving cond_spec_active true; the runner's
+        // multi_backend_mode is per-runner.)
+        // For LTX-2 specifically: LTXAVEmbedder constructs LLMRunner first
+        // (consumes spec), then LTXAVTextProjectionRunner (no spec consumed).
+        // The LLM has block-named tensors so layer-split applies; the
+        // projector has only 4 tensors and they should ride along on its
+        // single backend (clip_backend = main). Auto-fit's cond share counts
+        // both, so the share is over-counted on backend[0] for the projector.
+        // Acceptable for now — small correction.
+        ggml_backend_t   clip_main_backend_for_spec = nullptr;  // resolved below
+        MultiBackendSpec cond_spec;
+        bool             cond_spec_active = false;
+        auto prime_cond_spec = [&]() {
+            if (cond_spec_active) {
+                g_pending_multi_backend_spec() = &cond_spec;
+            }
+        };
 
         {
-            clip_backend = backend;
-            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("CLIP: Using CPU backend");
-                clip_backend = ggml_backend_cpu_init();
+            clip_backend = init_named_or_null(clip_dev_name);
+            if (!clip_backend) {
+                clip_backend = backend;
+            } else {
+                LOG_INFO("CLIP: using device %s", clip_dev_name);
+            }
+            // Now that clip_backend is resolved, build the conditioner's
+            // multi-GPU spec if auto-fit picked one (row-split or layer-split).
+            if (!fit_cond_split_device_names.empty()) {
+                if (fit_cond_row_split) {
+                    cond_spec_active = prepare_row_split_spec(fit_cond_split_device_names,
+                                                              fit_cond_split_share_bytes,
+                                                              fit_cond_extra_backends,
+                                                              cond_spec);
+                } else {
+                    cond_spec_active = prepare_layer_split_spec(clip_backend,
+                                                                fit_cond_split_device_names,
+                                                                fit_cond_split_share_bytes,
+                                                                "text_encoders.",
+                                                                fit_cond_extra_backends,
+                                                                cond_spec);
+                }
             }
             if (sd_version_is_sd3(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<MMDiTModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
@@ -406,12 +952,14 @@ class StableDiffusionGGML {
                             "--chroma-disable-dit-mask as a workaround.");
                     }
 
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else if (version == VERSION_OVIS_IMAGE) {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map,
@@ -419,40 +967,47 @@ class StableDiffusionGGML {
                                                                      "",
                                                                      false);
                 } else {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                           offload_params_to_cpu,
                                                                           tensor_storage_map);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend,
+                prime_dit_spec();
+                diffusion_model = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<FluxModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     true,
                                                                     0,
                                                                     true);
-                diffusion_model  = std::make_shared<WanModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<WanModel>(diffusion_backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
-                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
+                    prime_dit_spec();
+                    high_noise_diffusion_model = std::make_shared<WanModel>(diffusion_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
@@ -472,42 +1027,50 @@ class StableDiffusionGGML {
                 if (!vae_decode_only) {
                     enable_vision = true;
                 }
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version,
                                                                  "",
                                                                  enable_vision);
-                diffusion_model  = std::make_shared<QwenImageModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<QwenImageModel>(diffusion_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model",
                                                                    version,
                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_anima(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
-                diffusion_model  = std::make_shared<AnimaModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<AnimaModel>(diffusion_backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map,
                                                                "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ZImageModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<ZImageModel>(diffusion_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
                                                                 version);
             } else if (sd_version_is_ernie_image(version)) {
+                prime_cond_spec();
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ErnieImageModel>(backend,
+                prime_dit_spec();
+                diffusion_model  = std::make_shared<ErnieImageModel>(diffusion_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     "model.diffusion_model");
@@ -517,6 +1080,7 @@ class StableDiffusionGGML {
                     embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
                 }
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
@@ -524,13 +1088,15 @@ class StableDiffusionGGML {
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
+                    prime_cond_spec();
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend,
+                prime_dit_spec();
+                diffusion_model = std::make_shared<UNetModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version);
@@ -540,11 +1106,83 @@ class StableDiffusionGGML {
                 }
             }
 
-            cond_stage_model->alloc_params_buffer();
-            cond_stage_model->get_param_tensors(tensors);
+            // Conditioner: publish its tensors to the global map, EXCEPT the
+            // ones that are about to be configured for lazy load (we want the
+            // bulk loader to skip them — they have no buffer yet).
+            std::map<std::string, ggml_tensor*> cond_only_tensors;
+            cond_stage_model->get_param_tensors(cond_only_tensors);
+            std::map<std::string, ggml_tensor*> llm_lazy_map;
+            if (auto_lazy_load) {
+                for (const auto& kv : cond_only_tensors) {
+                    if (kv.first.rfind("text_encoders.llm.", 0) == 0) {
+                        llm_lazy_map[kv.first] = kv.second;
+                        cond_lazy_tensor_names.insert(kv.first);
+                    }
+                }
+            }
+            for (const auto& kv : cond_only_tensors) {
+                if (cond_lazy_tensor_names.find(kv.first) == cond_lazy_tensor_names.end()) {
+                    tensors[kv.first] = kv.second;  // eager — bulk loader will fill
+                }
+            }
+            if (auto_lazy_load && !llm_lazy_map.empty()) {
+                ModelLoader* loader_ptr        = owned_model_loader.get();
+                // Bound lazy-load threads to keep the per-thread staging
+                // buffer footprint small. The default n_threads = nproc gives
+                // ~nproc × max_tensor_bytes (up to several GB total) of
+                // CPU-side staging; for RAM-constrained systems running large
+                // models that's enough to trigger the OOM-killer even with
+                // mmap enabled. 2 threads still keep the disk-read pipeline
+                // fed while keeping staging bounded to ~2 × max_tensor_bytes.
+                int          n_threads_capture = std::min(sd_ctx_params->n_threads > 0
+                                                             ? sd_ctx_params->n_threads : 2,
+                                                          2);
+                bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                bool         quiet_capture     = sd_ctx_params->quiet_unknown_tensors;
+                cond_stage_model->set_llm_lazy_load([=]() -> bool {
+                    auto local_map = llm_lazy_map;
+                    return loader_ptr->load_tensors(local_map, /*ignore=*/{},
+                                                    n_threads_capture, mmap_capture,
+                                                    quiet_capture);
+                });
+                LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)",
+                         llm_lazy_map.size());
+            }
+            cond_stage_model->alloc_params_buffer();  // no-op for the lazy LLM
 
-            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors);
+            std::map<std::string, ggml_tensor*> dit_only_tensors;
+            diffusion_model->get_param_tensors(dit_only_tensors);
+            if (auto_lazy_load) {
+                for (const auto& kv : dit_only_tensors) {
+                    dit_lazy_tensor_names.insert(kv.first);
+                }
+                ModelLoader* loader_ptr        = owned_model_loader.get();
+                // Bound lazy-load threads to keep the per-thread staging
+                // buffer footprint small. The default n_threads = nproc gives
+                // ~nproc × max_tensor_bytes (up to several GB total) of
+                // CPU-side staging; for RAM-constrained systems running large
+                // models that's enough to trigger the OOM-killer even with
+                // mmap enabled. 2 threads still keep the disk-read pipeline
+                // fed while keeping staging bounded to ~2 × max_tensor_bytes.
+                int          n_threads_capture = std::min(sd_ctx_params->n_threads > 0
+                                                             ? sd_ctx_params->n_threads : 2,
+                                                          2);
+                bool         mmap_capture      = sd_ctx_params->enable_mmap;
+                bool         quiet_capture     = sd_ctx_params->quiet_unknown_tensors;
+                diffusion_model->set_lazy_load([=]() -> bool {
+                    auto local_map = dit_only_tensors;
+                    return loader_ptr->load_tensors(local_map, /*ignore=*/{},
+                                                    n_threads_capture, mmap_capture,
+                                                    quiet_capture);
+                });
+                LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)",
+                         dit_only_tensors.size());
+            } else {
+                for (const auto& kv : dit_only_tensors) {
+                    tensors[kv.first] = kv.second;
+                }
+            }
+            diffusion_model->alloc_params_buffer();  // no-op when lazy_load_fn is set
 
             if (sd_version_is_unet_edit(version)) {
                 vae_decode_only = false;
@@ -555,11 +1193,13 @@ class StableDiffusionGGML {
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
 
-            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("VAE Autoencoder: Using CPU backend");
-                vae_backend = ggml_backend_cpu_init();
-            } else {
+            if (vae_dev_name != nullptr && vae_dev_name[0] != '\0') {
+                vae_backend = init_named_backend(vae_dev_name);
+            }
+            if (!vae_backend) {
                 vae_backend = backend;
+            } else {
+                LOG_INFO("VAE: using device %s", vae_dev_name);
             }
 
             auto create_tae = [&]() -> std::shared_ptr<VAE> {
@@ -648,11 +1288,14 @@ class StableDiffusionGGML {
 
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                 ggml_backend_t controlnet_backend = nullptr;
-                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
-                    LOG_DEBUG("ControlNet: Using CPU backend");
-                    controlnet_backend = ggml_backend_cpu_init();
-                } else {
+                const char* cn_dev_name = sd_ctx_params->control_net_backend_device;
+                if (cn_dev_name != nullptr && cn_dev_name[0] != '\0') {
+                    controlnet_backend = init_named_backend(cn_dev_name);
+                }
+                if (!controlnet_backend) {
                     controlnet_backend = backend;
+                } else {
+                    LOG_INFO("ControlNet: using device %s", cn_dev_name);
                 }
                 control_net = std::make_shared<ControlNet>(controlnet_backend,
                                                            offload_params_to_cpu,
@@ -754,6 +1397,14 @@ class StableDiffusionGGML {
 
         std::set<std::string> ignore_tensors;
         tensors["alphas_cumprod"] = alphas_cumprod_tensor;
+        // Lazy-loaded components: skip them in the bulk load; their lazy
+        // callbacks will load them on first compute().
+        for (const auto& name : cond_lazy_tensor_names) {
+            ignore_tensors.insert(name);
+        }
+        for (const auto& name : dit_lazy_tensor_names) {
+            ignore_tensors.insert(name);
+        }
         if (use_tae && !tae_preview_only) {
             ignore_tensors.insert("first_stage_model.");
         }
@@ -783,7 +1434,9 @@ class StableDiffusionGGML {
             ignore_tensors.insert("text_encoders.llm.vision_tower.");
             ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads,
+                                                  sd_ctx_params->enable_mmap,
+                                                  sd_ctx_params->quiet_unknown_tensors);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -2142,16 +2795,31 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->prediction              = PREDICTION_COUNT;
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
-    sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
-    sd_ctx_params->diffusion_flash_attn    = false;
-    sd_ctx_params->circular_x              = false;
-    sd_ctx_params->circular_y              = false;
-    sd_ctx_params->chroma_use_dit_mask     = true;
-    sd_ctx_params->chroma_use_t5_mask      = false;
-    sd_ctx_params->chroma_t5_mask_pad      = 1;
+    sd_ctx_params->enable_mmap                  = false;
+    sd_ctx_params->main_backend_device          = nullptr;
+    sd_ctx_params->diffusion_backend_device     = nullptr;
+    sd_ctx_params->clip_backend_device          = nullptr;
+    sd_ctx_params->vae_backend_device           = nullptr;
+    sd_ctx_params->control_net_backend_device   = nullptr;
+    sd_ctx_params->tae_backend_device           = nullptr;
+    sd_ctx_params->upscaler_backend_device      = nullptr;
+    sd_ctx_params->photomaker_backend_device    = nullptr;
+    sd_ctx_params->vision_backend_device        = nullptr;
+    sd_ctx_params->diffusion_flash_attn         = false;
+    sd_ctx_params->circular_x                   = false;
+    sd_ctx_params->circular_y                   = false;
+    sd_ctx_params->chroma_use_dit_mask          = true;
+    sd_ctx_params->chroma_use_t5_mask           = false;
+    sd_ctx_params->chroma_t5_mask_pad           = 1;
+    sd_ctx_params->auto_fit                     = true;
+    sd_ctx_params->auto_fit_target_mb           = 512;
+    sd_ctx_params->auto_fit_dry_run             = false;
+    sd_ctx_params->auto_fit_compute_reserve_dit_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
+    sd_ctx_params->auto_multi_gpu               = true;
+    sd_ctx_params->multi_gpu_mode               = "row";
+    sd_ctx_params->quiet_unknown_tensors        = false;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2183,9 +2851,24 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "sampler_rng_type: %s\n"
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
+             "main_backend_device: %s\n"
+             "diffusion_backend_device: %s\n"
+             "clip_backend_device: %s\n"
+             "vae_backend_device: %s\n"
+             "control_net_backend_device: %s\n"
+             "tae_backend_device: %s\n"
+             "upscaler_backend_device: %s\n"
+             "photomaker_backend_device: %s\n"
+             "vision_backend_device: %s\n"
+             "auto_fit: %s\n"
+             "auto_fit_target_mb: %d\n"
+             "auto_fit_dry_run: %s\n"
+             "auto_fit_compute_reserve_dit_mb: %d\n"
+             "auto_fit_compute_reserve_vae_mb: %d\n"
+             "auto_fit_compute_reserve_cond_mb: %d\n"
+             "auto_multi_gpu: %s\n"
+             "multi_gpu_mode: %s\n"
+             "quiet_unknown_tensors: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2215,9 +2898,24 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
+             SAFE_STR(sd_ctx_params->main_backend_device),
+             SAFE_STR(sd_ctx_params->diffusion_backend_device),
+             SAFE_STR(sd_ctx_params->clip_backend_device),
+             SAFE_STR(sd_ctx_params->vae_backend_device),
+             SAFE_STR(sd_ctx_params->control_net_backend_device),
+             SAFE_STR(sd_ctx_params->tae_backend_device),
+             SAFE_STR(sd_ctx_params->upscaler_backend_device),
+             SAFE_STR(sd_ctx_params->photomaker_backend_device),
+             SAFE_STR(sd_ctx_params->vision_backend_device),
+             BOOL_STR(sd_ctx_params->auto_fit),
+             sd_ctx_params->auto_fit_target_mb,
+             BOOL_STR(sd_ctx_params->auto_fit_dry_run),
+             sd_ctx_params->auto_fit_compute_reserve_dit_mb,
+             sd_ctx_params->auto_fit_compute_reserve_vae_mb,
+             sd_ctx_params->auto_fit_compute_reserve_cond_mb,
+             BOOL_STR(sd_ctx_params->auto_multi_gpu),
+             SAFE_STR(sd_ctx_params->multi_gpu_mode),
+             BOOL_STR(sd_ctx_params->quiet_unknown_tensors),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
diff --git a/src/util.cpp b/src/util.cpp
index 0b514bb73..743738813 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -174,12 +174,33 @@ bool is_directory(const std::string& path) {
 
 class MmapWrapperImpl : public MmapWrapper {
 public:
-    MmapWrapperImpl(void* data, size_t size)
-        : MmapWrapper(data, size) {}
+    MmapWrapperImpl(void* data, size_t size, int fd)
+        : MmapWrapper(data, size), fd_(fd) {}
 
     ~MmapWrapperImpl() override {
+#ifdef __linux__
+        // Drop the kernel pagecache pages for this file. madvise(DONTNEED)
+        // alone only unmaps from the process address space; pagecache
+        // entries persist (`free` reports them as buff/cache and the OOM
+        // killer doesn't touch them, but they ARE counted against
+        // overcommit and can starve other allocations on tight-RAM
+        // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
+        // way to evict pagecache for a specific fd's pages.
+        if (data_ != nullptr && size_ > 0) {
+            madvise(data_, size_, MADV_DONTNEED);
+        }
+        if (fd_ >= 0) {
+            posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        }
+#endif
         munmap(data_, size_);
+        if (fd_ >= 0) {
+            close(fd_);
+        }
     }
+
+private:
+    int fd_;
 };
 
 std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
@@ -191,9 +212,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
     int mmap_flags = MAP_PRIVATE;
 
 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
-    // mmap_flags |= MAP_POPULATE;
+    // Sequential access hint helps the kernel read-ahead efficiently and
+    // also encourages eviction of already-read pages (the kernel keeps
+    // a smaller working set when this is set).
+    posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
 #endif
 
     struct stat sb;
@@ -206,9 +228,8 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
 
-    close(file_descriptor);
-
     if (mapped_data == MAP_FAILED) {
+        close(file_descriptor);
         return nullptr;
     }
 
@@ -217,7 +238,7 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
     // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
 #endif
 
-    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
+    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
 }
 
 #endif
diff --git a/src/version.cpp b/src/version.cpp
index 97dc8426b..6c266153c 100644
--- a/src/version.cpp
+++ b/src/version.cpp
@@ -1,3 +1,6 @@
+#include <cstdio>
+
+#include "ggml-backend.h"
 #include "stable-diffusion.h"
 
 #ifndef SDCPP_BUILD_COMMIT
@@ -18,3 +21,12 @@ const char* sd_commit(void) {
 const char* sd_version(void) {
     return STRINGIZE(SDCPP_BUILD_VERSION);
 }
+
+void sd_list_devices(void) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* name       = ggml_backend_dev_name(dev);
+        const char* desc       = ggml_backend_dev_description(dev);
+        std::printf("%s\t%s\n", name ? name : "", desc ? desc : "");
+    }
+}