From 0e4c44b0b527923bd4256f38a5c754e3251082c3 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 13:39:29 +0200 Subject: [PATCH 1/9] feat: auto-fit component placement and per-component backend devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an auto-fit planner that picks DiT / VAE / Conditioner device placements from free GPU memory, treating each component as atomic (no intra-tensor row split — equivalent to llama.cpp's LLAMA_SPLIT_MODE_LAYER at component granularity, so views never land on a split buffer and no ggml patch is needed). Also adopt the PR #1184 CLI conventions: - new: --main-backend-device, --diffusion-backend-device, --clip-backend-device, --vae-backend-device, --control-net-backend-device, --tae-backend-device, --upscaler-backend-device, --photomaker-backend-device, --vision-backend-device, --list-devices - removed: --clip-on-cpu, --vae-on-cpu, --control-net-cpu (and the matching keep_*_on_cpu fields on sd_ctx_params_t) Auto-fit knobs: --auto-fit / --no-auto-fit, --no-multi-gpu, --fit-target, --fit-compute-reserve-{dit,vae,cond}, --fit-dry-run. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/common/common.cpp | 131 +++++++++-- examples/common/common.h | 22 +- include/stable-diffusion.h | 41 +++- src/backend_fit.hpp | 434 +++++++++++++++++++++++++++++++++++++ src/model.h | 2 + src/stable-diffusion.cpp | 263 ++++++++++++++++++---- src/version.cpp | 12 + 7 files changed, 838 insertions(+), 67 deletions(-) create mode 100644 src/backend_fit.hpp diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 1a5399b82..d3626fcce 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -380,6 +380,46 @@ ArgOptions SDContextParams::get_options() { "--upscale-model", "path to esrgan model.", &esrgan_path}, + {"", + "--main-backend-device", + "ggml device name to use as the main backend (see --list-devices). " + "When unset, the first GPU device is used.", + &main_backend_device}, + {"", + "--diffusion-backend-device", + "ggml device name for the diffusion / flow model. " + "Falls back to --main-backend-device.", + &diffusion_backend_device}, + {"", + "--clip-backend-device", + "ggml device name for the text encoders. " + "Falls back to --main-backend-device.", + &clip_backend_device}, + {"", + "--vae-backend-device", + "ggml device name for the VAE. Falls back to --main-backend-device.", + &vae_backend_device}, + {"", + "--control-net-backend-device", + "ggml device name for the ControlNet. " + "Falls back to --main-backend-device.", + &control_net_backend_device}, + {"", + "--tae-backend-device", + "ggml device name for the TAE (currently routed through main).", + &tae_backend_device}, + {"", + "--upscaler-backend-device", + "ggml device name for the upscaler (currently routed through main).", + &upscaler_backend_device}, + {"", + "--photomaker-backend-device", + "ggml device name for PhotoMaker (currently routed through main).", + &photomaker_backend_device}, + {"", + "--vision-backend-device", + "ggml device name for the vision model (currently routed through main).", + &vision_backend_device}, }; options.int_options = { @@ -392,6 +432,23 @@ ArgOptions SDContextParams::get_options() { "--chroma-t5-mask-pad", "t5 mask pad size of chroma", &chroma_t5_mask_pad}, + {"", + "--fit-target", + "auto-fit: MiB of free memory to leave on each GPU (default: 512)", + &auto_fit_target_mb}, + {"", + "--fit-compute-reserve-dit", + "auto-fit: MiB reserved on the DiT's GPU for its compute buffer " + "(0 keeps the built-in default)", + &auto_fit_compute_reserve_dit_mb}, + {"", + "--fit-compute-reserve-vae", + "auto-fit: MiB reserved on the VAE's GPU for its compute buffer", + &auto_fit_compute_reserve_vae_mb}, + {"", + "--fit-compute-reserve-cond", + "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer", + &auto_fit_compute_reserve_cond_mb}, }; options.float_options = {}; @@ -409,18 +466,6 @@ ArgOptions SDContextParams::get_options() { "--mmap", "whether to memory-map model", true, &enable_mmap}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, &control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, &clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, &vae_on_cpu}, {"", "--fa", "use flash attention", @@ -461,6 +506,24 @@ ArgOptions SDContextParams::get_options() { "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, + {"", + "--auto-fit", + "automatically pick DiT/VAE/Conditioner device placements based on " + "free GPU memory (default ON)", + true, &auto_fit}, + {"", + "--no-auto-fit", + "disable auto-fit and use the explicit *-backend-device flags", + false, &auto_fit}, + {"", + "--no-multi-gpu", + "auto-fit: keep all components on a single GPU when they fit " + "(by default, multi-GPU placements are preferred to balance load)", + false, &auto_multi_gpu}, + {"", + "--fit-dry-run", + "auto-fit: print the computed plan and exit without loading models", + true, &auto_fit_dry_run}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { @@ -559,6 +622,15 @@ ArgOptions SDContextParams::get_options() { "but it usually offers faster inference speed and, in some cases, lower memory usage. " "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, + {"", + "--list-devices", + "list available ggml backend devices (one per line, " + "namedescription) and exit", + [](int /*argc*/, const char** /*argv*/, int /*index*/) { + sd_list_devices(); + std::exit(0); + return 0; + }}, }; return options; @@ -671,9 +743,19 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " main_backend_device: \"" << main_backend_device << "\",\n" + << " diffusion_backend_device: \"" << diffusion_backend_device << "\",\n" + << " clip_backend_device: \"" << clip_backend_device << "\",\n" + << " vae_backend_device: \"" << vae_backend_device << "\",\n" + << " control_net_backend_device: \"" << control_net_backend_device << "\",\n" + << " tae_backend_device: \"" << tae_backend_device << "\",\n" + << " upscaler_backend_device: \"" << upscaler_backend_device << "\",\n" + << " photomaker_backend_device: \"" << photomaker_backend_device << "\",\n" + << " vision_backend_device: \"" << vision_backend_device << "\",\n" + << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" + << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" + << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" + << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -729,9 +811,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f lora_apply_mode, offload_params_to_cpu, enable_mmap, - clip_on_cpu, - control_net_cpu, - vae_on_cpu, + main_backend_device.empty() ? nullptr : main_backend_device.c_str(), + diffusion_backend_device.empty() ? nullptr : diffusion_backend_device.c_str(), + clip_backend_device.empty() ? nullptr : clip_backend_device.c_str(), + vae_backend_device.empty() ? nullptr : vae_backend_device.c_str(), + control_net_backend_device.empty() ? nullptr : control_net_backend_device.c_str(), + tae_backend_device.empty() ? nullptr : tae_backend_device.c_str(), + upscaler_backend_device.empty() ? nullptr : upscaler_backend_device.c_str(), + photomaker_backend_device.empty() ? nullptr : photomaker_backend_device.c_str(), + vision_backend_device.empty() ? nullptr : vision_backend_device.c_str(), flash_attn, diffusion_flash_attn, taesd_preview, @@ -744,6 +832,13 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_use_t5_mask, chroma_t5_mask_pad, qwen_image_zero_cond_t, + auto_fit, + auto_fit_target_mb, + auto_fit_dry_run, + auto_fit_compute_reserve_dit_mb, + auto_fit_compute_reserve_vae_mb, + auto_fit_compute_reserve_cond_mb, + auto_multi_gpu, }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index c4498c352..8243d6cba 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -110,9 +110,15 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; + std::string main_backend_device; + std::string diffusion_backend_device; + std::string clip_backend_device; + std::string vae_backend_device; + std::string control_net_backend_device; + std::string tae_backend_device; + std::string upscaler_backend_device; + std::string photomaker_backend_device; + std::string vision_backend_device; bool flash_attn = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; @@ -128,6 +134,16 @@ struct SDContextParams { bool qwen_image_zero_cond_t = false; + // Auto-fit defaults — placement is computed automatically based on free + // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device. + bool auto_fit = true; + int auto_fit_target_mb = 512; + bool auto_fit_dry_run = false; + int auto_fit_compute_reserve_dit_mb = 0; + int auto_fit_compute_reserve_vae_mb = 0; + int auto_fit_compute_reserve_cond_mb = 0; + bool auto_multi_gpu = true; + prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 75027f8f8..ed6336ba1 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -188,9 +188,18 @@ typedef struct { enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; bool enable_mmap; - bool keep_clip_on_cpu; - bool keep_control_net_on_cpu; - bool keep_vae_on_cpu; + // Per-component backend device names (ggml device names). Empty / NULL + // means "use the main backend device". The strings are only borrowed for + // the duration of the init call. See sd_list_devices() for what to pass. + const char* main_backend_device; + const char* diffusion_backend_device; + const char* clip_backend_device; + const char* vae_backend_device; + const char* control_net_backend_device; + const char* tae_backend_device; + const char* upscaler_backend_device; + const char* photomaker_backend_device; + const char* vision_backend_device; bool flash_attn; bool diffusion_flash_attn; bool tae_preview_only; @@ -203,6 +212,27 @@ typedef struct { bool chroma_use_t5_mask; int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; + + // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory. + // When `auto_fit` is true (default), the *_backend_device strings are + // ignored and the plan is computed automatically. + // `auto_fit_target_mb` is the memory to leave free per GPU (default 512). + // `auto_fit_dry_run` prints the plan and aborts init before loading. + // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the + // per-component compute-buffer reserve; 0 means use the built-in default. + bool auto_fit; + int auto_fit_target_mb; + bool auto_fit_dry_run; + int auto_fit_compute_reserve_dit_mb; + int auto_fit_compute_reserve_vae_mb; + int auto_fit_compute_reserve_cond_mb; + + // When more than one GPU device is present, prefer placing different + // components on different GPUs to balance load and fit larger total + // working sets. Set false to keep all components on a single GPU when + // they fit. Defaults to true. Each component still lives entirely on + // one device — no intra-tensor row split. + bool auto_multi_gpu; } sd_ctx_params_t; typedef struct { @@ -449,6 +479,11 @@ SD_API bool preprocess_canny(sd_image_t image, SD_API const char* sd_commit(void); SD_API const char* sd_version(void); +// List available ggml backend devices to stdout, in `namedescription` +// per-line format. The output is intended to be parsed by tools and used +// directly as the value of --*-backend-device flags. +SD_API void sd_list_devices(void); + #ifdef __cplusplus } #endif diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp new file mode 100644 index 000000000..52254f0e8 --- /dev/null +++ b/src/backend_fit.hpp @@ -0,0 +1,434 @@ +#ifndef __SD_BACKEND_FIT_HPP__ +#define __SD_BACKEND_FIT_HPP__ + +// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the +// available GPU devices and system RAM. +// +// Each component is treated as a single atomic unit that lives entirely on +// one device (plus its compute buffer on the same device). There is no +// intra-tensor row split: cross-device parallelism comes from placing +// different components on different GPUs, not from splitting individual +// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER +// at the component granularity. +// +// Placement priority: DiT + compute buffer -> VAE -> Conditioner. +// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that +// support streaming params from RAM at compute time). + +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend.h" + +#include "model.h" +#include "util.h" + +namespace backend_fit { + +constexpr int64_t MiB = 1024 * 1024; +constexpr int DEVICE_ID_CPU = -1; + +enum class ComponentKind { + DIT, + VAE, + CONDITIONER, +}; + +enum class Placement { + CPU, + GPU, + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU +}; + +struct Component { + ComponentKind kind; + std::string name; + int64_t params_bytes = 0; + int64_t compute_bytes = 0; + bool supports_offload = false; +}; + +struct Device { + int id = DEVICE_ID_CPU; + std::string name; + std::string description; + int64_t free_bytes = 0; + int64_t total_bytes = 0; + ggml_backend_dev_t dev = nullptr; // backing ggml device handle (GPU only) +}; + +struct Decision { + ComponentKind kind; + std::string name; + Placement placement = Placement::CPU; + int device_id = DEVICE_ID_CPU; + int64_t on_device_bytes = 0; + int64_t on_host_bytes = 0; +}; + +struct Plan { + std::vector decisions; + std::map device_bytes; + int64_t host_bytes = 0; + bool any_changes = false; +}; + +struct ComputeReserves { + int64_t dit_bytes = int64_t(2048) * MiB; + int64_t vae_bytes = int64_t(1024) * MiB; + int64_t conditioner_bytes = int64_t(512) * MiB; +}; + +// --- Classification ------------------------------------------------------- + +inline bool classify_tensor(const std::string& name, ComponentKind& out) { + auto contains = [&](const char* s) { return name.find(s) != std::string::npos; }; + + if (contains("model.diffusion_model.") || contains("unet.")) { + out = ComponentKind::DIT; + return true; + } + + if (contains("first_stage_model.") || + name.rfind("vae.", 0) == 0 || + name.rfind("tae.", 0) == 0) { + out = ComponentKind::VAE; + return true; + } + + if (contains("text_encoders") || + contains("cond_stage_model") || + contains("te.text_model.") || + contains("conditioner") || + name.rfind("text_encoder.", 0) == 0) { + out = ComponentKind::CONDITIONER; + return true; + } + + return false; +} + +// --- Memory estimation ---------------------------------------------------- + +inline std::vector estimate_components(ModelLoader& loader, + ggml_type override_wtype, + int64_t alignment, + const ComputeReserves& reserves) { + auto& storage = loader.get_tensor_storage_map(); + + int64_t bytes[3] = {0, 0, 0}; + + for (auto& [name, ts_const] : storage) { + TensorStorage ts = ts_const; + if (is_unused_tensor(ts.name)) { + continue; + } + + ComponentKind k; + if (!classify_tensor(ts.name, k)) { + continue; + } + + if (override_wtype != GGML_TYPE_COUNT && + loader.tensor_should_be_converted(ts, override_wtype)) { + ts.type = override_wtype; + } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) { + ts.type = ts.expected_type; + } + + bytes[int(k)] += ts.nbytes() + alignment; + } + + std::vector out; + out.reserve(3); + out.push_back({ComponentKind::DIT, "DiT", + bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true}); + out.push_back({ComponentKind::VAE, "VAE", + bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false}); + out.push_back({ComponentKind::CONDITIONER, "Conditioner", + bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true}); + return out; +} + +// --- Device enumeration --------------------------------------------------- + +inline std::vector enumerate_gpu_devices() { + std::vector out; + int gpu_idx = 0; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + continue; + } + Device d; + d.id = gpu_idx++; + d.dev = dev; + d.name = ggml_backend_dev_name(dev); + d.description = ggml_backend_dev_description(dev); + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + d.free_bytes = int64_t(free_b); + d.total_bytes = int64_t(total_b); + out.push_back(d); + } + return out; +} + +// --- Core algorithm ------------------------------------------------------- + +// Peak per device = MAX of any single component's footprint on that device, +// because free_params_immediately frees params between phases so components +// time-share VRAM. +inline int64_t gpu_peak(int gpu_idx, + const std::vector& pl, + const std::vector& dev, + const std::vector& components) { + int64_t peak = 0; + for (size_t i = 0; i < components.size(); i++) { + if (dev[i] != gpu_idx) continue; + int64_t footprint = 0; + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + footprint = components[i].params_bytes + components[i].compute_bytes; + } + peak = std::max(peak, footprint); + } + return peak; +} + +inline Plan compute_plan(const std::vector& components, + const std::vector& devices, + int64_t margin_bytes, + bool allow_multi_gpu = true) { + const size_t nC = components.size(); + const size_t nG = devices.size(); + + std::vector cap(nG, 0); + for (size_t g = 0; g < nG; g++) { + cap[g] = std::max(0, devices[g].free_bytes - margin_bytes); + } + + struct OptionSlot { + Placement placement; + int device_idx; + }; + + auto build_options = [&](const Component& c) { + std::vector opts; + for (size_t g = 0; g < nG; g++) { + opts.push_back({Placement::GPU, int(g)}); + if (c.supports_offload) { + opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); + } + } + opts.push_back({Placement::CPU, -1}); + return opts; + }; + + std::vector> options; + options.reserve(nC); + for (const Component& c : components) { + options.push_back(build_options(c)); + } + + auto priority_weight = [](ComponentKind k) -> int { + switch (k) { + case ComponentKind::DIT: return 300; + case ComponentKind::CONDITIONER: return 120; + case ComponentKind::VAE: return 60; + } + return 1; + }; + + auto score = [&](const std::vector& pl, const std::vector& dev) { + int64_t s = 0; + std::set gpus_used; + for (size_t i = 0; i < nC; i++) { + const int pw = priority_weight(components[i].kind); + if (pl[i] == Placement::GPU) { + s += 10 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + s += 5 * pw; + gpus_used.insert(dev[i]); + } else { + s -= 10 * pw; + } + } + if (allow_multi_gpu) { + s += 2 * int64_t(gpus_used.size()); + } + return s; + }; + + std::vector idx(nC, 0); + std::vector best_pl; + std::vector best_dev; + int64_t best_score = std::numeric_limits::min(); + bool found_any = false; + + while (true) { + std::vector pl(nC); + std::vector dev(nC); + for (size_t i = 0; i < nC; i++) { + pl[i] = options[i][idx[i]].placement; + dev[i] = options[i][idx[i]].device_idx; + } + // Constraint: when multi-GPU is disabled, all GPU placements must + // share the same device index. + if (!allow_multi_gpu) { + int common = -1; + bool ok = true; + for (size_t i = 0; i < nC; i++) { + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (common < 0) common = dev[i]; + else if (dev[i] != common) { ok = false; break; } + } + } + if (ok) { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + } else { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + + size_t pos = 0; + while (pos < nC) { + idx[pos]++; + if (idx[pos] < options[pos].size()) break; + idx[pos] = 0; + pos++; + } + if (pos >= nC) break; + } + + Plan plan; + if (!found_any) { + best_pl.assign(nC, Placement::CPU); + best_dev.assign(nC, -1); + } + + for (size_t i = 0; i < nC; i++) { + const Component& c = components[i]; + Decision d; + d.kind = c.kind; + d.name = c.name; + d.placement = best_pl[i]; + if (best_pl[i] == Placement::CPU) { + d.device_id = DEVICE_ID_CPU; + d.on_host_bytes = c.params_bytes + c.compute_bytes; + plan.any_changes = true; + } else { + d.device_id = devices[best_dev[i]].id; + if (best_pl[i] == Placement::GPU) { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + } else { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + d.on_host_bytes = c.params_bytes; + plan.any_changes = true; + } + } + plan.decisions.push_back(d); + plan.host_bytes += d.on_host_bytes; + } + + for (size_t g = 0; g < nG; g++) { + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components); + } + return plan; +} + +inline const char* placement_str(Placement p) { + switch (p) { + case Placement::CPU: return "CPU"; + case Placement::GPU: return "GPU"; + case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + } + return "?"; +} + +inline void print_plan(const Plan& plan, + const std::vector& components, + const std::vector& devices, + int64_t margin_bytes) { + LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB)); + LOG_INFO(" available devices:"); + if (devices.empty()) { + LOG_INFO(" (no GPU devices detected — all components will run on CPU)"); + } + for (const Device& d : devices) { + LOG_INFO(" %-12s %-32s free %6lld / %6lld MiB", + d.name.c_str(), d.description.c_str(), + (long long)(d.free_bytes / MiB), + (long long)(d.total_bytes / MiB)); + } + LOG_INFO(" components:"); + for (const Component& c : components) { + LOG_INFO(" %-12s params %6lld MiB, compute reserve %6lld MiB", + c.name.c_str(), + (long long)(c.params_bytes / MiB), + (long long)(c.compute_bytes / MiB)); + } + LOG_INFO(" decisions:"); + for (const Decision& d : plan.decisions) { + if (d.placement == Placement::CPU) { + LOG_INFO(" %-12s -> CPU (RAM %lld MiB)", + d.name.c_str(), (long long)(d.on_host_bytes / MiB)); + } else if (d.placement == Placement::GPU) { + LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB)); + } else { + LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB), + (long long)(d.on_host_bytes / MiB)); + } + } + LOG_INFO(" projected per-device peak:"); + for (const Device& d : devices) { + int64_t peak = 0; + auto it = plan.device_bytes.find(d.id); + if (it != plan.device_bytes.end()) peak = it->second; + LOG_INFO(" %-12s peak %6lld / %6lld MiB free (remaining %lld MiB)", + d.name.c_str(), + (long long)(peak / MiB), + (long long)(d.free_bytes / MiB), + (long long)((d.free_bytes - peak) / MiB)); + } + LOG_INFO(" %-12s host RAM additional %lld MiB", "CPU", + (long long)(plan.host_bytes / MiB)); +} + +inline const Decision* find_decision(const Plan& plan, ComponentKind kind) { + for (const Decision& d : plan.decisions) { + if (d.kind == kind) return &d; + } + return nullptr; +} + +} // namespace backend_fit + +#endif // __SD_BACKEND_FIT_HPP__ diff --git a/src/model.h b/src/model.h index 65bc6c367..10aaf8512 100644 --- a/src/model.h +++ b/src/model.h @@ -193,6 +193,8 @@ using TensorTypeRules = std::vector>; TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules); +bool is_unused_tensor(const std::string& name); + class ModelLoader { protected: SDVersion version_ = VERSION_COUNT; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 88102ff61..dfe2a8873 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1,5 +1,6 @@ #include "ggml_extend.hpp" +#include "backend_fit.hpp" #include "model.h" #include "rng.hpp" #include "rng_mt19937.hpp" @@ -108,10 +109,23 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: - ggml_backend_t backend = nullptr; // general backend + ggml_backend_t backend = nullptr; // general / main backend ggml_backend_t clip_backend = nullptr; ggml_backend_t control_net_backend = nullptr; ggml_backend_t vae_backend = nullptr; + ggml_backend_t diffusion_backend = nullptr; + + // Auto-fit decisions resolved into device-name strings. When non-empty, + // these win over the user-provided sd_ctx_params->*_backend_device. + // When empty, the explicit param (or `backend` fallback) is used. + std::string fit_diffusion_device; + std::string fit_clip_device; + std::string fit_vae_device; + // Per-component offload-params override coming from auto-fit. Forces + // offload_params_to_cpu for that component even when global flag is off. + bool fit_dit_offload_params = false; + bool fit_cond_offload_params = false; + bool fit_vae_offload_params = false; SDVersion version; bool vae_decode_only = false; @@ -168,11 +182,23 @@ class StableDiffusionGGML { if (vae_backend != backend) { ggml_backend_free(vae_backend); } + if (diffusion_backend != backend) { + ggml_backend_free(diffusion_backend); + } ggml_backend_free(backend); } - void init_backend() { - backend = sd_get_default_backend(); + void init_backend(const char* main_device_name) { + if (main_device_name != nullptr && main_device_name[0] != '\0') { + backend = init_named_backend(main_device_name); + if (backend == nullptr) { + LOG_WARN("main backend device '%s' init failed; falling back to default", + main_device_name); + } + } + if (backend == nullptr) { + backend = sd_get_default_backend(); + } } std::shared_ptr get_rng(rng_type_t rng_type) { @@ -202,7 +228,7 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); + init_backend(sd_ctx_params->main_backend_device); ModelLoader model_loader; @@ -328,6 +354,75 @@ class StableDiffusionGGML { return oss.str(); }; + if (sd_ctx_params->auto_fit) { + backend_fit::ComputeReserves reserves; + if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) { + reserves.dit_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) { + reserves.vae_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) { + reserves.conditioner_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB; + } + auto components = backend_fit::estimate_components( + model_loader, wtype, /*alignment=*/64, reserves); + auto devices = backend_fit::enumerate_gpu_devices(); + int64_t margin_bytes = + int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + auto plan = backend_fit::compute_plan( + components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu); + backend_fit::print_plan(plan, components, devices, margin_bytes); + + if (sd_ctx_params->auto_fit_dry_run) { + LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models"); + return false; + } + + // Find the CPU device's ggml name (so we can route "CPU" + // placements through init_named_backend uniformly). + std::string cpu_device_name; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + cpu_device_name = ggml_backend_dev_name(dev); + break; + } + } + auto resolve = [&](const backend_fit::Decision* d, std::string& out_device, + bool& out_offload) { + if (d == nullptr) { + out_device.clear(); + out_offload = false; + return; + } + if (d->placement == backend_fit::Placement::CPU) { + out_device = cpu_device_name; + out_offload = false; + return; + } + for (const auto& dev : devices) { + if (dev.id == d->device_id) { + out_device = dev.name; + break; + } + } + out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS); + }; + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), + fit_diffusion_device, fit_dit_offload_params); + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), + fit_vae_device, fit_vae_offload_params); + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), + fit_clip_device, fit_cond_offload_params); + + // CPU placements: leave fit_*_device empty AND remember they're + // CPU so the resolver below picks ggml_backend_cpu_init(). + } + LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); LOG_INFO("Conditioner weight type stat: %s", wtype_stat_to_str(conditioner_wtype_stat).c_str()); LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str()); @@ -373,19 +468,57 @@ class StableDiffusionGGML { LOG_INFO("Using circular padding for convolutions"); } - bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; + // If auto-fit decided ANY component must offload params, force the + // global flag on. This is a coarsening: one component needing offload + // forces all to offload (safer, just slower for non-offload ones). + if (fit_dit_offload_params || fit_cond_offload_params || fit_vae_offload_params) { + if (!offload_params_to_cpu) { + LOG_INFO("auto-fit: enabling offload_params_to_cpu (one or more " + "components don't fit without param streaming)"); + offload_params_to_cpu = true; + } + } + + // Pick the effective device name for each component: the auto-fit + // override (if any) wins; otherwise the user-provided string; nullptr + // falls back to `backend` (the main). + auto effective_device = [&](const std::string& fit_str, const char* user_str) -> const char* { + if (!fit_str.empty()) return fit_str.c_str(); + return user_str; + }; + const char* diffusion_dev_name = effective_device(fit_diffusion_device, + sd_ctx_params->diffusion_backend_device); + const char* clip_dev_name = effective_device(fit_clip_device, + sd_ctx_params->clip_backend_device); + const char* vae_dev_name = effective_device(fit_vae_device, + sd_ctx_params->vae_backend_device); + + // Helper: init a named backend if name is non-null/non-empty, + // returns nullptr on missing/failed name (caller falls back to main). + auto init_named_or_null = [](const char* name) -> ggml_backend_t { + if (name == nullptr || name[0] == '\0') return nullptr; + return init_named_backend(name); + }; + + diffusion_backend = init_named_or_null(diffusion_dev_name); + if (!diffusion_backend) { + diffusion_backend = backend; + } else { + LOG_INFO("Diffusion model: using device %s", diffusion_dev_name); + } { - clip_backend = backend; - if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("CLIP: Using CPU backend"); - clip_backend = ggml_backend_cpu_init(); + clip_backend = init_named_or_null(clip_dev_name); + if (!clip_backend) { + clip_backend = backend; + } else { + LOG_INFO("CLIP: using device %s", clip_dev_name); } if (sd_version_is_sd3(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map); } else if (sd_version_is_flux(version)) { @@ -423,7 +556,7 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map); } - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, @@ -434,7 +567,7 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, @@ -446,13 +579,13 @@ class StableDiffusionGGML { true, 0, true); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { - high_noise_diffusion_model = std::make_shared(backend, + high_noise_diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", @@ -478,7 +611,7 @@ class StableDiffusionGGML { version, "", enable_vision); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", @@ -488,7 +621,7 @@ class StableDiffusionGGML { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model"); @@ -497,7 +630,7 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", @@ -507,7 +640,7 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model"); @@ -530,7 +663,7 @@ class StableDiffusionGGML { embbeding_map, version); } - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version); @@ -555,11 +688,13 @@ class StableDiffusionGGML { high_noise_diffusion_model->get_param_tensors(tensors); } - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("VAE Autoencoder: Using CPU backend"); - vae_backend = ggml_backend_cpu_init(); - } else { + if (vae_dev_name != nullptr && vae_dev_name[0] != '\0') { + vae_backend = init_named_backend(vae_dev_name); + } + if (!vae_backend) { vae_backend = backend; + } else { + LOG_INFO("VAE: using device %s", vae_dev_name); } auto create_tae = [&]() -> std::shared_ptr { @@ -648,11 +783,14 @@ class StableDiffusionGGML { if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { ggml_backend_t controlnet_backend = nullptr; - if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_DEBUG("ControlNet: Using CPU backend"); - controlnet_backend = ggml_backend_cpu_init(); - } else { + const char* cn_dev_name = sd_ctx_params->control_net_backend_device; + if (cn_dev_name != nullptr && cn_dev_name[0] != '\0') { + controlnet_backend = init_named_backend(cn_dev_name); + } + if (!controlnet_backend) { controlnet_backend = backend; + } else { + LOG_INFO("ControlNet: using device %s", cn_dev_name); } control_net = std::make_shared(controlnet_backend, offload_params_to_cpu, @@ -2142,16 +2280,29 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->prediction = PREDICTION_COUNT; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; - sd_ctx_params->enable_mmap = false; - sd_ctx_params->keep_clip_on_cpu = false; - sd_ctx_params->keep_control_net_on_cpu = false; - sd_ctx_params->keep_vae_on_cpu = false; - sd_ctx_params->diffusion_flash_attn = false; - sd_ctx_params->circular_x = false; - sd_ctx_params->circular_y = false; - sd_ctx_params->chroma_use_dit_mask = true; - sd_ctx_params->chroma_use_t5_mask = false; - sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->enable_mmap = false; + sd_ctx_params->main_backend_device = nullptr; + sd_ctx_params->diffusion_backend_device = nullptr; + sd_ctx_params->clip_backend_device = nullptr; + sd_ctx_params->vae_backend_device = nullptr; + sd_ctx_params->control_net_backend_device = nullptr; + sd_ctx_params->tae_backend_device = nullptr; + sd_ctx_params->upscaler_backend_device = nullptr; + sd_ctx_params->photomaker_backend_device = nullptr; + sd_ctx_params->vision_backend_device = nullptr; + sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->circular_x = false; + sd_ctx_params->circular_y = false; + sd_ctx_params->chroma_use_dit_mask = true; + sd_ctx_params->chroma_use_t5_mask = false; + sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->auto_fit = true; + sd_ctx_params->auto_fit_target_mb = 512; + sd_ctx_params->auto_fit_dry_run = false; + sd_ctx_params->auto_fit_compute_reserve_dit_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; + sd_ctx_params->auto_multi_gpu = true; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -2183,9 +2334,22 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "sampler_rng_type: %s\n" "prediction: %s\n" "offload_params_to_cpu: %s\n" - "keep_clip_on_cpu: %s\n" - "keep_control_net_on_cpu: %s\n" - "keep_vae_on_cpu: %s\n" + "main_backend_device: %s\n" + "diffusion_backend_device: %s\n" + "clip_backend_device: %s\n" + "vae_backend_device: %s\n" + "control_net_backend_device: %s\n" + "tae_backend_device: %s\n" + "upscaler_backend_device: %s\n" + "photomaker_backend_device: %s\n" + "vision_backend_device: %s\n" + "auto_fit: %s\n" + "auto_fit_target_mb: %d\n" + "auto_fit_dry_run: %s\n" + "auto_fit_compute_reserve_dit_mb: %d\n" + "auto_fit_compute_reserve_vae_mb: %d\n" + "auto_fit_compute_reserve_cond_mb: %d\n" + "auto_multi_gpu: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2215,9 +2379,22 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), - BOOL_STR(sd_ctx_params->keep_clip_on_cpu), - BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), - BOOL_STR(sd_ctx_params->keep_vae_on_cpu), + SAFE_STR(sd_ctx_params->main_backend_device), + SAFE_STR(sd_ctx_params->diffusion_backend_device), + SAFE_STR(sd_ctx_params->clip_backend_device), + SAFE_STR(sd_ctx_params->vae_backend_device), + SAFE_STR(sd_ctx_params->control_net_backend_device), + SAFE_STR(sd_ctx_params->tae_backend_device), + SAFE_STR(sd_ctx_params->upscaler_backend_device), + SAFE_STR(sd_ctx_params->photomaker_backend_device), + SAFE_STR(sd_ctx_params->vision_backend_device), + BOOL_STR(sd_ctx_params->auto_fit), + sd_ctx_params->auto_fit_target_mb, + BOOL_STR(sd_ctx_params->auto_fit_dry_run), + sd_ctx_params->auto_fit_compute_reserve_dit_mb, + sd_ctx_params->auto_fit_compute_reserve_vae_mb, + sd_ctx_params->auto_fit_compute_reserve_cond_mb, + BOOL_STR(sd_ctx_params->auto_multi_gpu), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), diff --git a/src/version.cpp b/src/version.cpp index 97dc8426b..6c266153c 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -1,3 +1,6 @@ +#include + +#include "ggml-backend.h" #include "stable-diffusion.h" #ifndef SDCPP_BUILD_COMMIT @@ -18,3 +21,12 @@ const char* sd_commit(void) { const char* sd_version(void) { return STRINGIZE(SDCPP_BUILD_VERSION); } + +void sd_list_devices(void) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* name = ggml_backend_dev_name(dev); + const char* desc = ggml_backend_dev_description(dev); + std::printf("%s\t%s\n", name ? name : "", desc ? desc : ""); + } +} From 717c79ae738d5bdc920e129e28ac97280ef86ce2 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 16:13:51 +0200 Subject: [PATCH 2/9] wip: layer-split + lazy load (RAM regression) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds runner-level multi-backend (sched-based) layer-split, per-tensor buft callback alloc, GPU_LAYER_SPLIT placement in backend_fit, and auto-fit lazy load when init-time SUM exceeds device cap. Wires the LTX-2 DiT and Conditioner LLM through the new path. Known issue: system RAM OOM-kill during DiT lazy load even with --mmap. Per-thread staging buffers in ModelLoader::load_tensors hold a copy of each tensor as it streams from mmap to GPU; with 8 threads × ~600 MB each + cumulative mmap'd page cache, peak RSS exceeds 16 GB. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/backend_fit.hpp | 129 ++++++++++++- src/conditioner.hpp | 5 + src/diffusion_model.hpp | 4 + src/ggml_extend.hpp | 337 +++++++++++++++++++++++++++++++- src/stable-diffusion.cpp | 402 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 847 insertions(+), 30 deletions(-) diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp index 52254f0e8..7ca789a0b 100644 --- a/src/backend_fit.hpp +++ b/src/backend_fit.hpp @@ -15,9 +15,11 @@ // Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that // support streaming params from RAM at compute time). +#include #include #include #include +#include #include #include #include @@ -42,7 +44,8 @@ enum class ComponentKind { enum class Placement { CPU, GPU, - GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries }; struct Component { @@ -69,6 +72,13 @@ struct Decision { int device_id = DEVICE_ID_CPU; int64_t on_device_bytes = 0; int64_t on_host_bytes = 0; + + // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs + // that share this component (in order) and each device's estimated share + // of the params. The order also defines block-range partitioning: the + // i-th device gets a contiguous range of blocks proportional to share[i]. + std::vector split_device_ids; + std::vector split_share_bytes; }; struct Plan { @@ -105,7 +115,13 @@ inline bool classify_tensor(const std::string& name, ComponentKind& out) { contains("cond_stage_model") || contains("te.text_model.") || contains("conditioner") || - name.rfind("text_encoder.", 0) == 0) { + name.rfind("text_encoder.", 0) == 0 || + // Connector / text projection layers that run on the conditioner + // backend (e.g. LTX-2's text_embedding_projection: video/audio + // aggregate embeds + projection that map LLM hidden states into + // DiT-input space). + name.rfind("text_embedding_projection.", 0) == 0 || + contains(".aggregate_embed.")) { out = ComponentKind::CONDITIONER; return true; } @@ -181,19 +197,58 @@ inline std::vector enumerate_gpu_devices() { // --- Core algorithm ------------------------------------------------------- +// Per-GPU share for a layer-split component: free-VRAM-weighted partition +// of params, plus the full compute reserve on each participating device. +// (Compute reserve is per-device since each shard activates its own kernels.) +inline std::vector layer_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs) { + std::vector out(gpu_idxs.size(), 0); + int64_t total_free = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + total_free += std::max(0, devices[gpu_idxs[k]].free_bytes); + } + if (total_free <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + double r = double(std::max(0, devices[gpu_idxs[k]].free_bytes)) / double(total_free); + out[k] = int64_t(double(params_bytes) * r) + compute_bytes; + } + return out; +} + // Peak per device = MAX of any single component's footprint on that device, // because free_params_immediately frees params between phases so components // time-share VRAM. inline int64_t gpu_peak(int gpu_idx, const std::vector& pl, const std::vector& dev, - const std::vector& components) { + const std::vector& components, + const std::vector& devices = {}) { int64_t peak = 0; for (size_t i = 0; i < components.size(); i++) { - if (dev[i] != gpu_idx) continue; int64_t footprint = 0; if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (dev[i] != gpu_idx) continue; footprint = components[i].params_bytes + components[i].compute_bytes; + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // dev[i] holds the bitmask of participating GPU indices into the + // devices[] vector (encoded by the planner). Look up our slot. + const int mask = dev[i]; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + // Find this gpu's slot in gpu_idxs. + int slot = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs); + footprint = shares[slot]; } peak = std::max(peak, footprint); } @@ -217,6 +272,13 @@ inline Plan compute_plan(const std::vector& components, int device_idx; }; + // Layer-split is only meaningful for components made up of many similarly + // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE + // is too structurally heterogeneous for naive block partitioning. + auto supports_layer_split = [](ComponentKind k) { + return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; + }; + auto build_options = [&](const Component& c) { std::vector opts; for (size_t g = 0; g < nG; g++) { @@ -225,6 +287,15 @@ inline Plan compute_plan(const std::vector& components, opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); } } + // Layer-split: enumerate non-trivial subsets of GPUs (size >= 2). + // Encode the participating set as a bitmask in device_idx. + if (allow_multi_gpu && nG >= 2 && supports_layer_split(c.kind)) { + const int max_mask = 1 << nG; + for (int mask = 1; mask < max_mask; mask++) { + if (__builtin_popcount(mask) < 2) continue; + opts.push_back({Placement::GPU_LAYER_SPLIT, mask}); + } + } opts.push_back({Placement::CPU, -1}); return opts; }; @@ -255,6 +326,13 @@ inline Plan compute_plan(const std::vector& components, } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { s += 5 * pw; gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // Better than CPU but worse than fitting on a single GPU + // (cross-GPU traffic between blocks). + s += 7 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } } else { s -= 10 * pw; } @@ -292,7 +370,7 @@ inline Plan compute_plan(const std::vector& components, if (ok) { bool feasible = true; for (size_t g = 0; g < nG; g++) { - if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } } if (feasible) { int64_t sc = score(pl, dev); @@ -304,7 +382,7 @@ inline Plan compute_plan(const std::vector& components, } else { bool feasible = true; for (size_t g = 0; g < nG; g++) { - if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } } if (feasible) { int64_t sc = score(pl, dev); @@ -340,6 +418,33 @@ inline Plan compute_plan(const std::vector& components, d.device_id = DEVICE_ID_CPU; d.on_host_bytes = c.params_bytes + c.compute_bytes; plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs); + // Sort participating GPUs by descending share so the LARGEST-share + // GPU is listed first. Sub-runners that don't get the layer-split + // spec (e.g. the LTX-2 text projection) follow the "main" backend + // (= first in this list) — putting the biggest one first keeps + // them on the GPU with most headroom. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return shares[a] > shares[b]; }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + d.split_share_bytes.push_back(shares[k]); + max_share = std::max(max_share, shares[k]); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; } else { d.device_id = devices[best_dev[i]].id; if (best_pl[i] == Placement::GPU) { @@ -355,7 +460,7 @@ inline Plan compute_plan(const std::vector& components, } for (size_t g = 0; g < nG; g++) { - plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components); + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices); } return plan; } @@ -365,6 +470,7 @@ inline const char* placement_str(Placement p) { case Placement::CPU: return "CPU"; case Placement::GPU: return "GPU"; case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)"; } return "?"; } @@ -400,6 +506,15 @@ inline void print_plan(const Plan& plan, LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", d.name.c_str(), d.device_id, (long long)(d.on_device_bytes / MiB)); + } else if (d.placement == Placement::GPU_LAYER_SPLIT) { + std::string ids; + for (size_t k = 0; k < d.split_device_ids.size(); k++) { + if (k > 0) ids += "+"; + ids += "GPU" + std::to_string(d.split_device_ids[k]); + ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)"; + } + LOG_INFO(" %-12s -> %s", + d.name.c_str(), ids.c_str()); } else { LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", d.name.c_str(), d.device_id, diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 9f4d45524..99e27ae39 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -87,6 +87,11 @@ struct Conditioner { virtual size_t get_params_buffer_size() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} + // Defer the LLM sub-runner's params alloc + read until first compute(). + // Only conditioners with a heavy LLM (e.g. LTX-2 Gemma) override this; + // others ignore the call. The callback is invoked AFTER the runner's + // alloc_params_buffer succeeds and is responsible for tensor data load. + virtual void set_llm_lazy_load(std::function /*fn*/) {} virtual std::tuple> get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) { GGML_ABORT("Not implemented yet!"); diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index c0a2a11c0..d7ea6ede7 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -50,6 +50,10 @@ struct DiffusionModel { virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; + // Defer params alloc + tensor data load until the first compute() call. + // Default: no-op. Subclasses backed by a single GGMLRunner forward to + // its set_lazy_load. + virtual void set_lazy_load(std::function /*fn*/) {} }; struct UNetModel : public DiffusionModel { diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 8b748194f..cd1662523 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1705,6 +1705,42 @@ struct GGMLRunnerContext { std::shared_ptr weight_adapter = nullptr; }; +// --------------------------------------------------------------------------- +// Multi-backend (layer-split) support +// --------------------------------------------------------------------------- +// A GGMLRunner can opt into "layer-split" mode where each weight tensor lives +// entirely on one of several backends, picked by a caller-supplied callback +// (typically based on the tensor name's block index). The runner switches +// from gallocr to ggml_backend_sched for graph compute, so cross-backend +// edges are routed automatically. +// +// This is the llama.cpp LLAMA_SPLIT_MODE_LAYER analogue. There is no +// intra-tensor row split, so every tensor lives on a single normal device +// buffer — views work without any ggml-cuda patch. +// +// To enable: populate g_pending_multi_backend_spec() with the additional +// backends + tensor->backend callback, then construct the GGMLRunner. The +// ctor consumes and clears the pending pointer. +struct MultiBackendSpec { + // Extra backends *in addition to* the runner's main runtime_backend. + // The first entry's role is the main backend; we don't list it here. + std::vector additional_backends; + + // Maps a weight tensor name to one of the runner's backends (the main + // runtime_backend, or one of additional_backends). Returning nullptr + // means "use the main runtime_backend". + std::function tensor_backend_fn; + + // Optional CPU backend appended last to the sched for unsupported-op + // fallback. May be nullptr. + ggml_backend_t cpu_fallback = nullptr; +}; + +__STATIC_INLINE__ MultiBackendSpec*& g_pending_multi_backend_spec() { + thread_local MultiBackendSpec* spec = nullptr; + return spec; +} + struct GGMLRunner { protected: typedef std::function get_graph_cb_t; @@ -1712,6 +1748,25 @@ struct GGMLRunner { ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; + // --- multi-backend (layer-split) state --- + bool multi_backend_mode = false; + std::vector additional_backends; + ggml_backend_t cpu_fallback_backend = nullptr; + bool owns_cpu_fallback_backend = false; + std::function tensor_backend_fn = nullptr; + ggml_backend_sched_t sched = nullptr; + bool sched_reserved = false; + // Per-backend params buffers when multi_backend_mode is on. + // params_buffer (single-backend) stays nullptr in this mode. + std::vector multi_params_buffers; + + // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual + // alloc + tensor-data load is deferred until the first compute(). The + // callback is invoked AFTER do_alloc_params_buffer succeeds and is + // responsible for populating tensor->data via ModelLoader. Used to keep + // peak VRAM per-component-MAX rather than sum-of-components at init. + std::function lazy_load_fn = nullptr; + ggml_context* params_ctx = nullptr; ggml_backend_buffer_t params_buffer = nullptr; ggml_context* offload_ctx = nullptr; @@ -1859,7 +1914,56 @@ struct GGMLRunner { return gf; } + // Build the multi-backend sched (lazily). + bool ensure_sched() { + if (sched != nullptr) return true; + std::vector backends; + backends.reserve(1 + additional_backends.size() + 1); + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + // ggml_backend_sched_new asserts the last backend is a CPU; create + // a CPU fallback if the caller didn't provide one. We own this + // instance and free it in the dtor below. + if (cpu_fallback_backend == nullptr) { + cpu_fallback_backend = ggml_backend_cpu_init(); + owns_cpu_fallback_backend = true; + } + backends.push_back(cpu_fallback_backend); + sched = ggml_backend_sched_new(backends.data(), + /*bufts=*/nullptr, + (int)backends.size(), + MAX_GRAPH_SIZE, + /*parallel=*/false, + /*op_offload=*/false); + if (sched == nullptr) { + LOG_ERROR("%s: failed to create backend sched", get_desc().c_str()); + return false; + } + return true; + } + bool alloc_compute_buffer(get_graph_cb_t get_graph) { + if (multi_backend_mode) { + if (sched_reserved) return true; + if (!ensure_sched()) return false; + reset_compute_ctx(); + ggml_cgraph* gf = get_compute_graph(get_graph); + backend_tensor_data_map.clear(); + if (!ggml_backend_sched_reserve(sched, gf)) { + LOG_ERROR("%s: sched reserve failed", get_desc().c_str()); + return false; + } + sched_reserved = true; + for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); i++) { + ggml_backend_t b = ggml_backend_sched_get_backend(sched, i); + size_t s = ggml_backend_sched_get_buffer_size(sched, b); + LOG_DEBUG("%s sched buf[%d] %s = %.2f MB", + get_desc().c_str(), i, ggml_backend_name(b), + s / (1024.f * 1024.f)); + } + return true; + } + if (compute_allocr != nullptr) { return true; } @@ -2018,6 +2122,22 @@ struct GGMLRunner { GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) : runtime_backend(backend) { + // Consume any pending multi-backend (layer-split) spec set by the + // caller via g_pending_multi_backend_spec(). + MultiBackendSpec* pending = g_pending_multi_backend_spec(); + if (pending != nullptr) { + g_pending_multi_backend_spec() = nullptr; + multi_backend_mode = true; + additional_backends = pending->additional_backends; + tensor_backend_fn = pending->tensor_backend_fn; + cpu_fallback_backend = pending->cpu_fallback; + if (offload_params_to_cpu) { + LOG_WARN("multi-backend layer-split is incompatible with " + "offload_params_to_cpu; ignoring offload"); + offload_params_to_cpu = false; + } + } + alloc_params_ctx(); if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) { params_backend = ggml_backend_cpu_init(); @@ -2035,6 +2155,16 @@ struct GGMLRunner { ggml_backend_free(params_backend); } free_cache_ctx_and_buffer(); + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + } + if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) { + ggml_backend_free(cpu_fallback_backend); + cpu_fallback_backend = nullptr; + } + // additional_backends are owned by the caller (see the MultiBackendSpec + // setup site in stable-diffusion.cpp); not freed here. } virtual GGMLRunnerContext get_context() { @@ -2054,7 +2184,102 @@ struct GGMLRunner { alloc_compute_ctx(); } - bool alloc_params_buffer() { + // Multi-backend params allocation: walk params_ctx, classify each tensor + // via tensor_backend_fn, allocate one buffer per backend on its default + // buft, bind tensors via ggml_tallocr. + bool alloc_params_buffer_layer_split() { + // Build the backend list (main first, then additional). Index 0 is + // the default for tensors whose callback returns nullptr. + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + + std::vector bufts; + bufts.reserve(backends.size()); + std::vector aligns(backends.size()); + std::vector sizes(backends.size(), 0); + std::vector counts(backends.size(), 0); + for (size_t i = 0; i < backends.size(); i++) { + bufts.push_back(ggml_backend_get_default_buffer_type(backends[i])); + aligns[i] = ggml_backend_buft_get_alignment(bufts[i]); + } + + // First pass: assign each tensor to a backend, accumulate sizes. + std::map tensor_backend_idx; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + int idx = 0; + if (tensor_backend_fn) { + ggml_backend_t target = tensor_backend_fn(t->name); + if (target != nullptr) { + for (size_t i = 0; i < backends.size(); i++) { + if (backends[i] == target) { + idx = int(i); + break; + } + } + } + } + tensor_backend_idx[t] = idx; + size_t s = ggml_backend_buft_get_alloc_size(bufts[idx], t); + sizes[idx] += GGML_PAD(s, aligns[idx]); + counts[idx] += 1; + } + + // Allocate one buffer per used backend. + multi_params_buffers.assign(backends.size(), nullptr); + for (size_t i = 0; i < backends.size(); i++) { + if (sizes[i] == 0) continue; + multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]); + if (multi_params_buffers[i] == nullptr) { + LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)", + get_desc().c_str(), + ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f)); + return false; + } + } + + // Bind tensors via ggml_tallocr. + std::vector tallocs(backends.size()); + for (size_t i = 0; i < backends.size(); i++) { + if (multi_params_buffers[i] != nullptr) { + tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]); + } + } + for (auto& kv : tensor_backend_idx) { + ggml_status st = ggml_tallocr_alloc(&tallocs[kv.second], kv.first); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s tallocr_alloc failed for tensor %s", + get_desc().c_str(), kv.first->name); + return false; + } + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + } + + // Log the breakdown. + for (size_t i = 0; i < backends.size(); i++) { + if (counts[i] == 0) continue; + LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)", + get_desc().c_str(), + ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f), + counts[i]); + } + return true; + } + + // Internal: always materializes the params buffer. Used by both the + // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded` + // path; the latter must bypass the lazy-skip. + bool do_alloc_params_buffer() { + if (multi_backend_mode) { + return alloc_params_buffer_layer_split(); + } size_t num_tensors = ggml_tensor_num(params_ctx); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); if (params_buffer == nullptr) { @@ -2072,18 +2297,66 @@ struct GGMLRunner { return true; } + bool alloc_params_buffer() { + // Lazy mode: skip alloc until first compute() (via ensure_params_loaded). + // The caller still goes through alloc_params_buffer + get_param_tensors + // at init; ModelLoader::load_tensors will silently skip this runner's + // tensors (their data ptrs are null because no buffer is allocated yet) + // and the lazy_load_fn callback re-loads them on demand. + if (lazy_load_fn) return true; + return do_alloc_params_buffer(); + } + + void set_lazy_load(std::function fn) { + lazy_load_fn = std::move(fn); + } + + bool ensure_params_loaded() { + if (params_buffer != nullptr || !multi_params_buffers.empty()) { + return true; + } + if (!lazy_load_fn) { + LOG_ERROR("%s: no params buffer and no lazy_load_fn", get_desc().c_str()); + return false; + } + int64_t t0 = ggml_time_ms(); + if (!do_alloc_params_buffer()) return false; + if (!lazy_load_fn()) { + LOG_ERROR("%s: lazy load callback failed", get_desc().c_str()); + return false; + } + int64_t t1 = ggml_time_ms(); + LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f); + return true; + } + void free_params_buffer() { if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_free(buf); + } + } + multi_params_buffers.clear(); + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + sched_reserved = false; + } } size_t get_params_buffer_size() { + size_t total = 0; if (params_buffer != nullptr) { - return ggml_backend_buffer_get_size(params_buffer); + total += ggml_backend_buffer_get_size(params_buffer); } - return 0; + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) total += ggml_backend_buffer_get_size(buf); + } + return total; } void free_cache_ctx_and_buffer() { @@ -2096,11 +2369,23 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } + if (sched != nullptr) { + // Reset rather than free: keeping the sched alive across compute() + // calls of a sampling loop avoids the per-step rebuild cost. + ggml_backend_sched_reset(sched); + sched_reserved = false; + } offload_params_to_params_backend(); } // do copy after alloc graph void set_backend_tensor_data(ggml_tensor* tensor, const void* data) { + // In multi-backend mode, sched needs the tensor flagged as input so + // it gets a backend assignment (otherwise tensors with no producers + // and no consumers leave sched at backend_id=-1). + if (multi_backend_mode) { + ggml_set_input(tensor); + } backend_tensor_data_map[tensor] = data; } @@ -2160,6 +2445,9 @@ struct GGMLRunner { int n_threads, bool free_compute_buffer_immediately, bool no_return = false) { + if (!ensure_params_loaded()) { + return std::nullopt; + } if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); return std::nullopt; @@ -2168,18 +2456,41 @@ struct GGMLRunner { LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); return std::nullopt; } - reset_compute_ctx(); - ggml_cgraph* gf = get_compute_graph(get_graph); - if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { - LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); - return std::nullopt; + ggml_cgraph* gf = nullptr; + if (multi_backend_mode) { + ggml_backend_sched_reset(sched); + reset_compute_ctx(); + gf = get_compute_graph(get_graph); + if (!ggml_backend_sched_alloc_graph(sched, gf)) { + LOG_ERROR("%s sched alloc graph failed", get_desc().c_str()); + return std::nullopt; + } + } else { + reset_compute_ctx(); + gf = get_compute_graph(get_graph); + if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { + LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); + return std::nullopt; + } } copy_data_to_backend_tensor(); if (ggml_backend_is_cpu(runtime_backend)) { ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); } + if (multi_backend_mode && cpu_fallback_backend && + ggml_backend_is_cpu(cpu_fallback_backend)) { + ggml_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads); + } - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); + ggml_status status; + if (multi_backend_mode) { + status = ggml_backend_sched_graph_compute(sched, gf); + if (status == GGML_STATUS_SUCCESS) { + ggml_backend_sched_synchronize(sched); + } + } else { + status = ggml_backend_graph_compute(runtime_backend, gf); + } if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); return std::nullopt; @@ -2259,6 +2570,14 @@ class GGMLBlock { prefix = prefix + "."; } init_params(ctx, tensor_storage_map, prefix); + // Tag each param tensor with its full (prefix-qualified) name so the + // multi-backend runner's tensor_backend_fn callback can route it. + // Without this, init_params leaves tensors with empty t->name. + for (auto& pair : params) { + if (pair.second != nullptr) { + ggml_set_name(pair.second, (prefix + pair.first).c_str()); + } + } init_blocks(ctx, tensor_storage_map, prefix); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index dfe2a8873..356038146 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -127,6 +127,26 @@ class StableDiffusionGGML { bool fit_cond_offload_params = false; bool fit_vae_offload_params = false; + // Layer-split state (when auto-fit picks GPU_LAYER_SPLIT). Holds the + // ordered list of device names and per-device share bytes; the actual + // backend handles are init'd at construction time and stored in + // *_extra_backends so the destructor can free them. + std::vector fit_dit_split_device_names; + std::vector fit_dit_split_share_bytes; + std::vector fit_dit_extra_backends; + std::vector fit_cond_split_device_names; + std::vector fit_cond_split_share_bytes; + std::vector fit_cond_extra_backends; + + // Owned model loader: kept alive across init() so lazy_load callbacks + // can re-read tensor data from disk on demand. Only set when at least + // one component is configured for lazy load. + std::unique_ptr owned_model_loader; + // Auto-fit decided init-time SUM exceeds device cap; defer cond + DiT + // allocation until first compute() so peaks don't pile up. + bool auto_lazy_load = false; + bool enable_mmap_member = false; + SDVersion version; bool vae_decode_only = false; bool external_vae_is_invalid = false; @@ -185,6 +205,18 @@ class StableDiffusionGGML { if (diffusion_backend != backend) { ggml_backend_free(diffusion_backend); } + for (auto* b : fit_dit_extra_backends) { + if (b != backend && b != diffusion_backend && b != clip_backend && + b != vae_backend && b != control_net_backend) { + ggml_backend_free(b); + } + } + for (auto* b : fit_cond_extra_backends) { + if (b != backend && b != diffusion_backend && b != clip_backend && + b != vae_backend && b != control_net_backend) { + ggml_backend_free(b); + } + } ggml_backend_free(backend); } @@ -230,7 +262,12 @@ class StableDiffusionGGML { init_backend(sd_ctx_params->main_backend_device); - ModelLoader model_loader; + // Use a stack-local handle that points into `owned_model_loader` if we + // need lazy callbacks (decided after auto-fit), otherwise a temp local + // is fine. Defer the unique_ptr decision; for now always own it so the + // pointer is stable even if lazy load is enabled later in this init(). + owned_model_loader = std::make_unique(); + ModelLoader& model_loader = *owned_model_loader; if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); @@ -392,8 +429,19 @@ class StableDiffusionGGML { break; } } - auto resolve = [&](const backend_fit::Decision* d, std::string& out_device, - bool& out_offload) { + auto device_id_to_name = [&](int dev_id) -> std::string { + for (const auto& dev : devices) { + if (dev.id == dev_id) return dev.name; + } + return {}; + }; + auto resolve = [&](const backend_fit::Decision* d, + std::string& out_device, + bool& out_offload, + std::vector& out_split_devices, + std::vector& out_split_shares) { + out_split_devices.clear(); + out_split_shares.clear(); if (d == nullptr) { out_device.clear(); out_offload = false; @@ -404,23 +452,67 @@ class StableDiffusionGGML { out_offload = false; return; } - for (const auto& dev : devices) { - if (dev.id == d->device_id) { - out_device = dev.name; - break; + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + // Primary device drives main_backend choice for the model; + // the rest become additional backends in the spec. + for (size_t k = 0; k < d->split_device_ids.size(); k++) { + out_split_devices.push_back(device_id_to_name(d->split_device_ids[k])); + out_split_shares.push_back(d->split_share_bytes[k]); } + if (!out_split_devices.empty()) out_device = out_split_devices[0]; + out_offload = false; + return; } + out_device = device_id_to_name(d->device_id); out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS); }; + std::vector dummy_devs; + std::vector dummy_shares; resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), - fit_diffusion_device, fit_dit_offload_params); + fit_diffusion_device, fit_dit_offload_params, + fit_dit_split_device_names, fit_dit_split_share_bytes); resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), - fit_vae_device, fit_vae_offload_params); + fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares); resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), - fit_clip_device, fit_cond_offload_params); + fit_clip_device, fit_cond_offload_params, + fit_cond_split_device_names, fit_cond_split_share_bytes); // CPU placements: leave fit_*_device empty AND remember they're // CPU so the resolver below picks ggml_backend_cpu_init(). + + // Decide auto-lazy-load: if the per-component MAX-based plan fits + // but the SUM-of-components on any device would exceed cap, defer + // alloc until first compute() so peaks don't pile up. Heuristic: + // sum the per-device on_device_bytes across all GPU decisions + // (excluding VAE which is small) and compare to free_bytes. + std::map sum_per_device; + auto add_sum = [&](const backend_fit::Decision* d) { + if (!d) return; + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + for (size_t k = 0; k < d->split_device_ids.size(); k++) { + sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k]; + } + } else if (d->placement == backend_fit::Placement::GPU || + d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) { + sum_per_device[d->device_id] += d->on_device_bytes; + } + }; + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT)); + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE)); + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER)); + for (const auto& dev : devices) { + int64_t cap = dev.free_bytes - margin_bytes; + int64_t sum = sum_per_device.count(dev.id) ? sum_per_device[dev.id] : 0; + if (sum > cap) { + LOG_INFO("auto-fit: enabling lazy load (init-time SUM %lld MiB on %s " + "exceeds cap %lld MiB; per-component MAX plan needs lazy alloc)", + (long long)(sum / backend_fit::MiB), + dev.name.c_str(), + (long long)(cap / backend_fit::MiB)); + auto_lazy_load = true; + break; + } + } } LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); @@ -493,6 +585,145 @@ class StableDiffusionGGML { const char* vae_dev_name = effective_device(fit_vae_device, sd_ctx_params->vae_backend_device); + // Build the layer-split MultiBackendSpec for a component. Only used + // when auto-fit picked GPU_LAYER_SPLIT for this component. + // - main_backend: the runner's primary backend (also first in the spec) + // - extra_device_names: additional device names to span + // - share_bytes: per-device share (for proportional block partition) + // - tensor_prefix: the model's weight name prefix (e.g., + // "model.diffusion_model.") — used to locate block-indexed tensors + // Returns true if a spec was prepared and pending_spec_storage was + // populated; the caller must set g_pending_multi_backend_spec() + // immediately before constructing the model. + auto prepare_layer_split_spec = [&](ggml_backend_t main_backend, + const std::vector& extra_device_names, + const std::vector& share_bytes, + const std::string& tensor_prefix, + std::vector& out_extra_backends, + MultiBackendSpec& out_spec) -> bool { + if (extra_device_names.size() < 2) return false; // only [main] -> single GPU + // Init the additional backends (skip [0] which is main_backend). + std::vector all_backends; + all_backends.push_back(main_backend); + for (size_t k = 1; k < extra_device_names.size(); k++) { + ggml_backend_t b = init_named_backend(extra_device_names[k]); + if (b == nullptr) { + LOG_WARN("layer-split: failed to init extra backend %s; falling back to single backend", + extra_device_names[k].c_str()); + return false; + } + out_extra_backends.push_back(b); + all_backends.push_back(b); + } + + // Walk tensor_storage_map to get per-block byte sizes and the + // total non-block bytes that will land on backend[0]. Then + // greedy-partition blocks by byte budget to balance per-backend + // bytes (accounting for non-block fixed load on backend[0]). + int max_block_idx = -1; + static const std::regex block_re( + R"((?:transformer_blocks|joint_blocks|double_blocks|single_blocks|blocks|layers)\.([0-9]+)\.)"); + std::map block_bytes; // block idx -> bytes + int64_t non_block_bytes = 0; + for (const auto& kv : tensor_storage_map) { + if (!tensor_prefix.empty() && kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + std::smatch m; + if (std::regex_search(kv.first, m, block_re)) { + int idx = std::stoi(m[1]); + if (idx > max_block_idx) max_block_idx = idx; + block_bytes[idx] += bytes; + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("layer-split: no blocks found under prefix '%s'; aborting split", + tensor_prefix.c_str()); + return false; + } + const int n_blocks = max_block_idx + 1; + + // Build per-backend byte budgets from share_bytes (ratios). The + // first backend absorbs `non_block_bytes` as a fixed load, so we + // SHRINK its remaining budget for blocks accordingly. + int64_t total_share = 0; + for (auto s : share_bytes) total_share += s; + int64_t total_block_bytes = 0; + for (const auto& kv : block_bytes) total_block_bytes += kv.second; + std::vector backend_block_budgets(share_bytes.size(), 0); + for (size_t k = 0; k < share_bytes.size(); k++) { + int64_t share = int64_t(double(total_block_bytes + non_block_bytes) * + double(share_bytes[k]) / double(total_share)); + if (k == 0) share = std::max(share - non_block_bytes, 0); + backend_block_budgets[k] = share; + } + // Greedy assign each block (in order) to the current backend + // until its budget is filled, then move to the next. + std::vector boundaries(share_bytes.size(), 0); + size_t cur_backend = 0; + int64_t cur_used = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur_backend + 1 < share_bytes.size() && + cur_used + bb > backend_block_budgets[cur_backend] && + cur_used > 0) { + boundaries[cur_backend] = b; + cur_backend++; + cur_used = 0; + } + cur_used += bb; + } + // The remaining backends get the rest, terminating at n_blocks. + for (size_t k = cur_backend; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + // Safety: ensure each backend has at least one block. + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) boundaries[k] = std::min(min_bound, n_blocks); + } + std::string boundary_log = "layer-split [" + tensor_prefix + "] " + + std::to_string(n_blocks) + " blocks: "; + int prev = 0; + for (size_t k = 0; k < all_backends.size() && k < boundaries.size(); k++) { + if (k > 0) boundary_log += ", "; + boundary_log += std::string(ggml_backend_name(all_backends[k])) + "=[" + + std::to_string(prev) + ".." + std::to_string(boundaries[k]) + ")"; + prev = boundaries[k]; + } + LOG_INFO("%s", boundary_log.c_str()); + + // Build the tensor_backend_fn closure. + std::vector backends_capture = all_backends; + std::vector boundaries_capture = boundaries; + std::string prefix_capture = tensor_prefix; + out_spec.tensor_backend_fn = + [backends_capture, boundaries_capture, prefix_capture](const std::string& name) -> ggml_backend_t { + if (!prefix_capture.empty() && + name.compare(0, prefix_capture.size(), prefix_capture) != 0) { + return backends_capture[0]; + } + std::smatch m; + if (!std::regex_search(name, m, block_re)) { + return backends_capture[0]; + } + int idx = std::stoi(m[1]); + for (size_t k = 0; k < boundaries_capture.size(); k++) { + if (idx < boundaries_capture[k]) { + return backends_capture[std::min(k, backends_capture.size() - 1)]; + } + } + return backends_capture.back(); + }; + // Spec contains the additional backends only (main is implicit). + out_spec.additional_backends.assign(out_extra_backends.begin(), out_extra_backends.end()); + out_spec.cpu_fallback = nullptr; + return true; + }; + // Helper: init a named backend if name is non-null/non-empty, // returns nullptr on missing/failed name (caller falls back to main). auto init_named_or_null = [](const char* name) -> ggml_backend_t { @@ -507,6 +738,59 @@ class StableDiffusionGGML { LOG_INFO("Diffusion model: using device %s", diffusion_dev_name); } + // Tensor name sets for components that are configured for lazy load. + // Populated below right before/after the cond + DiT construction; + // consumed by the bulk-load step's ignore_tensors. + std::set cond_lazy_tensor_names; + std::set dit_lazy_tensor_names; + + // Build the layer-split MultiBackendSpec for DiT (when auto-fit picked + // GPU_LAYER_SPLIT). The spec is consumed by the diffusion_model's + // GGMLRunner ctor when we set g_pending_multi_backend_spec() to it. + MultiBackendSpec dit_spec; + bool dit_spec_active = false; + if (!fit_dit_split_device_names.empty()) { + dit_spec_active = prepare_layer_split_spec(diffusion_backend, + fit_dit_split_device_names, + fit_dit_split_share_bytes, + "model.diffusion_model.", + fit_dit_extra_backends, + dit_spec); + } + // Lambda to set the pending spec immediately before constructing the + // diffusion model. Caller must invoke this on the same line / right + // before the std::make_shared<...Model>(diffusion_backend, ...) call. + auto prime_dit_spec = [&]() { + if (dit_spec_active) { + g_pending_multi_backend_spec() = &dit_spec; + } + }; + + // Same dance for the conditioner. The conditioner uses clip_backend as + // its main backend; we need to set up the spec BEFORE the cond_stage + // ctor runs (which is BEFORE the DiT ctor). Each cond model wraps one + // or more sub-runners; the spec's tensor_backend_fn handles all of + // them since it's keyed on tensor names with a generic block regex. + // (Some conditioners construct multiple sub-runners — only the FIRST + // ggml runner ctor consumes the pending spec, so we re-prime between + // sub-runners' allocs by leaving cond_spec_active true; the runner's + // multi_backend_mode is per-runner.) + // For LTX-2 specifically: LTXAVEmbedder constructs LLMRunner first + // (consumes spec), then LTXAVTextProjectionRunner (no spec consumed). + // The LLM has block-named tensors so layer-split applies; the + // projector has only 4 tensors and they should ride along on its + // single backend (clip_backend = main). Auto-fit's cond share counts + // both, so the share is over-counted on backend[0] for the projector. + // Acceptable for now — small correction. + ggml_backend_t clip_main_backend_for_spec = nullptr; // resolved below + MultiBackendSpec cond_spec; + bool cond_spec_active = false; + auto prime_cond_spec = [&]() { + if (cond_spec_active) { + g_pending_multi_backend_spec() = &cond_spec; + } + }; + { clip_backend = init_named_or_null(clip_dev_name); if (!clip_backend) { @@ -514,10 +798,22 @@ class StableDiffusionGGML { } else { LOG_INFO("CLIP: using device %s", clip_dev_name); } + // Now that clip_backend is resolved, build the conditioner's + // layer-split spec if auto-fit picked it. + if (!fit_cond_split_device_names.empty()) { + cond_spec_active = prepare_layer_split_spec(clip_backend, + fit_cond_split_device_names, + fit_cond_split_share_bytes, + "text_encoders.", // covers text_encoders.llm.* and text_encoders.t5xxl.* + fit_cond_extra_backends, + cond_spec); + } if (sd_version_is_sd3(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map); @@ -539,12 +835,14 @@ class StableDiffusionGGML { "--chroma-disable-dit-mask as a workaround."); } + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else if (version == VERSION_OVIS_IMAGE) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, @@ -552,10 +850,12 @@ class StableDiffusionGGML { "", false); } else { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); } + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, @@ -563,28 +863,33 @@ class StableDiffusionGGML { sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, true, 0, true); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { + prime_dit_spec(); high_noise_diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, @@ -605,12 +910,14 @@ class StableDiffusionGGML { if (!vae_decode_only) { enable_vision = true; } + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version, "", enable_vision); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, @@ -618,28 +925,34 @@ class StableDiffusionGGML { version, sd_ctx_params->qwen_image_zero_cond_t); } else if (sd_version_is_anima(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version); } else if (sd_version_is_ernie_image(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, @@ -650,6 +963,7 @@ class StableDiffusionGGML { embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, @@ -657,12 +971,14 @@ class StableDiffusionGGML { version, PM_VERSION_2); } else { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, embbeding_map, version); } + prime_dit_spec(); diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, @@ -673,11 +989,61 @@ class StableDiffusionGGML { } } - cond_stage_model->alloc_params_buffer(); - cond_stage_model->get_param_tensors(tensors); + // Conditioner: publish its tensors to the global map, EXCEPT the + // ones that are about to be configured for lazy load (we want the + // bulk loader to skip them — they have no buffer yet). + std::map cond_only_tensors; + cond_stage_model->get_param_tensors(cond_only_tensors); + std::map llm_lazy_map; + if (auto_lazy_load) { + for (const auto& kv : cond_only_tensors) { + if (kv.first.rfind("text_encoders.llm.", 0) == 0) { + llm_lazy_map[kv.first] = kv.second; + cond_lazy_tensor_names.insert(kv.first); + } + } + } + for (const auto& kv : cond_only_tensors) { + if (cond_lazy_tensor_names.find(kv.first) == cond_lazy_tensor_names.end()) { + tensors[kv.first] = kv.second; // eager — bulk loader will fill + } + } + if (auto_lazy_load && !llm_lazy_map.empty()) { + ModelLoader* loader_ptr = owned_model_loader.get(); + int n_threads_capture = sd_ctx_params->n_threads; + bool mmap_capture = sd_ctx_params->enable_mmap; + cond_stage_model->set_llm_lazy_load([=]() -> bool { + auto local_map = llm_lazy_map; + return loader_ptr->load_tensors(local_map, /*ignore=*/{}, + n_threads_capture, mmap_capture); + }); + LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)", + llm_lazy_map.size()); + } + cond_stage_model->alloc_params_buffer(); // no-op for the lazy LLM - diffusion_model->alloc_params_buffer(); - diffusion_model->get_param_tensors(tensors); + std::map dit_only_tensors; + diffusion_model->get_param_tensors(dit_only_tensors); + if (auto_lazy_load) { + for (const auto& kv : dit_only_tensors) { + dit_lazy_tensor_names.insert(kv.first); + } + ModelLoader* loader_ptr = owned_model_loader.get(); + int n_threads_capture = sd_ctx_params->n_threads; + bool mmap_capture = sd_ctx_params->enable_mmap; + diffusion_model->set_lazy_load([=]() -> bool { + auto local_map = dit_only_tensors; + return loader_ptr->load_tensors(local_map, /*ignore=*/{}, + n_threads_capture, mmap_capture); + }); + LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)", + dit_only_tensors.size()); + } else { + for (const auto& kv : dit_only_tensors) { + tensors[kv.first] = kv.second; + } + } + diffusion_model->alloc_params_buffer(); // no-op when lazy_load_fn is set if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -892,6 +1258,14 @@ class StableDiffusionGGML { std::set ignore_tensors; tensors["alphas_cumprod"] = alphas_cumprod_tensor; + // Lazy-loaded components: skip them in the bulk load; their lazy + // callbacks will load them on first compute(). + for (const auto& name : cond_lazy_tensor_names) { + ignore_tensors.insert(name); + } + for (const auto& name : dit_lazy_tensor_names) { + ignore_tensors.insert(name); + } if (use_tae && !tae_preview_only) { ignore_tensors.insert("first_stage_model."); } From b8d1c992c39a89401cd3cc0c38a6b300b523079f Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 16:52:00 +0200 Subject: [PATCH 3/9] fix: drop pagecache after each lazy load to bound RAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without an explicit posix_fadvise(POSIX_FADV_DONTNEED), the kernel keeps a model file's pages cached as buff/cache long after we're done with it, so loading the LLM (13.7 GB) followed by the DiT (17 GB) piles up to 30+ GB of cached pages on a 32 GB box and triggers the OOM-killer. - Keep the file descriptor alive in MmapWrapperImpl so we can posix_fadvise(POSIX_FADV_DONTNEED) on it before munmap. madvise alone only unmaps the address range — it does not evict pagecache. - Add POSIX_FADV_SEQUENTIAL on open: nudges the kernel toward a smaller working set during the read. - Make the "using mmap" log line INFO instead of DEBUG so the user can confirm at a glance. - Bound the lazy-load worker count to 2: the per-thread staging buffers grow to the largest tensor seen, so n_threads=8 doubles RAM peak for no measurable read-throughput gain. Result on 32 GB box: peak RSS ~6 GB, peak buff/cache ~12 GB during LLM lazy load — comfortably within budget. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/model.cpp | 9 +++++++-- src/stable-diffusion.cpp | 22 ++++++++++++++++++++-- src/util.cpp | 37 +++++++++++++++++++++++++++++-------- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index 8fdde3b76..32dfbed3c 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -783,11 +783,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::unique_ptr mmapped; if (enable_mmap && !is_zip) { - LOG_DEBUG("using mmap for I/O"); mmapped = MmapWrapper::create(file_path); if (!mmapped) { - LOG_WARN("failed to memory-map '%s'", file_path.c_str()); + LOG_WARN("failed to memory-map '%s' (falling back to read())", + file_path.c_str()); + } else { + LOG_INFO("using mmap for '%s'", file_path.c_str()); } + } else if (!is_zip) { + LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)", + file_path.c_str()); } int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 356038146..520d17b48 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1010,7 +1010,16 @@ class StableDiffusionGGML { } if (auto_lazy_load && !llm_lazy_map.empty()) { ModelLoader* loader_ptr = owned_model_loader.get(); - int n_threads_capture = sd_ctx_params->n_threads; + // Bound lazy-load threads to keep the per-thread staging + // buffer footprint small. The default n_threads = nproc gives + // ~nproc × max_tensor_bytes (up to several GB total) of + // CPU-side staging; for RAM-constrained systems running large + // models that's enough to trigger the OOM-killer even with + // mmap enabled. 2 threads still keep the disk-read pipeline + // fed while keeping staging bounded to ~2 × max_tensor_bytes. + int n_threads_capture = std::min(sd_ctx_params->n_threads > 0 + ? sd_ctx_params->n_threads : 2, + 2); bool mmap_capture = sd_ctx_params->enable_mmap; cond_stage_model->set_llm_lazy_load([=]() -> bool { auto local_map = llm_lazy_map; @@ -1029,7 +1038,16 @@ class StableDiffusionGGML { dit_lazy_tensor_names.insert(kv.first); } ModelLoader* loader_ptr = owned_model_loader.get(); - int n_threads_capture = sd_ctx_params->n_threads; + // Bound lazy-load threads to keep the per-thread staging + // buffer footprint small. The default n_threads = nproc gives + // ~nproc × max_tensor_bytes (up to several GB total) of + // CPU-side staging; for RAM-constrained systems running large + // models that's enough to trigger the OOM-killer even with + // mmap enabled. 2 threads still keep the disk-read pipeline + // fed while keeping staging bounded to ~2 × max_tensor_bytes. + int n_threads_capture = std::min(sd_ctx_params->n_threads > 0 + ? sd_ctx_params->n_threads : 2, + 2); bool mmap_capture = sd_ctx_params->enable_mmap; diffusion_model->set_lazy_load([=]() -> bool { auto local_map = dit_only_tensors; diff --git a/src/util.cpp b/src/util.cpp index 0b514bb73..743738813 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -174,12 +174,33 @@ bool is_directory(const std::string& path) { class MmapWrapperImpl : public MmapWrapper { public: - MmapWrapperImpl(void* data, size_t size) - : MmapWrapper(data, size) {} + MmapWrapperImpl(void* data, size_t size, int fd) + : MmapWrapper(data, size), fd_(fd) {} ~MmapWrapperImpl() override { +#ifdef __linux__ + // Drop the kernel pagecache pages for this file. madvise(DONTNEED) + // alone only unmaps from the process address space; pagecache + // entries persist (`free` reports them as buff/cache and the OOM + // killer doesn't touch them, but they ARE counted against + // overcommit and can starve other allocations on tight-RAM + // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented + // way to evict pagecache for a specific fd's pages. + if (data_ != nullptr && size_ > 0) { + madvise(data_, size_, MADV_DONTNEED); + } + if (fd_ >= 0) { + posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + } +#endif munmap(data_, size_); + if (fd_ >= 0) { + close(fd_); + } } + +private: + int fd_; }; std::unique_ptr MmapWrapper::create(const std::string& filename) { @@ -191,9 +212,10 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { int mmap_flags = MAP_PRIVATE; #ifdef __linux__ - // performance flags used by llama.cpp - // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); - // mmap_flags |= MAP_POPULATE; + // Sequential access hint helps the kernel read-ahead efficiently and + // also encourages eviction of already-read pages (the kernel keeps + // a smaller working set when this is set). + posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); #endif struct stat sb; @@ -206,9 +228,8 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0); - close(file_descriptor); - if (mapped_data == MAP_FAILED) { + close(file_descriptor); return nullptr; } @@ -217,7 +238,7 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); #endif - return std::make_unique(mapped_data, file_size); + return std::make_unique(mapped_data, file_size, file_descriptor); } #endif From f8d4a585ff428f00df7a89eddcce4c5ceedb31d8 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 17:26:44 +0200 Subject: [PATCH 4/9] wip: restrict layer-split to DiT, fall back to single-GPU/CPU for Cond MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The layer-split path for the Conditioner has a bug where some sub-runners (LTX-2 text projection) and possibly some Gemma ops route through host memory despite the planner placing them on GPU bufts — process RSS climbs to ~13 GB worth of supposedly-on-GPU tensors and CUDA1 reports only ~150 MB used after the lazy load completes. Until that is root-caused, restrict layer-split to the DiT only. The Cond falls back to whatever auto-fit picks (single GPU when it fits, OFFLOAD or CPU otherwise — slow but correct). DiT layer-split is unaffected and continues to work for the user's primary use case (split a 17 GB Q6_K LTX-2 DiT across two GPUs). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/backend_fit.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp index 7ca789a0b..59fa491cb 100644 --- a/src/backend_fit.hpp +++ b/src/backend_fit.hpp @@ -273,10 +273,14 @@ inline Plan compute_plan(const std::vector& components, }; // Layer-split is only meaningful for components made up of many similarly - // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE - // is too structurally heterogeneous for naive block partitioning. + // shaped blocks. Currently restricted to the DiT — the Conditioner's + // layer-split path has a known issue where some sub-runners (e.g. LTX-2 + // text projection) and possibly some Gemma ops route through CPU, + // dragging weights back into RAM and tanking performance. Until that + // is fixed, the planner keeps the Conditioner on a single GPU (or + // OFFLOAD / CPU when it doesn't fit). auto supports_layer_split = [](ComponentKind k) { - return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; + return k == ComponentKind::DIT; }; auto build_options = [&](const Component& c) { From 155f5235e937dcee556016d3a18532e790f99be8 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 21:40:06 +0200 Subject: [PATCH 5/9] fix: layer-split alloc actually lands on GPU; was a diagnostic artifact Re-enable Conditioner layer-split and add detailed diagnostics around alloc_params_buffer_layer_split + ensure_params_loaded. The added prints query ggml_backend_dev_memory before and after each per-backend buft_alloc_buffer and confirm cudaMalloc DOES reserve the requested GPU memory: gemma3_12b layer-split alloc[0] CUDA1 req=10464 MB dev_free 13495 -> 3029 MB (drop 10466 MB) is_host=0 gemma3_12b layer-split alloc[1] CUDA0 req=5177 MB dev_free 7903 -> 2725 MB (drop 5178 MB) is_host=0 Combined Q6_K LTX-2 DiT (17 GB) + Gemma 3 12B Q8_K_XL (15 GB) + text projection (2.2 GB) + VAE (1.4 GB) end-to-end on a 9.8 GB + 15.7 GB GPU pair via layer-split + lazy-load now completes: generate_video completed in 82.72s The earlier "tensors in RAM" symptom was from a stale binary state during the iterative build cycle, not an actual correctness bug. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/backend_fit.hpp | 10 +++----- src/ggml_extend.hpp | 61 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp index 59fa491cb..7ca789a0b 100644 --- a/src/backend_fit.hpp +++ b/src/backend_fit.hpp @@ -273,14 +273,10 @@ inline Plan compute_plan(const std::vector& components, }; // Layer-split is only meaningful for components made up of many similarly - // shaped blocks. Currently restricted to the DiT — the Conditioner's - // layer-split path has a known issue where some sub-runners (e.g. LTX-2 - // text projection) and possibly some Gemma ops route through CPU, - // dragging weights back into RAM and tanking performance. Until that - // is fixed, the planner keeps the Conditioner on a single GPU (or - // OFFLOAD / CPU when it doesn't fit). + // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE + // is too structurally heterogeneous for naive block partitioning. auto supports_layer_split = [](ComponentKind k) { - return k == ComponentKind::DIT; + return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; }; auto build_options = [&](const Component& c) { diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index cd1662523..64095764e 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2202,6 +2202,16 @@ struct GGMLRunner { for (size_t i = 0; i < backends.size(); i++) { bufts.push_back(ggml_backend_get_default_buffer_type(backends[i])); aligns[i] = ggml_backend_buft_get_alignment(bufts[i]); + // Diagnostic: confirm we got a sensible buft from each backend. + const char* buft_name = ggml_backend_buft_name(bufts[i]); + const char* backend_name = ggml_backend_name(backends[i]); + ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]); + enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU; + const char* dev_name = dev ? ggml_backend_dev_name(dev) : "(none)"; + LOG_INFO("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d", + get_desc().c_str(), i, backend_name ? backend_name : "(null)", + buft_name ? buft_name : "(null)", dev_name, + (int)dev_type); } // First pass: assign each tensor to a backend, accumulate sizes. @@ -2230,6 +2240,10 @@ struct GGMLRunner { multi_params_buffers.assign(backends.size(), nullptr); for (size_t i = 0; i < backends.size(); i++) { if (sizes[i] == 0) continue; + // Diagnostic: query the device's free memory BEFORE alloc. + ggml_backend_dev_t dev_pre = ggml_backend_buft_get_device(bufts[i]); + size_t free_pre = 0, total_pre = 0; + if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_pre, &total_pre); multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]); if (multi_params_buffers[i] == nullptr) { LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)", @@ -2238,6 +2252,22 @@ struct GGMLRunner { sizes[i] / (1024.f * 1024.f)); return false; } + // Diagnostic: query AFTER alloc. The drop in free memory tells + // us whether the alloc actually went to GPU device memory or + // to a virtual reservation that's not yet committed. + size_t free_post = 0, total_post = 0; + if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_post, &total_post); + int64_t actual_drop = (int64_t)free_pre - (int64_t)free_post; + void* base = ggml_backend_buffer_get_base(multi_params_buffers[i]); + size_t actual_sz = ggml_backend_buffer_get_size(multi_params_buffers[i]); + bool is_host = ggml_backend_buffer_is_host(multi_params_buffers[i]); + LOG_INFO("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB " + "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d", + get_desc().c_str(), i, ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f), + free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f), + actual_drop / (1024.f * 1024.f), + base, (int)is_host); } // Bind tensors via ggml_tallocr. @@ -2255,6 +2285,18 @@ struct GGMLRunner { return false; } } + // Diagnostic: pick a sample tensor per backend and confirm its + // buffer + data pointer. + std::vector sampled(backends.size(), false); + for (auto& kv : tensor_backend_idx) { + int idx = kv.second; + if (sampled[idx]) continue; + sampled[idx] = true; + ggml_tensor* t = kv.first; + LOG_INFO("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d", + get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data, + t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1); + } for (auto* buf : multi_params_buffers) { if (buf != nullptr) { ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); @@ -2327,6 +2369,25 @@ struct GGMLRunner { } int64_t t1 = ggml_time_ms(); LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f); + // Diagnostic: report device-memory free per backend AFTER load. + // If the bytes actually went to GPU, free should have decreased + // by ~params_size for each layer-split shard. + if (multi_backend_mode) { + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + for (size_t i = 0; i < backends.size(); i++) { + ggml_backend_dev_t dev = ggml_backend_get_device(backends[i]); + if (!dev) continue; + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + LOG_INFO("%s post-load device %s free=%.1f MB / %.1f MB", + get_desc().c_str(), + ggml_backend_dev_name(dev), + free_b / (1024.f * 1024.f), + total_b / (1024.f * 1024.f)); + } + } return true; } From 3c874965c905fc227f5639f4f5d24fe0882550f0 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 21:46:58 +0200 Subject: [PATCH 6/9] feat: add --quiet-unknown-tensors; demote layer-split diagnostics to DEBUG - New CLI flag --quiet-unknown-tensors (sd_ctx_params_t.quiet_unknown_tensors) suppresses the per-tensor 'unknown tensor X in model file' log lines emitted during model loading. LTX-2 ships ~4600 audio-branch and encoder tensors a video-only pipeline doesn't consume; without this flag the load output is drowned out by them. A single summary line is emitted at the end with the count of skipped tensors. - The flag is plumbed through all three load paths: - bulk loader at init (eager components: VAE, projector) - lazy LLM load callback (Conditioner) - lazy DiT load callback - ModelLoader::load_tensors gains a quiet_unknown_tensors=false default parameter so existing callers keep their current behaviour. - The four layer-split diagnostic LOG_INFO lines (backend, alloc, sample, post-load) are demoted to LOG_DEBUG; they're noisy and only useful when triaging the bug we just chased. The 'layer-split params on CUDA*: X MB' summary line stays at INFO since it shows the user how params were distributed. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/common/common.cpp | 8 ++++++++ examples/common/common.h | 4 ++++ include/stable-diffusion.h | 7 +++++++ src/ggml_extend.hpp | 8 ++++---- src/model.cpp | 14 ++++++++++++-- src/model.h | 3 ++- src/stable-diffusion.cpp | 15 ++++++++++++--- 7 files changed, 49 insertions(+), 10 deletions(-) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index d3626fcce..2bbd1d216 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -524,6 +524,12 @@ ArgOptions SDContextParams::get_options() { "--fit-dry-run", "auto-fit: print the computed plan and exit without loading models", true, &auto_fit_dry_run}, + {"", + "--quiet-unknown-tensors", + "suppress per-tensor 'unknown tensor X in model file' log lines " + "(useful for LTX-2 and similar models that ship many unused " + "tensors); a single summary line with the count is logged instead", + true, &quiet_unknown_tensors}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { @@ -756,6 +762,7 @@ std::string SDContextParams::to_string() const { << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" + << " quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -839,6 +846,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f auto_fit_compute_reserve_vae_mb, auto_fit_compute_reserve_cond_mb, auto_multi_gpu, + quiet_unknown_tensors, }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index 8243d6cba..e21a68142 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -144,6 +144,10 @@ struct SDContextParams { int auto_fit_compute_reserve_cond_mb = 0; bool auto_multi_gpu = true; + // When set, the model loader skips per-tensor "unknown tensor" log + // lines and instead emits a single summary count at the end of load. + bool quiet_unknown_tensors = false; + prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index ed6336ba1..2de064288 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -233,6 +233,13 @@ typedef struct { // they fit. Defaults to true. Each component still lives entirely on // one device — no intra-tensor row split. bool auto_multi_gpu; + + // Suppress per-tensor "unknown tensor 'X' in model file" log lines + // emitted during model loading. Useful for models like LTX-2 that + // ship hundreds of audio-branch / encoder tensors a video-only + // pipeline doesn't consume. A single summary line is logged at the + // end with the count of skipped tensors. + bool quiet_unknown_tensors; } sd_ctx_params_t; typedef struct { diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 64095764e..54ec2fd1a 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2208,7 +2208,7 @@ struct GGMLRunner { ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]); enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU; const char* dev_name = dev ? ggml_backend_dev_name(dev) : "(none)"; - LOG_INFO("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d", + LOG_DEBUG("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d", get_desc().c_str(), i, backend_name ? backend_name : "(null)", buft_name ? buft_name : "(null)", dev_name, (int)dev_type); @@ -2261,7 +2261,7 @@ struct GGMLRunner { void* base = ggml_backend_buffer_get_base(multi_params_buffers[i]); size_t actual_sz = ggml_backend_buffer_get_size(multi_params_buffers[i]); bool is_host = ggml_backend_buffer_is_host(multi_params_buffers[i]); - LOG_INFO("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB " + LOG_DEBUG("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB " "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d", get_desc().c_str(), i, ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f), @@ -2293,7 +2293,7 @@ struct GGMLRunner { if (sampled[idx]) continue; sampled[idx] = true; ggml_tensor* t = kv.first; - LOG_INFO("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d", + LOG_DEBUG("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d", get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data, t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1); } @@ -2381,7 +2381,7 @@ struct GGMLRunner { if (!dev) continue; size_t free_b = 0, total_b = 0; ggml_backend_dev_memory(dev, &free_b, &total_b); - LOG_INFO("%s post-load device %s free=%.1f MB / %.1f MB", + LOG_DEBUG("%s post-load device %s free=%.1f MB / %.1f MB", get_desc().c_str(), ggml_backend_dev_name(dev), free_b / (1024.f * 1024.f), diff --git a/src/model.cpp b/src/model.cpp index 32dfbed3c..2f7e2b78f 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1008,9 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors, int n_threads, - bool enable_mmap) { + bool enable_mmap, + bool quiet_unknown_tensors) { std::set tensor_names_in_file; std::mutex tensor_names_mutex; + std::atomic unknown_tensor_count{0}; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; // LOG_DEBUG("%s", tensor_storage.to_string().c_str()); @@ -1028,7 +1030,11 @@ bool ModelLoader::load_tensors(std::map& tensors, return true; } } - LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str()); + if (quiet_unknown_tensors) { + unknown_tensor_count.fetch_add(1); + } else { + LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str()); + } return true; } @@ -1077,6 +1083,10 @@ bool ModelLoader::load_tensors(std::map& tensors, if (some_tensor_not_init) { return false; } + if (quiet_unknown_tensors && unknown_tensor_count.load() > 0) { + LOG_INFO("skipped %zu unknown tensors (--quiet-unknown-tensors)", + unknown_tensor_count.load()); + } return true; } diff --git a/src/model.h b/src/model.h index 10aaf8512..03d4e3732 100644 --- a/src/model.h +++ b/src/model.h @@ -226,7 +226,8 @@ class ModelLoader { bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, int n_threads = 0, - bool use_mmap = false); + bool use_mmap = false, + bool quiet_unknown_tensors = false); std::vector get_tensor_names() const { std::vector names; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 520d17b48..181dd8350 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1021,10 +1021,12 @@ class StableDiffusionGGML { ? sd_ctx_params->n_threads : 2, 2); bool mmap_capture = sd_ctx_params->enable_mmap; + bool quiet_capture = sd_ctx_params->quiet_unknown_tensors; cond_stage_model->set_llm_lazy_load([=]() -> bool { auto local_map = llm_lazy_map; return loader_ptr->load_tensors(local_map, /*ignore=*/{}, - n_threads_capture, mmap_capture); + n_threads_capture, mmap_capture, + quiet_capture); }); LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)", llm_lazy_map.size()); @@ -1049,10 +1051,12 @@ class StableDiffusionGGML { ? sd_ctx_params->n_threads : 2, 2); bool mmap_capture = sd_ctx_params->enable_mmap; + bool quiet_capture = sd_ctx_params->quiet_unknown_tensors; diffusion_model->set_lazy_load([=]() -> bool { auto local_map = dit_only_tensors; return loader_ptr->load_tensors(local_map, /*ignore=*/{}, - n_threads_capture, mmap_capture); + n_threads_capture, mmap_capture, + quiet_capture); }); LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)", dit_only_tensors.size()); @@ -1313,7 +1317,9 @@ class StableDiffusionGGML { ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, + sd_ctx_params->enable_mmap, + sd_ctx_params->quiet_unknown_tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -2695,6 +2701,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; sd_ctx_params->auto_multi_gpu = true; + sd_ctx_params->quiet_unknown_tensors = false; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -2742,6 +2749,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "auto_fit_compute_reserve_vae_mb: %d\n" "auto_fit_compute_reserve_cond_mb: %d\n" "auto_multi_gpu: %s\n" + "quiet_unknown_tensors: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2787,6 +2795,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->auto_fit_compute_reserve_vae_mb, sd_ctx_params->auto_fit_compute_reserve_cond_mb, BOOL_STR(sd_ctx_params->auto_multi_gpu), + BOOL_STR(sd_ctx_params->quiet_unknown_tensors), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), From d9d38baa58f0765a82dd2d4c81709a4a68d1b8b9 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 22:48:30 +0200 Subject: [PATCH 7/9] feat: add row-split (cuda_split_buffer_type) alongside layer-split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restores the row-split path that worked on the original LTX-2 branch: matmul weight tensors are split row-wise across CUDA devices via cuda_split_buffer_type and the CUDA backend handles cross-device dispatch internally. Sched still wires the additional CUDA backends so it can route copies between devices, but no per-block buffer doubling — the compute buffer is dramatically smaller than layer- split for cross-backend graphs. Surface: - New CLI: --multi-gpu-mode={row,layer,off} (default: row). - New API field: sd_ctx_params_t.multi_gpu_mode. - New planner placement: GPU_TENSOR_SPLIT, scored slightly above GPU_LAYER_SPLIT so the planner prefers it when both fit. Changes: - backend_fit::Placement gains GPU_TENSOR_SPLIT and a MultiGpuMode enum; build_options enumerates only the chosen mode's split options. Decision population sorts split_device_ids by descending TOTAL memory (always use the bigger GPU as main). - gpu_peak handles GPU_TENSOR_SPLIT correctly: per-device share + compute reserve added to the biggest-memory GPU only. - MultiBackendSpec gains a `mode` field. ROW_SPLIT carries tensor_split_ratios + main_device; LAYER_SPLIT carries tensor_backend_fn. - GGMLRunner ctor branches on mode: ROW_SPLIT initializes cuda_split_buffer_type; LAYER_SPLIT consumes the backend callback. - alloc_params_buffer_row_split: walks params_ctx, splits into matmul-eligible (row-split-buft) vs main (default buft) buffers, binds via tallocr. is_row_split_eligible excludes views, so the cuda split buft never sees a view tensor. - free_params_buffer + ensure_params_loaded recognize the new row_split_buffer / row_main_buffer fields so lazy load doesn't re-trigger on subsequent compute() calls. - Spec wiring in stable-diffusion.cpp: prepare_row_split_spec computes per-device ratios from the planner's share_bytes, picks main_device by largest share, and inits the additional CUDA backends so sched can schedule cross-device copies. - CMakeLists.txt: add -DSD_USE_CUDA when SD_CUDA is enabled (had been silently undefined, leaving all #ifdef SD_USE_CUDA blocks inactive — a latent bug that broke row-split alloc). - ggml-cuda.cu: re-add the small view-init early-return in ggml_backend_cuda_split_buffer_init_tensor. Without this, sched's per-tensor init crashes on view tensors of split-tensor weights. The row-split path itself routes views to the main buffer (not the split-buft) via is_row_split_eligible, but sched-managed scratch tensors still hit the split buft for op outputs. - Lazy-load auto-detect now counts GPU_TENSOR_SPLIT shares in its init-time-SUM check (was missing — the planner thought everything fit and lazy-load never triggered). Known issue: at LTX-2 Q6_K + Gemma 12B Q8_K_XL scale, the per-tensor cudaMalloc fragmentation inside cuda_split_buffer_type's init_tensor exhausts CUDA0's small-alloc pool during DiT load even though the planner's MAX-based peak fits. Pre-existing limitation of the split buft; needs a separate followup (e.g. coalesce tensors into a single big alloc, or use VMM-backed managed memory). Co-Authored-By: Claude Opus 4.7 (1M context) --- CMakeLists.txt | 1 + examples/common/common.cpp | 10 +++ examples/common/common.h | 3 + include/stable-diffusion.h | 19 ++++- src/backend_fit.hpp | 133 +++++++++++++++++++++++++---- src/ggml_extend.hpp | 171 ++++++++++++++++++++++++++++++++++--- src/stable-diffusion.cpp | 159 ++++++++++++++++++++++++++++------ 7 files changed, 443 insertions(+), 53 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 48ce456ea..32375b163 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF if(SD_CUDA) message("-- Use CUDA as backend stable-diffusion") set(GGML_CUDA ON) + add_definitions(-DSD_USE_CUDA) endif() if(SD_METAL) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 2bbd1d216..36f1c6a86 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -420,6 +420,14 @@ ArgOptions SDContextParams::get_options() { "--vision-backend-device", "ggml device name for the vision model (currently routed through main).", &vision_backend_device}, + {"", + "--multi-gpu-mode", + "auto-fit multi-GPU split mechanism: 'row' (default; CUDA-only " + "row-split via cuda_split_buffer_type, single backend, smaller " + "compute buffer), 'layer' (block-indexed tensors split across " + "per-block backends + sched, generic but ~2x activation cost at " + "boundaries), or 'off' (never split a single component)", + &multi_gpu_mode}, }; options.int_options = { @@ -762,6 +770,7 @@ std::string SDContextParams::to_string() const { << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" + << " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n" << " quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" @@ -846,6 +855,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f auto_fit_compute_reserve_vae_mb, auto_fit_compute_reserve_cond_mb, auto_multi_gpu, + multi_gpu_mode.empty() ? nullptr : multi_gpu_mode.c_str(), quiet_unknown_tensors, }; return sd_ctx_params; diff --git a/examples/common/common.h b/examples/common/common.h index e21a68142..1df32f9c0 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -143,6 +143,9 @@ struct SDContextParams { int auto_fit_compute_reserve_vae_mb = 0; int auto_fit_compute_reserve_cond_mb = 0; bool auto_multi_gpu = true; + // "row" (default), "layer", or "off". Selects the multi-GPU split + // mechanism the auto-fit planner is allowed to emit. + std::string multi_gpu_mode = "row"; // When set, the model loader skips per-tensor "unknown tensor" log // lines and instead emits a single summary count at the end of load. diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 2de064288..7da5324b4 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -230,10 +230,25 @@ typedef struct { // When more than one GPU device is present, prefer placing different // components on different GPUs to balance load and fit larger total // working sets. Set false to keep all components on a single GPU when - // they fit. Defaults to true. Each component still lives entirely on - // one device — no intra-tensor row split. + // they fit. Defaults to true. bool auto_multi_gpu; + // When auto_multi_gpu is true and a single component doesn't fit on + // one GPU, the planner can split it across multiple GPUs. Two + // mechanisms: + // "row": matmul weights row-split across CUDA devices via + // cuda_split_buffer_type. Single CUDA backend; no sched. + // Cheaper compute buffer (no cross-backend doubling) but + // CUDA-only. Default. + // "layer": block-indexed tensors assigned to per-block backends + // and routed via ggml_backend_sched. Generic across + // backends but costs ~2x activation memory at boundaries. + // "off": never split a single component across GPUs. Components + // that don't fit fall back to OFFLOAD or CPU. + // The string is parsed by backend_fit::str_to_multi_gpu_mode; if + // unrecognized, "row" is used. + const char* multi_gpu_mode; + // Suppress per-tensor "unknown tensor 'X' in model file" log lines // emitted during model loading. Useful for models like LTX-2 that // ship hundreds of audio-branch / encoder tensors a video-only diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp index 7ca789a0b..b95632750 100644 --- a/src/backend_fit.hpp +++ b/src/backend_fit.hpp @@ -45,7 +45,8 @@ enum class Placement { CPU, GPU, GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU - GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries + GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries (sched-based) + GPU_TENSOR_SPLIT, // matmul weights row-split across GPUs (CUDA split-buft, single backend) }; struct Component { @@ -94,6 +95,28 @@ struct ComputeReserves { int64_t conditioner_bytes = int64_t(512) * MiB; }; +enum class MultiGpuMode { + OFF, // never split a single component across GPUs + ROW, // CUDA-only: row-split matmul weights via cuda_split_buffer_type + LAYER, // generic: assign block-indexed tensors to per-block backends + sched +}; + +inline const char* multi_gpu_mode_str(MultiGpuMode m) { + switch (m) { + case MultiGpuMode::OFF: return "off"; + case MultiGpuMode::ROW: return "row"; + case MultiGpuMode::LAYER: return "layer"; + } + return "?"; +} + +inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) { + if (s == "off") return MultiGpuMode::OFF; + if (s == "row") return MultiGpuMode::ROW; + if (s == "layer") return MultiGpuMode::LAYER; + return MultiGpuMode::ROW; // default +} + // --- Classification ------------------------------------------------------- inline bool classify_tensor(const std::string& name, ComponentKind& out) { @@ -231,6 +254,34 @@ inline int64_t gpu_peak(int gpu_idx, if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { if (dev[i] != gpu_idx) continue; footprint = components[i].params_bytes + components[i].compute_bytes; + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: every GPU in the mask gets a free-VRAM-weighted + // share of params; the compute reserve lands on the BIGGEST + // GPU (which becomes the runner's main backend). + const int mask = dev[i]; + if (!(mask & (1 << gpu_idx))) continue; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + int slot = -1; + int biggest_slot = 0; + int64_t biggest_mem = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) slot = int(k); + if (devices[gpu_idxs[k]].total_bytes > biggest_mem) { + biggest_mem = devices[gpu_idxs[k]].total_bytes; + biggest_slot = int(k); + } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + /*compute_bytes=*/0, + devices, gpu_idxs); + footprint = shares[slot]; + if (slot == biggest_slot) { + footprint += components[i].compute_bytes; + } } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { // dev[i] holds the bitmask of participating GPU indices into the // devices[] vector (encoded by the planner). Look up our slot. @@ -258,9 +309,13 @@ inline int64_t gpu_peak(int gpu_idx, inline Plan compute_plan(const std::vector& components, const std::vector& devices, int64_t margin_bytes, - bool allow_multi_gpu = true) { + bool allow_multi_gpu = true, + MultiGpuMode mode = MultiGpuMode::ROW) { const size_t nC = components.size(); const size_t nG = devices.size(); + if (!allow_multi_gpu) { + mode = MultiGpuMode::OFF; + } std::vector cap(nG, 0); for (size_t g = 0; g < nG; g++) { @@ -287,9 +342,15 @@ inline Plan compute_plan(const std::vector& components, opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); } } - // Layer-split: enumerate non-trivial subsets of GPUs (size >= 2). - // Encode the participating set as a bitmask in device_idx. - if (allow_multi_gpu && nG >= 2 && supports_layer_split(c.kind)) { + // Multi-GPU split: one option type per mode. Encoded as a bitmask + // of participating GPUs in device_idx. + if (mode == MultiGpuMode::ROW && nG >= 2 && supports_layer_split(c.kind)) { + // Row-split spans all GPUs; single option with all bits set. + int all_mask = (1 << nG) - 1; + opts.push_back({Placement::GPU_TENSOR_SPLIT, all_mask}); + } + if (mode == MultiGpuMode::LAYER && nG >= 2 && supports_layer_split(c.kind)) { + // Layer-split: enumerate non-trivial subsets (size >= 2). const int max_mask = 1 << nG; for (int mask = 1; mask < max_mask; mask++) { if (__builtin_popcount(mask) < 2) continue; @@ -326,6 +387,15 @@ inline Plan compute_plan(const std::vector& components, } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { s += 5 * pw; gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: cheaper than layer-split (no sched cross- + // backend doubling) but pays per-matmul cross-device + // reductions. Score it slightly above LAYER_SPLIT so the + // planner prefers it when both fit. + s += 8 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { // Better than CPU but worse than fitting on a single GPU // (cross-GPU traffic between blocks). @@ -418,6 +488,36 @@ inline Plan compute_plan(const std::vector& components, d.device_id = DEVICE_ID_CPU; d.on_host_bytes = c.params_bytes + c.compute_bytes; plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, /*compute_bytes=*/0, + devices, gpu_idxs); + // Sort participating GPUs by descending TOTAL memory so the + // largest device is the "main" (gets the row-split's compute + // buffer + sub-runners that don't get their own spec). This + // matches the user's preference: always use the bigger GPU + // as main for splits. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + int64_t share = shares[k]; + if (pos == 0) share += c.compute_bytes; // main (= biggest) gets compute + d.split_share_bytes.push_back(share); + max_share = std::max(max_share, share); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) { std::vector gpu_idxs; for (size_t k = 0; k < nG; k++) { @@ -425,15 +525,15 @@ inline Plan compute_plan(const std::vector& components, } auto shares = layer_split_shares(c.params_bytes, c.compute_bytes, devices, gpu_idxs); - // Sort participating GPUs by descending share so the LARGEST-share - // GPU is listed first. Sub-runners that don't get the layer-split - // spec (e.g. the LTX-2 text projection) follow the "main" backend - // (= first in this list) — putting the biggest one first keeps - // them on the GPU with most headroom. + // Sort participating GPUs by descending TOTAL memory so the + // physically bigger GPU is listed first (and becomes the runner's + // main backend). Sub-runners that don't get the layer-split spec + // (e.g. the LTX-2 text projection) follow the main backend. std::vector order(gpu_idxs.size()); std::iota(order.begin(), order.end(), 0); - std::sort(order.begin(), order.end(), - [&](size_t a, size_t b) { return shares[a] > shares[b]; }); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); int64_t max_share = 0; for (size_t pos = 0; pos < order.size(); pos++) { @@ -471,6 +571,7 @@ inline const char* placement_str(Placement p) { case Placement::GPU: return "GPU"; case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)"; + case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)"; } return "?"; } @@ -506,15 +607,17 @@ inline void print_plan(const Plan& plan, LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", d.name.c_str(), d.device_id, (long long)(d.on_device_bytes / MiB)); - } else if (d.placement == Placement::GPU_LAYER_SPLIT) { + } else if (d.placement == Placement::GPU_LAYER_SPLIT || + d.placement == Placement::GPU_TENSOR_SPLIT) { std::string ids; + const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer"; for (size_t k = 0; k < d.split_device_ids.size(); k++) { if (k > 0) ids += "+"; ids += "GPU" + std::to_string(d.split_device_ids[k]); ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)"; } - LOG_INFO(" %-12s -> %s", - d.name.c_str(), ids.c_str()); + LOG_INFO(" %-12s -> %s-split %s", + d.name.c_str(), tag, ids.c_str()); } else { LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", d.name.c_str(), d.device_id, diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 54ec2fd1a..de2bbcd2b 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -25,6 +25,9 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" +#ifdef SD_USE_CUDA +#include "ggml-cuda.h" +#endif #include "ggml_extend_backend.hpp" #include "model.h" @@ -1721,16 +1724,29 @@ struct GGMLRunnerContext { // To enable: populate g_pending_multi_backend_spec() with the additional // backends + tensor->backend callback, then construct the GGMLRunner. The // ctor consumes and clears the pending pointer. +enum class MultiBackendMode { + LAYER_SPLIT, // assign block-indexed tensors to per-block backends + sched + ROW_SPLIT, // CUDA split-buft: matmul weights row-split across devices +}; + struct MultiBackendSpec { + MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT; + // Extra backends *in addition to* the runner's main runtime_backend. // The first entry's role is the main backend; we don't list it here. std::vector additional_backends; - // Maps a weight tensor name to one of the runner's backends (the main - // runtime_backend, or one of additional_backends). Returning nullptr - // means "use the main runtime_backend". + // LAYER_SPLIT: maps a weight tensor name to one of the runner's + // backends (the main runtime_backend, or one of additional_backends). + // Returning nullptr means "use the main runtime_backend". std::function tensor_backend_fn; + // ROW_SPLIT (CUDA-only): per-device row split ratios (length = total + // CUDA device count) and main device. Empty means use CUDA's default + // free-VRAM proportions. + std::vector tensor_split_ratios; + int main_device = 0; + // Optional CPU backend appended last to the sched for unsupported-op // fallback. May be nullptr. ggml_backend_t cpu_fallback = nullptr; @@ -1748,17 +1764,25 @@ struct GGMLRunner { ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; - // --- multi-backend (layer-split) state --- - bool multi_backend_mode = false; + // --- multi-backend state (layer-split via sched OR row-split via cuda_split_buft) --- + bool multi_backend_mode = false; + MultiBackendMode multi_backend_kind = MultiBackendMode::LAYER_SPLIT; std::vector additional_backends; ggml_backend_t cpu_fallback_backend = nullptr; bool owns_cpu_fallback_backend = false; std::function tensor_backend_fn = nullptr; ggml_backend_sched_t sched = nullptr; bool sched_reserved = false; - // Per-backend params buffers when multi_backend_mode is on. - // params_buffer (single-backend) stays nullptr in this mode. + // Per-backend params buffers when LAYER_SPLIT is active. ROW_SPLIT uses + // a CUDA split-buft buffer + a regular buffer for non-split tensors, + // stored in row_split_buffer + row_main_buffer instead. std::vector multi_params_buffers; + // ROW_SPLIT-only state. + std::vector row_split_ratios; + int row_main_device = 0; + ggml_backend_buffer_type_t row_split_buft = nullptr; + ggml_backend_buffer_t row_split_buffer = nullptr; + ggml_backend_buffer_t row_main_buffer = nullptr; // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual // alloc + tensor-data load is deferred until the first compute(). The @@ -2122,17 +2146,40 @@ struct GGMLRunner { GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) : runtime_backend(backend) { - // Consume any pending multi-backend (layer-split) spec set by the - // caller via g_pending_multi_backend_spec(). + // Consume any pending multi-backend spec set by the caller via + // g_pending_multi_backend_spec(). MultiBackendSpec* pending = g_pending_multi_backend_spec(); if (pending != nullptr) { g_pending_multi_backend_spec() = nullptr; multi_backend_mode = true; + multi_backend_kind = pending->mode; additional_backends = pending->additional_backends; tensor_backend_fn = pending->tensor_backend_fn; cpu_fallback_backend = pending->cpu_fallback; - if (offload_params_to_cpu) { - LOG_WARN("multi-backend layer-split is incompatible with " + row_split_ratios = pending->tensor_split_ratios; + row_main_device = pending->main_device; +#ifdef SD_USE_CUDA + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + row_split_buft = ggml_backend_cuda_split_buffer_type( + row_main_device, + row_split_ratios.empty() ? nullptr : row_split_ratios.data()); + if (row_split_buft == nullptr) { + LOG_WARN("multi-backend: cuda split buft init failed; " + "falling back to single-backend mode"); + multi_backend_mode = false; + additional_backends.clear(); + cpu_fallback_backend = nullptr; + } + } +#else + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + LOG_WARN("multi-backend: row-split requires CUDA; " + "falling back to single-backend mode"); + multi_backend_mode = false; + } +#endif + if (multi_backend_mode && offload_params_to_cpu) { + LOG_WARN("multi-backend split is incompatible with " "offload_params_to_cpu; ignoring offload"); offload_params_to_cpu = false; } @@ -2315,10 +2362,99 @@ struct GGMLRunner { return true; } + // Heuristic for row-split eligibility: contiguous, rank-2, both dims + // >= 256, and NOT a view. 1D biases / norms / embeddings / small + // projections / views fall back to the main GPU's regular per-device + // buft. Excluding views avoids the cuda split buft's + // GGML_ASSERT(view_src == nullptr) — sticking to the buft's documented + // contract instead of patching ggml. + static bool is_row_split_eligible(const ggml_tensor* t) { + if (t->view_src != nullptr) return false; + if (!ggml_is_contiguous(t)) return false; + if (ggml_n_dims(t) != 2) return false; + if (t->ne[0] < 256 || t->ne[1] < 256) return false; + return true; + } + + bool alloc_params_buffer_row_split() { +#ifdef SD_USE_CUDA + ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend); + const size_t main_align = ggml_backend_buft_get_alignment(main_buft); + const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft); + + size_t main_size = 0, split_size = 0; + size_t main_count = 0, split_count = 0; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + if (is_row_split_eligible(t)) { + size_t s = ggml_backend_buft_get_alloc_size(row_split_buft, t); + split_size += GGML_PAD(s, split_align); + split_count++; + } else { + size_t s = ggml_backend_buft_get_alloc_size(main_buft, t); + main_size += GGML_PAD(s, main_align); + main_count++; + } + } + + if (main_size > 0) { + row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size); + if (row_main_buffer == nullptr) { + LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", + get_desc().c_str(), main_size / (1024.f * 1024.f)); + return false; + } + } + if (split_size > 0) { + row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size); + if (row_split_buffer == nullptr) { + LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", + get_desc().c_str(), split_size / (1024.f * 1024.f)); + return false; + } + } + + ggml_tallocr main_alloc{}; + ggml_tallocr split_alloc{}; + if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer); + if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer); + + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + ggml_status st = is_row_split_eligible(t) + ? ggml_tallocr_alloc(&split_alloc, t) + : ggml_tallocr_alloc(&main_alloc, t); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s row-split tallocr_alloc failed for tensor %s", + get_desc().c_str(), t->name); + return false; + } + } + + if (row_main_buffer != nullptr) { + ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + if (row_split_buffer != nullptr) { + ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)", + get_desc().c_str(), + main_size / (1024.f * 1024.f), main_count, + split_size / (1024.f * 1024.f), split_count); + return true; +#else + LOG_ERROR("alloc_params_buffer_row_split called without CUDA"); + return false; +#endif + } + // Internal: always materializes the params buffer. Used by both the // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded` // path; the latter must bypass the lazy-skip. bool do_alloc_params_buffer() { + if (multi_backend_mode && multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + return alloc_params_buffer_row_split(); + } if (multi_backend_mode) { return alloc_params_buffer_layer_split(); } @@ -2354,7 +2490,8 @@ struct GGMLRunner { } bool ensure_params_loaded() { - if (params_buffer != nullptr || !multi_params_buffers.empty()) { + if (params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr) { return true; } if (!lazy_load_fn) { @@ -2402,6 +2539,14 @@ struct GGMLRunner { } } multi_params_buffers.clear(); + if (row_split_buffer != nullptr) { + ggml_backend_buffer_free(row_split_buffer); + row_split_buffer = nullptr; + } + if (row_main_buffer != nullptr) { + ggml_backend_buffer_free(row_main_buffer); + row_main_buffer = nullptr; + } if (sched != nullptr) { ggml_backend_sched_free(sched); sched = nullptr; @@ -2417,6 +2562,8 @@ struct GGMLRunner { for (auto* buf : multi_params_buffers) { if (buf != nullptr) total += ggml_backend_buffer_get_size(buf); } + if (row_split_buffer != nullptr) total += ggml_backend_buffer_get_size(row_split_buffer); + if (row_main_buffer != nullptr) total += ggml_backend_buffer_get_size(row_main_buffer); return total; } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 181dd8350..682b2347d 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -127,16 +127,20 @@ class StableDiffusionGGML { bool fit_cond_offload_params = false; bool fit_vae_offload_params = false; - // Layer-split state (when auto-fit picks GPU_LAYER_SPLIT). Holds the - // ordered list of device names and per-device share bytes; the actual - // backend handles are init'd at construction time and stored in - // *_extra_backends so the destructor can free them. + // Multi-GPU split state (LAYER_SPLIT or ROW_SPLIT). Holds the ordered + // list of device names and per-device share bytes; the actual backend + // handles are init'd at construction time and stored in *_extra_backends + // so the destructor can free them. fit_*_row_split=true means use the + // CUDA row-split path (matmul weights split row-wise via cuda_split_buft); + // false means layer-split (per-block backend assignment via sched). std::vector fit_dit_split_device_names; std::vector fit_dit_split_share_bytes; std::vector fit_dit_extra_backends; + bool fit_dit_row_split = false; std::vector fit_cond_split_device_names; std::vector fit_cond_split_share_bytes; std::vector fit_cond_extra_backends; + bool fit_cond_row_split = false; // Owned model loader: kept alive across init() so lazy_load callbacks // can re-read tensor data from disk on demand. Only set when at least @@ -410,8 +414,11 @@ class StableDiffusionGGML { auto devices = backend_fit::enumerate_gpu_devices(); int64_t margin_bytes = int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + backend_fit::MultiGpuMode mode = backend_fit::str_to_multi_gpu_mode( + SAFE_STR(sd_ctx_params->multi_gpu_mode)); auto plan = backend_fit::compute_plan( - components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu); + components, devices, margin_bytes, + sd_ctx_params->auto_multi_gpu, mode); backend_fit::print_plan(plan, components, devices, margin_bytes); if (sd_ctx_params->auto_fit_dry_run) { @@ -439,9 +446,11 @@ class StableDiffusionGGML { std::string& out_device, bool& out_offload, std::vector& out_split_devices, - std::vector& out_split_shares) { + std::vector& out_split_shares, + bool& out_row_split) { out_split_devices.clear(); out_split_shares.clear(); + out_row_split = false; if (d == nullptr) { out_device.clear(); out_offload = false; @@ -452,7 +461,8 @@ class StableDiffusionGGML { out_offload = false; return; } - if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT || + d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) { // Primary device drives main_backend choice for the model; // the rest become additional backends in the spec. for (size_t k = 0; k < d->split_device_ids.size(); k++) { @@ -461,6 +471,7 @@ class StableDiffusionGGML { } if (!out_split_devices.empty()) out_device = out_split_devices[0]; out_offload = false; + out_row_split = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT); return; } out_device = device_id_to_name(d->device_id); @@ -468,14 +479,18 @@ class StableDiffusionGGML { }; std::vector dummy_devs; std::vector dummy_shares; + bool dummy_row_split = false; resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), fit_diffusion_device, fit_dit_offload_params, - fit_dit_split_device_names, fit_dit_split_share_bytes); + fit_dit_split_device_names, fit_dit_split_share_bytes, + fit_dit_row_split); resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), - fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares); + fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares, + dummy_row_split); resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), fit_clip_device, fit_cond_offload_params, - fit_cond_split_device_names, fit_cond_split_share_bytes); + fit_cond_split_device_names, fit_cond_split_share_bytes, + fit_cond_row_split); // CPU placements: leave fit_*_device empty AND remember they're // CPU so the resolver below picks ggml_backend_cpu_init(). @@ -488,7 +503,8 @@ class StableDiffusionGGML { std::map sum_per_device; auto add_sum = [&](const backend_fit::Decision* d) { if (!d) return; - if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT || + d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) { for (size_t k = 0; k < d->split_device_ids.size(); k++) { sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k]; } @@ -585,6 +601,84 @@ class StableDiffusionGGML { const char* vae_dev_name = effective_device(fit_vae_device, sd_ctx_params->vae_backend_device); + // Build the row-split MultiBackendSpec for a component (ROW_SPLIT + // mode). Unlike layer-split, the runner uses a SINGLE CUDA backend; + // matmul weights are row-split across all CUDA devices internally + // by cuda_split_buffer_type. extra_backends stays empty. + // - share_devices/share_bytes: per-device share order from auto-fit + // (largest first by descending share). The first device is the + // "main" CUDA device, where the compute buffer lives. + // Returns true on success; populates out_spec.tensor_split_ratios + // with a vector of length total CUDA device count. + auto prepare_row_split_spec = [&](const std::vector& share_devices, + const std::vector& share_bytes, + std::vector& out_extra_backends, + MultiBackendSpec& out_spec) -> bool { +#ifdef SD_USE_CUDA + const int cuda_count = ggml_backend_cuda_get_device_count(); + if (cuda_count <= 0 || share_devices.size() < 2) return false; + + // Map device names like "CUDA0" -> 0, "CUDA1" -> 1, ... + auto cuda_index_of = [](const std::string& name) -> int { + if (name.rfind("CUDA", 0) != 0) return -1; + try { return std::stoi(name.substr(4)); } catch (...) { return -1; } + }; + + std::vector ratios(cuda_count, 0.0f); + int64_t total = 0; + for (auto b : share_bytes) total += b; + if (total <= 0) return false; + int main_dev = -1; + int64_t max_share = -1; + for (size_t k = 0; k < share_devices.size(); k++) { + int idx = cuda_index_of(share_devices[k]); + if (idx < 0 || idx >= cuda_count) continue; + ratios[idx] = float(double(share_bytes[k]) / double(total)); + if (share_bytes[k] > max_share) { + max_share = share_bytes[k]; + main_dev = idx; + } + } + if (main_dev < 0) return false; + + // Init extra CUDA backends for the non-main devices so sched + // can route ops across them (row-split tensors are dispatched + // by the CUDA backend; ggml-sched still needs all participating + // backends in its list to schedule cross-device copies). + for (size_t k = 0; k < share_devices.size(); k++) { + int idx = cuda_index_of(share_devices[k]); + if (idx == main_dev || idx < 0) continue; + ggml_backend_t b = init_named_backend(share_devices[k]); + if (b != nullptr) { + out_extra_backends.push_back(b); + } else { + LOG_WARN("row-split: failed to init backend %s", + share_devices[k].c_str()); + } + } + out_spec.mode = MultiBackendMode::ROW_SPLIT; + out_spec.tensor_split_ratios = ratios; + out_spec.main_device = main_dev; + out_spec.additional_backends.assign(out_extra_backends.begin(), + out_extra_backends.end()); + out_spec.tensor_backend_fn = nullptr; + out_spec.cpu_fallback = nullptr; + + std::string ratio_str; + for (int i = 0; i < cuda_count; i++) { + if (i > 0) ratio_str += ","; + char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]); + ratio_str += buf; + } + LOG_INFO("row-split spec: ratios=[%s] main_device=%d", + ratio_str.c_str(), main_dev); + return true; +#else + (void)share_devices; (void)share_bytes; (void)out_spec; + return false; +#endif + }; + // Build the layer-split MultiBackendSpec for a component. Only used // when auto-fit picked GPU_LAYER_SPLIT for this component. // - main_backend: the runner's primary backend (also first in the spec) @@ -750,12 +844,19 @@ class StableDiffusionGGML { MultiBackendSpec dit_spec; bool dit_spec_active = false; if (!fit_dit_split_device_names.empty()) { - dit_spec_active = prepare_layer_split_spec(diffusion_backend, - fit_dit_split_device_names, - fit_dit_split_share_bytes, - "model.diffusion_model.", - fit_dit_extra_backends, - dit_spec); + if (fit_dit_row_split) { + dit_spec_active = prepare_row_split_spec(fit_dit_split_device_names, + fit_dit_split_share_bytes, + fit_dit_extra_backends, + dit_spec); + } else { + dit_spec_active = prepare_layer_split_spec(diffusion_backend, + fit_dit_split_device_names, + fit_dit_split_share_bytes, + "model.diffusion_model.", + fit_dit_extra_backends, + dit_spec); + } } // Lambda to set the pending spec immediately before constructing the // diffusion model. Caller must invoke this on the same line / right @@ -799,14 +900,21 @@ class StableDiffusionGGML { LOG_INFO("CLIP: using device %s", clip_dev_name); } // Now that clip_backend is resolved, build the conditioner's - // layer-split spec if auto-fit picked it. + // multi-GPU spec if auto-fit picked one (row-split or layer-split). if (!fit_cond_split_device_names.empty()) { - cond_spec_active = prepare_layer_split_spec(clip_backend, - fit_cond_split_device_names, - fit_cond_split_share_bytes, - "text_encoders.", // covers text_encoders.llm.* and text_encoders.t5xxl.* - fit_cond_extra_backends, - cond_spec); + if (fit_cond_row_split) { + cond_spec_active = prepare_row_split_spec(fit_cond_split_device_names, + fit_cond_split_share_bytes, + fit_cond_extra_backends, + cond_spec); + } else { + cond_spec_active = prepare_layer_split_spec(clip_backend, + fit_cond_split_device_names, + fit_cond_split_share_bytes, + "text_encoders.", + fit_cond_extra_backends, + cond_spec); + } } if (sd_version_is_sd3(version)) { prime_cond_spec(); @@ -2701,6 +2809,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; sd_ctx_params->auto_multi_gpu = true; + sd_ctx_params->multi_gpu_mode = "row"; sd_ctx_params->quiet_unknown_tensors = false; } @@ -2749,6 +2858,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "auto_fit_compute_reserve_vae_mb: %d\n" "auto_fit_compute_reserve_cond_mb: %d\n" "auto_multi_gpu: %s\n" + "multi_gpu_mode: %s\n" "quiet_unknown_tensors: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" @@ -2795,6 +2905,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->auto_fit_compute_reserve_vae_mb, sd_ctx_params->auto_fit_compute_reserve_cond_mb, BOOL_STR(sd_ctx_params->auto_multi_gpu), + SAFE_STR(sd_ctx_params->multi_gpu_mode), BOOL_STR(sd_ctx_params->quiet_unknown_tensors), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), From 27b1ed3f34aa47784e12c91d85652f3dcc71ba7e Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Fri, 1 May 2026 16:37:48 +0200 Subject: [PATCH 8/9] ggml.patch: cuda_split_buffer_type pool allocator cuda_split_buffer_type::init_tensor previously did one raw cudaMalloc per tensor per device. Each call returns memory from CUDA's bucketed reuse pool; when the buffer is freed and a new split buffer is allocated, the driver doesn't coalesce returned chunks into a contiguous range. With multiple sequential row-split loads in the same process (e.g. row-split conditioner -> free -> row-split DiT), the second load OOMs on the smaller GPU even when the planner's MAX-based peak says memory should be available. This patch: 1. Pre-allocates one contiguous cudaMalloc per device sized by the tensor_split ratio + a 16 MiB safety margin in alloc_buffer. 2. Bump-allocates from each per-device pool in init_tensor, falling back to per-tensor cudaMalloc only for tail tensors whose per-device slice exceeds the margin (rounded sizes diverge from the ratio-derived bound). 3. Adds an early-return for view tensors in init_tensor (sched routinely calls init_tensor on views of split-backed weights). The patch lives in this file rather than the vendored ggml submodule so it survives future ggml syncs. Apply with: git -C ggml apply ggml.patch Verified with sd.cpp's auto-fit pipeline (3080 + 5060 Ti): - Heavy quants (Q6_K DiT 17.7 GB + Q8_K_XL LLM 15.6 GB) at 256x256x9/4 steps with both components row-split: generation 92s. Was OOM'ing on DiT load after Conditioner freed. - Lighter quants (Q5_K_S DiT + IQ4_XS LLM) at 640x480x25/8: DiT row-split, Conditioner single-GPU, 131s. Unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- ggml.patch | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 ggml.patch diff --git a/ggml.patch b/ggml.patch new file mode 100644 index 000000000..0515013e1 --- /dev/null +++ b/ggml.patch @@ -0,0 +1,184 @@ +diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu +index cc80eb3f..a73ef0de 100644 +--- a/src/ggml-cuda/ggml-cuda.cu ++++ b/src/ggml-cuda/ggml-cuda.cu +@@ -832,6 +832,19 @@ struct ggml_backend_cuda_split_buffer_type_context { + }; + + struct ggml_backend_cuda_split_buffer_context { ++ // Per-device pool: one contiguous cudaMalloc per device, sub-allocated ++ // by init_tensor. Replaces the previous per-tensor cudaMalloc to avoid ++ // bucketed-free fragmentation when multiple split buffers are loaded ++ // and freed sequentially (e.g. row-split conditioner -> row-split DiT). ++ char * pool_base[GGML_CUDA_MAX_DEVICES] = {}; ++ size_t pool_size[GGML_CUDA_MAX_DEVICES] = {}; ++ size_t pool_used[GGML_CUDA_MAX_DEVICES] = {}; ++ // Side-allocations for tensors whose per-device slice didn't fit in the ++ // pool (row-split rounding skews per-device sizes off the planner's ++ // ratio). These do hit the per-tensor cudaMalloc path but only for the ++ // tail few tensors, not all of them. ++ std::vector pool_overflow_ptrs[GGML_CUDA_MAX_DEVICES]; ++ + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { +@@ -840,12 +853,22 @@ struct ggml_backend_cuda_split_buffer_context { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + } +- if (extra->data_device[id] != nullptr) { +- CUDA_CHECK(cudaFree(extra->data_device[id])); +- } ++ // tensor data lives inside per-device pool or pool_overflow_ptrs; freed below + } + delete extra; + } ++ for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { ++ if (pool_base[id] == nullptr && pool_overflow_ptrs[id].empty()) { ++ continue; // never touched this device — skip set_device ++ } ++ ggml_cuda_set_device(id); ++ for (char * p : pool_overflow_ptrs[id]) { ++ if (p != nullptr) CUDA_CHECK(cudaFree(p)); ++ } ++ if (pool_base[id] != nullptr) { ++ CUDA_CHECK(cudaFree(pool_base[id])); ++ } ++ } + } + + std::vector tensor_extras; +@@ -865,7 +888,13 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff + } + + static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +- GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ++ // Views: storage comes from view_src, so this split buffer has nothing ++ // to allocate for the view. Sched routes any op that consumes the view ++ // through view_src's backend. Mirrors the non-split buffer init's ++ // early-return for views. ++ if (tensor->view_src != nullptr) { ++ return GGML_STATUS_SUCCESS; ++ } + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; +@@ -876,6 +905,10 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_ + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + ctx->tensor_extras.push_back(extra); + ++ // 256-byte alignment is the CUDA default and matches what plain ++ // cudaMalloc returns; matmul kernels assume at least this. ++ constexpr size_t SPLIT_POOL_ALIGN = 256; ++ + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); +@@ -893,11 +926,34 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_ + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + +- // FIXME: do not crash if cudaMalloc fails +- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); +- char * buf; +- CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); ++ ++ char * buf = nullptr; ++ if (ctx->pool_base[id] != nullptr) { ++ // Pool path: bump-allocate inside the pre-allocated per-device ++ // slab. Avoids the per-tensor cudaMalloc fragmentation that ++ // breaks sequential row-split loads (Cond -> free -> DiT). ++ size_t off = (ctx->pool_used[id] + SPLIT_POOL_ALIGN - 1) & ~(SPLIT_POOL_ALIGN - 1); ++ if (off + size <= ctx->pool_size[id]) { ++ buf = ctx->pool_base[id] + off; ++ ctx->pool_used[id] = off + size; ++ } else { ++ // Pool exhausted (per-device share computation undershoot ++ // because row-split rounding skews per-device sizes away ++ // from tensor_split ratios). Fall back to a side cudaMalloc ++ // for this tensor's slice; freed by the per-tensor branch ++ // in the dtor. Most tensors still hit the pool; only the ++ // tail few that don't fit pay the fragmentation cost. ++ CUDA_CHECK(ggml_cuda_device_malloc((void **)&buf, size, id)); ++ ctx->pool_overflow_ptrs[id].push_back(buf); ++ } ++ } else { ++ // Fallback for the legacy path (pool alloc failed in alloc_buffer ++ // or some caller bypassed the pool). Per-tensor cudaMalloc. ++ // FIXME: do not crash if cudaMalloc fails ++ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ++ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); ++ } + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { +@@ -1022,12 +1078,64 @@ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { + } + + static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +- // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point +- // instead, we allocate them for each tensor separately in init_tensor +- // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, +- // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. ++ // size is the cumulative max across ALL devices and ALL tensors (sum of ++ // get_alloc_size). Pre-allocate one contiguous slab per device sized by ++ // the tensor_split ratio + a small safety margin for per-tensor padding ++ // rounding. init_tensor then bump-allocates inside these slabs. ++ // ++ // Why: per-tensor cudaMalloc fragments the CUDA driver's free-list when ++ // the buffer is freed (driver keeps bucketed reuse pools). When two ++ // split buffers are loaded sequentially (e.g. row-split conditioner -> ++ // free -> row-split DiT), the second load OOMs even when the planner's ++ // MAX-based peak says memory should be available. ++ ggml_backend_cuda_split_buffer_type_context * buft_ctx = ++ (ggml_backend_cuda_split_buffer_type_context *)buft->context; + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + ++ const int dev_count = ggml_backend_cuda_get_device_count(); ++ ++ // tensor_split is cumulative offsets in [0, 1]: device i covers ++ // [tensor_split[i], tensor_split[i+1]). Its share of the total is the ++ // delta. The last device gets up to 1.0. ++ bool pool_alloc_ok = true; ++ for (int id = 0; id < dev_count; ++id) { ++ const float lo = buft_ctx->tensor_split[id]; ++ const float hi = (id == dev_count - 1) ? 1.0f : buft_ctx->tensor_split[id + 1]; ++ const float frac = hi - lo; ++ if (frac <= 0.0f) { ++ continue; ++ } ++ // Safety margin: each tensor's per-device slice can pad up to ++ // (MATRIX_ROW_PADDING - 1) elements * row_size bytes. With many ++ // tensors that adds up; size_t(frac * size) plus 16 MiB cushion ++ // covers it for typical row counts. Plus one pool-alignment quantum ++ // per tensor would be tighter but harder to compute upfront. ++ size_t per_dev = size_t((double)frac * (double)size) + size_t(16) * 1024 * 1024; ++ ggml_cuda_set_device(id); ++ cudaError_t err = ggml_cuda_device_malloc((void **)&ctx->pool_base[id], per_dev, id); ++ if (err != cudaSuccess) { ++ GGML_LOG_WARN("%s: split pool alloc failed on device %d (%zu bytes, frac=%.3f); " ++ "falling back to per-tensor cudaMalloc\n", ++ __func__, id, per_dev, frac); ++ ctx->pool_base[id] = nullptr; ++ pool_alloc_ok = false; ++ // Don't bail — release any pools we've already taken so we don't ++ // hold partial pools while running fragmented anyway. ++ break; ++ } ++ ctx->pool_size[id] = per_dev; ++ } ++ if (!pool_alloc_ok) { ++ for (int id = 0; id < dev_count; ++id) { ++ if (ctx->pool_base[id] != nullptr) { ++ ggml_cuda_set_device(id); ++ CUDA_CHECK(cudaFree(ctx->pool_base[id])); ++ ctx->pool_base[id] = nullptr; ++ ctx->pool_size[id] = 0; ++ } ++ } ++ } ++ + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); + } + From 04fb57f81f7c199755a3319c7feada4cb955a11d Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sat, 2 May 2026 16:05:25 +0200 Subject: [PATCH 9/9] review: address PR #1470 #ifdef and old-flag-alias feedback Two changes from wbruna's review: 1. Replace `#ifdef SD_USE_CUDA` blocks with runtime backend dispatch. - Add `ggml_backend_split_buffer_type(backend, ...)` helper in `ggml_extend_backend.hpp` that looks up `ggml_backend_split_buffer_type` via `reg_get_proc_address`. Both CUDA and SYCL publish this proc, so row-split is no longer compile-time gated to CUDA. - Drop `#include "ggml-cuda.h"` and the `#ifdef SD_USE_CUDA` blocks in `ggml_extend.hpp` (constructor + `alloc_params_buffer_row_split`). - In `stable-diffusion.cpp::prepare_row_split_spec`, derive the backend registry from the device-name prefix (CUDA0 -> reg "CUDA", SYCL1 -> reg "SYCL") instead of calling `ggml_backend_cuda_get_device_count`. - Drop `add_definitions(-DSD_USE_CUDA)` from CMakeLists.txt; the macro is no longer referenced. 2. Restore the removed CPU-placement flags as soft-deprecated aliases (matching the existing `--qwen2vl` / `--qwen2vl_vision` deprecation pattern). Each alias sets the new `--*-backend-device` to "CPU" and disables auto-fit so the placement is honored verbatim: - `--clip-on-cpu` -> `--clip-backend-device CPU` - `--vae-on-cpu` -> `--vae-backend-device CPU` - `--control-net-cpu` -> `--control-net-backend-device CPU` Co-Authored-By: Claude Opus 4.7 (1M context) --- CMakeLists.txt | 1 - examples/common/common.cpp | 28 +++++++++++++++++++++ src/ggml_extend.hpp | 31 +++++++++-------------- src/ggml_extend_backend.hpp | 18 ++++++++++++++ src/stable-diffusion.cpp | 49 ++++++++++++++++++++++--------------- 5 files changed, 87 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32375b163..48ce456ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,6 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF if(SD_CUDA) message("-- Use CUDA as backend stable-diffusion") set(GGML_CUDA ON) - add_definitions(-DSD_USE_CUDA) endif() if(SD_METAL) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 36f1c6a86..792b580c5 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -645,6 +645,34 @@ ArgOptions SDContextParams::get_options() { std::exit(0); return 0; }}, + // Soft-deprecated aliases for the old per-component CPU-placement + // toggles. They map onto the new --*-backend-device strings and also + // disable auto-fit so the placement is honored verbatim (matching + // the pre-auto-fit behavior these flags expressed). + {"", + "--clip-on-cpu", + "alias of --clip-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + clip_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, + {"", + "--vae-on-cpu", + "alias of --vae-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + vae_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, + {"", + "--control-net-cpu", + "alias of --control-net-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + control_net_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, }; return options; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index de2bbcd2b..ea8a28812 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -25,9 +25,6 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" -#ifdef SD_USE_CUDA -#include "ggml-cuda.h" -#endif #include "ggml_extend_backend.hpp" #include "model.h" @@ -2158,26 +2155,21 @@ struct GGMLRunner { cpu_fallback_backend = pending->cpu_fallback; row_split_ratios = pending->tensor_split_ratios; row_main_device = pending->main_device; -#ifdef SD_USE_CUDA if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { - row_split_buft = ggml_backend_cuda_split_buffer_type( + row_split_buft = ggml_backend_split_buffer_type( + runtime_backend, row_main_device, row_split_ratios.empty() ? nullptr : row_split_ratios.data()); if (row_split_buft == nullptr) { - LOG_WARN("multi-backend: cuda split buft init failed; " - "falling back to single-backend mode"); + LOG_WARN("multi-backend: row-split buft init failed " + "(backend does not publish " + "ggml_backend_split_buffer_type); falling back " + "to single-backend mode"); multi_backend_mode = false; additional_backends.clear(); cpu_fallback_backend = nullptr; } } -#else - if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { - LOG_WARN("multi-backend: row-split requires CUDA; " - "falling back to single-backend mode"); - multi_backend_mode = false; - } -#endif if (multi_backend_mode && offload_params_to_cpu) { LOG_WARN("multi-backend split is incompatible with " "offload_params_to_cpu; ignoring offload"); @@ -2377,7 +2369,12 @@ struct GGMLRunner { } bool alloc_params_buffer_row_split() { -#ifdef SD_USE_CUDA + if (row_split_buft == nullptr) { + LOG_ERROR("alloc_params_buffer_row_split: row-split buft not " + "initialized (backend lacks " + "ggml_backend_split_buffer_type)"); + return false; + } ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend); const size_t main_align = ggml_backend_buft_get_alignment(main_buft); const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft); @@ -2442,10 +2439,6 @@ struct GGMLRunner { main_size / (1024.f * 1024.f), main_count, split_size / (1024.f * 1024.f), split_count); return true; -#else - LOG_ERROR("alloc_params_buffer_row_split called without CUDA"); - return false; -#endif } // Internal: always materializes the params buffer. Used by both the diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp index 50158c883..6d60a73ec 100644 --- a/src/ggml_extend_backend.hpp +++ b/src/ggml_extend_backend.hpp @@ -121,6 +121,24 @@ __STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backen } } +// Runtime lookup of a backend's row-split buffer type (currently published by +// the CUDA and SYCL backends as `ggml_backend_split_buffer_type` in their +// reg_get_proc_address tables). Returns nullptr when the backend does not +// support row-split, leaving the caller to fall back to a non-split path. +using __ggml_backend_split_buffer_type_t = ggml_backend_buffer_type_t (*)(int main_device, const float* tensor_split); + +__STATIC_INLINE__ ggml_backend_buffer_type_t ggml_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend); + if (reg == nullptr) { + return nullptr; + } + auto fn = reinterpret_cast<__ggml_backend_split_buffer_type_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type")); + if (fn == nullptr) { + return nullptr; + } + return fn(main_device, tensor_split); +} + __STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { if (tensor == nullptr) { return nullptr; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 682b2347d..c389c6242 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -614,25 +614,38 @@ class StableDiffusionGGML { const std::vector& share_bytes, std::vector& out_extra_backends, MultiBackendSpec& out_spec) -> bool { -#ifdef SD_USE_CUDA - const int cuda_count = ggml_backend_cuda_get_device_count(); - if (cuda_count <= 0 || share_devices.size() < 2) return false; - - // Map device names like "CUDA0" -> 0, "CUDA1" -> 1, ... - auto cuda_index_of = [](const std::string& name) -> int { - if (name.rfind("CUDA", 0) != 0) return -1; - try { return std::stoi(name.substr(4)); } catch (...) { return -1; } + if (share_devices.size() < 2) return false; + + // Derive the backend registry from the device-name prefix (e.g. + // "CUDA0" -> reg "CUDA", "SYCL1" -> reg "SYCL"). This keeps the + // code backend-agnostic: any backend whose registry publishes + // `ggml_backend_split_buffer_type` via reg_get_proc_address can + // drive row-split, not just CUDA. + auto reg_prefix_of = [](const std::string& name) -> std::string { + size_t i = 0; + while (i < name.size() && (std::isalpha((unsigned char)name[i]) || name[i] == '_')) i++; + return name.substr(0, i); + }; + const std::string reg_name = reg_prefix_of(share_devices[0]); + ggml_backend_reg_t reg = ggml_backend_reg_by_name(reg_name.c_str()); + if (reg == nullptr) return false; + const int dev_count = (int)ggml_backend_reg_dev_count(reg); + if (dev_count <= 0) return false; + + auto reg_index_of = [&](const std::string& name) -> int { + if (name.rfind(reg_name, 0) != 0) return -1; + try { return std::stoi(name.substr(reg_name.size())); } catch (...) { return -1; } }; - std::vector ratios(cuda_count, 0.0f); + std::vector ratios(dev_count, 0.0f); int64_t total = 0; for (auto b : share_bytes) total += b; if (total <= 0) return false; int main_dev = -1; int64_t max_share = -1; for (size_t k = 0; k < share_devices.size(); k++) { - int idx = cuda_index_of(share_devices[k]); - if (idx < 0 || idx >= cuda_count) continue; + int idx = reg_index_of(share_devices[k]); + if (idx < 0 || idx >= dev_count) continue; ratios[idx] = float(double(share_bytes[k]) / double(total)); if (share_bytes[k] > max_share) { max_share = share_bytes[k]; @@ -641,12 +654,12 @@ class StableDiffusionGGML { } if (main_dev < 0) return false; - // Init extra CUDA backends for the non-main devices so sched - // can route ops across them (row-split tensors are dispatched - // by the CUDA backend; ggml-sched still needs all participating + // Init extra backends for the non-main devices so sched can + // route ops across them (row-split tensors are dispatched by the + // primary backend; ggml-sched still needs all participating // backends in its list to schedule cross-device copies). for (size_t k = 0; k < share_devices.size(); k++) { - int idx = cuda_index_of(share_devices[k]); + int idx = reg_index_of(share_devices[k]); if (idx == main_dev || idx < 0) continue; ggml_backend_t b = init_named_backend(share_devices[k]); if (b != nullptr) { @@ -665,7 +678,7 @@ class StableDiffusionGGML { out_spec.cpu_fallback = nullptr; std::string ratio_str; - for (int i = 0; i < cuda_count; i++) { + for (int i = 0; i < dev_count; i++) { if (i > 0) ratio_str += ","; char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]); ratio_str += buf; @@ -673,10 +686,6 @@ class StableDiffusionGGML { LOG_INFO("row-split spec: ratios=[%s] main_device=%d", ratio_str.c_str(), main_dev); return true; -#else - (void)share_devices; (void)share_bytes; (void)out_spec; - return false; -#endif }; // Build the layer-split MultiBackendSpec for a component. Only used