diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 1a5399b82..792b580c5 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -380,6 +380,54 @@ ArgOptions SDContextParams::get_options() { "--upscale-model", "path to esrgan model.", &esrgan_path}, + {"", + "--main-backend-device", + "ggml device name to use as the main backend (see --list-devices). " + "When unset, the first GPU device is used.", + &main_backend_device}, + {"", + "--diffusion-backend-device", + "ggml device name for the diffusion / flow model. " + "Falls back to --main-backend-device.", + &diffusion_backend_device}, + {"", + "--clip-backend-device", + "ggml device name for the text encoders. " + "Falls back to --main-backend-device.", + &clip_backend_device}, + {"", + "--vae-backend-device", + "ggml device name for the VAE. Falls back to --main-backend-device.", + &vae_backend_device}, + {"", + "--control-net-backend-device", + "ggml device name for the ControlNet. " + "Falls back to --main-backend-device.", + &control_net_backend_device}, + {"", + "--tae-backend-device", + "ggml device name for the TAE (currently routed through main).", + &tae_backend_device}, + {"", + "--upscaler-backend-device", + "ggml device name for the upscaler (currently routed through main).", + &upscaler_backend_device}, + {"", + "--photomaker-backend-device", + "ggml device name for PhotoMaker (currently routed through main).", + &photomaker_backend_device}, + {"", + "--vision-backend-device", + "ggml device name for the vision model (currently routed through main).", + &vision_backend_device}, + {"", + "--multi-gpu-mode", + "auto-fit multi-GPU split mechanism: 'row' (default; CUDA-only " + "row-split via cuda_split_buffer_type, single backend, smaller " + "compute buffer), 'layer' (block-indexed tensors split across " + "per-block backends + sched, generic but ~2x activation cost at " + "boundaries), or 'off' (never split a single component)", + &multi_gpu_mode}, }; options.int_options = { @@ -392,6 +440,23 @@ ArgOptions SDContextParams::get_options() { "--chroma-t5-mask-pad", "t5 mask pad size of chroma", &chroma_t5_mask_pad}, + {"", + "--fit-target", + "auto-fit: MiB of free memory to leave on each GPU (default: 512)", + &auto_fit_target_mb}, + {"", + "--fit-compute-reserve-dit", + "auto-fit: MiB reserved on the DiT's GPU for its compute buffer " + "(0 keeps the built-in default)", + &auto_fit_compute_reserve_dit_mb}, + {"", + "--fit-compute-reserve-vae", + "auto-fit: MiB reserved on the VAE's GPU for its compute buffer", + &auto_fit_compute_reserve_vae_mb}, + {"", + "--fit-compute-reserve-cond", + "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer", + &auto_fit_compute_reserve_cond_mb}, }; options.float_options = {}; @@ -409,18 +474,6 @@ ArgOptions SDContextParams::get_options() { "--mmap", "whether to memory-map model", true, &enable_mmap}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, &control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, &clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, &vae_on_cpu}, {"", "--fa", "use flash attention", @@ -461,6 +514,30 @@ ArgOptions SDContextParams::get_options() { "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, + {"", + "--auto-fit", + "automatically pick DiT/VAE/Conditioner device placements based on " + "free GPU memory (default ON)", + true, &auto_fit}, + {"", + "--no-auto-fit", + "disable auto-fit and use the explicit *-backend-device flags", + false, &auto_fit}, + {"", + "--no-multi-gpu", + "auto-fit: keep all components on a single GPU when they fit " + "(by default, multi-GPU placements are preferred to balance load)", + false, &auto_multi_gpu}, + {"", + "--fit-dry-run", + "auto-fit: print the computed plan and exit without loading models", + true, &auto_fit_dry_run}, + {"", + "--quiet-unknown-tensors", + "suppress per-tensor 'unknown tensor X in model file' log lines " + "(useful for LTX-2 and similar models that ship many unused " + "tensors); a single summary line with the count is logged instead", + true, &quiet_unknown_tensors}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { @@ -559,6 +636,43 @@ ArgOptions SDContextParams::get_options() { "but it usually offers faster inference speed and, in some cases, lower memory usage. " "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, + {"", + "--list-devices", + "list available ggml backend devices (one per line, " + "namedescription) and exit", + [](int /*argc*/, const char** /*argv*/, int /*index*/) { + sd_list_devices(); + std::exit(0); + return 0; + }}, + // Soft-deprecated aliases for the old per-component CPU-placement + // toggles. They map onto the new --*-backend-device strings and also + // disable auto-fit so the placement is honored verbatim (matching + // the pre-auto-fit behavior these flags expressed). + {"", + "--clip-on-cpu", + "alias of --clip-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + clip_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, + {"", + "--vae-on-cpu", + "alias of --vae-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + vae_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, + {"", + "--control-net-cpu", + "alias of --control-net-backend-device CPU (also disables --auto-fit). Deprecated.", + [this](int /*argc*/, const char** /*argv*/, int /*index*/) { + control_net_backend_device = "CPU"; + auto_fit = false; + return 0; + }}, }; return options; @@ -671,9 +785,21 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " main_backend_device: \"" << main_backend_device << "\",\n" + << " diffusion_backend_device: \"" << diffusion_backend_device << "\",\n" + << " clip_backend_device: \"" << clip_backend_device << "\",\n" + << " vae_backend_device: \"" << vae_backend_device << "\",\n" + << " control_net_backend_device: \"" << control_net_backend_device << "\",\n" + << " tae_backend_device: \"" << tae_backend_device << "\",\n" + << " upscaler_backend_device: \"" << upscaler_backend_device << "\",\n" + << " photomaker_backend_device: \"" << photomaker_backend_device << "\",\n" + << " vision_backend_device: \"" << vision_backend_device << "\",\n" + << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" + << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" + << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" + << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" + << " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n" + << " quiet_unknown_tensors: " << (quiet_unknown_tensors ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -729,9 +855,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f lora_apply_mode, offload_params_to_cpu, enable_mmap, - clip_on_cpu, - control_net_cpu, - vae_on_cpu, + main_backend_device.empty() ? nullptr : main_backend_device.c_str(), + diffusion_backend_device.empty() ? nullptr : diffusion_backend_device.c_str(), + clip_backend_device.empty() ? nullptr : clip_backend_device.c_str(), + vae_backend_device.empty() ? nullptr : vae_backend_device.c_str(), + control_net_backend_device.empty() ? nullptr : control_net_backend_device.c_str(), + tae_backend_device.empty() ? nullptr : tae_backend_device.c_str(), + upscaler_backend_device.empty() ? nullptr : upscaler_backend_device.c_str(), + photomaker_backend_device.empty() ? nullptr : photomaker_backend_device.c_str(), + vision_backend_device.empty() ? nullptr : vision_backend_device.c_str(), flash_attn, diffusion_flash_attn, taesd_preview, @@ -744,6 +876,15 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_use_t5_mask, chroma_t5_mask_pad, qwen_image_zero_cond_t, + auto_fit, + auto_fit_target_mb, + auto_fit_dry_run, + auto_fit_compute_reserve_dit_mb, + auto_fit_compute_reserve_vae_mb, + auto_fit_compute_reserve_cond_mb, + auto_multi_gpu, + multi_gpu_mode.empty() ? nullptr : multi_gpu_mode.c_str(), + quiet_unknown_tensors, }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index c4498c352..1df32f9c0 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -110,9 +110,15 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; + std::string main_backend_device; + std::string diffusion_backend_device; + std::string clip_backend_device; + std::string vae_backend_device; + std::string control_net_backend_device; + std::string tae_backend_device; + std::string upscaler_backend_device; + std::string photomaker_backend_device; + std::string vision_backend_device; bool flash_attn = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; @@ -128,6 +134,23 @@ struct SDContextParams { bool qwen_image_zero_cond_t = false; + // Auto-fit defaults — placement is computed automatically based on free + // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device. + bool auto_fit = true; + int auto_fit_target_mb = 512; + bool auto_fit_dry_run = false; + int auto_fit_compute_reserve_dit_mb = 0; + int auto_fit_compute_reserve_vae_mb = 0; + int auto_fit_compute_reserve_cond_mb = 0; + bool auto_multi_gpu = true; + // "row" (default), "layer", or "off". Selects the multi-GPU split + // mechanism the auto-fit planner is allowed to emit. + std::string multi_gpu_mode = "row"; + + // When set, the model loader skips per-tensor "unknown tensor" log + // lines and instead emits a single summary count at the end of load. + bool quiet_unknown_tensors = false; + prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/ggml.patch b/ggml.patch new file mode 100644 index 000000000..0515013e1 --- /dev/null +++ b/ggml.patch @@ -0,0 +1,184 @@ +diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu +index cc80eb3f..a73ef0de 100644 +--- a/src/ggml-cuda/ggml-cuda.cu ++++ b/src/ggml-cuda/ggml-cuda.cu +@@ -832,6 +832,19 @@ struct ggml_backend_cuda_split_buffer_type_context { + }; + + struct ggml_backend_cuda_split_buffer_context { ++ // Per-device pool: one contiguous cudaMalloc per device, sub-allocated ++ // by init_tensor. Replaces the previous per-tensor cudaMalloc to avoid ++ // bucketed-free fragmentation when multiple split buffers are loaded ++ // and freed sequentially (e.g. row-split conditioner -> row-split DiT). ++ char * pool_base[GGML_CUDA_MAX_DEVICES] = {}; ++ size_t pool_size[GGML_CUDA_MAX_DEVICES] = {}; ++ size_t pool_used[GGML_CUDA_MAX_DEVICES] = {}; ++ // Side-allocations for tensors whose per-device slice didn't fit in the ++ // pool (row-split rounding skews per-device sizes off the planner's ++ // ratio). These do hit the per-tensor cudaMalloc path but only for the ++ // tail few tensors, not all of them. ++ std::vector pool_overflow_ptrs[GGML_CUDA_MAX_DEVICES]; ++ + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { +@@ -840,12 +853,22 @@ struct ggml_backend_cuda_split_buffer_context { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + } +- if (extra->data_device[id] != nullptr) { +- CUDA_CHECK(cudaFree(extra->data_device[id])); +- } ++ // tensor data lives inside per-device pool or pool_overflow_ptrs; freed below + } + delete extra; + } ++ for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { ++ if (pool_base[id] == nullptr && pool_overflow_ptrs[id].empty()) { ++ continue; // never touched this device — skip set_device ++ } ++ ggml_cuda_set_device(id); ++ for (char * p : pool_overflow_ptrs[id]) { ++ if (p != nullptr) CUDA_CHECK(cudaFree(p)); ++ } ++ if (pool_base[id] != nullptr) { ++ CUDA_CHECK(cudaFree(pool_base[id])); ++ } ++ } + } + + std::vector tensor_extras; +@@ -865,7 +888,13 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff + } + + static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +- GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ++ // Views: storage comes from view_src, so this split buffer has nothing ++ // to allocate for the view. Sched routes any op that consumes the view ++ // through view_src's backend. Mirrors the non-split buffer init's ++ // early-return for views. ++ if (tensor->view_src != nullptr) { ++ return GGML_STATUS_SUCCESS; ++ } + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; +@@ -876,6 +905,10 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_ + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + ctx->tensor_extras.push_back(extra); + ++ // 256-byte alignment is the CUDA default and matches what plain ++ // cudaMalloc returns; matmul kernels assume at least this. ++ constexpr size_t SPLIT_POOL_ALIGN = 256; ++ + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); +@@ -893,11 +926,34 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_ + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + +- // FIXME: do not crash if cudaMalloc fails +- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); +- char * buf; +- CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); ++ ++ char * buf = nullptr; ++ if (ctx->pool_base[id] != nullptr) { ++ // Pool path: bump-allocate inside the pre-allocated per-device ++ // slab. Avoids the per-tensor cudaMalloc fragmentation that ++ // breaks sequential row-split loads (Cond -> free -> DiT). ++ size_t off = (ctx->pool_used[id] + SPLIT_POOL_ALIGN - 1) & ~(SPLIT_POOL_ALIGN - 1); ++ if (off + size <= ctx->pool_size[id]) { ++ buf = ctx->pool_base[id] + off; ++ ctx->pool_used[id] = off + size; ++ } else { ++ // Pool exhausted (per-device share computation undershoot ++ // because row-split rounding skews per-device sizes away ++ // from tensor_split ratios). Fall back to a side cudaMalloc ++ // for this tensor's slice; freed by the per-tensor branch ++ // in the dtor. Most tensors still hit the pool; only the ++ // tail few that don't fit pay the fragmentation cost. ++ CUDA_CHECK(ggml_cuda_device_malloc((void **)&buf, size, id)); ++ ctx->pool_overflow_ptrs[id].push_back(buf); ++ } ++ } else { ++ // Fallback for the legacy path (pool alloc failed in alloc_buffer ++ // or some caller bypassed the pool). Per-tensor cudaMalloc. ++ // FIXME: do not crash if cudaMalloc fails ++ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ++ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); ++ } + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { +@@ -1022,12 +1078,64 @@ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { + } + + static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +- // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point +- // instead, we allocate them for each tensor separately in init_tensor +- // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, +- // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. ++ // size is the cumulative max across ALL devices and ALL tensors (sum of ++ // get_alloc_size). Pre-allocate one contiguous slab per device sized by ++ // the tensor_split ratio + a small safety margin for per-tensor padding ++ // rounding. init_tensor then bump-allocates inside these slabs. ++ // ++ // Why: per-tensor cudaMalloc fragments the CUDA driver's free-list when ++ // the buffer is freed (driver keeps bucketed reuse pools). When two ++ // split buffers are loaded sequentially (e.g. row-split conditioner -> ++ // free -> row-split DiT), the second load OOMs even when the planner's ++ // MAX-based peak says memory should be available. ++ ggml_backend_cuda_split_buffer_type_context * buft_ctx = ++ (ggml_backend_cuda_split_buffer_type_context *)buft->context; + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + ++ const int dev_count = ggml_backend_cuda_get_device_count(); ++ ++ // tensor_split is cumulative offsets in [0, 1]: device i covers ++ // [tensor_split[i], tensor_split[i+1]). Its share of the total is the ++ // delta. The last device gets up to 1.0. ++ bool pool_alloc_ok = true; ++ for (int id = 0; id < dev_count; ++id) { ++ const float lo = buft_ctx->tensor_split[id]; ++ const float hi = (id == dev_count - 1) ? 1.0f : buft_ctx->tensor_split[id + 1]; ++ const float frac = hi - lo; ++ if (frac <= 0.0f) { ++ continue; ++ } ++ // Safety margin: each tensor's per-device slice can pad up to ++ // (MATRIX_ROW_PADDING - 1) elements * row_size bytes. With many ++ // tensors that adds up; size_t(frac * size) plus 16 MiB cushion ++ // covers it for typical row counts. Plus one pool-alignment quantum ++ // per tensor would be tighter but harder to compute upfront. ++ size_t per_dev = size_t((double)frac * (double)size) + size_t(16) * 1024 * 1024; ++ ggml_cuda_set_device(id); ++ cudaError_t err = ggml_cuda_device_malloc((void **)&ctx->pool_base[id], per_dev, id); ++ if (err != cudaSuccess) { ++ GGML_LOG_WARN("%s: split pool alloc failed on device %d (%zu bytes, frac=%.3f); " ++ "falling back to per-tensor cudaMalloc\n", ++ __func__, id, per_dev, frac); ++ ctx->pool_base[id] = nullptr; ++ pool_alloc_ok = false; ++ // Don't bail — release any pools we've already taken so we don't ++ // hold partial pools while running fragmented anyway. ++ break; ++ } ++ ctx->pool_size[id] = per_dev; ++ } ++ if (!pool_alloc_ok) { ++ for (int id = 0; id < dev_count; ++id) { ++ if (ctx->pool_base[id] != nullptr) { ++ ggml_cuda_set_device(id); ++ CUDA_CHECK(cudaFree(ctx->pool_base[id])); ++ ctx->pool_base[id] = nullptr; ++ ctx->pool_size[id] = 0; ++ } ++ } ++ } ++ + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); + } + diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 75027f8f8..7da5324b4 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -188,9 +188,18 @@ typedef struct { enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; bool enable_mmap; - bool keep_clip_on_cpu; - bool keep_control_net_on_cpu; - bool keep_vae_on_cpu; + // Per-component backend device names (ggml device names). Empty / NULL + // means "use the main backend device". The strings are only borrowed for + // the duration of the init call. See sd_list_devices() for what to pass. + const char* main_backend_device; + const char* diffusion_backend_device; + const char* clip_backend_device; + const char* vae_backend_device; + const char* control_net_backend_device; + const char* tae_backend_device; + const char* upscaler_backend_device; + const char* photomaker_backend_device; + const char* vision_backend_device; bool flash_attn; bool diffusion_flash_attn; bool tae_preview_only; @@ -203,6 +212,49 @@ typedef struct { bool chroma_use_t5_mask; int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; + + // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory. + // When `auto_fit` is true (default), the *_backend_device strings are + // ignored and the plan is computed automatically. + // `auto_fit_target_mb` is the memory to leave free per GPU (default 512). + // `auto_fit_dry_run` prints the plan and aborts init before loading. + // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the + // per-component compute-buffer reserve; 0 means use the built-in default. + bool auto_fit; + int auto_fit_target_mb; + bool auto_fit_dry_run; + int auto_fit_compute_reserve_dit_mb; + int auto_fit_compute_reserve_vae_mb; + int auto_fit_compute_reserve_cond_mb; + + // When more than one GPU device is present, prefer placing different + // components on different GPUs to balance load and fit larger total + // working sets. Set false to keep all components on a single GPU when + // they fit. Defaults to true. + bool auto_multi_gpu; + + // When auto_multi_gpu is true and a single component doesn't fit on + // one GPU, the planner can split it across multiple GPUs. Two + // mechanisms: + // "row": matmul weights row-split across CUDA devices via + // cuda_split_buffer_type. Single CUDA backend; no sched. + // Cheaper compute buffer (no cross-backend doubling) but + // CUDA-only. Default. + // "layer": block-indexed tensors assigned to per-block backends + // and routed via ggml_backend_sched. Generic across + // backends but costs ~2x activation memory at boundaries. + // "off": never split a single component across GPUs. Components + // that don't fit fall back to OFFLOAD or CPU. + // The string is parsed by backend_fit::str_to_multi_gpu_mode; if + // unrecognized, "row" is used. + const char* multi_gpu_mode; + + // Suppress per-tensor "unknown tensor 'X' in model file" log lines + // emitted during model loading. Useful for models like LTX-2 that + // ship hundreds of audio-branch / encoder tensors a video-only + // pipeline doesn't consume. A single summary line is logged at the + // end with the count of skipped tensors. + bool quiet_unknown_tensors; } sd_ctx_params_t; typedef struct { @@ -449,6 +501,11 @@ SD_API bool preprocess_canny(sd_image_t image, SD_API const char* sd_commit(void); SD_API const char* sd_version(void); +// List available ggml backend devices to stdout, in `namedescription` +// per-line format. The output is intended to be parsed by tools and used +// directly as the value of --*-backend-device flags. +SD_API void sd_list_devices(void); + #ifdef __cplusplus } #endif diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp new file mode 100644 index 000000000..b95632750 --- /dev/null +++ b/src/backend_fit.hpp @@ -0,0 +1,652 @@ +#ifndef __SD_BACKEND_FIT_HPP__ +#define __SD_BACKEND_FIT_HPP__ + +// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the +// available GPU devices and system RAM. +// +// Each component is treated as a single atomic unit that lives entirely on +// one device (plus its compute buffer on the same device). There is no +// intra-tensor row split: cross-device parallelism comes from placing +// different components on different GPUs, not from splitting individual +// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER +// at the component granularity. +// +// Placement priority: DiT + compute buffer -> VAE -> Conditioner. +// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that +// support streaming params from RAM at compute time). + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend.h" + +#include "model.h" +#include "util.h" + +namespace backend_fit { + +constexpr int64_t MiB = 1024 * 1024; +constexpr int DEVICE_ID_CPU = -1; + +enum class ComponentKind { + DIT, + VAE, + CONDITIONER, +}; + +enum class Placement { + CPU, + GPU, + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries (sched-based) + GPU_TENSOR_SPLIT, // matmul weights row-split across GPUs (CUDA split-buft, single backend) +}; + +struct Component { + ComponentKind kind; + std::string name; + int64_t params_bytes = 0; + int64_t compute_bytes = 0; + bool supports_offload = false; +}; + +struct Device { + int id = DEVICE_ID_CPU; + std::string name; + std::string description; + int64_t free_bytes = 0; + int64_t total_bytes = 0; + ggml_backend_dev_t dev = nullptr; // backing ggml device handle (GPU only) +}; + +struct Decision { + ComponentKind kind; + std::string name; + Placement placement = Placement::CPU; + int device_id = DEVICE_ID_CPU; + int64_t on_device_bytes = 0; + int64_t on_host_bytes = 0; + + // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs + // that share this component (in order) and each device's estimated share + // of the params. The order also defines block-range partitioning: the + // i-th device gets a contiguous range of blocks proportional to share[i]. + std::vector split_device_ids; + std::vector split_share_bytes; +}; + +struct Plan { + std::vector decisions; + std::map device_bytes; + int64_t host_bytes = 0; + bool any_changes = false; +}; + +struct ComputeReserves { + int64_t dit_bytes = int64_t(2048) * MiB; + int64_t vae_bytes = int64_t(1024) * MiB; + int64_t conditioner_bytes = int64_t(512) * MiB; +}; + +enum class MultiGpuMode { + OFF, // never split a single component across GPUs + ROW, // CUDA-only: row-split matmul weights via cuda_split_buffer_type + LAYER, // generic: assign block-indexed tensors to per-block backends + sched +}; + +inline const char* multi_gpu_mode_str(MultiGpuMode m) { + switch (m) { + case MultiGpuMode::OFF: return "off"; + case MultiGpuMode::ROW: return "row"; + case MultiGpuMode::LAYER: return "layer"; + } + return "?"; +} + +inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) { + if (s == "off") return MultiGpuMode::OFF; + if (s == "row") return MultiGpuMode::ROW; + if (s == "layer") return MultiGpuMode::LAYER; + return MultiGpuMode::ROW; // default +} + +// --- Classification ------------------------------------------------------- + +inline bool classify_tensor(const std::string& name, ComponentKind& out) { + auto contains = [&](const char* s) { return name.find(s) != std::string::npos; }; + + if (contains("model.diffusion_model.") || contains("unet.")) { + out = ComponentKind::DIT; + return true; + } + + if (contains("first_stage_model.") || + name.rfind("vae.", 0) == 0 || + name.rfind("tae.", 0) == 0) { + out = ComponentKind::VAE; + return true; + } + + if (contains("text_encoders") || + contains("cond_stage_model") || + contains("te.text_model.") || + contains("conditioner") || + name.rfind("text_encoder.", 0) == 0 || + // Connector / text projection layers that run on the conditioner + // backend (e.g. LTX-2's text_embedding_projection: video/audio + // aggregate embeds + projection that map LLM hidden states into + // DiT-input space). + name.rfind("text_embedding_projection.", 0) == 0 || + contains(".aggregate_embed.")) { + out = ComponentKind::CONDITIONER; + return true; + } + + return false; +} + +// --- Memory estimation ---------------------------------------------------- + +inline std::vector estimate_components(ModelLoader& loader, + ggml_type override_wtype, + int64_t alignment, + const ComputeReserves& reserves) { + auto& storage = loader.get_tensor_storage_map(); + + int64_t bytes[3] = {0, 0, 0}; + + for (auto& [name, ts_const] : storage) { + TensorStorage ts = ts_const; + if (is_unused_tensor(ts.name)) { + continue; + } + + ComponentKind k; + if (!classify_tensor(ts.name, k)) { + continue; + } + + if (override_wtype != GGML_TYPE_COUNT && + loader.tensor_should_be_converted(ts, override_wtype)) { + ts.type = override_wtype; + } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) { + ts.type = ts.expected_type; + } + + bytes[int(k)] += ts.nbytes() + alignment; + } + + std::vector out; + out.reserve(3); + out.push_back({ComponentKind::DIT, "DiT", + bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true}); + out.push_back({ComponentKind::VAE, "VAE", + bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false}); + out.push_back({ComponentKind::CONDITIONER, "Conditioner", + bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true}); + return out; +} + +// --- Device enumeration --------------------------------------------------- + +inline std::vector enumerate_gpu_devices() { + std::vector out; + int gpu_idx = 0; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + continue; + } + Device d; + d.id = gpu_idx++; + d.dev = dev; + d.name = ggml_backend_dev_name(dev); + d.description = ggml_backend_dev_description(dev); + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + d.free_bytes = int64_t(free_b); + d.total_bytes = int64_t(total_b); + out.push_back(d); + } + return out; +} + +// --- Core algorithm ------------------------------------------------------- + +// Per-GPU share for a layer-split component: free-VRAM-weighted partition +// of params, plus the full compute reserve on each participating device. +// (Compute reserve is per-device since each shard activates its own kernels.) +inline std::vector layer_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs) { + std::vector out(gpu_idxs.size(), 0); + int64_t total_free = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + total_free += std::max(0, devices[gpu_idxs[k]].free_bytes); + } + if (total_free <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + double r = double(std::max(0, devices[gpu_idxs[k]].free_bytes)) / double(total_free); + out[k] = int64_t(double(params_bytes) * r) + compute_bytes; + } + return out; +} + +// Peak per device = MAX of any single component's footprint on that device, +// because free_params_immediately frees params between phases so components +// time-share VRAM. +inline int64_t gpu_peak(int gpu_idx, + const std::vector& pl, + const std::vector& dev, + const std::vector& components, + const std::vector& devices = {}) { + int64_t peak = 0; + for (size_t i = 0; i < components.size(); i++) { + int64_t footprint = 0; + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (dev[i] != gpu_idx) continue; + footprint = components[i].params_bytes + components[i].compute_bytes; + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: every GPU in the mask gets a free-VRAM-weighted + // share of params; the compute reserve lands on the BIGGEST + // GPU (which becomes the runner's main backend). + const int mask = dev[i]; + if (!(mask & (1 << gpu_idx))) continue; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + int slot = -1; + int biggest_slot = 0; + int64_t biggest_mem = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) slot = int(k); + if (devices[gpu_idxs[k]].total_bytes > biggest_mem) { + biggest_mem = devices[gpu_idxs[k]].total_bytes; + biggest_slot = int(k); + } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + /*compute_bytes=*/0, + devices, gpu_idxs); + footprint = shares[slot]; + if (slot == biggest_slot) { + footprint += components[i].compute_bytes; + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // dev[i] holds the bitmask of participating GPU indices into the + // devices[] vector (encoded by the planner). Look up our slot. + const int mask = dev[i]; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + // Find this gpu's slot in gpu_idxs. + int slot = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs); + footprint = shares[slot]; + } + peak = std::max(peak, footprint); + } + return peak; +} + +inline Plan compute_plan(const std::vector& components, + const std::vector& devices, + int64_t margin_bytes, + bool allow_multi_gpu = true, + MultiGpuMode mode = MultiGpuMode::ROW) { + const size_t nC = components.size(); + const size_t nG = devices.size(); + if (!allow_multi_gpu) { + mode = MultiGpuMode::OFF; + } + + std::vector cap(nG, 0); + for (size_t g = 0; g < nG; g++) { + cap[g] = std::max(0, devices[g].free_bytes - margin_bytes); + } + + struct OptionSlot { + Placement placement; + int device_idx; + }; + + // Layer-split is only meaningful for components made up of many similarly + // shaped blocks. DiT and Conditioner (LLM transformer) qualify; the VAE + // is too structurally heterogeneous for naive block partitioning. + auto supports_layer_split = [](ComponentKind k) { + return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; + }; + + auto build_options = [&](const Component& c) { + std::vector opts; + for (size_t g = 0; g < nG; g++) { + opts.push_back({Placement::GPU, int(g)}); + if (c.supports_offload) { + opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); + } + } + // Multi-GPU split: one option type per mode. Encoded as a bitmask + // of participating GPUs in device_idx. + if (mode == MultiGpuMode::ROW && nG >= 2 && supports_layer_split(c.kind)) { + // Row-split spans all GPUs; single option with all bits set. + int all_mask = (1 << nG) - 1; + opts.push_back({Placement::GPU_TENSOR_SPLIT, all_mask}); + } + if (mode == MultiGpuMode::LAYER && nG >= 2 && supports_layer_split(c.kind)) { + // Layer-split: enumerate non-trivial subsets (size >= 2). + const int max_mask = 1 << nG; + for (int mask = 1; mask < max_mask; mask++) { + if (__builtin_popcount(mask) < 2) continue; + opts.push_back({Placement::GPU_LAYER_SPLIT, mask}); + } + } + opts.push_back({Placement::CPU, -1}); + return opts; + }; + + std::vector> options; + options.reserve(nC); + for (const Component& c : components) { + options.push_back(build_options(c)); + } + + auto priority_weight = [](ComponentKind k) -> int { + switch (k) { + case ComponentKind::DIT: return 300; + case ComponentKind::CONDITIONER: return 120; + case ComponentKind::VAE: return 60; + } + return 1; + }; + + auto score = [&](const std::vector& pl, const std::vector& dev) { + int64_t s = 0; + std::set gpus_used; + for (size_t i = 0; i < nC; i++) { + const int pw = priority_weight(components[i].kind); + if (pl[i] == Placement::GPU) { + s += 10 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + s += 5 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: cheaper than layer-split (no sched cross- + // backend doubling) but pays per-matmul cross-device + // reductions. Score it slightly above LAYER_SPLIT so the + // planner prefers it when both fit. + s += 8 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // Better than CPU but worse than fitting on a single GPU + // (cross-GPU traffic between blocks). + s += 7 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } + } else { + s -= 10 * pw; + } + } + if (allow_multi_gpu) { + s += 2 * int64_t(gpus_used.size()); + } + return s; + }; + + std::vector idx(nC, 0); + std::vector best_pl; + std::vector best_dev; + int64_t best_score = std::numeric_limits::min(); + bool found_any = false; + + while (true) { + std::vector pl(nC); + std::vector dev(nC); + for (size_t i = 0; i < nC; i++) { + pl[i] = options[i][idx[i]].placement; + dev[i] = options[i][idx[i]].device_idx; + } + // Constraint: when multi-GPU is disabled, all GPU placements must + // share the same device index. + if (!allow_multi_gpu) { + int common = -1; + bool ok = true; + for (size_t i = 0; i < nC; i++) { + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (common < 0) common = dev[i]; + else if (dev[i] != common) { ok = false; break; } + } + } + if (ok) { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + } else { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + + size_t pos = 0; + while (pos < nC) { + idx[pos]++; + if (idx[pos] < options[pos].size()) break; + idx[pos] = 0; + pos++; + } + if (pos >= nC) break; + } + + Plan plan; + if (!found_any) { + best_pl.assign(nC, Placement::CPU); + best_dev.assign(nC, -1); + } + + for (size_t i = 0; i < nC; i++) { + const Component& c = components[i]; + Decision d; + d.kind = c.kind; + d.name = c.name; + d.placement = best_pl[i]; + if (best_pl[i] == Placement::CPU) { + d.device_id = DEVICE_ID_CPU; + d.on_host_bytes = c.params_bytes + c.compute_bytes; + plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, /*compute_bytes=*/0, + devices, gpu_idxs); + // Sort participating GPUs by descending TOTAL memory so the + // largest device is the "main" (gets the row-split's compute + // buffer + sub-runners that don't get their own spec). This + // matches the user's preference: always use the bigger GPU + // as main for splits. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + int64_t share = shares[k]; + if (pos == 0) share += c.compute_bytes; // main (= biggest) gets compute + d.split_share_bytes.push_back(share); + max_share = std::max(max_share, share); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs); + // Sort participating GPUs by descending TOTAL memory so the + // physically bigger GPU is listed first (and becomes the runner's + // main backend). Sub-runners that don't get the layer-split spec + // (e.g. the LTX-2 text projection) follow the main backend. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + d.split_share_bytes.push_back(shares[k]); + max_share = std::max(max_share, shares[k]); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; + } else { + d.device_id = devices[best_dev[i]].id; + if (best_pl[i] == Placement::GPU) { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + } else { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + d.on_host_bytes = c.params_bytes; + plan.any_changes = true; + } + } + plan.decisions.push_back(d); + plan.host_bytes += d.on_host_bytes; + } + + for (size_t g = 0; g < nG; g++) { + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices); + } + return plan; +} + +inline const char* placement_str(Placement p) { + switch (p) { + case Placement::CPU: return "CPU"; + case Placement::GPU: return "GPU"; + case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)"; + case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)"; + } + return "?"; +} + +inline void print_plan(const Plan& plan, + const std::vector& components, + const std::vector& devices, + int64_t margin_bytes) { + LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB)); + LOG_INFO(" available devices:"); + if (devices.empty()) { + LOG_INFO(" (no GPU devices detected — all components will run on CPU)"); + } + for (const Device& d : devices) { + LOG_INFO(" %-12s %-32s free %6lld / %6lld MiB", + d.name.c_str(), d.description.c_str(), + (long long)(d.free_bytes / MiB), + (long long)(d.total_bytes / MiB)); + } + LOG_INFO(" components:"); + for (const Component& c : components) { + LOG_INFO(" %-12s params %6lld MiB, compute reserve %6lld MiB", + c.name.c_str(), + (long long)(c.params_bytes / MiB), + (long long)(c.compute_bytes / MiB)); + } + LOG_INFO(" decisions:"); + for (const Decision& d : plan.decisions) { + if (d.placement == Placement::CPU) { + LOG_INFO(" %-12s -> CPU (RAM %lld MiB)", + d.name.c_str(), (long long)(d.on_host_bytes / MiB)); + } else if (d.placement == Placement::GPU) { + LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB)); + } else if (d.placement == Placement::GPU_LAYER_SPLIT || + d.placement == Placement::GPU_TENSOR_SPLIT) { + std::string ids; + const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer"; + for (size_t k = 0; k < d.split_device_ids.size(); k++) { + if (k > 0) ids += "+"; + ids += "GPU" + std::to_string(d.split_device_ids[k]); + ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)"; + } + LOG_INFO(" %-12s -> %s-split %s", + d.name.c_str(), tag, ids.c_str()); + } else { + LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB), + (long long)(d.on_host_bytes / MiB)); + } + } + LOG_INFO(" projected per-device peak:"); + for (const Device& d : devices) { + int64_t peak = 0; + auto it = plan.device_bytes.find(d.id); + if (it != plan.device_bytes.end()) peak = it->second; + LOG_INFO(" %-12s peak %6lld / %6lld MiB free (remaining %lld MiB)", + d.name.c_str(), + (long long)(peak / MiB), + (long long)(d.free_bytes / MiB), + (long long)((d.free_bytes - peak) / MiB)); + } + LOG_INFO(" %-12s host RAM additional %lld MiB", "CPU", + (long long)(plan.host_bytes / MiB)); +} + +inline const Decision* find_decision(const Plan& plan, ComponentKind kind) { + for (const Decision& d : plan.decisions) { + if (d.kind == kind) return &d; + } + return nullptr; +} + +} // namespace backend_fit + +#endif // __SD_BACKEND_FIT_HPP__ diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 9f4d45524..99e27ae39 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -87,6 +87,11 @@ struct Conditioner { virtual size_t get_params_buffer_size() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} + // Defer the LLM sub-runner's params alloc + read until first compute(). + // Only conditioners with a heavy LLM (e.g. LTX-2 Gemma) override this; + // others ignore the call. The callback is invoked AFTER the runner's + // alloc_params_buffer succeeds and is responsible for tensor data load. + virtual void set_llm_lazy_load(std::function /*fn*/) {} virtual std::tuple> get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) { GGML_ABORT("Not implemented yet!"); diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index c0a2a11c0..d7ea6ede7 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -50,6 +50,10 @@ struct DiffusionModel { virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; + // Defer params alloc + tensor data load until the first compute() call. + // Default: no-op. Subclasses backed by a single GGMLRunner forward to + // its set_lazy_load. + virtual void set_lazy_load(std::function /*fn*/) {} }; struct UNetModel : public DiffusionModel { diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 8b748194f..ea8a28812 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1705,6 +1705,55 @@ struct GGMLRunnerContext { std::shared_ptr weight_adapter = nullptr; }; +// --------------------------------------------------------------------------- +// Multi-backend (layer-split) support +// --------------------------------------------------------------------------- +// A GGMLRunner can opt into "layer-split" mode where each weight tensor lives +// entirely on one of several backends, picked by a caller-supplied callback +// (typically based on the tensor name's block index). The runner switches +// from gallocr to ggml_backend_sched for graph compute, so cross-backend +// edges are routed automatically. +// +// This is the llama.cpp LLAMA_SPLIT_MODE_LAYER analogue. There is no +// intra-tensor row split, so every tensor lives on a single normal device +// buffer — views work without any ggml-cuda patch. +// +// To enable: populate g_pending_multi_backend_spec() with the additional +// backends + tensor->backend callback, then construct the GGMLRunner. The +// ctor consumes and clears the pending pointer. +enum class MultiBackendMode { + LAYER_SPLIT, // assign block-indexed tensors to per-block backends + sched + ROW_SPLIT, // CUDA split-buft: matmul weights row-split across devices +}; + +struct MultiBackendSpec { + MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT; + + // Extra backends *in addition to* the runner's main runtime_backend. + // The first entry's role is the main backend; we don't list it here. + std::vector additional_backends; + + // LAYER_SPLIT: maps a weight tensor name to one of the runner's + // backends (the main runtime_backend, or one of additional_backends). + // Returning nullptr means "use the main runtime_backend". + std::function tensor_backend_fn; + + // ROW_SPLIT (CUDA-only): per-device row split ratios (length = total + // CUDA device count) and main device. Empty means use CUDA's default + // free-VRAM proportions. + std::vector tensor_split_ratios; + int main_device = 0; + + // Optional CPU backend appended last to the sched for unsupported-op + // fallback. May be nullptr. + ggml_backend_t cpu_fallback = nullptr; +}; + +__STATIC_INLINE__ MultiBackendSpec*& g_pending_multi_backend_spec() { + thread_local MultiBackendSpec* spec = nullptr; + return spec; +} + struct GGMLRunner { protected: typedef std::function get_graph_cb_t; @@ -1712,6 +1761,33 @@ struct GGMLRunner { ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; + // --- multi-backend state (layer-split via sched OR row-split via cuda_split_buft) --- + bool multi_backend_mode = false; + MultiBackendMode multi_backend_kind = MultiBackendMode::LAYER_SPLIT; + std::vector additional_backends; + ggml_backend_t cpu_fallback_backend = nullptr; + bool owns_cpu_fallback_backend = false; + std::function tensor_backend_fn = nullptr; + ggml_backend_sched_t sched = nullptr; + bool sched_reserved = false; + // Per-backend params buffers when LAYER_SPLIT is active. ROW_SPLIT uses + // a CUDA split-buft buffer + a regular buffer for non-split tensors, + // stored in row_split_buffer + row_main_buffer instead. + std::vector multi_params_buffers; + // ROW_SPLIT-only state. + std::vector row_split_ratios; + int row_main_device = 0; + ggml_backend_buffer_type_t row_split_buft = nullptr; + ggml_backend_buffer_t row_split_buffer = nullptr; + ggml_backend_buffer_t row_main_buffer = nullptr; + + // Lazy load: when set, alloc_params_buffer becomes a no-op; the actual + // alloc + tensor-data load is deferred until the first compute(). The + // callback is invoked AFTER do_alloc_params_buffer succeeds and is + // responsible for populating tensor->data via ModelLoader. Used to keep + // peak VRAM per-component-MAX rather than sum-of-components at init. + std::function lazy_load_fn = nullptr; + ggml_context* params_ctx = nullptr; ggml_backend_buffer_t params_buffer = nullptr; ggml_context* offload_ctx = nullptr; @@ -1859,7 +1935,56 @@ struct GGMLRunner { return gf; } + // Build the multi-backend sched (lazily). + bool ensure_sched() { + if (sched != nullptr) return true; + std::vector backends; + backends.reserve(1 + additional_backends.size() + 1); + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + // ggml_backend_sched_new asserts the last backend is a CPU; create + // a CPU fallback if the caller didn't provide one. We own this + // instance and free it in the dtor below. + if (cpu_fallback_backend == nullptr) { + cpu_fallback_backend = ggml_backend_cpu_init(); + owns_cpu_fallback_backend = true; + } + backends.push_back(cpu_fallback_backend); + sched = ggml_backend_sched_new(backends.data(), + /*bufts=*/nullptr, + (int)backends.size(), + MAX_GRAPH_SIZE, + /*parallel=*/false, + /*op_offload=*/false); + if (sched == nullptr) { + LOG_ERROR("%s: failed to create backend sched", get_desc().c_str()); + return false; + } + return true; + } + bool alloc_compute_buffer(get_graph_cb_t get_graph) { + if (multi_backend_mode) { + if (sched_reserved) return true; + if (!ensure_sched()) return false; + reset_compute_ctx(); + ggml_cgraph* gf = get_compute_graph(get_graph); + backend_tensor_data_map.clear(); + if (!ggml_backend_sched_reserve(sched, gf)) { + LOG_ERROR("%s: sched reserve failed", get_desc().c_str()); + return false; + } + sched_reserved = true; + for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); i++) { + ggml_backend_t b = ggml_backend_sched_get_backend(sched, i); + size_t s = ggml_backend_sched_get_buffer_size(sched, b); + LOG_DEBUG("%s sched buf[%d] %s = %.2f MB", + get_desc().c_str(), i, ggml_backend_name(b), + s / (1024.f * 1024.f)); + } + return true; + } + if (compute_allocr != nullptr) { return true; } @@ -2018,6 +2143,40 @@ struct GGMLRunner { GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) : runtime_backend(backend) { + // Consume any pending multi-backend spec set by the caller via + // g_pending_multi_backend_spec(). + MultiBackendSpec* pending = g_pending_multi_backend_spec(); + if (pending != nullptr) { + g_pending_multi_backend_spec() = nullptr; + multi_backend_mode = true; + multi_backend_kind = pending->mode; + additional_backends = pending->additional_backends; + tensor_backend_fn = pending->tensor_backend_fn; + cpu_fallback_backend = pending->cpu_fallback; + row_split_ratios = pending->tensor_split_ratios; + row_main_device = pending->main_device; + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + row_split_buft = ggml_backend_split_buffer_type( + runtime_backend, + row_main_device, + row_split_ratios.empty() ? nullptr : row_split_ratios.data()); + if (row_split_buft == nullptr) { + LOG_WARN("multi-backend: row-split buft init failed " + "(backend does not publish " + "ggml_backend_split_buffer_type); falling back " + "to single-backend mode"); + multi_backend_mode = false; + additional_backends.clear(); + cpu_fallback_backend = nullptr; + } + } + if (multi_backend_mode && offload_params_to_cpu) { + LOG_WARN("multi-backend split is incompatible with " + "offload_params_to_cpu; ignoring offload"); + offload_params_to_cpu = false; + } + } + alloc_params_ctx(); if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) { params_backend = ggml_backend_cpu_init(); @@ -2035,6 +2194,16 @@ struct GGMLRunner { ggml_backend_free(params_backend); } free_cache_ctx_and_buffer(); + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + } + if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) { + ggml_backend_free(cpu_fallback_backend); + cpu_fallback_backend = nullptr; + } + // additional_backends are owned by the caller (see the MultiBackendSpec + // setup site in stable-diffusion.cpp); not freed here. } virtual GGMLRunnerContext get_context() { @@ -2054,7 +2223,234 @@ struct GGMLRunner { alloc_compute_ctx(); } - bool alloc_params_buffer() { + // Multi-backend params allocation: walk params_ctx, classify each tensor + // via tensor_backend_fn, allocate one buffer per backend on its default + // buft, bind tensors via ggml_tallocr. + bool alloc_params_buffer_layer_split() { + // Build the backend list (main first, then additional). Index 0 is + // the default for tensors whose callback returns nullptr. + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + + std::vector bufts; + bufts.reserve(backends.size()); + std::vector aligns(backends.size()); + std::vector sizes(backends.size(), 0); + std::vector counts(backends.size(), 0); + for (size_t i = 0; i < backends.size(); i++) { + bufts.push_back(ggml_backend_get_default_buffer_type(backends[i])); + aligns[i] = ggml_backend_buft_get_alignment(bufts[i]); + // Diagnostic: confirm we got a sensible buft from each backend. + const char* buft_name = ggml_backend_buft_name(bufts[i]); + const char* backend_name = ggml_backend_name(backends[i]); + ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]); + enum ggml_backend_dev_type dev_type = dev ? ggml_backend_dev_type(dev) : GGML_BACKEND_DEVICE_TYPE_CPU; + const char* dev_name = dev ? ggml_backend_dev_name(dev) : "(none)"; + LOG_DEBUG("%s layer-split backend[%zu]=%s, buft=%s, dev=%s, dev_type=%d", + get_desc().c_str(), i, backend_name ? backend_name : "(null)", + buft_name ? buft_name : "(null)", dev_name, + (int)dev_type); + } + + // First pass: assign each tensor to a backend, accumulate sizes. + std::map tensor_backend_idx; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + int idx = 0; + if (tensor_backend_fn) { + ggml_backend_t target = tensor_backend_fn(t->name); + if (target != nullptr) { + for (size_t i = 0; i < backends.size(); i++) { + if (backends[i] == target) { + idx = int(i); + break; + } + } + } + } + tensor_backend_idx[t] = idx; + size_t s = ggml_backend_buft_get_alloc_size(bufts[idx], t); + sizes[idx] += GGML_PAD(s, aligns[idx]); + counts[idx] += 1; + } + + // Allocate one buffer per used backend. + multi_params_buffers.assign(backends.size(), nullptr); + for (size_t i = 0; i < backends.size(); i++) { + if (sizes[i] == 0) continue; + // Diagnostic: query the device's free memory BEFORE alloc. + ggml_backend_dev_t dev_pre = ggml_backend_buft_get_device(bufts[i]); + size_t free_pre = 0, total_pre = 0; + if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_pre, &total_pre); + multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]); + if (multi_params_buffers[i] == nullptr) { + LOG_ERROR("%s alloc params buffer on backend %s failed (%.1f MB)", + get_desc().c_str(), + ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f)); + return false; + } + // Diagnostic: query AFTER alloc. The drop in free memory tells + // us whether the alloc actually went to GPU device memory or + // to a virtual reservation that's not yet committed. + size_t free_post = 0, total_post = 0; + if (dev_pre) ggml_backend_dev_memory(dev_pre, &free_post, &total_post); + int64_t actual_drop = (int64_t)free_pre - (int64_t)free_post; + void* base = ggml_backend_buffer_get_base(multi_params_buffers[i]); + size_t actual_sz = ggml_backend_buffer_get_size(multi_params_buffers[i]); + bool is_host = ggml_backend_buffer_is_host(multi_params_buffers[i]); + LOG_DEBUG("%s layer-split alloc[%zu] backend=%s req=%.1f MB actual=%.1f MB " + "dev_free %.1f -> %.1f MB (drop %.1f MB) base=%p is_host=%d", + get_desc().c_str(), i, ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f), actual_sz / (1024.f * 1024.f), + free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f), + actual_drop / (1024.f * 1024.f), + base, (int)is_host); + } + + // Bind tensors via ggml_tallocr. + std::vector tallocs(backends.size()); + for (size_t i = 0; i < backends.size(); i++) { + if (multi_params_buffers[i] != nullptr) { + tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]); + } + } + for (auto& kv : tensor_backend_idx) { + ggml_status st = ggml_tallocr_alloc(&tallocs[kv.second], kv.first); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s tallocr_alloc failed for tensor %s", + get_desc().c_str(), kv.first->name); + return false; + } + } + // Diagnostic: pick a sample tensor per backend and confirm its + // buffer + data pointer. + std::vector sampled(backends.size(), false); + for (auto& kv : tensor_backend_idx) { + int idx = kv.second; + if (sampled[idx]) continue; + sampled[idx] = true; + ggml_tensor* t = kv.first; + LOG_DEBUG("%s layer-split sample[%d] tensor=%s buffer=%p data=%p buffer_is_host=%d", + get_desc().c_str(), idx, t->name, (void*)t->buffer, t->data, + t->buffer ? (int)ggml_backend_buffer_is_host(t->buffer) : -1); + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + } + + // Log the breakdown. + for (size_t i = 0; i < backends.size(); i++) { + if (counts[i] == 0) continue; + LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)", + get_desc().c_str(), + ggml_backend_name(backends[i]), + sizes[i] / (1024.f * 1024.f), + counts[i]); + } + return true; + } + + // Heuristic for row-split eligibility: contiguous, rank-2, both dims + // >= 256, and NOT a view. 1D biases / norms / embeddings / small + // projections / views fall back to the main GPU's regular per-device + // buft. Excluding views avoids the cuda split buft's + // GGML_ASSERT(view_src == nullptr) — sticking to the buft's documented + // contract instead of patching ggml. + static bool is_row_split_eligible(const ggml_tensor* t) { + if (t->view_src != nullptr) return false; + if (!ggml_is_contiguous(t)) return false; + if (ggml_n_dims(t) != 2) return false; + if (t->ne[0] < 256 || t->ne[1] < 256) return false; + return true; + } + + bool alloc_params_buffer_row_split() { + if (row_split_buft == nullptr) { + LOG_ERROR("alloc_params_buffer_row_split: row-split buft not " + "initialized (backend lacks " + "ggml_backend_split_buffer_type)"); + return false; + } + ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend); + const size_t main_align = ggml_backend_buft_get_alignment(main_buft); + const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft); + + size_t main_size = 0, split_size = 0; + size_t main_count = 0, split_count = 0; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + if (is_row_split_eligible(t)) { + size_t s = ggml_backend_buft_get_alloc_size(row_split_buft, t); + split_size += GGML_PAD(s, split_align); + split_count++; + } else { + size_t s = ggml_backend_buft_get_alloc_size(main_buft, t); + main_size += GGML_PAD(s, main_align); + main_count++; + } + } + + if (main_size > 0) { + row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size); + if (row_main_buffer == nullptr) { + LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", + get_desc().c_str(), main_size / (1024.f * 1024.f)); + return false; + } + } + if (split_size > 0) { + row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size); + if (row_split_buffer == nullptr) { + LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", + get_desc().c_str(), split_size / (1024.f * 1024.f)); + return false; + } + } + + ggml_tallocr main_alloc{}; + ggml_tallocr split_alloc{}; + if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer); + if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer); + + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; + t = ggml_get_next_tensor(params_ctx, t)) { + ggml_status st = is_row_split_eligible(t) + ? ggml_tallocr_alloc(&split_alloc, t) + : ggml_tallocr_alloc(&main_alloc, t); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s row-split tallocr_alloc failed for tensor %s", + get_desc().c_str(), t->name); + return false; + } + } + + if (row_main_buffer != nullptr) { + ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + if (row_split_buffer != nullptr) { + ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)", + get_desc().c_str(), + main_size / (1024.f * 1024.f), main_count, + split_size / (1024.f * 1024.f), split_count); + return true; + } + + // Internal: always materializes the params buffer. Used by both the + // eager `alloc_params_buffer` path and the lazy `ensure_params_loaded` + // path; the latter must bypass the lazy-skip. + bool do_alloc_params_buffer() { + if (multi_backend_mode && multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + return alloc_params_buffer_row_split(); + } + if (multi_backend_mode) { + return alloc_params_buffer_layer_split(); + } size_t num_tensors = ggml_tensor_num(params_ctx); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); if (params_buffer == nullptr) { @@ -2072,18 +2468,96 @@ struct GGMLRunner { return true; } + bool alloc_params_buffer() { + // Lazy mode: skip alloc until first compute() (via ensure_params_loaded). + // The caller still goes through alloc_params_buffer + get_param_tensors + // at init; ModelLoader::load_tensors will silently skip this runner's + // tensors (their data ptrs are null because no buffer is allocated yet) + // and the lazy_load_fn callback re-loads them on demand. + if (lazy_load_fn) return true; + return do_alloc_params_buffer(); + } + + void set_lazy_load(std::function fn) { + lazy_load_fn = std::move(fn); + } + + bool ensure_params_loaded() { + if (params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr) { + return true; + } + if (!lazy_load_fn) { + LOG_ERROR("%s: no params buffer and no lazy_load_fn", get_desc().c_str()); + return false; + } + int64_t t0 = ggml_time_ms(); + if (!do_alloc_params_buffer()) return false; + if (!lazy_load_fn()) { + LOG_ERROR("%s: lazy load callback failed", get_desc().c_str()); + return false; + } + int64_t t1 = ggml_time_ms(); + LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (t1 - t0) / 1000.f); + // Diagnostic: report device-memory free per backend AFTER load. + // If the bytes actually went to GPU, free should have decreased + // by ~params_size for each layer-split shard. + if (multi_backend_mode) { + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + for (size_t i = 0; i < backends.size(); i++) { + ggml_backend_dev_t dev = ggml_backend_get_device(backends[i]); + if (!dev) continue; + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + LOG_DEBUG("%s post-load device %s free=%.1f MB / %.1f MB", + get_desc().c_str(), + ggml_backend_dev_name(dev), + free_b / (1024.f * 1024.f), + total_b / (1024.f * 1024.f)); + } + } + return true; + } + void free_params_buffer() { if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_free(buf); + } + } + multi_params_buffers.clear(); + if (row_split_buffer != nullptr) { + ggml_backend_buffer_free(row_split_buffer); + row_split_buffer = nullptr; + } + if (row_main_buffer != nullptr) { + ggml_backend_buffer_free(row_main_buffer); + row_main_buffer = nullptr; + } + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + sched_reserved = false; + } } size_t get_params_buffer_size() { + size_t total = 0; if (params_buffer != nullptr) { - return ggml_backend_buffer_get_size(params_buffer); + total += ggml_backend_buffer_get_size(params_buffer); } - return 0; + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) total += ggml_backend_buffer_get_size(buf); + } + if (row_split_buffer != nullptr) total += ggml_backend_buffer_get_size(row_split_buffer); + if (row_main_buffer != nullptr) total += ggml_backend_buffer_get_size(row_main_buffer); + return total; } void free_cache_ctx_and_buffer() { @@ -2096,11 +2570,23 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } + if (sched != nullptr) { + // Reset rather than free: keeping the sched alive across compute() + // calls of a sampling loop avoids the per-step rebuild cost. + ggml_backend_sched_reset(sched); + sched_reserved = false; + } offload_params_to_params_backend(); } // do copy after alloc graph void set_backend_tensor_data(ggml_tensor* tensor, const void* data) { + // In multi-backend mode, sched needs the tensor flagged as input so + // it gets a backend assignment (otherwise tensors with no producers + // and no consumers leave sched at backend_id=-1). + if (multi_backend_mode) { + ggml_set_input(tensor); + } backend_tensor_data_map[tensor] = data; } @@ -2160,6 +2646,9 @@ struct GGMLRunner { int n_threads, bool free_compute_buffer_immediately, bool no_return = false) { + if (!ensure_params_loaded()) { + return std::nullopt; + } if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); return std::nullopt; @@ -2168,18 +2657,41 @@ struct GGMLRunner { LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); return std::nullopt; } - reset_compute_ctx(); - ggml_cgraph* gf = get_compute_graph(get_graph); - if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { - LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); - return std::nullopt; + ggml_cgraph* gf = nullptr; + if (multi_backend_mode) { + ggml_backend_sched_reset(sched); + reset_compute_ctx(); + gf = get_compute_graph(get_graph); + if (!ggml_backend_sched_alloc_graph(sched, gf)) { + LOG_ERROR("%s sched alloc graph failed", get_desc().c_str()); + return std::nullopt; + } + } else { + reset_compute_ctx(); + gf = get_compute_graph(get_graph); + if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { + LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); + return std::nullopt; + } } copy_data_to_backend_tensor(); if (ggml_backend_is_cpu(runtime_backend)) { ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); } + if (multi_backend_mode && cpu_fallback_backend && + ggml_backend_is_cpu(cpu_fallback_backend)) { + ggml_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads); + } - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); + ggml_status status; + if (multi_backend_mode) { + status = ggml_backend_sched_graph_compute(sched, gf); + if (status == GGML_STATUS_SUCCESS) { + ggml_backend_sched_synchronize(sched); + } + } else { + status = ggml_backend_graph_compute(runtime_backend, gf); + } if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); return std::nullopt; @@ -2259,6 +2771,14 @@ class GGMLBlock { prefix = prefix + "."; } init_params(ctx, tensor_storage_map, prefix); + // Tag each param tensor with its full (prefix-qualified) name so the + // multi-backend runner's tensor_backend_fn callback can route it. + // Without this, init_params leaves tensors with empty t->name. + for (auto& pair : params) { + if (pair.second != nullptr) { + ggml_set_name(pair.second, (prefix + pair.first).c_str()); + } + } init_blocks(ctx, tensor_storage_map, prefix); } diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp index 50158c883..6d60a73ec 100644 --- a/src/ggml_extend_backend.hpp +++ b/src/ggml_extend_backend.hpp @@ -121,6 +121,24 @@ __STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backen } } +// Runtime lookup of a backend's row-split buffer type (currently published by +// the CUDA and SYCL backends as `ggml_backend_split_buffer_type` in their +// reg_get_proc_address tables). Returns nullptr when the backend does not +// support row-split, leaving the caller to fall back to a non-split path. +using __ggml_backend_split_buffer_type_t = ggml_backend_buffer_type_t (*)(int main_device, const float* tensor_split); + +__STATIC_INLINE__ ggml_backend_buffer_type_t ggml_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend); + if (reg == nullptr) { + return nullptr; + } + auto fn = reinterpret_cast<__ggml_backend_split_buffer_type_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type")); + if (fn == nullptr) { + return nullptr; + } + return fn(main_device, tensor_split); +} + __STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { if (tensor == nullptr) { return nullptr; diff --git a/src/model.cpp b/src/model.cpp index 8fdde3b76..2f7e2b78f 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -783,11 +783,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::unique_ptr mmapped; if (enable_mmap && !is_zip) { - LOG_DEBUG("using mmap for I/O"); mmapped = MmapWrapper::create(file_path); if (!mmapped) { - LOG_WARN("failed to memory-map '%s'", file_path.c_str()); + LOG_WARN("failed to memory-map '%s' (falling back to read())", + file_path.c_str()); + } else { + LOG_INFO("using mmap for '%s'", file_path.c_str()); } + } else if (!is_zip) { + LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)", + file_path.c_str()); } int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); @@ -1003,9 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors, int n_threads, - bool enable_mmap) { + bool enable_mmap, + bool quiet_unknown_tensors) { std::set tensor_names_in_file; std::mutex tensor_names_mutex; + std::atomic unknown_tensor_count{0}; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; // LOG_DEBUG("%s", tensor_storage.to_string().c_str()); @@ -1023,7 +1030,11 @@ bool ModelLoader::load_tensors(std::map& tensors, return true; } } - LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str()); + if (quiet_unknown_tensors) { + unknown_tensor_count.fetch_add(1); + } else { + LOG_INFO("unknown tensor '%s' in model file", tensor_storage.to_string().c_str()); + } return true; } @@ -1072,6 +1083,10 @@ bool ModelLoader::load_tensors(std::map& tensors, if (some_tensor_not_init) { return false; } + if (quiet_unknown_tensors && unknown_tensor_count.load() > 0) { + LOG_INFO("skipped %zu unknown tensors (--quiet-unknown-tensors)", + unknown_tensor_count.load()); + } return true; } diff --git a/src/model.h b/src/model.h index 65bc6c367..03d4e3732 100644 --- a/src/model.h +++ b/src/model.h @@ -193,6 +193,8 @@ using TensorTypeRules = std::vector>; TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules); +bool is_unused_tensor(const std::string& name); + class ModelLoader { protected: SDVersion version_ = VERSION_COUNT; @@ -224,7 +226,8 @@ class ModelLoader { bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, int n_threads = 0, - bool use_mmap = false); + bool use_mmap = false, + bool quiet_unknown_tensors = false); std::vector get_tensor_names() const { std::vector names; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 88102ff61..c389c6242 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1,5 +1,6 @@ #include "ggml_extend.hpp" +#include "backend_fit.hpp" #include "model.h" #include "rng.hpp" #include "rng_mt19937.hpp" @@ -108,10 +109,47 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: - ggml_backend_t backend = nullptr; // general backend + ggml_backend_t backend = nullptr; // general / main backend ggml_backend_t clip_backend = nullptr; ggml_backend_t control_net_backend = nullptr; ggml_backend_t vae_backend = nullptr; + ggml_backend_t diffusion_backend = nullptr; + + // Auto-fit decisions resolved into device-name strings. When non-empty, + // these win over the user-provided sd_ctx_params->*_backend_device. + // When empty, the explicit param (or `backend` fallback) is used. + std::string fit_diffusion_device; + std::string fit_clip_device; + std::string fit_vae_device; + // Per-component offload-params override coming from auto-fit. Forces + // offload_params_to_cpu for that component even when global flag is off. + bool fit_dit_offload_params = false; + bool fit_cond_offload_params = false; + bool fit_vae_offload_params = false; + + // Multi-GPU split state (LAYER_SPLIT or ROW_SPLIT). Holds the ordered + // list of device names and per-device share bytes; the actual backend + // handles are init'd at construction time and stored in *_extra_backends + // so the destructor can free them. fit_*_row_split=true means use the + // CUDA row-split path (matmul weights split row-wise via cuda_split_buft); + // false means layer-split (per-block backend assignment via sched). + std::vector fit_dit_split_device_names; + std::vector fit_dit_split_share_bytes; + std::vector fit_dit_extra_backends; + bool fit_dit_row_split = false; + std::vector fit_cond_split_device_names; + std::vector fit_cond_split_share_bytes; + std::vector fit_cond_extra_backends; + bool fit_cond_row_split = false; + + // Owned model loader: kept alive across init() so lazy_load callbacks + // can re-read tensor data from disk on demand. Only set when at least + // one component is configured for lazy load. + std::unique_ptr owned_model_loader; + // Auto-fit decided init-time SUM exceeds device cap; defer cond + DiT + // allocation until first compute() so peaks don't pile up. + bool auto_lazy_load = false; + bool enable_mmap_member = false; SDVersion version; bool vae_decode_only = false; @@ -168,11 +206,35 @@ class StableDiffusionGGML { if (vae_backend != backend) { ggml_backend_free(vae_backend); } + if (diffusion_backend != backend) { + ggml_backend_free(diffusion_backend); + } + for (auto* b : fit_dit_extra_backends) { + if (b != backend && b != diffusion_backend && b != clip_backend && + b != vae_backend && b != control_net_backend) { + ggml_backend_free(b); + } + } + for (auto* b : fit_cond_extra_backends) { + if (b != backend && b != diffusion_backend && b != clip_backend && + b != vae_backend && b != control_net_backend) { + ggml_backend_free(b); + } + } ggml_backend_free(backend); } - void init_backend() { - backend = sd_get_default_backend(); + void init_backend(const char* main_device_name) { + if (main_device_name != nullptr && main_device_name[0] != '\0') { + backend = init_named_backend(main_device_name); + if (backend == nullptr) { + LOG_WARN("main backend device '%s' init failed; falling back to default", + main_device_name); + } + } + if (backend == nullptr) { + backend = sd_get_default_backend(); + } } std::shared_ptr get_rng(rng_type_t rng_type) { @@ -202,9 +264,14 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); + init_backend(sd_ctx_params->main_backend_device); - ModelLoader model_loader; + // Use a stack-local handle that points into `owned_model_loader` if we + // need lazy callbacks (decided after auto-fit), otherwise a temp local + // is fine. Defer the unique_ptr decision; for now always own it so the + // pointer is stable even if lazy load is enabled later in this init(). + owned_model_loader = std::make_unique(); + ModelLoader& model_loader = *owned_model_loader; if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); @@ -328,6 +395,142 @@ class StableDiffusionGGML { return oss.str(); }; + if (sd_ctx_params->auto_fit) { + backend_fit::ComputeReserves reserves; + if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) { + reserves.dit_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) { + reserves.vae_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) { + reserves.conditioner_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB; + } + auto components = backend_fit::estimate_components( + model_loader, wtype, /*alignment=*/64, reserves); + auto devices = backend_fit::enumerate_gpu_devices(); + int64_t margin_bytes = + int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + backend_fit::MultiGpuMode mode = backend_fit::str_to_multi_gpu_mode( + SAFE_STR(sd_ctx_params->multi_gpu_mode)); + auto plan = backend_fit::compute_plan( + components, devices, margin_bytes, + sd_ctx_params->auto_multi_gpu, mode); + backend_fit::print_plan(plan, components, devices, margin_bytes); + + if (sd_ctx_params->auto_fit_dry_run) { + LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models"); + return false; + } + + // Find the CPU device's ggml name (so we can route "CPU" + // placements through init_named_backend uniformly). + std::string cpu_device_name; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + cpu_device_name = ggml_backend_dev_name(dev); + break; + } + } + auto device_id_to_name = [&](int dev_id) -> std::string { + for (const auto& dev : devices) { + if (dev.id == dev_id) return dev.name; + } + return {}; + }; + auto resolve = [&](const backend_fit::Decision* d, + std::string& out_device, + bool& out_offload, + std::vector& out_split_devices, + std::vector& out_split_shares, + bool& out_row_split) { + out_split_devices.clear(); + out_split_shares.clear(); + out_row_split = false; + if (d == nullptr) { + out_device.clear(); + out_offload = false; + return; + } + if (d->placement == backend_fit::Placement::CPU) { + out_device = cpu_device_name; + out_offload = false; + return; + } + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT || + d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) { + // Primary device drives main_backend choice for the model; + // the rest become additional backends in the spec. + for (size_t k = 0; k < d->split_device_ids.size(); k++) { + out_split_devices.push_back(device_id_to_name(d->split_device_ids[k])); + out_split_shares.push_back(d->split_share_bytes[k]); + } + if (!out_split_devices.empty()) out_device = out_split_devices[0]; + out_offload = false; + out_row_split = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT); + return; + } + out_device = device_id_to_name(d->device_id); + out_offload = (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS); + }; + std::vector dummy_devs; + std::vector dummy_shares; + bool dummy_row_split = false; + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), + fit_diffusion_device, fit_dit_offload_params, + fit_dit_split_device_names, fit_dit_split_share_bytes, + fit_dit_row_split); + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), + fit_vae_device, fit_vae_offload_params, dummy_devs, dummy_shares, + dummy_row_split); + resolve(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), + fit_clip_device, fit_cond_offload_params, + fit_cond_split_device_names, fit_cond_split_share_bytes, + fit_cond_row_split); + + // CPU placements: leave fit_*_device empty AND remember they're + // CPU so the resolver below picks ggml_backend_cpu_init(). + + // Decide auto-lazy-load: if the per-component MAX-based plan fits + // but the SUM-of-components on any device would exceed cap, defer + // alloc until first compute() so peaks don't pile up. Heuristic: + // sum the per-device on_device_bytes across all GPU decisions + // (excluding VAE which is small) and compare to free_bytes. + std::map sum_per_device; + auto add_sum = [&](const backend_fit::Decision* d) { + if (!d) return; + if (d->placement == backend_fit::Placement::GPU_LAYER_SPLIT || + d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) { + for (size_t k = 0; k < d->split_device_ids.size(); k++) { + sum_per_device[d->split_device_ids[k]] += d->split_share_bytes[k]; + } + } else if (d->placement == backend_fit::Placement::GPU || + d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) { + sum_per_device[d->device_id] += d->on_device_bytes; + } + }; + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT)); + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE)); + add_sum(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER)); + for (const auto& dev : devices) { + int64_t cap = dev.free_bytes - margin_bytes; + int64_t sum = sum_per_device.count(dev.id) ? sum_per_device[dev.id] : 0; + if (sum > cap) { + LOG_INFO("auto-fit: enabling lazy load (init-time SUM %lld MiB on %s " + "exceeds cap %lld MiB; per-component MAX plan needs lazy alloc)", + (long long)(sum / backend_fit::MiB), + dev.name.c_str(), + (long long)(cap / backend_fit::MiB)); + auto_lazy_load = true; + break; + } + } + } + LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); LOG_INFO("Conditioner weight type stat: %s", wtype_stat_to_str(conditioner_wtype_stat).c_str()); LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str()); @@ -373,19 +576,362 @@ class StableDiffusionGGML { LOG_INFO("Using circular padding for convolutions"); } - bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; + // If auto-fit decided ANY component must offload params, force the + // global flag on. This is a coarsening: one component needing offload + // forces all to offload (safer, just slower for non-offload ones). + if (fit_dit_offload_params || fit_cond_offload_params || fit_vae_offload_params) { + if (!offload_params_to_cpu) { + LOG_INFO("auto-fit: enabling offload_params_to_cpu (one or more " + "components don't fit without param streaming)"); + offload_params_to_cpu = true; + } + } + + // Pick the effective device name for each component: the auto-fit + // override (if any) wins; otherwise the user-provided string; nullptr + // falls back to `backend` (the main). + auto effective_device = [&](const std::string& fit_str, const char* user_str) -> const char* { + if (!fit_str.empty()) return fit_str.c_str(); + return user_str; + }; + const char* diffusion_dev_name = effective_device(fit_diffusion_device, + sd_ctx_params->diffusion_backend_device); + const char* clip_dev_name = effective_device(fit_clip_device, + sd_ctx_params->clip_backend_device); + const char* vae_dev_name = effective_device(fit_vae_device, + sd_ctx_params->vae_backend_device); + + // Build the row-split MultiBackendSpec for a component (ROW_SPLIT + // mode). Unlike layer-split, the runner uses a SINGLE CUDA backend; + // matmul weights are row-split across all CUDA devices internally + // by cuda_split_buffer_type. extra_backends stays empty. + // - share_devices/share_bytes: per-device share order from auto-fit + // (largest first by descending share). The first device is the + // "main" CUDA device, where the compute buffer lives. + // Returns true on success; populates out_spec.tensor_split_ratios + // with a vector of length total CUDA device count. + auto prepare_row_split_spec = [&](const std::vector& share_devices, + const std::vector& share_bytes, + std::vector& out_extra_backends, + MultiBackendSpec& out_spec) -> bool { + if (share_devices.size() < 2) return false; + + // Derive the backend registry from the device-name prefix (e.g. + // "CUDA0" -> reg "CUDA", "SYCL1" -> reg "SYCL"). This keeps the + // code backend-agnostic: any backend whose registry publishes + // `ggml_backend_split_buffer_type` via reg_get_proc_address can + // drive row-split, not just CUDA. + auto reg_prefix_of = [](const std::string& name) -> std::string { + size_t i = 0; + while (i < name.size() && (std::isalpha((unsigned char)name[i]) || name[i] == '_')) i++; + return name.substr(0, i); + }; + const std::string reg_name = reg_prefix_of(share_devices[0]); + ggml_backend_reg_t reg = ggml_backend_reg_by_name(reg_name.c_str()); + if (reg == nullptr) return false; + const int dev_count = (int)ggml_backend_reg_dev_count(reg); + if (dev_count <= 0) return false; + + auto reg_index_of = [&](const std::string& name) -> int { + if (name.rfind(reg_name, 0) != 0) return -1; + try { return std::stoi(name.substr(reg_name.size())); } catch (...) { return -1; } + }; + + std::vector ratios(dev_count, 0.0f); + int64_t total = 0; + for (auto b : share_bytes) total += b; + if (total <= 0) return false; + int main_dev = -1; + int64_t max_share = -1; + for (size_t k = 0; k < share_devices.size(); k++) { + int idx = reg_index_of(share_devices[k]); + if (idx < 0 || idx >= dev_count) continue; + ratios[idx] = float(double(share_bytes[k]) / double(total)); + if (share_bytes[k] > max_share) { + max_share = share_bytes[k]; + main_dev = idx; + } + } + if (main_dev < 0) return false; + + // Init extra backends for the non-main devices so sched can + // route ops across them (row-split tensors are dispatched by the + // primary backend; ggml-sched still needs all participating + // backends in its list to schedule cross-device copies). + for (size_t k = 0; k < share_devices.size(); k++) { + int idx = reg_index_of(share_devices[k]); + if (idx == main_dev || idx < 0) continue; + ggml_backend_t b = init_named_backend(share_devices[k]); + if (b != nullptr) { + out_extra_backends.push_back(b); + } else { + LOG_WARN("row-split: failed to init backend %s", + share_devices[k].c_str()); + } + } + out_spec.mode = MultiBackendMode::ROW_SPLIT; + out_spec.tensor_split_ratios = ratios; + out_spec.main_device = main_dev; + out_spec.additional_backends.assign(out_extra_backends.begin(), + out_extra_backends.end()); + out_spec.tensor_backend_fn = nullptr; + out_spec.cpu_fallback = nullptr; + + std::string ratio_str; + for (int i = 0; i < dev_count; i++) { + if (i > 0) ratio_str += ","; + char buf[16]; std::snprintf(buf, sizeof(buf), "%.2f", ratios[i]); + ratio_str += buf; + } + LOG_INFO("row-split spec: ratios=[%s] main_device=%d", + ratio_str.c_str(), main_dev); + return true; + }; + + // Build the layer-split MultiBackendSpec for a component. Only used + // when auto-fit picked GPU_LAYER_SPLIT for this component. + // - main_backend: the runner's primary backend (also first in the spec) + // - extra_device_names: additional device names to span + // - share_bytes: per-device share (for proportional block partition) + // - tensor_prefix: the model's weight name prefix (e.g., + // "model.diffusion_model.") — used to locate block-indexed tensors + // Returns true if a spec was prepared and pending_spec_storage was + // populated; the caller must set g_pending_multi_backend_spec() + // immediately before constructing the model. + auto prepare_layer_split_spec = [&](ggml_backend_t main_backend, + const std::vector& extra_device_names, + const std::vector& share_bytes, + const std::string& tensor_prefix, + std::vector& out_extra_backends, + MultiBackendSpec& out_spec) -> bool { + if (extra_device_names.size() < 2) return false; // only [main] -> single GPU + // Init the additional backends (skip [0] which is main_backend). + std::vector all_backends; + all_backends.push_back(main_backend); + for (size_t k = 1; k < extra_device_names.size(); k++) { + ggml_backend_t b = init_named_backend(extra_device_names[k]); + if (b == nullptr) { + LOG_WARN("layer-split: failed to init extra backend %s; falling back to single backend", + extra_device_names[k].c_str()); + return false; + } + out_extra_backends.push_back(b); + all_backends.push_back(b); + } + + // Walk tensor_storage_map to get per-block byte sizes and the + // total non-block bytes that will land on backend[0]. Then + // greedy-partition blocks by byte budget to balance per-backend + // bytes (accounting for non-block fixed load on backend[0]). + int max_block_idx = -1; + static const std::regex block_re( + R"((?:transformer_blocks|joint_blocks|double_blocks|single_blocks|blocks|layers)\.([0-9]+)\.)"); + std::map block_bytes; // block idx -> bytes + int64_t non_block_bytes = 0; + for (const auto& kv : tensor_storage_map) { + if (!tensor_prefix.empty() && kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + std::smatch m; + if (std::regex_search(kv.first, m, block_re)) { + int idx = std::stoi(m[1]); + if (idx > max_block_idx) max_block_idx = idx; + block_bytes[idx] += bytes; + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("layer-split: no blocks found under prefix '%s'; aborting split", + tensor_prefix.c_str()); + return false; + } + const int n_blocks = max_block_idx + 1; + + // Build per-backend byte budgets from share_bytes (ratios). The + // first backend absorbs `non_block_bytes` as a fixed load, so we + // SHRINK its remaining budget for blocks accordingly. + int64_t total_share = 0; + for (auto s : share_bytes) total_share += s; + int64_t total_block_bytes = 0; + for (const auto& kv : block_bytes) total_block_bytes += kv.second; + std::vector backend_block_budgets(share_bytes.size(), 0); + for (size_t k = 0; k < share_bytes.size(); k++) { + int64_t share = int64_t(double(total_block_bytes + non_block_bytes) * + double(share_bytes[k]) / double(total_share)); + if (k == 0) share = std::max(share - non_block_bytes, 0); + backend_block_budgets[k] = share; + } + // Greedy assign each block (in order) to the current backend + // until its budget is filled, then move to the next. + std::vector boundaries(share_bytes.size(), 0); + size_t cur_backend = 0; + int64_t cur_used = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur_backend + 1 < share_bytes.size() && + cur_used + bb > backend_block_budgets[cur_backend] && + cur_used > 0) { + boundaries[cur_backend] = b; + cur_backend++; + cur_used = 0; + } + cur_used += bb; + } + // The remaining backends get the rest, terminating at n_blocks. + for (size_t k = cur_backend; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + // Safety: ensure each backend has at least one block. + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) boundaries[k] = std::min(min_bound, n_blocks); + } + std::string boundary_log = "layer-split [" + tensor_prefix + "] " + + std::to_string(n_blocks) + " blocks: "; + int prev = 0; + for (size_t k = 0; k < all_backends.size() && k < boundaries.size(); k++) { + if (k > 0) boundary_log += ", "; + boundary_log += std::string(ggml_backend_name(all_backends[k])) + "=[" + + std::to_string(prev) + ".." + std::to_string(boundaries[k]) + ")"; + prev = boundaries[k]; + } + LOG_INFO("%s", boundary_log.c_str()); + + // Build the tensor_backend_fn closure. + std::vector backends_capture = all_backends; + std::vector boundaries_capture = boundaries; + std::string prefix_capture = tensor_prefix; + out_spec.tensor_backend_fn = + [backends_capture, boundaries_capture, prefix_capture](const std::string& name) -> ggml_backend_t { + if (!prefix_capture.empty() && + name.compare(0, prefix_capture.size(), prefix_capture) != 0) { + return backends_capture[0]; + } + std::smatch m; + if (!std::regex_search(name, m, block_re)) { + return backends_capture[0]; + } + int idx = std::stoi(m[1]); + for (size_t k = 0; k < boundaries_capture.size(); k++) { + if (idx < boundaries_capture[k]) { + return backends_capture[std::min(k, backends_capture.size() - 1)]; + } + } + return backends_capture.back(); + }; + // Spec contains the additional backends only (main is implicit). + out_spec.additional_backends.assign(out_extra_backends.begin(), out_extra_backends.end()); + out_spec.cpu_fallback = nullptr; + return true; + }; + + // Helper: init a named backend if name is non-null/non-empty, + // returns nullptr on missing/failed name (caller falls back to main). + auto init_named_or_null = [](const char* name) -> ggml_backend_t { + if (name == nullptr || name[0] == '\0') return nullptr; + return init_named_backend(name); + }; + + diffusion_backend = init_named_or_null(diffusion_dev_name); + if (!diffusion_backend) { + diffusion_backend = backend; + } else { + LOG_INFO("Diffusion model: using device %s", diffusion_dev_name); + } + + // Tensor name sets for components that are configured for lazy load. + // Populated below right before/after the cond + DiT construction; + // consumed by the bulk-load step's ignore_tensors. + std::set cond_lazy_tensor_names; + std::set dit_lazy_tensor_names; + + // Build the layer-split MultiBackendSpec for DiT (when auto-fit picked + // GPU_LAYER_SPLIT). The spec is consumed by the diffusion_model's + // GGMLRunner ctor when we set g_pending_multi_backend_spec() to it. + MultiBackendSpec dit_spec; + bool dit_spec_active = false; + if (!fit_dit_split_device_names.empty()) { + if (fit_dit_row_split) { + dit_spec_active = prepare_row_split_spec(fit_dit_split_device_names, + fit_dit_split_share_bytes, + fit_dit_extra_backends, + dit_spec); + } else { + dit_spec_active = prepare_layer_split_spec(diffusion_backend, + fit_dit_split_device_names, + fit_dit_split_share_bytes, + "model.diffusion_model.", + fit_dit_extra_backends, + dit_spec); + } + } + // Lambda to set the pending spec immediately before constructing the + // diffusion model. Caller must invoke this on the same line / right + // before the std::make_shared<...Model>(diffusion_backend, ...) call. + auto prime_dit_spec = [&]() { + if (dit_spec_active) { + g_pending_multi_backend_spec() = &dit_spec; + } + }; + + // Same dance for the conditioner. The conditioner uses clip_backend as + // its main backend; we need to set up the spec BEFORE the cond_stage + // ctor runs (which is BEFORE the DiT ctor). Each cond model wraps one + // or more sub-runners; the spec's tensor_backend_fn handles all of + // them since it's keyed on tensor names with a generic block regex. + // (Some conditioners construct multiple sub-runners — only the FIRST + // ggml runner ctor consumes the pending spec, so we re-prime between + // sub-runners' allocs by leaving cond_spec_active true; the runner's + // multi_backend_mode is per-runner.) + // For LTX-2 specifically: LTXAVEmbedder constructs LLMRunner first + // (consumes spec), then LTXAVTextProjectionRunner (no spec consumed). + // The LLM has block-named tensors so layer-split applies; the + // projector has only 4 tensors and they should ride along on its + // single backend (clip_backend = main). Auto-fit's cond share counts + // both, so the share is over-counted on backend[0] for the projector. + // Acceptable for now — small correction. + ggml_backend_t clip_main_backend_for_spec = nullptr; // resolved below + MultiBackendSpec cond_spec; + bool cond_spec_active = false; + auto prime_cond_spec = [&]() { + if (cond_spec_active) { + g_pending_multi_backend_spec() = &cond_spec; + } + }; { - clip_backend = backend; - if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("CLIP: Using CPU backend"); - clip_backend = ggml_backend_cpu_init(); + clip_backend = init_named_or_null(clip_dev_name); + if (!clip_backend) { + clip_backend = backend; + } else { + LOG_INFO("CLIP: using device %s", clip_dev_name); + } + // Now that clip_backend is resolved, build the conditioner's + // multi-GPU spec if auto-fit picked one (row-split or layer-split). + if (!fit_cond_split_device_names.empty()) { + if (fit_cond_row_split) { + cond_spec_active = prepare_row_split_spec(fit_cond_split_device_names, + fit_cond_split_share_bytes, + fit_cond_extra_backends, + cond_spec); + } else { + cond_spec_active = prepare_layer_split_spec(clip_backend, + fit_cond_split_device_names, + fit_cond_split_share_bytes, + "text_encoders.", + fit_cond_extra_backends, + cond_spec); + } } if (sd_version_is_sd3(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map); } else if (sd_version_is_flux(version)) { @@ -406,12 +952,14 @@ class StableDiffusionGGML { "--chroma-disable-dit-mask as a workaround."); } + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else if (version == VERSION_OVIS_IMAGE) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, @@ -419,40 +967,47 @@ class StableDiffusionGGML { "", false); } else { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); } - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, true, 0, true); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { - high_noise_diffusion_model = std::make_shared(backend, + prime_dit_spec(); + high_noise_diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", @@ -472,42 +1027,50 @@ class StableDiffusionGGML { if (!vae_decode_only) { enable_vision = true; } + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version, "", enable_vision); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); } else if (sd_version_is_anima(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version); } else if (sd_version_is_ernie_image(version)) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model"); @@ -517,6 +1080,7 @@ class StableDiffusionGGML { embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, @@ -524,13 +1088,15 @@ class StableDiffusionGGML { version, PM_VERSION_2); } else { + prime_cond_spec(); cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, embbeding_map, version); } - diffusion_model = std::make_shared(backend, + prime_dit_spec(); + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version); @@ -540,11 +1106,83 @@ class StableDiffusionGGML { } } - cond_stage_model->alloc_params_buffer(); - cond_stage_model->get_param_tensors(tensors); + // Conditioner: publish its tensors to the global map, EXCEPT the + // ones that are about to be configured for lazy load (we want the + // bulk loader to skip them — they have no buffer yet). + std::map cond_only_tensors; + cond_stage_model->get_param_tensors(cond_only_tensors); + std::map llm_lazy_map; + if (auto_lazy_load) { + for (const auto& kv : cond_only_tensors) { + if (kv.first.rfind("text_encoders.llm.", 0) == 0) { + llm_lazy_map[kv.first] = kv.second; + cond_lazy_tensor_names.insert(kv.first); + } + } + } + for (const auto& kv : cond_only_tensors) { + if (cond_lazy_tensor_names.find(kv.first) == cond_lazy_tensor_names.end()) { + tensors[kv.first] = kv.second; // eager — bulk loader will fill + } + } + if (auto_lazy_load && !llm_lazy_map.empty()) { + ModelLoader* loader_ptr = owned_model_loader.get(); + // Bound lazy-load threads to keep the per-thread staging + // buffer footprint small. The default n_threads = nproc gives + // ~nproc × max_tensor_bytes (up to several GB total) of + // CPU-side staging; for RAM-constrained systems running large + // models that's enough to trigger the OOM-killer even with + // mmap enabled. 2 threads still keep the disk-read pipeline + // fed while keeping staging bounded to ~2 × max_tensor_bytes. + int n_threads_capture = std::min(sd_ctx_params->n_threads > 0 + ? sd_ctx_params->n_threads : 2, + 2); + bool mmap_capture = sd_ctx_params->enable_mmap; + bool quiet_capture = sd_ctx_params->quiet_unknown_tensors; + cond_stage_model->set_llm_lazy_load([=]() -> bool { + auto local_map = llm_lazy_map; + return loader_ptr->load_tensors(local_map, /*ignore=*/{}, + n_threads_capture, mmap_capture, + quiet_capture); + }); + LOG_INFO("auto-fit: conditioner LLM is lazy (defer alloc until first compute, %zu tensors)", + llm_lazy_map.size()); + } + cond_stage_model->alloc_params_buffer(); // no-op for the lazy LLM - diffusion_model->alloc_params_buffer(); - diffusion_model->get_param_tensors(tensors); + std::map dit_only_tensors; + diffusion_model->get_param_tensors(dit_only_tensors); + if (auto_lazy_load) { + for (const auto& kv : dit_only_tensors) { + dit_lazy_tensor_names.insert(kv.first); + } + ModelLoader* loader_ptr = owned_model_loader.get(); + // Bound lazy-load threads to keep the per-thread staging + // buffer footprint small. The default n_threads = nproc gives + // ~nproc × max_tensor_bytes (up to several GB total) of + // CPU-side staging; for RAM-constrained systems running large + // models that's enough to trigger the OOM-killer even with + // mmap enabled. 2 threads still keep the disk-read pipeline + // fed while keeping staging bounded to ~2 × max_tensor_bytes. + int n_threads_capture = std::min(sd_ctx_params->n_threads > 0 + ? sd_ctx_params->n_threads : 2, + 2); + bool mmap_capture = sd_ctx_params->enable_mmap; + bool quiet_capture = sd_ctx_params->quiet_unknown_tensors; + diffusion_model->set_lazy_load([=]() -> bool { + auto local_map = dit_only_tensors; + return loader_ptr->load_tensors(local_map, /*ignore=*/{}, + n_threads_capture, mmap_capture, + quiet_capture); + }); + LOG_INFO("auto-fit: diffusion_model is lazy (defer alloc until first compute, %zu tensors)", + dit_only_tensors.size()); + } else { + for (const auto& kv : dit_only_tensors) { + tensors[kv.first] = kv.second; + } + } + diffusion_model->alloc_params_buffer(); // no-op when lazy_load_fn is set if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -555,11 +1193,13 @@ class StableDiffusionGGML { high_noise_diffusion_model->get_param_tensors(tensors); } - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("VAE Autoencoder: Using CPU backend"); - vae_backend = ggml_backend_cpu_init(); - } else { + if (vae_dev_name != nullptr && vae_dev_name[0] != '\0') { + vae_backend = init_named_backend(vae_dev_name); + } + if (!vae_backend) { vae_backend = backend; + } else { + LOG_INFO("VAE: using device %s", vae_dev_name); } auto create_tae = [&]() -> std::shared_ptr { @@ -648,11 +1288,14 @@ class StableDiffusionGGML { if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { ggml_backend_t controlnet_backend = nullptr; - if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_DEBUG("ControlNet: Using CPU backend"); - controlnet_backend = ggml_backend_cpu_init(); - } else { + const char* cn_dev_name = sd_ctx_params->control_net_backend_device; + if (cn_dev_name != nullptr && cn_dev_name[0] != '\0') { + controlnet_backend = init_named_backend(cn_dev_name); + } + if (!controlnet_backend) { controlnet_backend = backend; + } else { + LOG_INFO("ControlNet: using device %s", cn_dev_name); } control_net = std::make_shared(controlnet_backend, offload_params_to_cpu, @@ -754,6 +1397,14 @@ class StableDiffusionGGML { std::set ignore_tensors; tensors["alphas_cumprod"] = alphas_cumprod_tensor; + // Lazy-loaded components: skip them in the bulk load; their lazy + // callbacks will load them on first compute(). + for (const auto& name : cond_lazy_tensor_names) { + ignore_tensors.insert(name); + } + for (const auto& name : dit_lazy_tensor_names) { + ignore_tensors.insert(name); + } if (use_tae && !tae_preview_only) { ignore_tensors.insert("first_stage_model."); } @@ -783,7 +1434,9 @@ class StableDiffusionGGML { ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, + sd_ctx_params->enable_mmap, + sd_ctx_params->quiet_unknown_tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -2142,16 +2795,31 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->prediction = PREDICTION_COUNT; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; - sd_ctx_params->enable_mmap = false; - sd_ctx_params->keep_clip_on_cpu = false; - sd_ctx_params->keep_control_net_on_cpu = false; - sd_ctx_params->keep_vae_on_cpu = false; - sd_ctx_params->diffusion_flash_attn = false; - sd_ctx_params->circular_x = false; - sd_ctx_params->circular_y = false; - sd_ctx_params->chroma_use_dit_mask = true; - sd_ctx_params->chroma_use_t5_mask = false; - sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->enable_mmap = false; + sd_ctx_params->main_backend_device = nullptr; + sd_ctx_params->diffusion_backend_device = nullptr; + sd_ctx_params->clip_backend_device = nullptr; + sd_ctx_params->vae_backend_device = nullptr; + sd_ctx_params->control_net_backend_device = nullptr; + sd_ctx_params->tae_backend_device = nullptr; + sd_ctx_params->upscaler_backend_device = nullptr; + sd_ctx_params->photomaker_backend_device = nullptr; + sd_ctx_params->vision_backend_device = nullptr; + sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->circular_x = false; + sd_ctx_params->circular_y = false; + sd_ctx_params->chroma_use_dit_mask = true; + sd_ctx_params->chroma_use_t5_mask = false; + sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->auto_fit = true; + sd_ctx_params->auto_fit_target_mb = 512; + sd_ctx_params->auto_fit_dry_run = false; + sd_ctx_params->auto_fit_compute_reserve_dit_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; + sd_ctx_params->auto_multi_gpu = true; + sd_ctx_params->multi_gpu_mode = "row"; + sd_ctx_params->quiet_unknown_tensors = false; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -2183,9 +2851,24 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "sampler_rng_type: %s\n" "prediction: %s\n" "offload_params_to_cpu: %s\n" - "keep_clip_on_cpu: %s\n" - "keep_control_net_on_cpu: %s\n" - "keep_vae_on_cpu: %s\n" + "main_backend_device: %s\n" + "diffusion_backend_device: %s\n" + "clip_backend_device: %s\n" + "vae_backend_device: %s\n" + "control_net_backend_device: %s\n" + "tae_backend_device: %s\n" + "upscaler_backend_device: %s\n" + "photomaker_backend_device: %s\n" + "vision_backend_device: %s\n" + "auto_fit: %s\n" + "auto_fit_target_mb: %d\n" + "auto_fit_dry_run: %s\n" + "auto_fit_compute_reserve_dit_mb: %d\n" + "auto_fit_compute_reserve_vae_mb: %d\n" + "auto_fit_compute_reserve_cond_mb: %d\n" + "auto_multi_gpu: %s\n" + "multi_gpu_mode: %s\n" + "quiet_unknown_tensors: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2215,9 +2898,24 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), - BOOL_STR(sd_ctx_params->keep_clip_on_cpu), - BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), - BOOL_STR(sd_ctx_params->keep_vae_on_cpu), + SAFE_STR(sd_ctx_params->main_backend_device), + SAFE_STR(sd_ctx_params->diffusion_backend_device), + SAFE_STR(sd_ctx_params->clip_backend_device), + SAFE_STR(sd_ctx_params->vae_backend_device), + SAFE_STR(sd_ctx_params->control_net_backend_device), + SAFE_STR(sd_ctx_params->tae_backend_device), + SAFE_STR(sd_ctx_params->upscaler_backend_device), + SAFE_STR(sd_ctx_params->photomaker_backend_device), + SAFE_STR(sd_ctx_params->vision_backend_device), + BOOL_STR(sd_ctx_params->auto_fit), + sd_ctx_params->auto_fit_target_mb, + BOOL_STR(sd_ctx_params->auto_fit_dry_run), + sd_ctx_params->auto_fit_compute_reserve_dit_mb, + sd_ctx_params->auto_fit_compute_reserve_vae_mb, + sd_ctx_params->auto_fit_compute_reserve_cond_mb, + BOOL_STR(sd_ctx_params->auto_multi_gpu), + SAFE_STR(sd_ctx_params->multi_gpu_mode), + BOOL_STR(sd_ctx_params->quiet_unknown_tensors), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), diff --git a/src/util.cpp b/src/util.cpp index 0b514bb73..743738813 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -174,12 +174,33 @@ bool is_directory(const std::string& path) { class MmapWrapperImpl : public MmapWrapper { public: - MmapWrapperImpl(void* data, size_t size) - : MmapWrapper(data, size) {} + MmapWrapperImpl(void* data, size_t size, int fd) + : MmapWrapper(data, size), fd_(fd) {} ~MmapWrapperImpl() override { +#ifdef __linux__ + // Drop the kernel pagecache pages for this file. madvise(DONTNEED) + // alone only unmaps from the process address space; pagecache + // entries persist (`free` reports them as buff/cache and the OOM + // killer doesn't touch them, but they ARE counted against + // overcommit and can starve other allocations on tight-RAM + // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented + // way to evict pagecache for a specific fd's pages. + if (data_ != nullptr && size_ > 0) { + madvise(data_, size_, MADV_DONTNEED); + } + if (fd_ >= 0) { + posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + } +#endif munmap(data_, size_); + if (fd_ >= 0) { + close(fd_); + } } + +private: + int fd_; }; std::unique_ptr MmapWrapper::create(const std::string& filename) { @@ -191,9 +212,10 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { int mmap_flags = MAP_PRIVATE; #ifdef __linux__ - // performance flags used by llama.cpp - // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); - // mmap_flags |= MAP_POPULATE; + // Sequential access hint helps the kernel read-ahead efficiently and + // also encourages eviction of already-read pages (the kernel keeps + // a smaller working set when this is set). + posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); #endif struct stat sb; @@ -206,9 +228,8 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0); - close(file_descriptor); - if (mapped_data == MAP_FAILED) { + close(file_descriptor); return nullptr; } @@ -217,7 +238,7 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); #endif - return std::make_unique(mapped_data, file_size); + return std::make_unique(mapped_data, file_size, file_descriptor); } #endif diff --git a/src/version.cpp b/src/version.cpp index 97dc8426b..6c266153c 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -1,3 +1,6 @@ +#include + +#include "ggml-backend.h" #include "stable-diffusion.h" #ifndef SDCPP_BUILD_COMMIT @@ -18,3 +21,12 @@ const char* sd_commit(void) { const char* sd_version(void) { return STRINGIZE(SDCPP_BUILD_VERSION); } + +void sd_list_devices(void) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* name = ggml_backend_dev_name(dev); + const char* desc = ggml_backend_dev_description(dev); + std::printf("%s\t%s\n", name ? name : "", desc ? desc : ""); + } +}