From 4086c96be1e5521e3f820c49533b1a6ac027b115 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 30 Apr 2026 13:39:29 +0200 Subject: [PATCH 1/2] feat: auto-fit component placement and per-component backend devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an auto-fit planner that picks DiT / VAE / Conditioner device placements from free GPU memory, treating each component as atomic (no intra-tensor row split — equivalent to llama.cpp's LLAMA_SPLIT_MODE_LAYER at component granularity, so views never land on a split buffer and no ggml patch is needed). Also adopt the PR #1184 CLI conventions: - new: --main-backend-device, --diffusion-backend-device, --clip-backend-device, --vae-backend-device, --control-net-backend-device, --tae-backend-device, --upscaler-backend-device, --photomaker-backend-device, --vision-backend-device, --list-devices - removed: --clip-on-cpu, --vae-on-cpu, --control-net-cpu (and the matching keep_*_on_cpu fields on sd_ctx_params_t) Auto-fit knobs: --auto-fit / --no-auto-fit, --no-multi-gpu, --fit-target, --fit-compute-reserve-{dit,vae,cond}, --fit-dry-run. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/common/common.cpp | 73 ++++-- examples/common/common.h | 23 +- include/stable-diffusion.h | 30 ++- src/backend_fit.hpp | 441 +++++++++++++++++++++++++++++++++++++ src/model_loader.h | 2 + src/stable-diffusion.cpp | 173 ++++++++++++--- src/version.cpp | 12 + 7 files changed, 694 insertions(+), 60 deletions(-) create mode 100644 src/backend_fit.hpp diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..52f7635e4 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -435,6 +435,23 @@ ArgOptions SDContextParams::get_options() { "--chroma-t5-mask-pad", "t5 mask pad size of chroma", &chroma_t5_mask_pad}, + {"", + "--fit-target", + "auto-fit: MiB of free memory to leave on each GPU (default: 512)", + &auto_fit_target_mb}, + {"", + "--fit-compute-reserve-dit", + "auto-fit: MiB reserved on the DiT's GPU for its compute buffer " + "(0 keeps the built-in default)", + &auto_fit_compute_reserve_dit_mb}, + {"", + "--fit-compute-reserve-vae", + "auto-fit: MiB reserved on the VAE's GPU for its compute buffer", + &auto_fit_compute_reserve_vae_mb}, + {"", + "--fit-compute-reserve-cond", + "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer", + &auto_fit_compute_reserve_cond_mb}, }; options.float_options = { @@ -461,18 +478,6 @@ ArgOptions SDContextParams::get_options() { "--mmap", "whether to memory-map model", true, &enable_mmap}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, &control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, &clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, &vae_on_cpu}, {"", "--fa", "use flash attention", @@ -513,6 +518,24 @@ ArgOptions SDContextParams::get_options() { "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, + {"", + "--auto-fit", + "automatically pick DiT/VAE/Conditioner device placements based on " + "free GPU memory (default ON)", + true, &auto_fit}, + {"", + "--no-auto-fit", + "disable auto-fit and use the explicit --backend / --params-backend flags", + false, &auto_fit}, + {"", + "--no-multi-gpu", + "auto-fit: keep all components on a single GPU when they fit " + "(by default, multi-GPU placements are preferred to balance load)", + false, &auto_multi_gpu}, + {"", + "--fit-dry-run", + "auto-fit: print the computed plan and exit without loading models", + true, &auto_fit_dry_run}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { @@ -611,6 +634,15 @@ ArgOptions SDContextParams::get_options() { "but it usually offers faster inference speed and, in some cases, lower memory usage. " "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, + {"", + "--list-devices", + "list available ggml backend devices (one per line, " + "namedescription) and exit", + [](int /*argc*/, const char** /*argv*/, int /*index*/) { + sd_list_devices(); + std::exit(0); + return 0; + }}, }; return options; @@ -736,9 +768,10 @@ std::string SDContextParams::to_string() const { << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" + << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" + << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" + << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -797,9 +830,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f lora_apply_mode, offload_params_to_cpu, enable_mmap, - clip_on_cpu, - control_net_cpu, - vae_on_cpu, flash_attn, diffusion_flash_attn, taesd_preview, @@ -817,6 +847,13 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f stream_layers, backend.c_str(), params_backend.c_str(), + auto_fit, + auto_fit_target_mb, + auto_fit_dry_run, + auto_fit_compute_reserve_dit_mb, + auto_fit_compute_reserve_vae_mb, + auto_fit_compute_reserve_cond_mb, + auto_multi_gpu, }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..2fa798e7e 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -148,14 +148,11 @@ struct SDContextParams { bool stream_layers = false; std::string backend; std::string params_backend; - bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool flash_attn = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; + bool enable_mmap = false; + bool flash_attn = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool circular = false; bool circular_x = false; @@ -167,6 +164,16 @@ struct SDContextParams { bool qwen_image_zero_cond_t = false; + // Auto-fit defaults — placement is computed automatically based on free + // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device. + bool auto_fit = true; + int auto_fit_target_mb = 512; + bool auto_fit_dry_run = false; + int auto_fit_compute_reserve_dit_mb = 0; + int auto_fit_compute_reserve_vae_mb = 0; + int auto_fit_compute_reserve_cond_mb = 0; + bool auto_multi_gpu = true; + prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 17596f849..3c5b59005 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -206,9 +206,6 @@ typedef struct { enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; bool enable_mmap; - bool keep_clip_on_cpu; - bool keep_control_net_on_cpu; - bool keep_vae_on_cpu; bool flash_attn; bool diffusion_flash_attn; bool tae_preview_only; @@ -226,6 +223,28 @@ typedef struct { bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; + + // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory. + // When `auto_fit` is true (default), `backend` / `params_backend` are + // ignored and the placement is computed automatically (the plan is fed + // into the same backend assignment that `backend` / `params_backend` use). + // `auto_fit_target_mb` is the memory to leave free per GPU (default 512). + // `auto_fit_dry_run` prints the plan and aborts init before loading. + // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the + // per-component compute-buffer reserve; 0 means use the built-in default. + bool auto_fit; + int auto_fit_target_mb; + bool auto_fit_dry_run; + int auto_fit_compute_reserve_dit_mb; + int auto_fit_compute_reserve_vae_mb; + int auto_fit_compute_reserve_cond_mb; + + // When more than one GPU device is present, prefer placing different + // components on different GPUs to balance load and fit larger total + // working sets. Set false to keep all components on a single GPU when + // they fit. Defaults to true. Each component still lives entirely on + // one device — no intra-tensor row split. + bool auto_multi_gpu; } sd_ctx_params_t; typedef struct { @@ -491,6 +510,11 @@ SD_API bool preprocess_canny(sd_image_t image, SD_API const char* sd_commit(void); SD_API const char* sd_version(void); +// List available ggml backend devices to stdout, in `namedescription` +// per-line format. The output is intended to be parsed by tools and used as +// device names in the --backend / --params-backend assignment specs. +SD_API void sd_list_devices(void); + #ifdef __cplusplus } #endif diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp new file mode 100644 index 000000000..ee23d6418 --- /dev/null +++ b/src/backend_fit.hpp @@ -0,0 +1,441 @@ +#ifndef __SD_BACKEND_FIT_HPP__ +#define __SD_BACKEND_FIT_HPP__ + +// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the +// available GPU devices and system RAM. +// +// Each component is treated as a single atomic unit that lives entirely on +// one device (plus its compute buffer on the same device). There is no +// intra-tensor row split: cross-device parallelism comes from placing +// different components on different GPUs, not from splitting individual +// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER +// at the component granularity. +// +// Placement priority: DiT + compute buffer -> VAE -> Conditioner. +// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that +// support streaming params from RAM at compute time). + +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend.h" + +#include "model_loader.h" +#include "core/util.h" + +namespace backend_fit { + +constexpr int64_t MiB = 1024 * 1024; +constexpr int DEVICE_ID_CPU = -1; + +enum class ComponentKind { + DIT, + VAE, + CONDITIONER, +}; + +enum class Placement { + CPU, + GPU, + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU +}; + +struct Component { + ComponentKind kind; + std::string name; + int64_t params_bytes = 0; + int64_t compute_bytes = 0; + bool supports_offload = false; +}; + +struct Device { + int id = DEVICE_ID_CPU; + std::string name; + std::string description; + int64_t free_bytes = 0; + int64_t total_bytes = 0; + ggml_backend_dev_t dev = nullptr; // backing ggml device handle (GPU only) +}; + +struct Decision { + ComponentKind kind; + std::string name; + Placement placement = Placement::CPU; + int device_id = DEVICE_ID_CPU; + int64_t on_device_bytes = 0; + int64_t on_host_bytes = 0; +}; + +struct Plan { + std::vector decisions; + std::map device_bytes; + int64_t host_bytes = 0; + bool any_changes = false; +}; + +struct ComputeReserves { + int64_t dit_bytes = int64_t(2048) * MiB; + int64_t vae_bytes = int64_t(1024) * MiB; + int64_t conditioner_bytes = int64_t(512) * MiB; +}; + +// --- Classification ------------------------------------------------------- + +inline bool classify_tensor(const std::string& name, ComponentKind& out) { + auto contains = [&](const char* s) { return name.find(s) != std::string::npos; }; + + if (contains("model.diffusion_model.") || contains("unet.")) { + out = ComponentKind::DIT; + return true; + } + + if (contains("first_stage_model.") || + name.rfind("vae.", 0) == 0 || + name.rfind("tae.", 0) == 0) { + out = ComponentKind::VAE; + return true; + } + + if (contains("text_encoders") || + contains("cond_stage_model") || + contains("te.text_model.") || + contains("conditioner") || + name.rfind("text_encoder.", 0) == 0) { + out = ComponentKind::CONDITIONER; + return true; + } + + return false; +} + +// --- Memory estimation ---------------------------------------------------- + +inline std::vector estimate_components(ModelLoader& loader, + ggml_type override_wtype, + int64_t alignment, + const ComputeReserves& reserves) { + auto& storage = loader.get_tensor_storage_map(); + + int64_t bytes[3] = {0, 0, 0}; + + for (auto& [name, ts_const] : storage) { + TensorStorage ts = ts_const; + if (is_unused_tensor(ts.name)) { + continue; + } + + ComponentKind k; + if (!classify_tensor(ts.name, k)) { + continue; + } + + if (override_wtype != GGML_TYPE_COUNT && + loader.tensor_should_be_converted(ts, override_wtype)) { + ts.type = override_wtype; + } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) { + ts.type = ts.expected_type; + } + + bytes[int(k)] += ts.nbytes() + alignment; + } + + std::vector out; + out.reserve(3); + out.push_back({ComponentKind::DIT, "DiT", + bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true}); + out.push_back({ComponentKind::VAE, "VAE", + bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false}); + out.push_back({ComponentKind::CONDITIONER, "Conditioner", + bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true}); + return out; +} + +// --- Device enumeration --------------------------------------------------- + +inline std::vector enumerate_gpu_devices() { + // Make sure the dynamically-loaded backends are registered before we query + // the device list. This runs before SDBackendManager initializes any + // backend, so nothing else has triggered the (file-local) lazy load yet. + // Safe to call once here: the manager's own load-all-once guard short + // circuits afterwards because the device count is already non-zero. + ggml_backend_load_all(); + + std::vector out; + int gpu_idx = 0; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + continue; + } + Device d; + d.id = gpu_idx++; + d.dev = dev; + d.name = ggml_backend_dev_name(dev); + d.description = ggml_backend_dev_description(dev); + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + d.free_bytes = int64_t(free_b); + d.total_bytes = int64_t(total_b); + out.push_back(d); + } + return out; +} + +// --- Core algorithm ------------------------------------------------------- + +// Peak per device = MAX of any single component's footprint on that device, +// because free_params_immediately frees params between phases so components +// time-share VRAM. +inline int64_t gpu_peak(int gpu_idx, + const std::vector& pl, + const std::vector& dev, + const std::vector& components) { + int64_t peak = 0; + for (size_t i = 0; i < components.size(); i++) { + if (dev[i] != gpu_idx) continue; + int64_t footprint = 0; + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + footprint = components[i].params_bytes + components[i].compute_bytes; + } + peak = std::max(peak, footprint); + } + return peak; +} + +inline Plan compute_plan(const std::vector& components, + const std::vector& devices, + int64_t margin_bytes, + bool allow_multi_gpu = true) { + const size_t nC = components.size(); + const size_t nG = devices.size(); + + std::vector cap(nG, 0); + for (size_t g = 0; g < nG; g++) { + cap[g] = std::max(0, devices[g].free_bytes - margin_bytes); + } + + struct OptionSlot { + Placement placement; + int device_idx; + }; + + auto build_options = [&](const Component& c) { + std::vector opts; + for (size_t g = 0; g < nG; g++) { + opts.push_back({Placement::GPU, int(g)}); + if (c.supports_offload) { + opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); + } + } + opts.push_back({Placement::CPU, -1}); + return opts; + }; + + std::vector> options; + options.reserve(nC); + for (const Component& c : components) { + options.push_back(build_options(c)); + } + + auto priority_weight = [](ComponentKind k) -> int { + switch (k) { + case ComponentKind::DIT: return 300; + case ComponentKind::CONDITIONER: return 120; + case ComponentKind::VAE: return 60; + } + return 1; + }; + + auto score = [&](const std::vector& pl, const std::vector& dev) { + int64_t s = 0; + std::set gpus_used; + for (size_t i = 0; i < nC; i++) { + const int pw = priority_weight(components[i].kind); + if (pl[i] == Placement::GPU) { + s += 10 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + s += 5 * pw; + gpus_used.insert(dev[i]); + } else { + s -= 10 * pw; + } + } + if (allow_multi_gpu) { + s += 2 * int64_t(gpus_used.size()); + } + return s; + }; + + std::vector idx(nC, 0); + std::vector best_pl; + std::vector best_dev; + int64_t best_score = std::numeric_limits::min(); + bool found_any = false; + + while (true) { + std::vector pl(nC); + std::vector dev(nC); + for (size_t i = 0; i < nC; i++) { + pl[i] = options[i][idx[i]].placement; + dev[i] = options[i][idx[i]].device_idx; + } + // Constraint: when multi-GPU is disabled, all GPU placements must + // share the same device index. + if (!allow_multi_gpu) { + int common = -1; + bool ok = true; + for (size_t i = 0; i < nC; i++) { + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (common < 0) common = dev[i]; + else if (dev[i] != common) { ok = false; break; } + } + } + if (ok) { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + } else { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + + size_t pos = 0; + while (pos < nC) { + idx[pos]++; + if (idx[pos] < options[pos].size()) break; + idx[pos] = 0; + pos++; + } + if (pos >= nC) break; + } + + Plan plan; + if (!found_any) { + best_pl.assign(nC, Placement::CPU); + best_dev.assign(nC, -1); + } + + for (size_t i = 0; i < nC; i++) { + const Component& c = components[i]; + Decision d; + d.kind = c.kind; + d.name = c.name; + d.placement = best_pl[i]; + if (best_pl[i] == Placement::CPU) { + d.device_id = DEVICE_ID_CPU; + d.on_host_bytes = c.params_bytes + c.compute_bytes; + plan.any_changes = true; + } else { + d.device_id = devices[best_dev[i]].id; + if (best_pl[i] == Placement::GPU) { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + } else { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + d.on_host_bytes = c.params_bytes; + plan.any_changes = true; + } + } + plan.decisions.push_back(d); + plan.host_bytes += d.on_host_bytes; + } + + for (size_t g = 0; g < nG; g++) { + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components); + } + return plan; +} + +inline const char* placement_str(Placement p) { + switch (p) { + case Placement::CPU: return "CPU"; + case Placement::GPU: return "GPU"; + case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + } + return "?"; +} + +inline void print_plan(const Plan& plan, + const std::vector& components, + const std::vector& devices, + int64_t margin_bytes) { + LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB)); + LOG_INFO(" available devices:"); + if (devices.empty()) { + LOG_INFO(" (no GPU devices detected — all components will run on CPU)"); + } + for (const Device& d : devices) { + LOG_INFO(" %-12s %-32s free %6lld / %6lld MiB", + d.name.c_str(), d.description.c_str(), + (long long)(d.free_bytes / MiB), + (long long)(d.total_bytes / MiB)); + } + LOG_INFO(" components:"); + for (const Component& c : components) { + LOG_INFO(" %-12s params %6lld MiB, compute reserve %6lld MiB", + c.name.c_str(), + (long long)(c.params_bytes / MiB), + (long long)(c.compute_bytes / MiB)); + } + LOG_INFO(" decisions:"); + for (const Decision& d : plan.decisions) { + if (d.placement == Placement::CPU) { + LOG_INFO(" %-12s -> CPU (RAM %lld MiB)", + d.name.c_str(), (long long)(d.on_host_bytes / MiB)); + } else if (d.placement == Placement::GPU) { + LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB)); + } else { + LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB), + (long long)(d.on_host_bytes / MiB)); + } + } + LOG_INFO(" projected per-device peak:"); + for (const Device& d : devices) { + int64_t peak = 0; + auto it = plan.device_bytes.find(d.id); + if (it != plan.device_bytes.end()) peak = it->second; + LOG_INFO(" %-12s peak %6lld / %6lld MiB free (remaining %lld MiB)", + d.name.c_str(), + (long long)(peak / MiB), + (long long)(d.free_bytes / MiB), + (long long)((d.free_bytes - peak) / MiB)); + } + LOG_INFO(" %-12s host RAM additional %lld MiB", "CPU", + (long long)(plan.host_bytes / MiB)); +} + +inline const Decision* find_decision(const Plan& plan, ComponentKind kind) { + for (const Decision& d : plan.decisions) { + if (d.kind == kind) return &d; + } + return nullptr; +} + +} // namespace backend_fit + +#endif // __SD_BACKEND_FIT_HPP__ diff --git a/src/model_loader.h b/src/model_loader.h index 8e0f41981..b77f4d6e2 100644 --- a/src/model_loader.h +++ b/src/model_loader.h @@ -27,6 +27,8 @@ struct MmapTensorStore { std::shared_ptr mmbuffer; }; +bool is_unused_tensor(const std::string& name); + class ModelLoader { protected: SDVersion version_ = VERSION_COUNT; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..640c049ad 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -10,6 +10,8 @@ #include "core/rng_mt19937.hpp" #include "core/rng_philox.hpp" #include "core/util.h" + +#include "backend_fit.hpp" #include "model_loader.h" #include "stable-diffusion.h" @@ -232,14 +234,19 @@ class StableDiffusionGGML { return params_backend_for(module) != nullptr; } - bool init_backend(const sd_ctx_params_t* sd_ctx_params) { + // Initialize the backend manager from backend_spec / params_backend_spec. + // These hold the user's --backend / --params-backend by default, but when + // auto-fit is enabled they are overwritten with the computed plan before + // this runs. The keep_*_on_cpu shortcuts were replaced by the spec + // mechanism (e.g. "vae=cpu"), so they are always false here. + bool init_backend() { std::string error; - if (!backend_manager.init(sd_ctx_params->backend, - sd_ctx_params->params_backend, + if (!backend_manager.init(backend_spec.c_str(), + params_backend_spec.c_str(), offload_params_to_cpu, - sd_ctx_params->keep_clip_on_cpu, - sd_ctx_params->keep_vae_on_cpu, - sd_ctx_params->keep_control_net_on_cpu, + /*keep_clip_on_cpu=*/false, + /*keep_vae_on_cpu=*/false, + /*keep_control_net_on_cpu=*/false, &error)) { LOG_ERROR("backend config failed: %s", error.c_str()); return false; @@ -288,10 +295,10 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - if (!init_backend(sd_ctx_params)) { - return false; - } - max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); + // Backend initialization is deferred until after the model metadata is + // loaded, so auto-fit can size the components and choose device + // placements before the backends are created (see the auto-fit block + // below, which feeds its plan into init_backend()). ModelLoader model_loader; @@ -441,6 +448,98 @@ class StableDiffusionGGML { return oss.str(); }; + if (sd_ctx_params->auto_fit) { + if (!backend_spec.empty() || !params_backend_spec.empty()) { + LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend " + "(pass --no-auto-fit to set device placement manually)"); + } + + backend_fit::ComputeReserves reserves; + if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) { + reserves.dit_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) { + reserves.vae_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB; + } + if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) { + reserves.conditioner_bytes = + int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB; + } + auto components = backend_fit::estimate_components( + model_loader, wtype, /*alignment=*/64, reserves); + auto devices = backend_fit::enumerate_gpu_devices(); + int64_t margin_bytes = + int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + auto plan = backend_fit::compute_plan( + components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu); + backend_fit::print_plan(plan, components, devices, margin_bytes); + + if (sd_ctx_params->auto_fit_dry_run) { + LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models"); + return false; + } + + // Translate the plan into the backend-assignment specs consumed by + // SDBackendManager. Each component lives entirely on one device: + // GPU -> runtime= (params follow runtime) + // GPU_OFFLOAD_PARAMS -> runtime=, params=cpu (params streamed from RAM) + // CPU -> runtime=cpu (params follow runtime) + // Modules the planner doesn't cover (clip_vision, control_net, + // photomaker, upscaler) fall back to the default backend. + std::string runtime_spec; + std::string params_spec; + auto append_assignment = [](std::string& spec, const char* key, const std::string& value) { + if (!spec.empty()) { + spec += ","; + } + spec += key; + spec += "="; + spec += value; + }; + auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) { + if (d == nullptr) { + return; + } + if (d->placement == backend_fit::Placement::CPU) { + append_assignment(runtime_spec, module_key, "cpu"); + return; + } + std::string dev_name; + for (const auto& dev : devices) { + if (dev.id == d->device_id) { + dev_name = dev.name; + break; + } + } + if (dev_name.empty()) { + return; // no matching device; fall back to the default backend + } + append_assignment(runtime_spec, module_key, dev_name); + if (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) { + append_assignment(params_spec, module_key, "cpu"); + } + }; + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), "diffusion"); + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), "te"); + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), "vae"); + + backend_spec = runtime_spec; + params_backend_spec = params_spec; + LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'", + backend_spec.empty() ? "(default)" : backend_spec.c_str(), + params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str()); + } + + // Create the backends now that the placement (manual or auto-fit) is + // settled, then resolve the graph-cut VRAM budget against the DiT's + // runtime backend. + if (!init_backend()) { + return false; + } + max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); + LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); LOG_INFO("Conditioner weight type stat: %s", wtype_stat_to_str(conditioner_wtype_stat).c_str()); LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str()); @@ -2688,21 +2787,25 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->prediction = PREDICTION_COUNT; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; - sd_ctx_params->max_vram = 0.f; - sd_ctx_params->stream_layers = false; - sd_ctx_params->enable_mmap = false; - sd_ctx_params->keep_clip_on_cpu = false; - sd_ctx_params->keep_control_net_on_cpu = false; - sd_ctx_params->keep_vae_on_cpu = false; - sd_ctx_params->diffusion_flash_attn = false; - sd_ctx_params->circular_x = false; - sd_ctx_params->circular_y = false; - sd_ctx_params->chroma_use_dit_mask = true; - sd_ctx_params->chroma_use_t5_mask = false; - sd_ctx_params->chroma_t5_mask_pad = 1; - sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO; - sd_ctx_params->backend = nullptr; - sd_ctx_params->params_backend = nullptr; + sd_ctx_params->max_vram = 0.f; + sd_ctx_params->stream_layers = false; + sd_ctx_params->enable_mmap = false; + sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->circular_x = false; + sd_ctx_params->circular_y = false; + sd_ctx_params->chroma_use_dit_mask = true; + sd_ctx_params->chroma_use_t5_mask = false; + sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO; + sd_ctx_params->backend = nullptr; + sd_ctx_params->params_backend = nullptr; + sd_ctx_params->auto_fit = true; + sd_ctx_params->auto_fit_target_mb = 512; + sd_ctx_params->auto_fit_dry_run = false; + sd_ctx_params->auto_fit_compute_reserve_dit_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; + sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; + sd_ctx_params->auto_multi_gpu = true; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -2741,9 +2844,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "stream_layers: %s\n" "backend: %s\n" "params_backend: %s\n" - "keep_clip_on_cpu: %s\n" - "keep_control_net_on_cpu: %s\n" - "keep_vae_on_cpu: %s\n" + "auto_fit: %s\n" + "auto_fit_target_mb: %d\n" + "auto_fit_dry_run: %s\n" + "auto_fit_compute_reserve_dit_mb: %d\n" + "auto_fit_compute_reserve_vae_mb: %d\n" + "auto_fit_compute_reserve_cond_mb: %d\n" + "auto_multi_gpu: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2781,9 +2888,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->stream_layers), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), - BOOL_STR(sd_ctx_params->keep_clip_on_cpu), - BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), - BOOL_STR(sd_ctx_params->keep_vae_on_cpu), + BOOL_STR(sd_ctx_params->auto_fit), + sd_ctx_params->auto_fit_target_mb, + BOOL_STR(sd_ctx_params->auto_fit_dry_run), + sd_ctx_params->auto_fit_compute_reserve_dit_mb, + sd_ctx_params->auto_fit_compute_reserve_vae_mb, + sd_ctx_params->auto_fit_compute_reserve_cond_mb, + BOOL_STR(sd_ctx_params->auto_multi_gpu), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), diff --git a/src/version.cpp b/src/version.cpp index 97dc8426b..6c266153c 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -1,3 +1,6 @@ +#include + +#include "ggml-backend.h" #include "stable-diffusion.h" #ifndef SDCPP_BUILD_COMMIT @@ -18,3 +21,12 @@ const char* sd_commit(void) { const char* sd_version(void) { return STRINGIZE(SDCPP_BUILD_VERSION); } + +void sd_list_devices(void) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* name = ggml_backend_dev_name(dev); + const char* desc = ggml_backend_dev_description(dev); + std::printf("%s\t%s\n", name ? name : "", desc ? desc : ""); + } +} From 20bfcc40e1d533985f1fb6200a582f6408f444af Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Fri, 12 Jun 2026 13:48:23 +0200 Subject: [PATCH 2/2] Finally adapt to the newest master --- examples/common/common.cpp | 68 +++- examples/common/common.h | 23 +- ggml | 2 +- include/stable-diffusion.h | 19 +- src/backend_fit.hpp | 301 ++++++++++++++- src/conditioning/conditioner.hpp | 36 ++ src/core/ggml_extend.hpp | 561 +++++++++++++++++++++++++++- src/core/ggml_extend_backend.cpp | 23 ++ src/core/ggml_extend_backend.h | 14 + src/core/util.cpp | 10 + src/model/diffusion/ltxv.hpp | 9 +- src/stable-diffusion.cpp | 609 +++++++++++++++++++++++++++++-- src/version.cpp | 12 - 13 files changed, 1593 insertions(+), 94 deletions(-) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 52f7635e4..a92e4615d 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -423,6 +423,18 @@ ArgOptions SDContextParams::get_options() { "--params-backend", "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu", ¶ms_backend}, + {"", + "--multi-gpu-mode", + "how to split a too-large DiT across GPUs (auto-fit): " + "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off " + "(default: row)", + &multi_gpu_mode}, + {"", + "--fit-compute-reserve", + "auto-fit: per-component compute-buffer reserve in MiB as a component " + "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in " + "defaults)", + &fit_compute_reserve}, }; options.int_options = { @@ -439,19 +451,6 @@ ArgOptions SDContextParams::get_options() { "--fit-target", "auto-fit: MiB of free memory to leave on each GPU (default: 512)", &auto_fit_target_mb}, - {"", - "--fit-compute-reserve-dit", - "auto-fit: MiB reserved on the DiT's GPU for its compute buffer " - "(0 keeps the built-in default)", - &auto_fit_compute_reserve_dit_mb}, - {"", - "--fit-compute-reserve-vae", - "auto-fit: MiB reserved on the VAE's GPU for its compute buffer", - &auto_fit_compute_reserve_vae_mb}, - {"", - "--fit-compute-reserve-cond", - "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer", - &auto_fit_compute_reserve_cond_mb}, }; options.float_options = { @@ -518,6 +517,18 @@ ArgOptions SDContextParams::get_options() { "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, + {"", + "--control-net-cpu", + "keep controlnet in cpu (deprecated alias for --backend control_net=cpu)", + true, &control_net_cpu}, + {"", + "--clip-on-cpu", + "keep clip in cpu (deprecated alias for --backend clip=cpu)", + true, &clip_on_cpu}, + {"", + "--vae-on-cpu", + "keep vae in cpu (deprecated alias for --backend vae=cpu)", + true, &vae_on_cpu}, {"", "--auto-fit", "automatically pick DiT/VAE/Conditioner device placements based on " @@ -771,7 +782,9 @@ std::string SDContextParams::to_string() const { << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" + << " fit_compute_reserve: \"" << fit_compute_reserve << "\",\n" << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" + << " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -791,6 +804,30 @@ std::string SDContextParams::to_string() const { } sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) { + // Fold the deprecated --*-on-cpu aliases into the generic backend spec. + // They are prepended so explicit --backend entries take precedence. + std::string alias_spec; + if (control_net_cpu) { + alias_spec += "control_net=cpu,"; + } + if (clip_on_cpu) { + alias_spec += "clip=cpu,"; + } + if (vae_on_cpu) { + alias_spec += "vae=cpu,"; + } + if (!alias_spec.empty()) { + backend = alias_spec + backend; + if (backend.back() == ',') { + backend.pop_back(); + } + control_net_cpu = false; + clip_on_cpu = false; + vae_on_cpu = false; + printf("warning: --clip-on-cpu / --vae-on-cpu / --control-net-cpu are deprecated, use --backend instead (folded into --backend \"%s\")\n", + backend.c_str()); + } + embedding_vec.clear(); embedding_vec.reserve(embedding_map.size()); for (const auto& kv : embedding_map) { @@ -850,10 +887,9 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f auto_fit, auto_fit_target_mb, auto_fit_dry_run, - auto_fit_compute_reserve_dit_mb, - auto_fit_compute_reserve_vae_mb, - auto_fit_compute_reserve_cond_mb, + fit_compute_reserve.c_str(), auto_multi_gpu, + multi_gpu_mode.c_str(), }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index 2fa798e7e..784d4fc77 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -165,14 +165,21 @@ struct SDContextParams { bool qwen_image_zero_cond_t = false; // Auto-fit defaults — placement is computed automatically based on free - // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device. - bool auto_fit = true; - int auto_fit_target_mb = 512; - bool auto_fit_dry_run = false; - int auto_fit_compute_reserve_dit_mb = 0; - int auto_fit_compute_reserve_vae_mb = 0; - int auto_fit_compute_reserve_cond_mb = 0; - bool auto_multi_gpu = true; + // VRAM. Pass --no-auto-fit to disable and use explicit --backend specs. + bool auto_fit = true; + int auto_fit_target_mb = 512; + bool auto_fit_dry_run = false; + // Per-component compute-buffer reserve in MiB as a component map, + // e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults. + std::string fit_compute_reserve; + bool auto_multi_gpu = true; + std::string multi_gpu_mode = "row"; + + // Deprecated aliases for --backend =cpu (kept for + // backwards compatibility with the pre-auto-fit CLI). + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/ggml b/ggml index 0ce7ad348..404fcb9d7 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421 +Subproject commit 404fcb9d7c96989569e68c9e7881ee3465a05c50 diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 3c5b59005..b1af537dc 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -230,21 +230,28 @@ typedef struct { // into the same backend assignment that `backend` / `params_backend` use). // `auto_fit_target_mb` is the memory to leave free per GPU (default 512). // `auto_fit_dry_run` prints the plan and aborts init before loading. - // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the - // per-component compute-buffer reserve; 0 means use the built-in default. + // `auto_fit_compute_reserve` tunes the per-component compute-buffer + // reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512" + // (same component-key style as `backend`); missing keys / NULL keep the + // built-in defaults. bool auto_fit; int auto_fit_target_mb; bool auto_fit_dry_run; - int auto_fit_compute_reserve_dit_mb; - int auto_fit_compute_reserve_vae_mb; - int auto_fit_compute_reserve_cond_mb; + const char* auto_fit_compute_reserve; // When more than one GPU device is present, prefer placing different // components on different GPUs to balance load and fit larger total // working sets. Set false to keep all components on a single GPU when // they fit. Defaults to true. Each component still lives entirely on - // one device — no intra-tensor row split. + // one device unless multi_gpu_mode splits it (see below). bool auto_multi_gpu; + + // How to split a single component (currently only the DiT) across GPUs + // when it doesn't fit on one but fits across several: "row" (matmul rows + // split via the backend's stock split buffer type, CUDA/SYCL), + // "layer" (whole blocks per GPU, routed by a scheduler, backend-generic), + // or "off" (never split a single component). NULL / empty => "row". + const char* multi_gpu_mode; } sd_ctx_params_t; typedef struct { diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp index ee23d6418..17994cd42 100644 --- a/src/backend_fit.hpp +++ b/src/backend_fit.hpp @@ -15,9 +15,11 @@ // Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that // support streaming params from RAM at compute time). +#include #include #include #include +#include #include #include #include @@ -42,7 +44,9 @@ enum class ComponentKind { enum class Placement { CPU, GPU, - GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries (sched-based) + GPU_TENSOR_SPLIT, // matmul weights row-split across GPUs (CUDA split-buft, single backend) }; struct Component { @@ -69,6 +73,13 @@ struct Decision { int device_id = DEVICE_ID_CPU; int64_t on_device_bytes = 0; int64_t on_host_bytes = 0; + + // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs + // that share this component (in order) and each device's estimated share + // of the params. The order also defines block-range partitioning: the + // i-th device gets a contiguous range of blocks proportional to share[i]. + std::vector split_device_ids; + std::vector split_share_bytes; }; struct Plan { @@ -84,6 +95,28 @@ struct ComputeReserves { int64_t conditioner_bytes = int64_t(512) * MiB; }; +enum class MultiGpuMode { + OFF, // never split a single component across GPUs + ROW, // CUDA-only: row-split matmul weights via cuda_split_buffer_type + LAYER, // generic: assign block-indexed tensors to per-block backends + sched +}; + +inline const char* multi_gpu_mode_str(MultiGpuMode m) { + switch (m) { + case MultiGpuMode::OFF: return "off"; + case MultiGpuMode::ROW: return "row"; + case MultiGpuMode::LAYER: return "layer"; + } + return "?"; +} + +inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) { + if (s == "off") return MultiGpuMode::OFF; + if (s == "row") return MultiGpuMode::ROW; + if (s == "layer") return MultiGpuMode::LAYER; + return MultiGpuMode::ROW; // default +} + // --- Classification ------------------------------------------------------- inline bool classify_tensor(const std::string& name, ComponentKind& out) { @@ -105,7 +138,13 @@ inline bool classify_tensor(const std::string& name, ComponentKind& out) { contains("cond_stage_model") || contains("te.text_model.") || contains("conditioner") || - name.rfind("text_encoder.", 0) == 0) { + name.rfind("text_encoder.", 0) == 0 || + // Connector / text projection layers that run on the conditioner + // backend (e.g. LTX-2's text_embedding_projection: video/audio + // aggregate embeds + projection that map LLM hidden states into + // DiT-input space). + name.rfind("text_embedding_projection.", 0) == 0 || + contains(".aggregate_embed.")) { out = ComponentKind::CONDITIONER; return true; } @@ -188,19 +227,129 @@ inline std::vector enumerate_gpu_devices() { // --- Core algorithm ------------------------------------------------------- -// Peak per device = MAX of any single component's footprint on that device, -// because free_params_immediately frees params between phases so components -// time-share VRAM. +// Per-GPU share for a layer-split component: free-VRAM-weighted partition +// of params, plus the full compute reserve on each participating device. +// (Compute reserve is per-device since each shard activates its own kernels.) +inline std::vector layer_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs, + int64_t margin_bytes = 0) { + // Every participating device hosts its param share PLUS a full compute + // reserve (the sched allocates a compute buffer per backend), so weight the + // param shares by what remains AFTER compute + margin. This guarantees + // share_k + compute <= free_k - margin whenever the total fits at all; + // weighting by raw free overcommits the smaller GPU and the planner then + // rejects layer-split as infeasible (observed: 22B DiT fell to CPU). + std::vector avail(gpu_idxs.size(), 0); + int64_t total = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + int64_t a = std::max(0, devices[gpu_idxs[k]].free_bytes - compute_bytes - margin_bytes); + avail[k] = a; + total += a; + } + std::vector out(gpu_idxs.size(), 0); + if (total <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + double r = double(avail[k]) / double(total); + out[k] = int64_t(double(params_bytes) * r) + compute_bytes; + } + return out; +} + +// Per-GPU PARAM share for a row (tensor) split. Unlike layer-split, the graph +// runs on a single MAIN backend (the biggest GPU at gpu_idxs[main_pos]), so +// ONLY the main device also hosts the compute buffer. We therefore reserve +// `compute_bytes` of the main device's free VRAM before weighting, so the main +// doesn't get so many matmul rows that its compute buffer no longer fits. The +// caller adds compute_bytes back when computing the main device's peak. Returns +// param bytes per device (no compute folded in) — these become the split ratios. +inline std::vector row_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs, + size_t main_pos) { + std::vector avail(gpu_idxs.size(), 0); + int64_t total = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + int64_t a = std::max(0, devices[gpu_idxs[k]].free_bytes); + if (k == main_pos) { + a = std::max(0, a - compute_bytes); + } + avail[k] = a; + total += a; + } + std::vector out(gpu_idxs.size(), 0); + if (total <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + out[k] = int64_t(double(params_bytes) * double(avail[k]) / double(total)); + } + return out; +} + +// Peak per device = MAX of any single component's footprint on that device. +// Components free their params between phases (free_params_immediately; the +// split runners load lazily and free after each phase too), so they time-share +// VRAM rather than coexisting — hence MAX, not sum. inline int64_t gpu_peak(int gpu_idx, const std::vector& pl, const std::vector& dev, - const std::vector& components) { + const std::vector& components, + const std::vector& devices = {}) { int64_t peak = 0; for (size_t i = 0; i < components.size(); i++) { - if (dev[i] != gpu_idx) continue; int64_t footprint = 0; if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (dev[i] != gpu_idx) continue; footprint = components[i].params_bytes + components[i].compute_bytes; + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: every GPU in the mask gets a free-VRAM-weighted + // share of params; the compute reserve lands on the BIGGEST + // GPU (which becomes the runner's main backend). + const int mask = dev[i]; + if (!(mask & (1 << gpu_idx))) continue; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + int slot = -1; + int biggest_slot = 0; + int64_t biggest_mem = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) slot = int(k); + if (devices[gpu_idxs[k]].total_bytes > biggest_mem) { + biggest_mem = devices[gpu_idxs[k]].total_bytes; + biggest_slot = int(k); + } + } + if (slot < 0) continue; + // Row-split: graph runs on the main (= biggest) GPU, which reserves + // its compute buffer; param rows are weighted by the remaining free. + auto shares = row_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs, size_t(biggest_slot)); + footprint = shares[slot]; + if (slot == biggest_slot) { + footprint += components[i].compute_bytes; + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // dev[i] holds the bitmask of participating GPU indices into the + // devices[] vector (encoded by the planner). Look up our slot. + const int mask = dev[i]; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + // Find this gpu's slot in gpu_idxs. + int slot = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs); + footprint = shares[slot]; } peak = std::max(peak, footprint); } @@ -210,9 +359,13 @@ inline int64_t gpu_peak(int gpu_idx, inline Plan compute_plan(const std::vector& components, const std::vector& devices, int64_t margin_bytes, - bool allow_multi_gpu = true) { + bool allow_multi_gpu = true, + MultiGpuMode mode = MultiGpuMode::ROW) { const size_t nC = components.size(); const size_t nG = devices.size(); + if (!allow_multi_gpu) { + mode = MultiGpuMode::OFF; + } std::vector cap(nG, 0); for (size_t g = 0; g < nG; g++) { @@ -224,6 +377,24 @@ inline Plan compute_plan(const std::vector& components, int device_idx; }; + // ROW-split is DiT-exclusive. Keeping a single homogeneous row-split + // component (same tensor sizes every phase/generation) lets the driver + // reuse freed split-buffer chunks, which is what avoids the + // cuda_split_buffer fragmentation a ggml patch would otherwise be needed + // for. The DiT is also the per-step bottleneck, where row-split's small + // compute buffer matters most. + auto supports_tensor_split = [](ComponentKind k) { + return k == ComponentKind::DIT; + }; + // LAYER-split (regular per-device buffers routed by a scheduler) is + // general and fragmentation-free, so any block-structured component can + // use it. The Conditioner (e.g. Gemma) splits this way when it is too big + // for one GPU; its (larger) cross-backend compute buffer is acceptable + // because it runs once at encode time and frees before the DiT loop. + auto supports_layer_split = [](ComponentKind k) { + return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; + }; + auto build_options = [&](const Component& c) { std::vector opts; for (size_t g = 0; g < nG; g++) { @@ -232,6 +403,25 @@ inline Plan compute_plan(const std::vector& components, opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); } } + if (nG >= 2) { + // ROW-split: DiT only, in row mode. Spans all GPUs (one option). + if (mode == MultiGpuMode::ROW && supports_tensor_split(c.kind)) { + opts.push_back({Placement::GPU_TENSOR_SPLIT, (1 << nG) - 1}); + } + // LAYER-split: the DiT in layer mode, and any OTHER layer-split + // candidate (the Conditioner) regardless of mode — non-DiT + // components never row-split, preserving the single-row invariant. + const bool want_layer = supports_layer_split(c.kind) && + (mode == MultiGpuMode::LAYER || + (mode == MultiGpuMode::ROW && !supports_tensor_split(c.kind))); + if (want_layer) { + const int max_mask = 1 << nG; + for (int mask = 1; mask < max_mask; mask++) { + if (__builtin_popcount(mask) < 2) continue; + opts.push_back({Placement::GPU_LAYER_SPLIT, mask}); + } + } + } opts.push_back({Placement::CPU, -1}); return opts; }; @@ -262,6 +452,22 @@ inline Plan compute_plan(const std::vector& components, } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { s += 5 * pw; gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: cheaper than layer-split (no sched cross- + // backend doubling) but pays per-matmul cross-device + // reductions. Score it slightly above LAYER_SPLIT so the + // planner prefers it when both fit. + s += 8 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // Better than CPU but worse than fitting on a single GPU + // (cross-GPU traffic between blocks). + s += 7 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } } else { s -= 10 * pw; } @@ -299,7 +505,7 @@ inline Plan compute_plan(const std::vector& components, if (ok) { bool feasible = true; for (size_t g = 0; g < nG; g++) { - if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } } if (feasible) { int64_t sc = score(pl, dev); @@ -311,7 +517,7 @@ inline Plan compute_plan(const std::vector& components, } else { bool feasible = true; for (size_t g = 0; g < nG; g++) { - if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; } + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } } if (feasible) { int64_t sc = score(pl, dev); @@ -347,6 +553,66 @@ inline Plan compute_plan(const std::vector& components, d.device_id = DEVICE_ID_CPU; d.on_host_bytes = c.params_bytes + c.compute_bytes; plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + // Sort participating GPUs by descending TOTAL memory so the + // largest device is the "main" (runs the graph + hosts the compute + // buffer + sub-runners that don't get their own spec). This matches + // the user's preference: always use the bigger GPU as main. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + // PARAM shares for the split ratio: the main (order[0]) reserves its + // compute buffer first so it doesn't get over-loaded with rows. + auto shares = row_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs, order[0]); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + // split_share_bytes drives the row ratio in apply_dit -> keep it + // param-only. The main device's peak (params + compute) is folded + // into on_device_bytes for the plan display / feasibility. + d.split_share_bytes.push_back(shares[k]); + int64_t peak = shares[k] + (pos == 0 ? c.compute_bytes : 0); + max_share = std::max(max_share, peak); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs); + // Sort participating GPUs by descending TOTAL memory so the + // physically bigger GPU is listed first (and becomes the runner's + // main backend). Sub-runners that don't get the layer-split spec + // (e.g. the LTX-2 text projection) follow the main backend. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + d.split_share_bytes.push_back(shares[k]); + max_share = std::max(max_share, shares[k]); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; } else { d.device_id = devices[best_dev[i]].id; if (best_pl[i] == Placement::GPU) { @@ -362,7 +628,7 @@ inline Plan compute_plan(const std::vector& components, } for (size_t g = 0; g < nG; g++) { - plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components); + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices); } return plan; } @@ -372,6 +638,8 @@ inline const char* placement_str(Placement p) { case Placement::CPU: return "CPU"; case Placement::GPU: return "GPU"; case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)"; + case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)"; } return "?"; } @@ -407,6 +675,17 @@ inline void print_plan(const Plan& plan, LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", d.name.c_str(), d.device_id, (long long)(d.on_device_bytes / MiB)); + } else if (d.placement == Placement::GPU_LAYER_SPLIT || + d.placement == Placement::GPU_TENSOR_SPLIT) { + std::string ids; + const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer"; + for (size_t k = 0; k < d.split_device_ids.size(); k++) { + if (k > 0) ids += "+"; + ids += "GPU" + std::to_string(d.split_device_ids[k]); + ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)"; + } + LOG_INFO(" %-12s -> %s-split %s", + d.name.c_str(), tag, ids.c_str()); } else { LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", d.name.c_str(), d.device_id, diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index 0cb3172b9..b851b80bf 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -119,6 +119,12 @@ struct Conditioner { virtual size_t get_params_buffer_size() = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} virtual void set_stream_layers_enabled(bool enabled) {} + // Multi-GPU + lazy-load hooks. Default no-op; LLM-backed conditioners + // forward them to their (heavy) LLM sub-runner so it can be split across + // GPUs (layer-split) and/or have its params alloc+load deferred to the + // first compute so it time-shares VRAM with the DiT. + virtual void set_lazy_load(std::function fn) {} + virtual void set_multi_backend_spec(const MultiBackendSpec& spec) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} }; @@ -1488,6 +1494,14 @@ struct AnimaConditioner : public Conditioner { llm->set_stream_layers_enabled(enabled); } + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -1642,6 +1656,14 @@ struct LLMEmbedder : public Conditioner { llm->set_stream_layers_enabled(enabled); } + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -2229,6 +2251,16 @@ struct LTXAVEmbedder : public Conditioner { projector->set_flash_attention_enabled(enabled); } + // Split/lazy apply to the heavy LLM only; the small projector stays on the + // main backend and loads eagerly. + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { llm->set_max_graph_vram_bytes(max_vram_bytes); projector->set_max_graph_vram_bytes(max_vram_bytes); @@ -2267,6 +2299,7 @@ struct LTXAVEmbedder : public Conditioner { std::vector mask; tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength); + return {tokens, weights, mask}; } @@ -2304,6 +2337,7 @@ struct LTXAVEmbedder : public Conditioner { {}, true); GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), weights); int64_t valid_tokens = 0; @@ -2361,6 +2395,8 @@ struct LTXAVEmbedder : public Conditioner { } hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens}); + + return projector->compute(n_threads, hidden_states); } diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index d0326a192..f59785fdf 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1674,6 +1674,39 @@ struct GGMLRunnerContext { } }; +// Multi-GPU split of a single runner across several GPU backends, on stock +// ggml (no ggml patch needed). Two modes: +// LAYER_SPLIT: whole transformer blocks are assigned to per-block backends +// and a ggml_backend_sched routes cross-device ops. Works on +// any multi-GPU set. +// ROW_SPLIT: matmul weights are split row-wise via the backend's stock +// split buffer type (CUDA/SYCL `ggml_backend_split_buffer_type`), +// non-matmul weights live on the main GPU; sched still wires the +// extra backends so it can route the cross-device reductions. +// The split params are allocated once and kept resident (the runner is not +// freed+realloc'd between generations), which is what lets us avoid the +// split-buffer fragmentation a ggml patch would otherwise be needed for. +enum class MultiBackendMode { + LAYER_SPLIT, + ROW_SPLIT, +}; + +struct MultiBackendSpec { + MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT; + // Extra GPU backends beyond the runner's main (runtime) backend. The main + // backend is implicit and is NOT listed here. Borrowed handles — owned by + // the SDBackendManager, never freed by the runner. + std::vector additional_backends; + // LAYER_SPLIT: map a param tensor to the backend that should hold it (the + // main backend, or one of additional_backends). nullptr => main. Keyed by + // tensor POINTER, not name: param tensors are unnamed at alloc time. + std::function tensor_backend_fn; + // ROW_SPLIT: per-device weight ratios (length = the backend registry's + // device count) and the main device index that owns the non-split portion. + std::vector tensor_split_ratios; + int main_device = 0; +}; + struct GGMLRunner { protected: typedef std::function get_graph_cb_t; @@ -1710,6 +1743,32 @@ struct GGMLRunner { bool stream_layers_enabled = false; size_t observed_max_effective_budget_ = 0; + // --- multi-GPU split state (layer-split via sched OR row-split via the + // stock split buffer type). Inactive unless set_multi_backend_spec() + // was called before alloc_params_buffer(). --- + bool multi_backend_mode = false; + MultiBackendMode multi_backend_kind = MultiBackendMode::LAYER_SPLIT; + std::vector additional_backends; // borrowed (manager-owned) + std::function tensor_backend_fn = nullptr; + ggml_backend_sched_t sched = nullptr; // owned + bool sched_reserved = false; + ggml_backend_t cpu_fallback_backend = nullptr; + bool owns_cpu_fallback_backend = false; + // LAYER_SPLIT: one resident params buffer per participating backend. + std::vector multi_params_buffers; // owned + // ROW_SPLIT: resident split + main buffers and the split buft (buft is + // backend-cached, not owned). + std::vector row_split_ratios; + int row_main_device = 0; + ggml_backend_buffer_type_t row_split_buft = nullptr; + ggml_backend_buffer_t row_split_buffer = nullptr; // owned + ggml_backend_buffer_t row_main_buffer = nullptr; // owned + + // Lazy-load: when set, params alloc + tensor-data load is deferred to the + // first compute() (ensure_params_loaded) and freed after each phase, so + // components time-share VRAM instead of all coexisting at init. + std::function lazy_load_fn = nullptr; + sd::layer_registry::LayerRegistry layer_registry_; std::shared_ptr weight_adapter = nullptr; @@ -1894,7 +1953,167 @@ struct GGMLRunner { return true; } + // Build the multi-backend scheduler (lazily). Backends in priority order: + // main runtime backend, then the additional GPU backends, then a CPU + // fallback last (ggml_backend_sched_new requires the last backend be CPU). + bool ensure_sched() { + if (sched != nullptr) { + return true; + } + std::vector backends; + backends.reserve(1 + additional_backends.size() + 1); + backends.push_back(runtime_backend); + for (auto* b : additional_backends) { + backends.push_back(b); + } + if (cpu_fallback_backend == nullptr) { + cpu_fallback_backend = sd_backend_cpu_init(); + owns_cpu_fallback_backend = true; + } + backends.push_back(cpu_fallback_backend); + // Build an explicit per-backend buffer-type array instead of passing + // nullptr. ggml_backend_sched uses these in buffer_supported() to decide + // whether a cross-backend src needs a copy; with nullptr it synthesizes + // them from default backend types, and CUDA devices can spuriously report + // supporting each other's buffers -> a needed copy is skipped and a node + // (e.g. a cont in attention) reads another device's memory -> illegal + // access. For the trailing CPU slot, use device-0's host buffer type + // (pinned host memory) exactly as llama.cpp does (llama-context.cpp). + std::vector bufts; + bufts.reserve(backends.size()); + ggml_backend_dev_t dev0 = ggml_backend_get_device(runtime_backend); + for (auto* b : backends) { + if (b == cpu_fallback_backend && dev0 != nullptr) { + ggml_backend_buffer_type_t host = ggml_backend_dev_host_buffer_type(dev0); + bufts.push_back(host != nullptr ? host : ggml_backend_get_default_buffer_type(b)); + } else { + bufts.push_back(ggml_backend_get_default_buffer_type(b)); + } + } + sched = ggml_backend_sched_new(backends.data(), + bufts.data(), + (int)backends.size(), + MAX_GRAPH_SIZE, + /*parallel=*/false, + /*op_offload=*/false); + if (sched == nullptr) { + LOG_ERROR("%s: failed to create backend sched", get_desc().c_str()); + return false; + } + return true; + } + + // Map a weight tensor to the backend it was allocated on in a layer split. + ggml_backend_t backend_of_weight(ggml_tensor* t) const { + if (t == nullptr || t->buffer == nullptr) { + return nullptr; + } + if (ggml_backend_buffer_get_usage(t->buffer) != GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + return nullptr; + } + for (size_t i = 0; i < multi_params_buffers.size(); i++) { + if (multi_params_buffers[i] == t->buffer) { + if (i == 0) { + return runtime_backend; + } + if (i - 1 < additional_backends.size()) { + return additional_backends[i - 1]; + } + } + } + return nullptr; + } + + // Pin compute nodes to their layer's device for a LAYER split. Stock + // ggml_backend_sched anchors weight-bearing ops (matmuls) to the weight's + // device, but weightless ops (norm, residual add, permute, cont) have no + // anchor and are placed by a heuristic that, for the attention `cont`, can + // land on the wrong device and then read it without a cross-device copy -> + // CUDA illegal access. llama.cpp pins each layer-boundary norm to the + // layer's device for exactly this reason (llama-context.cpp). We generalise: + // walk the graph in execution order, track the device of the most recently + // consumed weight (= the current layer's device), and pin every node to it. + // This forces clean per-layer cuts so sched copies only the residual stream + // across the boundary. No-op outside a layer split. + void pin_layer_split_nodes(ggml_cgraph* gf) { + if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::LAYER_SPLIT) { + return; + } + if (sched == nullptr || multi_params_buffers.empty() || gf == nullptr) { + return; + } + ggml_backend_t cur = runtime_backend; + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor* node = ggml_graph_node(gf, i); + for (int s = 0; s < GGML_MAX_SRC; s++) { + ggml_backend_t wb = backend_of_weight(node->src[s]); + if (wb != nullptr) { + cur = wb; + } + } + // NEVER pin view ops (view/reshape/permute/transpose): a view + // assigned to a different backend than its view_src's data makes + // the sched skip the cross-device copy for consumers (the copy + // decision trusts the assigned id), and a kernel then dereferences + // the other device's pointer. The sched places views correctly on + // its own by following view_src. + if (node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + continue; + } + if (cur != nullptr && ggml_backend_supports_op(cur, node)) { + ggml_backend_sched_set_tensor_backend(sched, node, cur); + } + } + } + + // Pin un-allocated graph-input leaves (rope pe tables, timesteps, latents…) + // to the MAIN backend before sched alloc. Left to its own heuristics the + // sched places them on the CPU/host slot and emits per-split host->device + // input copies; those copies were observed landing LATE (first pass reads + // zeros / stale pool garbage, second pass reads the first pass's data). + // Pinning them to the main backend makes our copy_data_to_backend_tensor + // fill a device-resident tensor directly (synchronous H2D) and removes the + // cross-backend input copies entirely. + void pin_input_leaves(ggml_cgraph* gf) { + // ROW_SPLIT only: the whole graph computes on the main backend, so + // graph inputs trivially belong there; pinning them avoids per-split + // host->device input copies. (Layer-split graphs span devices and the + // sched routes their inputs correctly on its own.) + if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::ROW_SPLIT || + sched == nullptr || gf == nullptr || runtime_backend == nullptr) { + return; + } + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor* node = ggml_graph_node(gf, i); + for (int s = 0; s < GGML_MAX_SRC && node->src[s] != nullptr; s++) { + ggml_tensor* t = node->src[s]; + while (t->view_src != nullptr) { + t = t->view_src; + } + // op NONE + no buffer yet = a graph input the sched will + // allocate (weights already sit in params buffers). + if (t->op == GGML_OP_NONE && t->buffer == nullptr) { + ggml_backend_sched_set_tensor_backend(sched, t, runtime_backend); + } + } + } + } + bool alloc_compute_buffer(ggml_cgraph* gf) { + if (multi_backend_mode) { + // Do NOT ggml_backend_sched_reserve(gf) here: reserve runs + // split_graph, which REWIRES gf's src pointers to sched-internal + // copy tensors. execute_graph then sched_alloc_graph's the SAME gf, + // and the second split sees the stale reserve-epoch copies (measure + // layout) as valid inputs — silently corrupting every cross-backend + // input (garbage rope pe, garbage Gemma stack) or crashing. A graph + // must be split at most once; the first sched_alloc_graph in + // execute_graph performs the real allocation instead. + return ensure_sched(); + } if (compute_allocr != nullptr) { return true; } @@ -2417,13 +2636,15 @@ struct GGMLRunner { max_graph_vram_bytes > 0 && plan.segments.size() > 1 && params_backend != runtime_backend && - !sd_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend) && + !multi_backend_mode; } bool can_attempt_graph_cut_segmented_compute() const { return max_graph_vram_bytes > 0 && params_backend != runtime_backend && - !sd_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend) && + !multi_backend_mode; } bool resolve_graph_cut_plan(ggml_cgraph* gf, @@ -2657,7 +2878,18 @@ struct GGMLRunner { return std::nullopt; } - if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { + if (multi_backend_mode) { + ggml_backend_sched_reset(sched); + pin_layer_split_nodes(gf); // reset clears pins; re-apply before alloc + pin_input_leaves(gf); + if (!ggml_backend_sched_alloc_graph(sched, gf)) { + LOG_ERROR("%s sched alloc compute graph failed", get_desc().c_str()); + if (free_compute_buffer_immediately) { + free_compute_buffer(); + } + return std::nullopt; + } + } else if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); if (free_compute_buffer_immediately) { free_compute_buffer(); @@ -2674,9 +2906,20 @@ struct GGMLRunner { if (sd_backend_is_cpu(runtime_backend)) { sd_backend_cpu_set_n_threads(runtime_backend, n_threads); } + if (multi_backend_mode && cpu_fallback_backend != nullptr && sd_backend_is_cpu(cpu_fallback_backend)) { + sd_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads); + } int64_t t_compute_begin = ggml_time_ms(); - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); + ggml_status status; + if (multi_backend_mode) { + status = ggml_backend_sched_graph_compute(sched, gf); + if (status == GGML_STATUS_SUCCESS) { + ggml_backend_sched_synchronize(sched); + } + } else { + status = ggml_backend_graph_compute(runtime_backend, gf); + } int64_t t_compute_end = ggml_time_ms(); if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); @@ -3002,6 +3245,16 @@ struct GGMLRunner { free_params_ctx(); free_compute_ctx(); free_cache_ctx_and_buffer(); + // Multi-GPU split teardown. additional_backends are owned by the + // SDBackendManager (not freed here); row_split_buft is backend-cached. + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + } + if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) { + ggml_backend_free(cpu_fallback_backend); + cpu_fallback_backend = nullptr; + } } virtual GGMLRunnerContext get_context() { @@ -3028,7 +3281,207 @@ struct GGMLRunner { alloc_compute_ctx(); } + // Row-split eligibility: contiguous, rank-2, both dims >= 256, not a view. + // 1D biases/norms, embeddings, small projections and views fall back to the + // main GPU's regular per-device buft. Excluding views respects the split + // buft's documented contract (GGML_ASSERT(view_src == nullptr)) so we never + // need to patch ggml. + static bool is_row_split_eligible(const ggml_tensor* t) { + if (t->view_src != nullptr) return false; + if (!ggml_is_contiguous(t)) return false; + if (ggml_n_dims(t) != 2) return false; + if (t->ne[0] < 256 || t->ne[1] < 256) return false; + return true; + } + + // ROW_SPLIT: matmul-eligible weights -> row_split_buft (split row-wise + // across GPUs by the CUDA/SYCL backend), everything else -> the main GPU's + // default buft. Each is allocated ONCE into a single resident buffer and + // suballocated via ggml_tallocr — no per-tensor churn, no free->realloc. + bool alloc_params_buffer_row_split() { + if (row_split_buft == nullptr) { + LOG_ERROR("%s row-split buft not initialized (backend lacks ggml_backend_split_buffer_type)", + get_desc().c_str()); + return false; + } + ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend); + const size_t main_align = ggml_backend_buft_get_alignment(main_buft); + const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft); + + size_t main_size = 0, split_size = 0; + size_t main_count = 0, split_count = 0; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + if (is_row_split_eligible(t)) { + split_size += GGML_PAD(ggml_backend_buft_get_alloc_size(row_split_buft, t), split_align); + split_count++; + } else { + main_size += GGML_PAD(ggml_backend_buft_get_alloc_size(main_buft, t), main_align); + main_count++; + } + } + + if (main_size > 0) { + row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size); + if (row_main_buffer == nullptr) { + LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", get_desc().c_str(), main_size / (1024.f * 1024.f)); + return false; + } + } + if (split_size > 0) { + row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size); + if (row_split_buffer == nullptr) { + LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", get_desc().c_str(), split_size / (1024.f * 1024.f)); + return false; + } + } + + ggml_tallocr main_alloc{}; + ggml_tallocr split_alloc{}; + if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer); + if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer); + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + ggml_status st = is_row_split_eligible(t) ? ggml_tallocr_alloc(&split_alloc, t) : ggml_tallocr_alloc(&main_alloc, t); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s row-split tallocr_alloc failed", get_desc().c_str()); + return false; + } + } + if (row_main_buffer != nullptr) ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + if (row_split_buffer != nullptr) ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + rebuild_params_tensor_set(); + LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)", + get_desc().c_str(), main_size / (1024.f * 1024.f), main_count, split_size / (1024.f * 1024.f), split_count); + return true; + } + + // LAYER_SPLIT: assign each param tensor to a backend (via tensor_backend_fn, + // keyed by tensor pointer), allocate one resident buffer per backend on its + // default buft, and suballocate via ggml_tallocr. + bool alloc_params_buffer_layer_split() { + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + + std::vector bufts(backends.size()); + std::vector aligns(backends.size()); + std::vector sizes(backends.size(), 0); + std::vector counts(backends.size(), 0); + for (size_t i = 0; i < backends.size(); i++) { + bufts[i] = ggml_backend_get_default_buffer_type(backends[i]); + aligns[i] = ggml_backend_buft_get_alignment(bufts[i]); + } + + std::map tensor_backend_idx; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + int idx = 0; + if (tensor_backend_fn) { + ggml_backend_t target = tensor_backend_fn(t); + if (target != nullptr) { + for (size_t i = 0; i < backends.size(); i++) { + if (backends[i] == target) { idx = int(i); break; } + } + } + } + tensor_backend_idx[t] = idx; + sizes[idx] += GGML_PAD(ggml_backend_buft_get_alloc_size(bufts[idx], t), aligns[idx]); + counts[idx] += 1; + } + + multi_params_buffers.assign(backends.size(), nullptr); + for (size_t i = 0; i < backends.size(); i++) { + if (sizes[i] == 0) continue; + ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]); + size_t free_pre = 0, total_pre = 0; + if (dev) ggml_backend_dev_memory(dev, &free_pre, &total_pre); + multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]); + if (multi_params_buffers[i] == nullptr) { + LOG_ERROR("%s layer-split alloc on %s failed (%.1f MB)", get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f)); + return false; + } + size_t free_post = 0, total_post = 0; + if (dev) ggml_backend_dev_memory(dev, &free_post, &total_post); + LOG_DEBUG("%s layer-split alloc[%zu] %s req=%.1f MB dev_free %.1f -> %.1f MB is_host=%d", + get_desc().c_str(), i, ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), + free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f), + (int)ggml_backend_buffer_is_host(multi_params_buffers[i])); + } + + std::vector tallocs(backends.size()); + for (size_t i = 0; i < backends.size(); i++) { + if (multi_params_buffers[i] != nullptr) tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]); + } + for (auto& kv : tensor_backend_idx) { + if (ggml_tallocr_alloc(&tallocs[kv.second], kv.first) != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s layer-split tallocr_alloc failed", get_desc().c_str()); + return false; + } + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + rebuild_params_tensor_set(); + for (size_t i = 0; i < backends.size(); i++) { + if (counts[i] == 0) continue; + LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)", + get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), counts[i]); + } + return true; + } + + // Lazy mode: defer alloc + tensor-data load until the first compute(). + // The caller still runs alloc_params_buffer + get_param_tensors at init, + // but for a lazy runner alloc_params_buffer is a no-op and the bulk loader + // skips this runner's tensors (they have no buffer yet); ensure_params_loaded() + // then allocates and invokes lazy_load_fn() on demand, and the params are + // freed after the phase (free_params_immediately) so components time-share VRAM. + void set_lazy_load(std::function fn) { + lazy_load_fn = std::move(fn); + } + + // True once a (non-lazy) buffer exists OR a lazy load has materialized one. + bool params_loaded() const { + return params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr; + } + + bool ensure_params_loaded() { + if (params_loaded()) { + return true; + } + if (!lazy_load_fn) { + // Non-lazy runner with no buffer: either it had no tensors, or its + // params are mmap-resident (data already set). Nothing to do. + return true; + } + int64_t t0 = ggml_time_ms(); + if (!do_alloc_params_buffer()) { + return false; + } + if (!lazy_load_fn()) { + LOG_ERROR("%s: lazy params load failed", get_desc().c_str()); + return false; + } + LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (ggml_time_ms() - t0) / 1000.f); + return true; + } + bool alloc_params_buffer() { + // Defer to first compute() for lazy runners (see set_lazy_load). + if (lazy_load_fn) { + return true; + } + return do_alloc_params_buffer(); + } + + bool do_alloc_params_buffer() { + if (multi_backend_mode) { + // Split allocation bypasses the mmap fast-path: the params must land + // in the GPU split buffers, not stay mmap'd. + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + return alloc_params_buffer_row_split(); + } + return alloc_params_buffer_layer_split(); + } size_t num_tensors = ggml_tensor_num(params_ctx); if (num_tensors > 0) { // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated @@ -3086,14 +3539,53 @@ struct GGMLRunner { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; } + // Multi-GPU split buffers (layer-split: one per backend; row-split: + // split + main). The split buft itself is backend-cached, not freed. + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_free(buf); + } + } + multi_params_buffers.clear(); + if (row_split_buffer != nullptr) { + ggml_backend_buffer_free(row_split_buffer); + row_split_buffer = nullptr; + } + if (row_main_buffer != nullptr) { + ggml_backend_buffer_free(row_main_buffer); + row_main_buffer = nullptr; + } + // Release the multi-backend scheduler as well. Its reserved compute + // buffers can be GBs on each device, and free_compute_buffer only + // sched_reset()s them (kept alive across the sampling loop to avoid a + // per-step rebuild). free_params_buffer is the end-of-phase release, so + // here we actually free the sched so the next component can claim that + // VRAM (time-share). It is recreated lazily on the next compute(). + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + sched_reserved = false; + } observed_max_effective_budget_ = 0; } size_t get_params_buffer_size() { + size_t total = 0; if (params_buffer != nullptr) { - return ggml_backend_buffer_get_size(params_buffer); + total += ggml_backend_buffer_get_size(params_buffer); + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + total += ggml_backend_buffer_get_size(buf); + } + } + if (row_split_buffer != nullptr) { + total += ggml_backend_buffer_get_size(row_split_buffer); } - return 0; + if (row_main_buffer != nullptr) { + total += ggml_backend_buffer_get_size(row_main_buffer); + } + return total; } void free_cache_ctx_and_buffer() { @@ -3106,12 +3598,25 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } + if (sched != nullptr) { + // Reset (not free): keeping the sched alive across the sampling + // loop's compute() calls avoids a per-step rebuild. It is freed in + // the destructor. + ggml_backend_sched_reset(sched); + sched_reserved = false; + } restore_partial_params(); restore_all_params(); } // do copy after alloc graph void set_backend_tensor_data(ggml_tensor* tensor, const void* data) { + // In multi-backend mode, sched needs the tensor flagged as input so it + // gets a concrete backend assignment (tensors with no producers and no + // consumers otherwise stay at backend_id = -1 and never get a buffer). + if (multi_backend_mode) { + ggml_set_input(tensor); + } backend_tensor_data_map[tensor] = data; } @@ -3174,6 +3679,11 @@ struct GGMLRunner { int n_threads, bool free_compute_buffer_immediately, bool no_return = false) { + // Lazy runners allocate + load their params here, on first use of the + // phase; they were skipped at init so components time-share VRAM. + if (!ensure_params_loaded()) { + return std::nullopt; + } ggml_cgraph* gf = nullptr; if (!prepare_compute_graph(get_graph, &gf)) { return std::nullopt; @@ -3240,6 +3750,45 @@ struct GGMLRunner { stream_layers_enabled = enabled; } + // Configure a multi-GPU split for this runner. Must be called AFTER + // construction + get_param_tensors() and BEFORE alloc_params_buffer(). + // For ROW_SPLIT, resolves the backend's stock split buffer type; if the + // backend has none (non-CUDA/SYCL), it cleanly falls back to single-GPU. + void set_multi_backend_spec(const MultiBackendSpec& spec) { + if (params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr) { + LOG_ERROR("%s set_multi_backend_spec called after params were allocated; ignoring", + get_desc().c_str()); + return; + } + multi_backend_mode = true; + multi_backend_kind = spec.mode; + additional_backends = spec.additional_backends; + tensor_backend_fn = spec.tensor_backend_fn; + row_split_ratios = spec.tensor_split_ratios; + row_main_device = spec.main_device; + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + row_split_buft = sd_backend_split_buffer_type( + runtime_backend, + row_main_device, + row_split_ratios.empty() ? nullptr : row_split_ratios.data()); + if (row_split_buft == nullptr) { + LOG_WARN("%s row-split unavailable on this backend; falling back to single-GPU", + get_desc().c_str()); + multi_backend_mode = false; + additional_backends.clear(); + tensor_backend_fn = nullptr; + return; + } + } + // Streaming (graph-cut param offload) is mutually exclusive with split. + stream_layers_enabled = false; + } + + bool is_multi_backend() const { + return multi_backend_mode; + } + sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } ggml_backend_t get_runtime_backend() { diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp index d085129db..d8f43c90d 100644 --- a/src/core/ggml_extend_backend.cpp +++ b/src/core/ggml_extend_backend.cpp @@ -507,6 +507,10 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) { return init_cached_backend(name); } +ggml_backend_t SDBackendManager::ensure_backend(const std::string& device_name) { + return init_cached_backend(device_name); +} + bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) { return sd_backend_is_cpu(runtime_backend(module)); } @@ -654,3 +658,22 @@ const char* sd_backend_module_name(SDBackendModule module) { } return "unknown"; } + +ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) { + if (backend == nullptr) { + return nullptr; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev == nullptr) { + return nullptr; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (reg == nullptr) { + return nullptr; + } + auto fn = (ggml_backend_split_buffer_type_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); + if (fn == nullptr) { + return nullptr; // backend has no row-split support (non-CUDA/SYCL) + } + return fn(main_device, tensor_split); +} diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h index fc071ffda..24b53c1ad 100644 --- a/src/core/ggml_extend_backend.h +++ b/src/core/ggml_extend_backend.h @@ -61,6 +61,12 @@ class SDBackendManager { ggml_backend_t runtime_backend(SDBackendModule module); ggml_backend_t params_backend(SDBackendModule module); + // Return (creating + caching on first use) the backend for an explicit + // ggml device name (e.g. "CUDA1"). Used to obtain the additional GPU + // backends a multi-GPU split needs; the manager owns the handle and frees + // it once at teardown, so callers only borrow it. + ggml_backend_t ensure_backend(const std::string& device_name); + bool runtime_backend_is_cpu(SDBackendModule module); bool params_backend_is_cpu(SDBackendModule module); bool runtime_backend_supports_host_buffer(SDBackendModule module); @@ -76,4 +82,12 @@ ggml_backend_t sd_backend_cpu_init(); bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); const char* sd_backend_module_name(SDBackendModule module); void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); + +// Runtime lookup of a backend's row-split buffer type, published by the CUDA +// and SYCL backends as the "ggml_backend_split_buffer_type" proc. Returns +// nullptr when the backend does not support row-split (the caller then falls +// back to a non-split single-GPU path). `tensor_split` is a per-device weight +// array of length = the backend registry's device count; `main_device` is the +// index of the device that owns the non-split portion. +ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split); #endif // __SD_CORE_GGML_EXTEND_BACKEND_H__ diff --git a/src/core/util.cpp b/src/core/util.cpp index 61101a08b..d50f3770f 100644 --- a/src/core/util.cpp +++ b/src/core/util.cpp @@ -25,6 +25,7 @@ #include #endif +#include "ggml-backend.h" #include "ggml.h" #include "stable-diffusion.h" @@ -972,3 +973,12 @@ std::vector> split_quotation_attention( } return result; } + +void sd_list_devices(void) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* name = ggml_backend_dev_name(dev); + const char* desc = ggml_backend_dev_description(dev); + printf("%s\t%s\n", name ? name : "", desc ? desc : ""); + } +} diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index a86b4cf50..fd149ce13 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -1606,8 +1606,13 @@ namespace LTXV { if (config.cross_attention_adaln) { auto prompt_adaln_single = std::dynamic_pointer_cast(blocks["prompt_adaln_single"]); auto audio_prompt_adaln_single = std::dynamic_pointer_cast(blocks["audio_prompt_adaln_single"]); - v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first; - a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first; + // The reference feeds modality.sigma (the RAW per-batch sigma) to + // both prompt adalns. effective_audio_timestep is exactly that: + // audio timesteps are never denoise-masked, so it carries the + // unmasked sigma even in i2v. The VIDEO timestep tensor is the + // denoise-masked per-token one and must NOT be used here. + v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first; + a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first; } auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 640c049ad..829260f00 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -197,6 +197,30 @@ class StableDiffusionGGML { std::string backend_spec; std::string params_backend_spec; + // DiT multi-GPU split decision captured from the auto-fit plan and applied + // to the diffusion runner(s) before param load. OFF when the DiT is not + // split. device_ids[0] is the "main" GPU (largest); share_bytes is the + // per-device VRAM share (same order as device_ids). + backend_fit::MultiGpuMode fit_dit_split_mode = backend_fit::MultiGpuMode::OFF; + std::vector fit_dit_split_device_names; // ggml device names, [0] = main + std::vector fit_dit_split_share_bytes; + // Conditioner (LLM) split decision — always layer-split when it splits + // (only the DiT ever row-splits; see backend_fit::supports_tensor_split). + backend_fit::MultiGpuMode fit_cond_split_mode = backend_fit::MultiGpuMode::OFF; + std::vector fit_cond_split_device_names; + std::vector fit_cond_split_share_bytes; + + // Kept alive past init() so lazy-load callbacks can re-read tensors from the + // model files on demand. Populated only when auto_lazy_load is on. + std::unique_ptr owned_model_loader; + // Auto-fit decided the components can't all be resident at once (the + // per-component MAX plan only fits if they time-share), so defer the heavy + // components' param alloc+load to their compute phase and free after. + bool auto_lazy_load = false; + // auto-fit is on: when a VAE decode OOMs we may auto-enable tiling and retry + // (temporal for LTX video, spatial otherwise) instead of failing. + bool auto_fit_enabled = false; + bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; @@ -254,6 +278,338 @@ class StableDiffusionGGML { return ensure_backend_pair(SDBackendModule::DIFFUSION); } + // Parse a transformer block index out of a weight name, or -1 if none. + static int dit_block_index_of(const std::string& name) { + static const char* kw[] = {"transformer_blocks.", "joint_blocks.", "double_blocks.", + "single_blocks.", "blocks.", "layers."}; + for (const char* k : kw) { + size_t p = name.find(k); + if (p == std::string::npos) { + continue; + } + p += strlen(k); + size_t e = p; + while (e < name.size() && name[e] >= '0' && name[e] <= '9') { + e++; + } + if (e > p) { + return atoi(name.substr(p, e - p).c_str()); + } + } + return -1; + } + + // Build a MultiBackendSpec from the auto-fit DiT split decision and apply it + // to a diffusion runner BEFORE its params are allocated. No-op when the DiT + // is not split. Always returns true (any failure falls back to single-GPU). + bool apply_dit_multi_gpu_split(const std::shared_ptr& runner, + ModelLoader& model_loader) { + if (!runner || fit_dit_split_mode == backend_fit::MultiGpuMode::OFF || + fit_dit_split_device_names.size() < 2) { + return true; + } + const auto& devnames = fit_dit_split_device_names; + const auto& shares = fit_dit_split_share_bytes; + ggml_backend_t main_backend = runner->get_runtime_backend(); + MultiBackendSpec spec; + + if (fit_dit_split_mode == backend_fit::MultiGpuMode::ROW) { + // ROW: one main backend; matmul rows are split across the devices by + // the stock split buft. sched still needs the extra backends so it + // can route the cross-device reductions. + auto reg_prefix_of = [](const std::string& n) -> std::string { + size_t i = 0; + while (i < n.size() && !(n[i] >= '0' && n[i] <= '9')) { + i++; + } + return n.substr(0, i); + }; + std::string reg_name = reg_prefix_of(devnames[0]); + ggml_backend_reg_t reg = ggml_backend_reg_by_name(reg_name.c_str()); + if (reg == nullptr) { + LOG_WARN("row-split: backend registry '%s' not found; using single GPU", reg_name.c_str()); + return true; + } + int dev_count = (int)ggml_backend_reg_dev_count(reg); + if (dev_count <= 0) { + return true; + } + auto reg_index_of = [&](const std::string& n) -> int { + if (n.rfind(reg_name, 0) != 0) { + return -1; + } + try { + return std::stoi(n.substr(reg_name.size())); + } catch (...) { + return -1; + } + }; + int64_t total = 0; + for (auto b : shares) { + total += b; + } + if (total <= 0) { + return true; + } + std::vector ratios(dev_count, 0.f); + for (size_t k = 0; k < devnames.size(); k++) { + int idx = reg_index_of(devnames[k]); + if (idx < 0 || idx >= dev_count) { + continue; + } + ratios[idx] = float(double(shares[k]) / double(total)); + } + // The main device must be the runner's runtime backend, which the + // planner set to devnames[0] (the largest-VRAM GPU, listed first). + // Keeping these aligned ensures the split buft's non-split portion + // and the runner's compute buffer live on the same device. + int main_dev = reg_index_of(devnames[0]); + if (main_dev < 0 || main_dev >= dev_count) { + return true; + } + for (size_t k = 0; k < devnames.size(); k++) { + int idx = reg_index_of(devnames[k]); + if (idx == main_dev || idx < 0) { + continue; + } + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b != nullptr) { + spec.additional_backends.push_back(b); + } else { + LOG_WARN("row-split: failed to init backend %s", devnames[k].c_str()); + } + } + spec.mode = MultiBackendMode::ROW_SPLIT; + spec.tensor_split_ratios = ratios; + spec.main_device = main_dev; + LOG_INFO("DiT row-split across %zu devices (main reg-index %d)", devnames.size(), main_dev); + } else { + // LAYER: assign contiguous block ranges to per-device backends. + std::vector all_backends; + all_backends.push_back(main_backend); + for (size_t k = 1; k < devnames.size(); k++) { + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b == nullptr) { + LOG_WARN("layer-split: failed to init backend %s; using single GPU", devnames[k].c_str()); + return true; + } + spec.additional_backends.push_back(b); + all_backends.push_back(b); + } + const std::string tensor_prefix = "model.diffusion_model."; + std::map block_bytes; + int64_t non_block_bytes = 0; + int max_block_idx = -1; + for (const auto& kv : model_loader.get_tensor_storage_map()) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + block_bytes[idx] += bytes; + if (idx > max_block_idx) { + max_block_idx = idx; + } + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("layer-split: no transformer blocks found; using single GPU"); + return true; + } + const int n_blocks = max_block_idx + 1; + int64_t total_share = 0, total_block = 0; + for (auto s : shares) { + total_share += s; + } + for (const auto& kv : block_bytes) { + total_block += kv.second; + } + if (total_share <= 0) { + return true; + } + std::vector budgets(shares.size(), 0); + for (size_t k = 0; k < shares.size(); k++) { + int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share)); + if (k == 0) { + b = std::max(b - non_block_bytes, 0); // backend 0 also holds non-block weights + } + budgets[k] = b; + } + std::vector boundaries(shares.size(), 0); + size_t cur = 0; + int64_t cur_use = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) { + boundaries[cur] = b; + cur++; + cur_use = 0; + } + cur_use += bb; + } + for (size_t k = cur; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) { + boundaries[k] = std::min(min_bound, n_blocks); + } + } + // Map each param tensor pointer to its backend (block range -> device). + auto ptr_backend = std::make_shared>(); + std::map dit_map; + runner->get_param_tensors(dit_map); + for (const auto& kv : dit_map) { + ggml_backend_t target = all_backends[0]; + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) == 0) { + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + for (size_t k = 0; k < boundaries.size(); k++) { + if (idx < boundaries[k]) { + target = all_backends[std::min(k, all_backends.size() - 1)]; + break; + } + } + } + } + (*ptr_backend)[kv.second] = target; + } + spec.mode = MultiBackendMode::LAYER_SPLIT; + spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t { + auto it = ptr_backend->find(t); + return it != ptr_backend->end() ? it->second : main_backend; + }; + LOG_INFO("DiT layer-split: %d blocks across %zu devices", n_blocks, all_backends.size()); + } + + runner->set_multi_backend_spec(spec); + return true; + } + + // Conditioner (LLM) layer-split: same block-partition approach as the DiT + // layer-split, but applied to the conditioner's LLM sub-runner (tensors + // under "text_encoders.llm."). LAYER only — the conditioner never row-splits + // (only the DiT does, preserving the single-row-component invariant). The + // conditioner's small projector stays on the main backend. + bool apply_cond_multi_gpu_split(const std::shared_ptr& cond, ModelLoader& model_loader) { + if (!cond || fit_cond_split_mode == backend_fit::MultiGpuMode::OFF || + fit_cond_split_device_names.size() < 2) { + return true; + } + ggml_backend_t main_backend = backend_for(SDBackendModule::TE); + if (main_backend == nullptr) { + return true; + } + const auto& devnames = fit_cond_split_device_names; + const auto& shares = fit_cond_split_share_bytes; + std::vector all_backends; + all_backends.push_back(main_backend); + MultiBackendSpec spec; + for (size_t k = 1; k < devnames.size(); k++) { + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b == nullptr) { + LOG_WARN("cond layer-split: failed to init backend %s; using single GPU", devnames[k].c_str()); + return true; + } + spec.additional_backends.push_back(b); + all_backends.push_back(b); + } + const std::string tensor_prefix = "text_encoders.llm."; + std::map block_bytes; + int64_t non_block_bytes = 0; + int max_block_idx = -1; + for (const auto& kv : model_loader.get_tensor_storage_map()) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + block_bytes[idx] += bytes; + if (idx > max_block_idx) { + max_block_idx = idx; + } + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("cond layer-split: no transformer blocks under '%s'; using single GPU", tensor_prefix.c_str()); + return true; + } + const int n_blocks = max_block_idx + 1; + int64_t total_share = 0, total_block = 0; + for (auto s : shares) { + total_share += s; + } + for (const auto& kv : block_bytes) { + total_block += kv.second; + } + if (total_share <= 0) { + return true; + } + std::vector budgets(shares.size(), 0); + for (size_t k = 0; k < shares.size(); k++) { + int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share)); + if (k == 0) { + b = std::max(b - non_block_bytes, 0); + } + budgets[k] = b; + } + std::vector boundaries(shares.size(), 0); + size_t cur = 0; + int64_t cur_use = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) { + boundaries[cur] = b; + cur++; + cur_use = 0; + } + cur_use += bb; + } + for (size_t k = cur; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) { + boundaries[k] = std::min(min_bound, n_blocks); + } + } + auto ptr_backend = std::make_shared>(); + std::map cond_map; + cond->get_param_tensors(cond_map); + for (const auto& kv : cond_map) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; // only the LLM tensors are split; projector stays on main + } + ggml_backend_t target = all_backends[0]; + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + for (size_t k = 0; k < boundaries.size(); k++) { + if (idx < boundaries[k]) { + target = all_backends[std::min(k, all_backends.size() - 1)]; + break; + } + } + } + (*ptr_backend)[kv.second] = target; + } + spec.mode = MultiBackendMode::LAYER_SPLIT; + spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t { + auto it = ptr_backend->find(t); + return it != ptr_backend->end() ? it->second : main_backend; + }; + cond->set_multi_backend_spec(spec); + LOG_INFO("Conditioner LLM layer-split: %d blocks across %zu devices", n_blocks, all_backends.size()); + return true; + } + std::shared_ptr get_rng(rng_type_t rng_type) { if (rng_type == STD_DEFAULT_RNG) { return std::make_shared(); @@ -300,7 +656,11 @@ class StableDiffusionGGML { // placements before the backends are created (see the auto-fit block // below, which feeds its plan into init_backend()). - ModelLoader model_loader; + // Owned by the SD object so lazy-load callbacks can re-read tensors + // after init() returns. `model_loader` aliases it, so all the existing + // model_loader.* uses below are unchanged. + owned_model_loader = std::make_unique(); + ModelLoader& model_loader = *owned_model_loader; if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); @@ -448,6 +808,7 @@ class StableDiffusionGGML { return oss.str(); }; + auto_fit_enabled = sd_ctx_params->auto_fit; if (sd_ctx_params->auto_fit) { if (!backend_spec.empty() || !params_backend_spec.empty()) { LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend " @@ -455,25 +816,59 @@ class StableDiffusionGGML { } backend_fit::ComputeReserves reserves; - if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) { - reserves.dit_bytes = - int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB; - } - if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) { - reserves.vae_bytes = - int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB; - } - if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) { - reserves.conditioner_bytes = - int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB; + // Parse the per-component reserve map ("dit=2048,vae=1024,cond=512"). + // Missing keys keep the built-in defaults. + if (sd_ctx_params->auto_fit_compute_reserve != nullptr) { + std::string spec(sd_ctx_params->auto_fit_compute_reserve); + size_t pos = 0; + while (pos < spec.size()) { + size_t comma = spec.find(',', pos); + std::string entry = spec.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos); + pos = comma == std::string::npos ? spec.size() : comma + 1; + size_t eq = entry.find('='); + if (eq == std::string::npos) { + LOG_WARN("auto-fit: ignoring malformed compute-reserve entry '%s' (expected component=MiB)", entry.c_str()); + continue; + } + std::string key = entry.substr(0, eq); + int64_t mib = std::atoll(entry.c_str() + eq + 1); + if (mib <= 0) { + LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (value must be a positive MiB count)", entry.c_str()); + continue; + } + backend_fit::ComponentKind kind; + if (key == "dit" || key == "diffusion" || key == "model" || key == "unet") { + kind = backend_fit::ComponentKind::DIT; + } else if (key == "vae") { + kind = backend_fit::ComponentKind::VAE; + } else if (key == "cond" || key == "conditioner" || key == "te" || key == "clip") { + kind = backend_fit::ComponentKind::CONDITIONER; + } else { + LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (unknown component, expected dit/vae/cond)", entry.c_str()); + continue; + } + switch (kind) { + case backend_fit::ComponentKind::DIT: + reserves.dit_bytes = mib * backend_fit::MiB; + break; + case backend_fit::ComponentKind::VAE: + reserves.vae_bytes = mib * backend_fit::MiB; + break; + case backend_fit::ComponentKind::CONDITIONER: + reserves.conditioner_bytes = mib * backend_fit::MiB; + break; + } + } } auto components = backend_fit::estimate_components( model_loader, wtype, /*alignment=*/64, reserves); auto devices = backend_fit::enumerate_gpu_devices(); int64_t margin_bytes = int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + backend_fit::MultiGpuMode multi_gpu_mode = + backend_fit::str_to_multi_gpu_mode(SAFE_STR(sd_ctx_params->multi_gpu_mode)); auto plan = backend_fit::compute_plan( - components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu); + components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu, multi_gpu_mode); backend_fit::print_plan(plan, components, devices, margin_bytes); if (sd_ctx_params->auto_fit_dry_run) { @@ -498,6 +893,14 @@ class StableDiffusionGGML { spec += "="; spec += value; }; + auto dev_name_by_id = [&](int id) -> std::string { + for (const auto& dev : devices) { + if (dev.id == id) { + return dev.name; + } + } + return ""; + }; auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) { if (d == nullptr) { return; @@ -506,13 +909,36 @@ class StableDiffusionGGML { append_assignment(runtime_spec, module_key, "cpu"); return; } - std::string dev_name; - for (const auto& dev : devices) { - if (dev.id == d->device_id) { - dev_name = dev.name; - break; + // Multi-GPU split (DiT only): the runner's main backend is the + // largest participating GPU (split_device_ids[0]); the actual + // per-tensor distribution is applied later via a MultiBackendSpec + // (see prepare_*_split_spec). Record the decision for that step. + if (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT || + d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + std::string main_dev = d->split_device_ids.empty() ? "" : dev_name_by_id(d->split_device_ids[0]); + if (main_dev.empty()) { + return; // fall back to default backend + } + append_assignment(runtime_spec, module_key, main_dev); + backend_fit::MultiGpuMode m = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) + ? backend_fit::MultiGpuMode::ROW + : backend_fit::MultiGpuMode::LAYER; + std::vector names; + for (int id : d->split_device_ids) { + names.push_back(dev_name_by_id(id)); } + if (std::string(module_key) == "diffusion") { + fit_dit_split_mode = m; + fit_dit_split_device_names = names; + fit_dit_split_share_bytes = d->split_share_bytes; + } else if (std::string(module_key) == "te") { + fit_cond_split_mode = m; + fit_cond_split_device_names = names; + fit_cond_split_share_bytes = d->split_share_bytes; + } + return; } + std::string dev_name = dev_name_by_id(d->device_id); if (dev_name.empty()) { return; // no matching device; fall back to the default backend } @@ -530,6 +956,17 @@ class StableDiffusionGGML { LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'", backend_spec.empty() ? "(default)" : backend_spec.c_str(), params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str()); + + // When a component is split across GPUs the working set is tight: + // the split component (and the others sharing those GPUs) cannot all + // be resident at once. Enable lazy-load so the DiT / conditioner / + // VAE defer their param alloc+load to their compute phase and free + // after, time-sharing VRAM (the per-component MAX plan assumes this). + if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF || + fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) { + auto_lazy_load = true; + LOG_INFO("auto-fit: enabling lazy-load (components time-share VRAM across phases)"); + } } // Create the backends now that the placement (manual or auto-fit) is @@ -859,9 +1296,20 @@ class StableDiffusionGGML { cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); + // When the DiT is split across GPUs its params live resident in the + // (per-device) split buffers, so it must not be mmap'd and must not + // use the RAM-streaming path (mutually exclusive with split). + const bool dit_split = fit_dit_split_mode != backend_fit::MultiGpuMode::OFF && + fit_dit_split_device_names.size() >= 2; + if (dit_split && stream_layers) { + LOG_WARN("--stream-layers is ignored for the diffusion model when it is " + "split across GPUs (--multi-gpu-mode=%s)", + backend_fit::multi_gpu_mode_str(fit_dit_split_mode)); + } + diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - diffusion_model->set_stream_layers_enabled(stream_layers); - get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); + diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers); + get_param_tensors(diffusion_model, dit_split ? false : module_can_mmap(SDBackendModule::DIFFUSION)); if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -869,8 +1317,8 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); - get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); + high_noise_diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers); + get_param_tensors(high_noise_diffusion_model, dit_split ? false : module_can_mmap(SDBackendModule::DIFFUSION)); } if (!ensure_backend_pair(SDBackendModule::VAE)) { @@ -1129,6 +1577,78 @@ class StableDiffusionGGML { ignore_tensors.insert("model.visual.deepstack_merger_list."); } + // --- Multi-GPU split + lazy-load (auto-fit) ------------------------ + // Apply the split specs to the heavy runners BEFORE any params alloc, + // then (when the plan needs time-sharing) mark them lazy: drop their + // tensors from the bulk load + mmap here and load them on the first + // compute() of their phase instead, freeing after, so the DiT / + // conditioner / VAE share VRAM rather than coexisting. + apply_dit_multi_gpu_split(diffusion_model, model_loader); + apply_dit_multi_gpu_split(high_noise_diffusion_model, model_loader); + apply_cond_multi_gpu_split(cond_stage_model, model_loader); + + if (auto_lazy_load) { + const int lazy_threads = std::min(n_threads > 0 ? n_threads : 2, 2); + const bool lazy_mmap = sd_ctx_params->enable_mmap; + ModelLoader* loader_ptr = owned_model_loader.get(); + // Defer a component's params: drop its tensors from the bulk load + + // mmap set, and register a callback that loads just those tensors on + // first compute. `only_prefix` restricts to a sub-runner (the + // conditioner only lazy-loads its LLM; the small projector stays + // eager). set_lazy_load makes the runner's alloc a no-op at init. + // `collect` lets each component gather its own param tensors with the + // right call arity: DiffusionModel/Conditioner expose a 1-arg + // get_param_tensors() that bakes in their prefix, while VAE only has + // the 2-arg form (prefix is caller-supplied, no default). A single + // templated call can't cover both, so the caller passes a closure. + auto make_lazy = [&](auto&& component, + const std::function&)>& collect, + const std::string& only_prefix) { + if (!component) { + return; + } + std::map all; + collect(all); + auto sub = std::make_shared>(); + for (const auto& kv : all) { + if (!only_prefix.empty() && + kv.first.compare(0, only_prefix.size(), only_prefix) != 0) { + continue; + } + (*sub)[kv.first] = kv.second; + tensors.erase(kv.first); + mmap_able_tensors.erase(kv.first); + ignore_tensors.insert(kv.first); + } + if (sub->empty()) { + return; + } + component->set_lazy_load([loader_ptr, sub, lazy_threads, lazy_mmap]() -> bool { + auto local = *sub; + return loader_ptr->load_tensors(local, {}, lazy_threads, lazy_mmap); + }); + LOG_INFO("auto-fit: deferring %zu tensors to first compute (lazy-load)", sub->size()); + }; + make_lazy(diffusion_model, + [&](std::map& m) { diffusion_model->get_param_tensors(m); }, + ""); + make_lazy(high_noise_diffusion_model, + [&](std::map& m) { high_noise_diffusion_model->get_param_tensors(m); }, + ""); + make_lazy(cond_stage_model, + [&](std::map& m) { cond_stage_model->get_param_tensors(m); }, + "text_encoders.llm."); + // The VAE must also time-share: left eager it squats its ~1.4 GB on + // its placed GPU through the conditioner and DiT phases, which on a + // tight card is exactly enough to OOM the layer-split conditioner's + // compute buffer. Defer it like the rest (prefix "first_stage_model" + // matches its loader tensor names). + make_lazy(first_stage_model, + [&](std::map& m) { first_stage_model->get_param_tensors(m, "first_stage_model"); }, + ""); + } + // ------------------------------------------------------------------ + if (enable_mmap_tensors) { if (mmap_able_tensors.empty()) { LOG_DEBUG("no tensors could be memory-mapped"); @@ -2446,7 +2966,35 @@ class StableDiffusionGGML { } auto latents = first_stage_model->diffusion_to_vae_latents(x); first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); - return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + auto decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + // Auto-fit tiling fallback: a full-frame video decode can need ~10 GB of + // compute buffer and OOM (a graceful failure -> empty result, not an + // abort). Under auto-fit, enable tiling and retry once instead of failing. + // Temporal tiling is LTX-only (its 3D VAE supports temporal_tile_frames); + // every other architecture falls back to ordinary spatial tiling. + if (decoded.empty() && auto_fit_enabled) { + bool changed = false; + if (version == VERSION_LTXAV) { + if (!vae_tiling_params.temporal_tiling) { + vae_tiling_params.temporal_tiling = true; + changed = true; + } + } else if (!vae_tiling_params.enabled) { + vae_tiling_params.enabled = true; + // Reasonable default tile if the user didn't set one. + if (vae_tiling_params.tile_size_x <= 0) vae_tiling_params.tile_size_x = 256; + if (vae_tiling_params.tile_size_y <= 0) vae_tiling_params.tile_size_y = 256; + changed = true; + } + if (changed) { + LOG_WARN("auto-fit: VAE decode failed (likely OOM); retrying with %s tiling", + version == VERSION_LTXAV ? "temporal" : "spatial"); + first_stage_model->free_compute_buffer(); + first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); + decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + } + } + return decoded; } sd::Tensor normalize_ltx_video_latents(const sd::Tensor& x) { @@ -2802,10 +3350,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->auto_fit = true; sd_ctx_params->auto_fit_target_mb = 512; sd_ctx_params->auto_fit_dry_run = false; - sd_ctx_params->auto_fit_compute_reserve_dit_mb = 0; - sd_ctx_params->auto_fit_compute_reserve_vae_mb = 0; - sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0; + sd_ctx_params->auto_fit_compute_reserve = nullptr; sd_ctx_params->auto_multi_gpu = true; + sd_ctx_params->multi_gpu_mode = "row"; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -2847,10 +3394,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "auto_fit: %s\n" "auto_fit_target_mb: %d\n" "auto_fit_dry_run: %s\n" - "auto_fit_compute_reserve_dit_mb: %d\n" - "auto_fit_compute_reserve_vae_mb: %d\n" - "auto_fit_compute_reserve_cond_mb: %d\n" + "auto_fit_compute_reserve: %s\n" "auto_multi_gpu: %s\n" + "multi_gpu_mode: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2891,10 +3437,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->auto_fit), sd_ctx_params->auto_fit_target_mb, BOOL_STR(sd_ctx_params->auto_fit_dry_run), - sd_ctx_params->auto_fit_compute_reserve_dit_mb, - sd_ctx_params->auto_fit_compute_reserve_vae_mb, - sd_ctx_params->auto_fit_compute_reserve_cond_mb, + SAFE_STR(sd_ctx_params->auto_fit_compute_reserve), BOOL_STR(sd_ctx_params->auto_multi_gpu), + SAFE_STR(sd_ctx_params->multi_gpu_mode), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), diff --git a/src/version.cpp b/src/version.cpp index 6c266153c..97dc8426b 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -1,6 +1,3 @@ -#include - -#include "ggml-backend.h" #include "stable-diffusion.h" #ifndef SDCPP_BUILD_COMMIT @@ -21,12 +18,3 @@ const char* sd_commit(void) { const char* sd_version(void) { return STRINGIZE(SDCPP_BUILD_VERSION); } - -void sd_list_devices(void) { - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - const char* name = ggml_backend_dev_name(dev); - const char* desc = ggml_backend_dev_description(dev); - std::printf("%s\t%s\n", name ? name : "", desc ? desc : ""); - } -}