From 4086c96be1e5521e3f820c49533b1a6ac027b115 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 30 Apr 2026 13:39:29 +0200
Subject: [PATCH 1/2] feat: auto-fit component placement and per-component
 backend devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an auto-fit planner that picks DiT / VAE / Conditioner device
placements from free GPU memory, treating each component as atomic
(no intra-tensor row split — equivalent to llama.cpp's
LLAMA_SPLIT_MODE_LAYER at component granularity, so views never land
on a split buffer and no ggml patch is needed).

Also adopt the PR #1184 CLI conventions:
- new: --main-backend-device, --diffusion-backend-device,
  --clip-backend-device, --vae-backend-device,
  --control-net-backend-device, --tae-backend-device,
  --upscaler-backend-device, --photomaker-backend-device,
  --vision-backend-device, --list-devices
- removed: --clip-on-cpu, --vae-on-cpu, --control-net-cpu
  (and the matching keep_*_on_cpu fields on sd_ctx_params_t)

Auto-fit knobs: --auto-fit / --no-auto-fit, --no-multi-gpu,
--fit-target, --fit-compute-reserve-{dit,vae,cond}, --fit-dry-run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/common/common.cpp |  73 ++++--
 examples/common/common.h   |  23 +-
 include/stable-diffusion.h |  30 ++-
 src/backend_fit.hpp        | 441 +++++++++++++++++++++++++++++++++++++
 src/model_loader.h         |   2 +
 src/stable-diffusion.cpp   | 173 ++++++++++++---
 src/version.cpp            |  12 +
 7 files changed, 694 insertions(+), 60 deletions(-)
 create mode 100644 src/backend_fit.hpp

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 3ae5faba7..52f7635e4 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -435,6 +435,23 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
+        {"",
+         "--fit-compute-reserve-dit",
+         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
+         "(0 keeps the built-in default)",
+         &auto_fit_compute_reserve_dit_mb},
+        {"",
+         "--fit-compute-reserve-vae",
+         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer",
+         &auto_fit_compute_reserve_vae_mb},
+        {"",
+         "--fit-compute-reserve-cond",
+         "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer",
+         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {
@@ -461,18 +478,6 @@ ArgOptions SDContextParams::get_options() {
          "--mmap",
          "whether to memory-map model",
          true, &enable_mmap},
-        {"",
-         "--control-net-cpu",
-         "keep controlnet in cpu (for low vram)",
-         true, &control_net_cpu},
-        {"",
-         "--clip-on-cpu",
-         "keep clip in cpu (for low vram)",
-         true, &clip_on_cpu},
-        {"",
-         "--vae-on-cpu",
-         "keep vae in cpu (for low vram)",
-         true, &vae_on_cpu},
         {"",
          "--fa",
          "use flash attention",
@@ -513,6 +518,24 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit --backend / --params-backend flags",
+         false, &auto_fit},
+        {"",
+         "--no-multi-gpu",
+         "auto-fit: keep all components on a single GPU when they fit "
+         "(by default, multi-GPU placements are preferred to balance load)",
+         false, &auto_multi_gpu},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -611,6 +634,15 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--list-devices",
+         "list available ggml backend devices (one per line, "
+         "name<TAB>description) and exit",
+         [](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             sd_list_devices();
+             std::exit(0);
+             return 0;
+         }},
     };
 
     return options;
@@ -736,9 +768,10 @@ std::string SDContextParams::to_string() const {
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-        << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-        << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-        << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+        << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
+        << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
+        << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -797,9 +830,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         lora_apply_mode,
         offload_params_to_cpu,
         enable_mmap,
-        clip_on_cpu,
-        control_net_cpu,
-        vae_on_cpu,
         flash_attn,
         diffusion_flash_attn,
         taesd_preview,
@@ -817,6 +847,13 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         stream_layers,
         backend.c_str(),
         params_backend.c_str(),
+        auto_fit,
+        auto_fit_target_mb,
+        auto_fit_dry_run,
+        auto_fit_compute_reserve_dit_mb,
+        auto_fit_compute_reserve_vae_mb,
+        auto_fit_compute_reserve_cond_mb,
+        auto_multi_gpu,
     };
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index a90a33132..2fa798e7e 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -148,14 +148,11 @@ struct SDContextParams {
     bool stream_layers          = false;
     std::string backend;
     std::string params_backend;
-    bool enable_mmap           = false;
-    bool control_net_cpu       = false;
-    bool clip_on_cpu           = false;
-    bool vae_on_cpu            = false;
-    bool flash_attn            = false;
-    bool diffusion_flash_attn  = false;
-    bool diffusion_conv_direct = false;
-    bool vae_conv_direct       = false;
+    bool enable_mmap            = false;
+    bool flash_attn             = false;
+    bool diffusion_flash_attn   = false;
+    bool diffusion_conv_direct  = false;
+    bool vae_conv_direct        = false;
 
     bool circular   = false;
     bool circular_x = false;
@@ -167,6 +164,16 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit defaults — placement is computed automatically based on free
+    // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device.
+    bool auto_fit                         = true;
+    int  auto_fit_target_mb               = 512;
+    bool auto_fit_dry_run                 = false;
+    int  auto_fit_compute_reserve_dit_mb  = 0;
+    int  auto_fit_compute_reserve_vae_mb  = 0;
+    int  auto_fit_compute_reserve_cond_mb = 0;
+    bool auto_multi_gpu                   = true;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 17596f849..3c5b59005 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -206,9 +206,6 @@ typedef struct {
     enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
     bool flash_attn;
     bool diffusion_flash_attn;
     bool tae_preview_only;
@@ -226,6 +223,28 @@ typedef struct {
     bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
     const char* backend;
     const char* params_backend;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), `backend` / `params_backend` are
+    // ignored and the placement is computed automatically (the plan is fed
+    // into the same backend assignment that `backend` / `params_backend` use).
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
+    // per-component compute-buffer reserve; 0 means use the built-in default.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    int  auto_fit_compute_reserve_dit_mb;
+    int  auto_fit_compute_reserve_vae_mb;
+    int  auto_fit_compute_reserve_cond_mb;
+
+    // When more than one GPU device is present, prefer placing different
+    // components on different GPUs to balance load and fit larger total
+    // working sets. Set false to keep all components on a single GPU when
+    // they fit. Defaults to true. Each component still lives entirely on
+    // one device — no intra-tensor row split.
+    bool auto_multi_gpu;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -491,6 +510,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
+// per-line format. The output is intended to be parsed by tools and used as
+// device names in the --backend / --params-backend assignment specs.
+SD_API void sd_list_devices(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
new file mode 100644
index 000000000..ee23d6418
--- /dev/null
+++ b/src/backend_fit.hpp
@@ -0,0 +1,441 @@
+#ifndef __SD_BACKEND_FIT_HPP__
+#define __SD_BACKEND_FIT_HPP__
+
+// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the
+// available GPU devices and system RAM.
+//
+// Each component is treated as a single atomic unit that lives entirely on
+// one device (plus its compute buffer on the same device). There is no
+// intra-tensor row split: cross-device parallelism comes from placing
+// different components on different GPUs, not from splitting individual
+// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER
+// at the component granularity.
+//
+// Placement priority: DiT + compute buffer -> VAE -> Conditioner.
+// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
+// support streaming params from RAM at compute time).
+
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include "model_loader.h"
+#include "core/util.h"
+
+namespace backend_fit {
+
+constexpr int64_t MiB           = 1024 * 1024;
+constexpr int     DEVICE_ID_CPU = -1;
+
+enum class ComponentKind {
+    DIT,
+    VAE,
+    CONDITIONER,
+};
+
+enum class Placement {
+    CPU,
+    GPU,
+    GPU_OFFLOAD_PARAMS,  // params in RAM, compute on GPU
+};
+
+struct Component {
+    ComponentKind kind;
+    std::string   name;
+    int64_t       params_bytes     = 0;
+    int64_t       compute_bytes    = 0;
+    bool          supports_offload = false;
+};
+
+struct Device {
+    int                id = DEVICE_ID_CPU;
+    std::string        name;
+    std::string        description;
+    int64_t            free_bytes  = 0;
+    int64_t            total_bytes = 0;
+    ggml_backend_dev_t dev         = nullptr;  // backing ggml device handle (GPU only)
+};
+
+struct Decision {
+    ComponentKind kind;
+    std::string   name;
+    Placement     placement       = Placement::CPU;
+    int           device_id       = DEVICE_ID_CPU;
+    int64_t       on_device_bytes = 0;
+    int64_t       on_host_bytes   = 0;
+};
+
+struct Plan {
+    std::vector<Decision>  decisions;
+    std::map<int, int64_t> device_bytes;
+    int64_t                host_bytes  = 0;
+    bool                   any_changes = false;
+};
+
+struct ComputeReserves {
+    int64_t dit_bytes         = int64_t(2048) * MiB;
+    int64_t vae_bytes         = int64_t(1024) * MiB;
+    int64_t conditioner_bytes = int64_t(512) * MiB;
+};
+
+// --- Classification -------------------------------------------------------
+
+inline bool classify_tensor(const std::string& name, ComponentKind& out) {
+    auto contains = [&](const char* s) { return name.find(s) != std::string::npos; };
+
+    if (contains("model.diffusion_model.") || contains("unet.")) {
+        out = ComponentKind::DIT;
+        return true;
+    }
+
+    if (contains("first_stage_model.") ||
+        name.rfind("vae.", 0) == 0 ||
+        name.rfind("tae.", 0) == 0) {
+        out = ComponentKind::VAE;
+        return true;
+    }
+
+    if (contains("text_encoders") ||
+        contains("cond_stage_model") ||
+        contains("te.text_model.") ||
+        contains("conditioner") ||
+        name.rfind("text_encoder.", 0) == 0) {
+        out = ComponentKind::CONDITIONER;
+        return true;
+    }
+
+    return false;
+}
+
+// --- Memory estimation ----------------------------------------------------
+
+inline std::vector<Component> estimate_components(ModelLoader&           loader,
+                                                  ggml_type              override_wtype,
+                                                  int64_t                alignment,
+                                                  const ComputeReserves& reserves) {
+    auto& storage = loader.get_tensor_storage_map();
+
+    int64_t bytes[3] = {0, 0, 0};
+
+    for (auto& [name, ts_const] : storage) {
+        TensorStorage ts = ts_const;
+        if (is_unused_tensor(ts.name)) {
+            continue;
+        }
+
+        ComponentKind k;
+        if (!classify_tensor(ts.name, k)) {
+            continue;
+        }
+
+        if (override_wtype != GGML_TYPE_COUNT &&
+            loader.tensor_should_be_converted(ts, override_wtype)) {
+            ts.type = override_wtype;
+        } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) {
+            ts.type = ts.expected_type;
+        }
+
+        bytes[int(k)] += ts.nbytes() + alignment;
+    }
+
+    std::vector<Component> out;
+    out.reserve(3);
+    out.push_back({ComponentKind::DIT, "DiT",
+                   bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true});
+    out.push_back({ComponentKind::VAE, "VAE",
+                   bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false});
+    out.push_back({ComponentKind::CONDITIONER, "Conditioner",
+                   bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true});
+    return out;
+}
+
+// --- Device enumeration ---------------------------------------------------
+
+inline std::vector<Device> enumerate_gpu_devices() {
+    // Make sure the dynamically-loaded backends are registered before we query
+    // the device list. This runs before SDBackendManager initializes any
+    // backend, so nothing else has triggered the (file-local) lazy load yet.
+    // Safe to call once here: the manager's own load-all-once guard short
+    // circuits afterwards because the device count is already non-zero.
+    ggml_backend_load_all();
+
+    std::vector<Device> out;
+    int gpu_idx = 0;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            continue;
+        }
+        Device d;
+        d.id          = gpu_idx++;
+        d.dev         = dev;
+        d.name        = ggml_backend_dev_name(dev);
+        d.description = ggml_backend_dev_description(dev);
+        size_t free_b = 0, total_b = 0;
+        ggml_backend_dev_memory(dev, &free_b, &total_b);
+        d.free_bytes  = int64_t(free_b);
+        d.total_bytes = int64_t(total_b);
+        out.push_back(d);
+    }
+    return out;
+}
+
+// --- Core algorithm -------------------------------------------------------
+
+// Peak per device = MAX of any single component's footprint on that device,
+// because free_params_immediately frees params between phases so components
+// time-share VRAM.
+inline int64_t gpu_peak(int                           gpu_idx,
+                        const std::vector<Placement>& pl,
+                        const std::vector<int>&       dev,
+                        const std::vector<Component>& components) {
+    int64_t peak = 0;
+    for (size_t i = 0; i < components.size(); i++) {
+        if (dev[i] != gpu_idx) continue;
+        int64_t footprint = 0;
+        if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            footprint = components[i].params_bytes + components[i].compute_bytes;
+        }
+        peak = std::max(peak, footprint);
+    }
+    return peak;
+}
+
+inline Plan compute_plan(const std::vector<Component>& components,
+                         const std::vector<Device>&    devices,
+                         int64_t                       margin_bytes,
+                         bool                          allow_multi_gpu = true) {
+    const size_t nC = components.size();
+    const size_t nG = devices.size();
+
+    std::vector<int64_t> cap(nG, 0);
+    for (size_t g = 0; g < nG; g++) {
+        cap[g] = std::max<int64_t>(0, devices[g].free_bytes - margin_bytes);
+    }
+
+    struct OptionSlot {
+        Placement placement;
+        int       device_idx;
+    };
+
+    auto build_options = [&](const Component& c) {
+        std::vector<OptionSlot> opts;
+        for (size_t g = 0; g < nG; g++) {
+            opts.push_back({Placement::GPU, int(g)});
+            if (c.supports_offload) {
+                opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
+            }
+        }
+        opts.push_back({Placement::CPU, -1});
+        return opts;
+    };
+
+    std::vector<std::vector<OptionSlot>> options;
+    options.reserve(nC);
+    for (const Component& c : components) {
+        options.push_back(build_options(c));
+    }
+
+    auto priority_weight = [](ComponentKind k) -> int {
+        switch (k) {
+            case ComponentKind::DIT:         return 300;
+            case ComponentKind::CONDITIONER: return 120;
+            case ComponentKind::VAE:         return 60;
+        }
+        return 1;
+    };
+
+    auto score = [&](const std::vector<Placement>& pl, const std::vector<int>& dev) {
+        int64_t       s = 0;
+        std::set<int> gpus_used;
+        for (size_t i = 0; i < nC; i++) {
+            const int pw = priority_weight(components[i].kind);
+            if (pl[i] == Placement::GPU) {
+                s += 10 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                s += 5 * pw;
+                gpus_used.insert(dev[i]);
+            } else {
+                s -= 10 * pw;
+            }
+        }
+        if (allow_multi_gpu) {
+            s += 2 * int64_t(gpus_used.size());
+        }
+        return s;
+    };
+
+    std::vector<size_t>    idx(nC, 0);
+    std::vector<Placement> best_pl;
+    std::vector<int>       best_dev;
+    int64_t                best_score = std::numeric_limits<int64_t>::min();
+    bool                   found_any  = false;
+
+    while (true) {
+        std::vector<Placement> pl(nC);
+        std::vector<int>       dev(nC);
+        for (size_t i = 0; i < nC; i++) {
+            pl[i]  = options[i][idx[i]].placement;
+            dev[i] = options[i][idx[i]].device_idx;
+        }
+        // Constraint: when multi-GPU is disabled, all GPU placements must
+        // share the same device index.
+        if (!allow_multi_gpu) {
+            int common = -1;
+            bool ok = true;
+            for (size_t i = 0; i < nC; i++) {
+                if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                    if (common < 0) common = dev[i];
+                    else if (dev[i] != common) { ok = false; break; }
+                }
+            }
+            if (ok) {
+                bool feasible = true;
+                for (size_t g = 0; g < nG; g++) {
+                    if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                }
+                if (feasible) {
+                    int64_t sc = score(pl, dev);
+                    if (sc > best_score) {
+                        best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                    }
+                }
+            }
+        } else {
+            bool feasible = true;
+            for (size_t g = 0; g < nG; g++) {
+                if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+            }
+            if (feasible) {
+                int64_t sc = score(pl, dev);
+                if (sc > best_score) {
+                    best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                }
+            }
+        }
+
+        size_t pos = 0;
+        while (pos < nC) {
+            idx[pos]++;
+            if (idx[pos] < options[pos].size()) break;
+            idx[pos] = 0;
+            pos++;
+        }
+        if (pos >= nC) break;
+    }
+
+    Plan plan;
+    if (!found_any) {
+        best_pl.assign(nC, Placement::CPU);
+        best_dev.assign(nC, -1);
+    }
+
+    for (size_t i = 0; i < nC; i++) {
+        const Component& c = components[i];
+        Decision         d;
+        d.kind      = c.kind;
+        d.name      = c.name;
+        d.placement = best_pl[i];
+        if (best_pl[i] == Placement::CPU) {
+            d.device_id      = DEVICE_ID_CPU;
+            d.on_host_bytes  = c.params_bytes + c.compute_bytes;
+            plan.any_changes = true;
+        } else {
+            d.device_id = devices[best_dev[i]].id;
+            if (best_pl[i] == Placement::GPU) {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+            } else {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+                d.on_host_bytes   = c.params_bytes;
+                plan.any_changes  = true;
+            }
+        }
+        plan.decisions.push_back(d);
+        plan.host_bytes += d.on_host_bytes;
+    }
+
+    for (size_t g = 0; g < nG; g++) {
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components);
+    }
+    return plan;
+}
+
+inline const char* placement_str(Placement p) {
+    switch (p) {
+        case Placement::CPU: return "CPU";
+        case Placement::GPU: return "GPU";
+        case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+    }
+    return "?";
+}
+
+inline void print_plan(const Plan&                   plan,
+                       const std::vector<Component>& components,
+                       const std::vector<Device>&    devices,
+                       int64_t                       margin_bytes) {
+    LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB));
+    LOG_INFO("  available devices:");
+    if (devices.empty()) {
+        LOG_INFO("    (no GPU devices detected — all components will run on CPU)");
+    }
+    for (const Device& d : devices) {
+        LOG_INFO("    %-12s %-32s free %6lld / %6lld MiB",
+                 d.name.c_str(), d.description.c_str(),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)(d.total_bytes / MiB));
+    }
+    LOG_INFO("  components:");
+    for (const Component& c : components) {
+        LOG_INFO("    %-12s params %6lld MiB, compute reserve %6lld MiB",
+                 c.name.c_str(),
+                 (long long)(c.params_bytes / MiB),
+                 (long long)(c.compute_bytes / MiB));
+    }
+    LOG_INFO("  decisions:");
+    for (const Decision& d : plan.decisions) {
+        if (d.placement == Placement::CPU) {
+            LOG_INFO("    %-12s -> CPU                (RAM %lld MiB)",
+                     d.name.c_str(), (long long)(d.on_host_bytes / MiB));
+        } else if (d.placement == Placement::GPU) {
+            LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB));
+        } else {
+            LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB),
+                     (long long)(d.on_host_bytes / MiB));
+        }
+    }
+    LOG_INFO("  projected per-device peak:");
+    for (const Device& d : devices) {
+        int64_t peak = 0;
+        auto    it   = plan.device_bytes.find(d.id);
+        if (it != plan.device_bytes.end()) peak = it->second;
+        LOG_INFO("    %-12s peak %6lld / %6lld MiB free  (remaining %lld MiB)",
+                 d.name.c_str(),
+                 (long long)(peak / MiB),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)((d.free_bytes - peak) / MiB));
+    }
+    LOG_INFO("    %-12s host RAM additional %lld MiB", "CPU",
+             (long long)(plan.host_bytes / MiB));
+}
+
+inline const Decision* find_decision(const Plan& plan, ComponentKind kind) {
+    for (const Decision& d : plan.decisions) {
+        if (d.kind == kind) return &d;
+    }
+    return nullptr;
+}
+
+}  // namespace backend_fit
+
+#endif  // __SD_BACKEND_FIT_HPP__
diff --git a/src/model_loader.h b/src/model_loader.h
index 8e0f41981..b77f4d6e2 100644
--- a/src/model_loader.h
+++ b/src/model_loader.h
@@ -27,6 +27,8 @@ struct MmapTensorStore {
     std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
 };
 
+bool is_unused_tensor(const std::string& name);
+
 class ModelLoader {
 protected:
     SDVersion version_ = VERSION_COUNT;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 8ba4a463a..640c049ad 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -10,6 +10,8 @@
 #include "core/rng_mt19937.hpp"
 #include "core/rng_philox.hpp"
 #include "core/util.h"
+
+#include "backend_fit.hpp"
 #include "model_loader.h"
 #include "stable-diffusion.h"
 
@@ -232,14 +234,19 @@ class StableDiffusionGGML {
         return params_backend_for(module) != nullptr;
     }
 
-    bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
+    // Initialize the backend manager from backend_spec / params_backend_spec.
+    // These hold the user's --backend / --params-backend by default, but when
+    // auto-fit is enabled they are overwritten with the computed plan before
+    // this runs. The keep_*_on_cpu shortcuts were replaced by the spec
+    // mechanism (e.g. "vae=cpu"), so they are always false here.
+    bool init_backend() {
         std::string error;
-        if (!backend_manager.init(sd_ctx_params->backend,
-                                  sd_ctx_params->params_backend,
+        if (!backend_manager.init(backend_spec.c_str(),
+                                  params_backend_spec.c_str(),
                                   offload_params_to_cpu,
-                                  sd_ctx_params->keep_clip_on_cpu,
-                                  sd_ctx_params->keep_vae_on_cpu,
-                                  sd_ctx_params->keep_control_net_on_cpu,
+                                  /*keep_clip_on_cpu=*/false,
+                                  /*keep_vae_on_cpu=*/false,
+                                  /*keep_control_net_on_cpu=*/false,
                                   &error)) {
             LOG_ERROR("backend config failed: %s", error.c_str());
             return false;
@@ -288,10 +295,10 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        if (!init_backend(sd_ctx_params)) {
-            return false;
-        }
-        max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION));
+        // Backend initialization is deferred until after the model metadata is
+        // loaded, so auto-fit can size the components and choose device
+        // placements before the backends are created (see the auto-fit block
+        // below, which feeds its plan into init_backend()).
 
         ModelLoader model_loader;
 
@@ -441,6 +448,98 @@ class StableDiffusionGGML {
             return oss.str();
         };
 
+        if (sd_ctx_params->auto_fit) {
+            if (!backend_spec.empty() || !params_backend_spec.empty()) {
+                LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend "
+                         "(pass --no-auto-fit to set device placement manually)");
+            }
+
+            backend_fit::ComputeReserves reserves;
+            if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) {
+                reserves.dit_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) {
+                reserves.vae_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB;
+            }
+            if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) {
+                reserves.conditioner_bytes =
+                    int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB;
+            }
+            auto components = backend_fit::estimate_components(
+                model_loader, wtype, /*alignment=*/64, reserves);
+            auto    devices = backend_fit::enumerate_gpu_devices();
+            int64_t margin_bytes =
+                int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            auto plan = backend_fit::compute_plan(
+                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu);
+            backend_fit::print_plan(plan, components, devices, margin_bytes);
+
+            if (sd_ctx_params->auto_fit_dry_run) {
+                LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models");
+                return false;
+            }
+
+            // Translate the plan into the backend-assignment specs consumed by
+            // SDBackendManager. Each component lives entirely on one device:
+            //   GPU                -> runtime=<dev>             (params follow runtime)
+            //   GPU_OFFLOAD_PARAMS -> runtime=<dev>, params=cpu (params streamed from RAM)
+            //   CPU                -> runtime=cpu               (params follow runtime)
+            // Modules the planner doesn't cover (clip_vision, control_net,
+            // photomaker, upscaler) fall back to the default backend.
+            std::string runtime_spec;
+            std::string params_spec;
+            auto append_assignment = [](std::string& spec, const char* key, const std::string& value) {
+                if (!spec.empty()) {
+                    spec += ",";
+                }
+                spec += key;
+                spec += "=";
+                spec += value;
+            };
+            auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) {
+                if (d == nullptr) {
+                    return;
+                }
+                if (d->placement == backend_fit::Placement::CPU) {
+                    append_assignment(runtime_spec, module_key, "cpu");
+                    return;
+                }
+                std::string dev_name;
+                for (const auto& dev : devices) {
+                    if (dev.id == d->device_id) {
+                        dev_name = dev.name;
+                        break;
+                    }
+                }
+                if (dev_name.empty()) {
+                    return;  // no matching device; fall back to the default backend
+                }
+                append_assignment(runtime_spec, module_key, dev_name);
+                if (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) {
+                    append_assignment(params_spec, module_key, "cpu");
+                }
+            };
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), "diffusion");
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), "te");
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), "vae");
+
+            backend_spec        = runtime_spec;
+            params_backend_spec = params_spec;
+            LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'",
+                     backend_spec.empty() ? "(default)" : backend_spec.c_str(),
+                     params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str());
+        }
+
+        // Create the backends now that the placement (manual or auto-fit) is
+        // settled, then resolve the graph-cut VRAM budget against the DiT's
+        // runtime backend.
+        if (!init_backend()) {
+            return false;
+        }
+        max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION));
+
         LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
         LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
         LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
@@ -2688,21 +2787,25 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->prediction              = PREDICTION_COUNT;
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
-    sd_ctx_params->max_vram                = 0.f;
-    sd_ctx_params->stream_layers           = false;
-    sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
-    sd_ctx_params->diffusion_flash_attn    = false;
-    sd_ctx_params->circular_x              = false;
-    sd_ctx_params->circular_y              = false;
-    sd_ctx_params->chroma_use_dit_mask     = true;
-    sd_ctx_params->chroma_use_t5_mask      = false;
-    sd_ctx_params->chroma_t5_mask_pad      = 1;
-    sd_ctx_params->vae_format              = SD_VAE_FORMAT_AUTO;
-    sd_ctx_params->backend                 = nullptr;
-    sd_ctx_params->params_backend          = nullptr;
+    sd_ctx_params->max_vram                         = 0.f;
+    sd_ctx_params->stream_layers                    = false;
+    sd_ctx_params->enable_mmap                      = false;
+    sd_ctx_params->diffusion_flash_attn             = false;
+    sd_ctx_params->circular_x                       = false;
+    sd_ctx_params->circular_y                       = false;
+    sd_ctx_params->chroma_use_dit_mask              = true;
+    sd_ctx_params->chroma_use_t5_mask               = false;
+    sd_ctx_params->chroma_t5_mask_pad               = 1;
+    sd_ctx_params->vae_format                       = SD_VAE_FORMAT_AUTO;
+    sd_ctx_params->backend                          = nullptr;
+    sd_ctx_params->params_backend                   = nullptr;
+    sd_ctx_params->auto_fit                         = true;
+    sd_ctx_params->auto_fit_target_mb               = 512;
+    sd_ctx_params->auto_fit_dry_run                 = false;
+    sd_ctx_params->auto_fit_compute_reserve_dit_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
+    sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
+    sd_ctx_params->auto_multi_gpu                   = true;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2741,9 +2844,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "stream_layers: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
+             "auto_fit: %s\n"
+             "auto_fit_target_mb: %d\n"
+             "auto_fit_dry_run: %s\n"
+             "auto_fit_compute_reserve_dit_mb: %d\n"
+             "auto_fit_compute_reserve_vae_mb: %d\n"
+             "auto_fit_compute_reserve_cond_mb: %d\n"
+             "auto_multi_gpu: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2781,9 +2888,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->stream_layers),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
+             BOOL_STR(sd_ctx_params->auto_fit),
+             sd_ctx_params->auto_fit_target_mb,
+             BOOL_STR(sd_ctx_params->auto_fit_dry_run),
+             sd_ctx_params->auto_fit_compute_reserve_dit_mb,
+             sd_ctx_params->auto_fit_compute_reserve_vae_mb,
+             sd_ctx_params->auto_fit_compute_reserve_cond_mb,
+             BOOL_STR(sd_ctx_params->auto_multi_gpu),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
diff --git a/src/version.cpp b/src/version.cpp
index 97dc8426b..6c266153c 100644
--- a/src/version.cpp
+++ b/src/version.cpp
@@ -1,3 +1,6 @@
+#include <cstdio>
+
+#include "ggml-backend.h"
 #include "stable-diffusion.h"
 
 #ifndef SDCPP_BUILD_COMMIT
@@ -18,3 +21,12 @@ const char* sd_commit(void) {
 const char* sd_version(void) {
     return STRINGIZE(SDCPP_BUILD_VERSION);
 }
+
+void sd_list_devices(void) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* name       = ggml_backend_dev_name(dev);
+        const char* desc       = ggml_backend_dev_description(dev);
+        std::printf("%s\t%s\n", name ? name : "", desc ? desc : "");
+    }
+}

From 20bfcc40e1d533985f1fb6200a582f6408f444af Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Fri, 12 Jun 2026 13:48:23 +0200
Subject: [PATCH 2/2] Finally adapt to the newest master

---
 examples/common/common.cpp       |  68 +++-
 examples/common/common.h         |  23 +-
 ggml                             |   2 +-
 include/stable-diffusion.h       |  19 +-
 src/backend_fit.hpp              | 301 ++++++++++++++-
 src/conditioning/conditioner.hpp |  36 ++
 src/core/ggml_extend.hpp         | 561 +++++++++++++++++++++++++++-
 src/core/ggml_extend_backend.cpp |  23 ++
 src/core/ggml_extend_backend.h   |  14 +
 src/core/util.cpp                |  10 +
 src/model/diffusion/ltxv.hpp     |   9 +-
 src/stable-diffusion.cpp         | 609 +++++++++++++++++++++++++++++--
 src/version.cpp                  |  12 -
 13 files changed, 1593 insertions(+), 94 deletions(-)

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 52f7635e4..a92e4615d 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -423,6 +423,18 @@ ArgOptions SDContextParams::get_options() {
          "--params-backend",
          "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu",
          &params_backend},
+        {"",
+         "--multi-gpu-mode",
+         "how to split a too-large DiT across GPUs (auto-fit): "
+         "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off "
+         "(default: row)",
+         &multi_gpu_mode},
+        {"",
+         "--fit-compute-reserve",
+         "auto-fit: per-component compute-buffer reserve in MiB as a component "
+         "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in "
+         "defaults)",
+         &fit_compute_reserve},
     };
 
     options.int_options = {
@@ -439,19 +451,6 @@ ArgOptions SDContextParams::get_options() {
          "--fit-target",
          "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
          &auto_fit_target_mb},
-        {"",
-         "--fit-compute-reserve-dit",
-         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
-         "(0 keeps the built-in default)",
-         &auto_fit_compute_reserve_dit_mb},
-        {"",
-         "--fit-compute-reserve-vae",
-         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer",
-         &auto_fit_compute_reserve_vae_mb},
-        {"",
-         "--fit-compute-reserve-cond",
-         "auto-fit: MiB reserved on the conditioner's GPU for its compute buffer",
-         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {
@@ -518,6 +517,18 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--control-net-cpu",
+         "keep controlnet in cpu (deprecated alias for --backend control_net=cpu)",
+         true, &control_net_cpu},
+        {"",
+         "--clip-on-cpu",
+         "keep clip in cpu (deprecated alias for --backend clip=cpu)",
+         true, &clip_on_cpu},
+        {"",
+         "--vae-on-cpu",
+         "keep vae in cpu (deprecated alias for --backend vae=cpu)",
+         true, &vae_on_cpu},
         {"",
          "--auto-fit",
          "automatically pick DiT/VAE/Conditioner device placements based on "
@@ -771,7 +782,9 @@ std::string SDContextParams::to_string() const {
         << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
         << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
         << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  fit_compute_reserve: \"" << fit_compute_reserve << "\",\n"
         << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -791,6 +804,30 @@ std::string SDContextParams::to_string() const {
 }
 
 sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
+    // Fold the deprecated --*-on-cpu aliases into the generic backend spec.
+    // They are prepended so explicit --backend entries take precedence.
+    std::string alias_spec;
+    if (control_net_cpu) {
+        alias_spec += "control_net=cpu,";
+    }
+    if (clip_on_cpu) {
+        alias_spec += "clip=cpu,";
+    }
+    if (vae_on_cpu) {
+        alias_spec += "vae=cpu,";
+    }
+    if (!alias_spec.empty()) {
+        backend = alias_spec + backend;
+        if (backend.back() == ',') {
+            backend.pop_back();
+        }
+        control_net_cpu = false;
+        clip_on_cpu     = false;
+        vae_on_cpu      = false;
+        printf("warning: --clip-on-cpu / --vae-on-cpu / --control-net-cpu are deprecated, use --backend instead (folded into --backend \"%s\")\n",
+               backend.c_str());
+    }
+
     embedding_vec.clear();
     embedding_vec.reserve(embedding_map.size());
     for (const auto& kv : embedding_map) {
@@ -850,10 +887,9 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         auto_fit,
         auto_fit_target_mb,
         auto_fit_dry_run,
-        auto_fit_compute_reserve_dit_mb,
-        auto_fit_compute_reserve_vae_mb,
-        auto_fit_compute_reserve_cond_mb,
+        fit_compute_reserve.c_str(),
         auto_multi_gpu,
+        multi_gpu_mode.c_str(),
     };
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index 2fa798e7e..784d4fc77 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -165,14 +165,21 @@ struct SDContextParams {
     bool qwen_image_zero_cond_t = false;
 
     // Auto-fit defaults — placement is computed automatically based on free
-    // VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device.
-    bool auto_fit                         = true;
-    int  auto_fit_target_mb               = 512;
-    bool auto_fit_dry_run                 = false;
-    int  auto_fit_compute_reserve_dit_mb  = 0;
-    int  auto_fit_compute_reserve_vae_mb  = 0;
-    int  auto_fit_compute_reserve_cond_mb = 0;
-    bool auto_multi_gpu                   = true;
+    // VRAM. Pass --no-auto-fit to disable and use explicit --backend specs.
+    bool auto_fit           = true;
+    int  auto_fit_target_mb = 512;
+    bool auto_fit_dry_run   = false;
+    // Per-component compute-buffer reserve in MiB as a component map,
+    // e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults.
+    std::string fit_compute_reserve;
+    bool auto_multi_gpu = true;
+    std::string multi_gpu_mode = "row";
+
+    // Deprecated aliases for --backend <component>=cpu (kept for
+    // backwards compatibility with the pre-auto-fit CLI).
+    bool control_net_cpu = false;
+    bool clip_on_cpu     = false;
+    bool vae_on_cpu      = false;
 
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
diff --git a/ggml b/ggml
index 0ce7ad348..404fcb9d7 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
+Subproject commit 404fcb9d7c96989569e68c9e7881ee3465a05c50
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 3c5b59005..b1af537dc 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -230,21 +230,28 @@ typedef struct {
     // into the same backend assignment that `backend` / `params_backend` use).
     // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
     // `auto_fit_dry_run` prints the plan and aborts init before loading.
-    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
-    // per-component compute-buffer reserve; 0 means use the built-in default.
+    // `auto_fit_compute_reserve` tunes the per-component compute-buffer
+    // reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512"
+    // (same component-key style as `backend`); missing keys / NULL keep the
+    // built-in defaults.
     bool auto_fit;
     int  auto_fit_target_mb;
     bool auto_fit_dry_run;
-    int  auto_fit_compute_reserve_dit_mb;
-    int  auto_fit_compute_reserve_vae_mb;
-    int  auto_fit_compute_reserve_cond_mb;
+    const char* auto_fit_compute_reserve;
 
     // When more than one GPU device is present, prefer placing different
     // components on different GPUs to balance load and fit larger total
     // working sets. Set false to keep all components on a single GPU when
     // they fit. Defaults to true. Each component still lives entirely on
-    // one device — no intra-tensor row split.
+    // one device unless multi_gpu_mode splits it (see below).
     bool auto_multi_gpu;
+
+    // How to split a single component (currently only the DiT) across GPUs
+    // when it doesn't fit on one but fits across several: "row" (matmul rows
+    // split via the backend's stock split buffer type, CUDA/SYCL),
+    // "layer" (whole blocks per GPU, routed by a scheduler, backend-generic),
+    // or "off" (never split a single component). NULL / empty => "row".
+    const char* multi_gpu_mode;
 } sd_ctx_params_t;
 
 typedef struct {
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
index ee23d6418..17994cd42 100644
--- a/src/backend_fit.hpp
+++ b/src/backend_fit.hpp
@@ -15,9 +15,11 @@
 // Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
 // support streaming params from RAM at compute time).
 
+#include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <map>
+#include <numeric>
 #include <set>
 #include <string>
 #include <vector>
@@ -42,7 +44,9 @@ enum class ComponentKind {
 enum class Placement {
     CPU,
     GPU,
-    GPU_OFFLOAD_PARAMS,  // params in RAM, compute on GPU
+    GPU_OFFLOAD_PARAMS,    // params in RAM, compute on GPU
+    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries (sched-based)
+    GPU_TENSOR_SPLIT,      // matmul weights row-split across GPUs (CUDA split-buft, single backend)
 };
 
 struct Component {
@@ -69,6 +73,13 @@ struct Decision {
     int           device_id       = DEVICE_ID_CPU;
     int64_t       on_device_bytes = 0;
     int64_t       on_host_bytes   = 0;
+
+    // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs
+    // that share this component (in order) and each device's estimated share
+    // of the params. The order also defines block-range partitioning: the
+    // i-th device gets a contiguous range of blocks proportional to share[i].
+    std::vector<int>     split_device_ids;
+    std::vector<int64_t> split_share_bytes;
 };
 
 struct Plan {
@@ -84,6 +95,28 @@ struct ComputeReserves {
     int64_t conditioner_bytes = int64_t(512) * MiB;
 };
 
+enum class MultiGpuMode {
+    OFF,    // never split a single component across GPUs
+    ROW,    // CUDA-only: row-split matmul weights via cuda_split_buffer_type
+    LAYER,  // generic: assign block-indexed tensors to per-block backends + sched
+};
+
+inline const char* multi_gpu_mode_str(MultiGpuMode m) {
+    switch (m) {
+        case MultiGpuMode::OFF:   return "off";
+        case MultiGpuMode::ROW:   return "row";
+        case MultiGpuMode::LAYER: return "layer";
+    }
+    return "?";
+}
+
+inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) {
+    if (s == "off")   return MultiGpuMode::OFF;
+    if (s == "row")   return MultiGpuMode::ROW;
+    if (s == "layer") return MultiGpuMode::LAYER;
+    return MultiGpuMode::ROW;  // default
+}
+
 // --- Classification -------------------------------------------------------
 
 inline bool classify_tensor(const std::string& name, ComponentKind& out) {
@@ -105,7 +138,13 @@ inline bool classify_tensor(const std::string& name, ComponentKind& out) {
         contains("cond_stage_model") ||
         contains("te.text_model.") ||
         contains("conditioner") ||
-        name.rfind("text_encoder.", 0) == 0) {
+        name.rfind("text_encoder.", 0) == 0 ||
+        // Connector / text projection layers that run on the conditioner
+        // backend (e.g. LTX-2's text_embedding_projection: video/audio
+        // aggregate embeds + projection that map LLM hidden states into
+        // DiT-input space).
+        name.rfind("text_embedding_projection.", 0) == 0 ||
+        contains(".aggregate_embed.")) {
         out = ComponentKind::CONDITIONER;
         return true;
     }
@@ -188,19 +227,129 @@ inline std::vector<Device> enumerate_gpu_devices() {
 
 // --- Core algorithm -------------------------------------------------------
 
-// Peak per device = MAX of any single component's footprint on that device,
-// because free_params_immediately frees params between phases so components
-// time-share VRAM.
+// Per-GPU share for a layer-split component: free-VRAM-weighted partition
+// of params, plus the full compute reserve on each participating device.
+// (Compute reserve is per-device since each shard activates its own kernels.)
+inline std::vector<int64_t> layer_split_shares(int64_t                    params_bytes,
+                                               int64_t                    compute_bytes,
+                                               const std::vector<Device>& devices,
+                                               const std::vector<size_t>& gpu_idxs,
+                                               int64_t                    margin_bytes = 0) {
+    // Every participating device hosts its param share PLUS a full compute
+    // reserve (the sched allocates a compute buffer per backend), so weight the
+    // param shares by what remains AFTER compute + margin. This guarantees
+    // share_k + compute <= free_k - margin whenever the total fits at all;
+    // weighting by raw free overcommits the smaller GPU and the planner then
+    // rejects layer-split as infeasible (observed: 22B DiT fell to CPU).
+    std::vector<int64_t> avail(gpu_idxs.size(), 0);
+    int64_t              total = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        int64_t a = std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes - compute_bytes - margin_bytes);
+        avail[k]  = a;
+        total += a;
+    }
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    if (total <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        double r = double(avail[k]) / double(total);
+        out[k]   = int64_t(double(params_bytes) * r) + compute_bytes;
+    }
+    return out;
+}
+
+// Per-GPU PARAM share for a row (tensor) split. Unlike layer-split, the graph
+// runs on a single MAIN backend (the biggest GPU at gpu_idxs[main_pos]), so
+// ONLY the main device also hosts the compute buffer. We therefore reserve
+// `compute_bytes` of the main device's free VRAM before weighting, so the main
+// doesn't get so many matmul rows that its compute buffer no longer fits. The
+// caller adds compute_bytes back when computing the main device's peak. Returns
+// param bytes per device (no compute folded in) — these become the split ratios.
+inline std::vector<int64_t> row_split_shares(int64_t                    params_bytes,
+                                             int64_t                    compute_bytes,
+                                             const std::vector<Device>& devices,
+                                             const std::vector<size_t>& gpu_idxs,
+                                             size_t                     main_pos) {
+    std::vector<int64_t> avail(gpu_idxs.size(), 0);
+    int64_t              total = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        int64_t a = std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes);
+        if (k == main_pos) {
+            a = std::max<int64_t>(0, a - compute_bytes);
+        }
+        avail[k] = a;
+        total += a;
+    }
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    if (total <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        out[k] = int64_t(double(params_bytes) * double(avail[k]) / double(total));
+    }
+    return out;
+}
+
+// Peak per device = MAX of any single component's footprint on that device.
+// Components free their params between phases (free_params_immediately; the
+// split runners load lazily and free after each phase too), so they time-share
+// VRAM rather than coexisting — hence MAX, not sum.
 inline int64_t gpu_peak(int                           gpu_idx,
                         const std::vector<Placement>& pl,
                         const std::vector<int>&       dev,
-                        const std::vector<Component>& components) {
+                        const std::vector<Component>& components,
+                        const std::vector<Device>&    devices = {}) {
     int64_t peak = 0;
     for (size_t i = 0; i < components.size(); i++) {
-        if (dev[i] != gpu_idx) continue;
         int64_t footprint = 0;
         if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            if (dev[i] != gpu_idx) continue;
             footprint = components[i].params_bytes + components[i].compute_bytes;
+        } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            // Row-split: every GPU in the mask gets a free-VRAM-weighted
+            // share of params; the compute reserve lands on the BIGGEST
+            // GPU (which becomes the runner's main backend).
+            const int mask = dev[i];
+            if (!(mask & (1 << gpu_idx))) continue;
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            int slot = -1;
+            int biggest_slot = 0;
+            int64_t biggest_mem = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) slot = int(k);
+                if (devices[gpu_idxs[k]].total_bytes > biggest_mem) {
+                    biggest_mem  = devices[gpu_idxs[k]].total_bytes;
+                    biggest_slot = int(k);
+                }
+            }
+            if (slot < 0) continue;
+            // Row-split: graph runs on the main (= biggest) GPU, which reserves
+            // its compute buffer; param rows are weighted by the remaining free.
+            auto shares = row_split_shares(components[i].params_bytes,
+                                           components[i].compute_bytes,
+                                           devices, gpu_idxs, size_t(biggest_slot));
+            footprint = shares[slot];
+            if (slot == biggest_slot) {
+                footprint += components[i].compute_bytes;
+            }
+        } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+            // dev[i] holds the bitmask of participating GPU indices into the
+            // devices[] vector (encoded by the planner). Look up our slot.
+            const int mask = dev[i];
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Find this gpu's slot in gpu_idxs.
+            int slot = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             components[i].compute_bytes,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
         }
         peak = std::max(peak, footprint);
     }
@@ -210,9 +359,13 @@ inline int64_t gpu_peak(int                           gpu_idx,
 inline Plan compute_plan(const std::vector<Component>& components,
                          const std::vector<Device>&    devices,
                          int64_t                       margin_bytes,
-                         bool                          allow_multi_gpu = true) {
+                         bool                          allow_multi_gpu = true,
+                         MultiGpuMode                  mode = MultiGpuMode::ROW) {
     const size_t nC = components.size();
     const size_t nG = devices.size();
+    if (!allow_multi_gpu) {
+        mode = MultiGpuMode::OFF;
+    }
 
     std::vector<int64_t> cap(nG, 0);
     for (size_t g = 0; g < nG; g++) {
@@ -224,6 +377,24 @@ inline Plan compute_plan(const std::vector<Component>& components,
         int       device_idx;
     };
 
+    // ROW-split is DiT-exclusive. Keeping a single homogeneous row-split
+    // component (same tensor sizes every phase/generation) lets the driver
+    // reuse freed split-buffer chunks, which is what avoids the
+    // cuda_split_buffer fragmentation a ggml patch would otherwise be needed
+    // for. The DiT is also the per-step bottleneck, where row-split's small
+    // compute buffer matters most.
+    auto supports_tensor_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT;
+    };
+    // LAYER-split (regular per-device buffers routed by a scheduler) is
+    // general and fragmentation-free, so any block-structured component can
+    // use it. The Conditioner (e.g. Gemma) splits this way when it is too big
+    // for one GPU; its (larger) cross-backend compute buffer is acceptable
+    // because it runs once at encode time and frees before the DiT loop.
+    auto supports_layer_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
+    };
+
     auto build_options = [&](const Component& c) {
         std::vector<OptionSlot> opts;
         for (size_t g = 0; g < nG; g++) {
@@ -232,6 +403,25 @@ inline Plan compute_plan(const std::vector<Component>& components,
                 opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
             }
         }
+        if (nG >= 2) {
+            // ROW-split: DiT only, in row mode. Spans all GPUs (one option).
+            if (mode == MultiGpuMode::ROW && supports_tensor_split(c.kind)) {
+                opts.push_back({Placement::GPU_TENSOR_SPLIT, (1 << nG) - 1});
+            }
+            // LAYER-split: the DiT in layer mode, and any OTHER layer-split
+            // candidate (the Conditioner) regardless of mode — non-DiT
+            // components never row-split, preserving the single-row invariant.
+            const bool want_layer = supports_layer_split(c.kind) &&
+                                    (mode == MultiGpuMode::LAYER ||
+                                     (mode == MultiGpuMode::ROW && !supports_tensor_split(c.kind)));
+            if (want_layer) {
+                const int max_mask = 1 << nG;
+                for (int mask = 1; mask < max_mask; mask++) {
+                    if (__builtin_popcount(mask) < 2) continue;
+                    opts.push_back({Placement::GPU_LAYER_SPLIT, mask});
+                }
+            }
+        }
         opts.push_back({Placement::CPU, -1});
         return opts;
     };
@@ -262,6 +452,22 @@ inline Plan compute_plan(const std::vector<Component>& components,
             } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
                 s += 5 * pw;
                 gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+                // Row-split: cheaper than layer-split (no sched cross-
+                // backend doubling) but pays per-matmul cross-device
+                // reductions. Score it slightly above LAYER_SPLIT so the
+                // planner prefers it when both fit.
+                s += 8 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
+            } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+                // Better than CPU but worse than fitting on a single GPU
+                // (cross-GPU traffic between blocks).
+                s += 7 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
             } else {
                 s -= 10 * pw;
             }
@@ -299,7 +505,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
             if (ok) {
                 bool feasible = true;
                 for (size_t g = 0; g < nG; g++) {
-                    if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                    if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
                 }
                 if (feasible) {
                     int64_t sc = score(pl, dev);
@@ -311,7 +517,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
         } else {
             bool feasible = true;
             for (size_t g = 0; g < nG; g++) {
-                if (gpu_peak(int(g), pl, dev, components) > cap[g]) { feasible = false; break; }
+                if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
             }
             if (feasible) {
                 int64_t sc = score(pl, dev);
@@ -347,6 +553,66 @@ inline Plan compute_plan(const std::vector<Component>& components,
             d.device_id      = DEVICE_ID_CPU;
             d.on_host_bytes  = c.params_bytes + c.compute_bytes;
             plan.any_changes = true;
+        } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Sort participating GPUs by descending TOTAL memory so the
+            // largest device is the "main" (runs the graph + hosts the compute
+            // buffer + sub-runners that don't get their own spec). This matches
+            // the user's preference: always use the bigger GPU as main.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+            // PARAM shares for the split ratio: the main (order[0]) reserves its
+            // compute buffer first so it doesn't get over-loaded with rows.
+            auto shares = row_split_shares(c.params_bytes, c.compute_bytes,
+                                           devices, gpu_idxs, order[0]);
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                // split_share_bytes drives the row ratio in apply_dit -> keep it
+                // param-only. The main device's peak (params + compute) is folded
+                // into on_device_bytes for the plan display / feasibility.
+                d.split_share_bytes.push_back(shares[k]);
+                int64_t peak = shares[k] + (pos == 0 ? c.compute_bytes : 0);
+                max_share    = std::max(max_share, peak);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
+        } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, c.compute_bytes,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending TOTAL memory so the
+            // physically bigger GPU is listed first (and becomes the runner's
+            // main backend). Sub-runners that don't get the layer-split spec
+            // (e.g. the LTX-2 text projection) follow the main backend.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                d.split_share_bytes.push_back(shares[k]);
+                max_share = std::max(max_share, shares[k]);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
         } else {
             d.device_id = devices[best_dev[i]].id;
             if (best_pl[i] == Placement::GPU) {
@@ -362,7 +628,7 @@ inline Plan compute_plan(const std::vector<Component>& components,
     }
 
     for (size_t g = 0; g < nG; g++) {
-        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components);
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices);
     }
     return plan;
 }
@@ -372,6 +638,8 @@ inline const char* placement_str(Placement p) {
         case Placement::CPU: return "CPU";
         case Placement::GPU: return "GPU";
         case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+        case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)";
+        case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)";
     }
     return "?";
 }
@@ -407,6 +675,17 @@ inline void print_plan(const Plan&                   plan,
             LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
                      d.name.c_str(), d.device_id,
                      (long long)(d.on_device_bytes / MiB));
+        } else if (d.placement == Placement::GPU_LAYER_SPLIT ||
+                   d.placement == Placement::GPU_TENSOR_SPLIT) {
+            std::string ids;
+            const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer";
+            for (size_t k = 0; k < d.split_device_ids.size(); k++) {
+                if (k > 0) ids += "+";
+                ids += "GPU" + std::to_string(d.split_device_ids[k]);
+                ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)";
+            }
+            LOG_INFO("    %-12s -> %s-split %s",
+                     d.name.c_str(), tag, ids.c_str());
         } else {
             LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
                      d.name.c_str(), d.device_id,
diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp
index 0cb3172b9..b851b80bf 100644
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@@ -119,6 +119,12 @@ struct Conditioner {
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
     virtual void set_stream_layers_enabled(bool enabled) {}
+    // Multi-GPU + lazy-load hooks. Default no-op; LLM-backed conditioners
+    // forward them to their (heavy) LLM sub-runner so it can be split across
+    // GPUs (layer-split) and/or have its params alloc+load deferred to the
+    // first compute so it time-shares VRAM with the DiT.
+    virtual void set_lazy_load(std::function<bool()> fn) {}
+    virtual void set_multi_backend_spec(const MultiBackendSpec& spec) {}
     virtual void set_flash_attention_enabled(bool enabled) = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
 };
@@ -1488,6 +1494,14 @@ struct AnimaConditioner : public Conditioner {
         llm->set_stream_layers_enabled(enabled);
     }
 
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
@@ -1642,6 +1656,14 @@ struct LLMEmbedder : public Conditioner {
         llm->set_stream_layers_enabled(enabled);
     }
 
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
@@ -2229,6 +2251,16 @@ struct LTXAVEmbedder : public Conditioner {
         projector->set_flash_attention_enabled(enabled);
     }
 
+    // Split/lazy apply to the heavy LLM only; the small projector stays on the
+    // main backend and loads eagerly.
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
     void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
         llm->set_max_graph_vram_bytes(max_vram_bytes);
         projector->set_max_graph_vram_bytes(max_vram_bytes);
@@ -2267,6 +2299,7 @@ struct LTXAVEmbedder : public Conditioner {
 
         std::vector<float> mask;
         tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength);
+
         return {tokens, weights, mask};
     }
 
@@ -2304,6 +2337,7 @@ struct LTXAVEmbedder : public Conditioner {
                                           {},
                                           true);
         GGML_ASSERT(!hidden_states.empty());
+
         hidden_states = apply_token_weights(std::move(hidden_states), weights);
 
         int64_t valid_tokens = 0;
@@ -2361,6 +2395,8 @@ struct LTXAVEmbedder : public Conditioner {
         }
 
         hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens});
+
+
         return projector->compute(n_threads, hidden_states);
     }
 
diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
index d0326a192..f59785fdf 100644
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@@ -1674,6 +1674,39 @@ struct GGMLRunnerContext {
     }
 };
 
+// Multi-GPU split of a single runner across several GPU backends, on stock
+// ggml (no ggml patch needed). Two modes:
+//   LAYER_SPLIT: whole transformer blocks are assigned to per-block backends
+//                and a ggml_backend_sched routes cross-device ops. Works on
+//                any multi-GPU set.
+//   ROW_SPLIT:   matmul weights are split row-wise via the backend's stock
+//                split buffer type (CUDA/SYCL `ggml_backend_split_buffer_type`),
+//                non-matmul weights live on the main GPU; sched still wires the
+//                extra backends so it can route the cross-device reductions.
+// The split params are allocated once and kept resident (the runner is not
+// freed+realloc'd between generations), which is what lets us avoid the
+// split-buffer fragmentation a ggml patch would otherwise be needed for.
+enum class MultiBackendMode {
+    LAYER_SPLIT,
+    ROW_SPLIT,
+};
+
+struct MultiBackendSpec {
+    MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT;
+    // Extra GPU backends beyond the runner's main (runtime) backend. The main
+    // backend is implicit and is NOT listed here. Borrowed handles — owned by
+    // the SDBackendManager, never freed by the runner.
+    std::vector<ggml_backend_t> additional_backends;
+    // LAYER_SPLIT: map a param tensor to the backend that should hold it (the
+    // main backend, or one of additional_backends). nullptr => main. Keyed by
+    // tensor POINTER, not name: param tensors are unnamed at alloc time.
+    std::function<ggml_backend_t(ggml_tensor*)> tensor_backend_fn;
+    // ROW_SPLIT: per-device weight ratios (length = the backend registry's
+    // device count) and the main device index that owns the non-split portion.
+    std::vector<float> tensor_split_ratios;
+    int                main_device = 0;
+};
+
 struct GGMLRunner {
 protected:
     typedef std::function<ggml_cgraph*()> get_graph_cb_t;
@@ -1710,6 +1743,32 @@ struct GGMLRunner {
     bool stream_layers_enabled            = false;
     size_t observed_max_effective_budget_ = 0;
 
+    // --- multi-GPU split state (layer-split via sched OR row-split via the
+    //     stock split buffer type). Inactive unless set_multi_backend_spec()
+    //     was called before alloc_params_buffer(). ---
+    bool                                        multi_backend_mode = false;
+    MultiBackendMode                            multi_backend_kind = MultiBackendMode::LAYER_SPLIT;
+    std::vector<ggml_backend_t>                 additional_backends;  // borrowed (manager-owned)
+    std::function<ggml_backend_t(ggml_tensor*)> tensor_backend_fn    = nullptr;
+    ggml_backend_sched_t                        sched                = nullptr;  // owned
+    bool                                        sched_reserved       = false;
+    ggml_backend_t                              cpu_fallback_backend = nullptr;
+    bool                                        owns_cpu_fallback_backend = false;
+    // LAYER_SPLIT: one resident params buffer per participating backend.
+    std::vector<ggml_backend_buffer_t>          multi_params_buffers;  // owned
+    // ROW_SPLIT: resident split + main buffers and the split buft (buft is
+    // backend-cached, not owned).
+    std::vector<float>                          row_split_ratios;
+    int                                         row_main_device  = 0;
+    ggml_backend_buffer_type_t                  row_split_buft   = nullptr;
+    ggml_backend_buffer_t                       row_split_buffer = nullptr;  // owned
+    ggml_backend_buffer_t                       row_main_buffer  = nullptr;  // owned
+
+    // Lazy-load: when set, params alloc + tensor-data load is deferred to the
+    // first compute() (ensure_params_loaded) and freed after each phase, so
+    // components time-share VRAM instead of all coexisting at init.
+    std::function<bool()> lazy_load_fn = nullptr;
+
     sd::layer_registry::LayerRegistry layer_registry_;
 
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
@@ -1894,7 +1953,167 @@ struct GGMLRunner {
         return true;
     }
 
+    // Build the multi-backend scheduler (lazily). Backends in priority order:
+    // main runtime backend, then the additional GPU backends, then a CPU
+    // fallback last (ggml_backend_sched_new requires the last backend be CPU).
+    bool ensure_sched() {
+        if (sched != nullptr) {
+            return true;
+        }
+        std::vector<ggml_backend_t> backends;
+        backends.reserve(1 + additional_backends.size() + 1);
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) {
+            backends.push_back(b);
+        }
+        if (cpu_fallback_backend == nullptr) {
+            cpu_fallback_backend      = sd_backend_cpu_init();
+            owns_cpu_fallback_backend = true;
+        }
+        backends.push_back(cpu_fallback_backend);
+        // Build an explicit per-backend buffer-type array instead of passing
+        // nullptr. ggml_backend_sched uses these in buffer_supported() to decide
+        // whether a cross-backend src needs a copy; with nullptr it synthesizes
+        // them from default backend types, and CUDA devices can spuriously report
+        // supporting each other's buffers -> a needed copy is skipped and a node
+        // (e.g. a cont in attention) reads another device's memory -> illegal
+        // access. For the trailing CPU slot, use device-0's host buffer type
+        // (pinned host memory) exactly as llama.cpp does (llama-context.cpp).
+        std::vector<ggml_backend_buffer_type_t> bufts;
+        bufts.reserve(backends.size());
+        ggml_backend_dev_t dev0 = ggml_backend_get_device(runtime_backend);
+        for (auto* b : backends) {
+            if (b == cpu_fallback_backend && dev0 != nullptr) {
+                ggml_backend_buffer_type_t host = ggml_backend_dev_host_buffer_type(dev0);
+                bufts.push_back(host != nullptr ? host : ggml_backend_get_default_buffer_type(b));
+            } else {
+                bufts.push_back(ggml_backend_get_default_buffer_type(b));
+            }
+        }
+        sched = ggml_backend_sched_new(backends.data(),
+                                       bufts.data(),
+                                       (int)backends.size(),
+                                       MAX_GRAPH_SIZE,
+                                       /*parallel=*/false,
+                                       /*op_offload=*/false);
+        if (sched == nullptr) {
+            LOG_ERROR("%s: failed to create backend sched", get_desc().c_str());
+            return false;
+        }
+        return true;
+    }
+
+    // Map a weight tensor to the backend it was allocated on in a layer split.
+    ggml_backend_t backend_of_weight(ggml_tensor* t) const {
+        if (t == nullptr || t->buffer == nullptr) {
+            return nullptr;
+        }
+        if (ggml_backend_buffer_get_usage(t->buffer) != GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            return nullptr;
+        }
+        for (size_t i = 0; i < multi_params_buffers.size(); i++) {
+            if (multi_params_buffers[i] == t->buffer) {
+                if (i == 0) {
+                    return runtime_backend;
+                }
+                if (i - 1 < additional_backends.size()) {
+                    return additional_backends[i - 1];
+                }
+            }
+        }
+        return nullptr;
+    }
+
+    // Pin compute nodes to their layer's device for a LAYER split. Stock
+    // ggml_backend_sched anchors weight-bearing ops (matmuls) to the weight's
+    // device, but weightless ops (norm, residual add, permute, cont) have no
+    // anchor and are placed by a heuristic that, for the attention `cont`, can
+    // land on the wrong device and then read it without a cross-device copy ->
+    // CUDA illegal access. llama.cpp pins each layer-boundary norm to the
+    // layer's device for exactly this reason (llama-context.cpp). We generalise:
+    // walk the graph in execution order, track the device of the most recently
+    // consumed weight (= the current layer's device), and pin every node to it.
+    // This forces clean per-layer cuts so sched copies only the residual stream
+    // across the boundary. No-op outside a layer split.
+    void pin_layer_split_nodes(ggml_cgraph* gf) {
+        if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::LAYER_SPLIT) {
+            return;
+        }
+        if (sched == nullptr || multi_params_buffers.empty() || gf == nullptr) {
+            return;
+        }
+        ggml_backend_t cur     = runtime_backend;
+        const int      n_nodes = ggml_graph_n_nodes(gf);
+        for (int i = 0; i < n_nodes; i++) {
+            ggml_tensor* node = ggml_graph_node(gf, i);
+            for (int s = 0; s < GGML_MAX_SRC; s++) {
+                ggml_backend_t wb = backend_of_weight(node->src[s]);
+                if (wb != nullptr) {
+                    cur = wb;
+                }
+            }
+            // NEVER pin view ops (view/reshape/permute/transpose): a view
+            // assigned to a different backend than its view_src's data makes
+            // the sched skip the cross-device copy for consumers (the copy
+            // decision trusts the assigned id), and a kernel then dereferences
+            // the other device's pointer. The sched places views correctly on
+            // its own by following view_src.
+            if (node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE ||
+                node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+                continue;
+            }
+            if (cur != nullptr && ggml_backend_supports_op(cur, node)) {
+                ggml_backend_sched_set_tensor_backend(sched, node, cur);
+            }
+        }
+    }
+
+    // Pin un-allocated graph-input leaves (rope pe tables, timesteps, latents…)
+    // to the MAIN backend before sched alloc. Left to its own heuristics the
+    // sched places them on the CPU/host slot and emits per-split host->device
+    // input copies; those copies were observed landing LATE (first pass reads
+    // zeros / stale pool garbage, second pass reads the first pass's data).
+    // Pinning them to the main backend makes our copy_data_to_backend_tensor
+    // fill a device-resident tensor directly (synchronous H2D) and removes the
+    // cross-backend input copies entirely.
+    void pin_input_leaves(ggml_cgraph* gf) {
+        // ROW_SPLIT only: the whole graph computes on the main backend, so
+        // graph inputs trivially belong there; pinning them avoids per-split
+        // host->device input copies. (Layer-split graphs span devices and the
+        // sched routes their inputs correctly on its own.)
+        if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::ROW_SPLIT ||
+            sched == nullptr || gf == nullptr || runtime_backend == nullptr) {
+            return;
+        }
+        const int n_nodes = ggml_graph_n_nodes(gf);
+        for (int i = 0; i < n_nodes; i++) {
+            ggml_tensor* node = ggml_graph_node(gf, i);
+            for (int s = 0; s < GGML_MAX_SRC && node->src[s] != nullptr; s++) {
+                ggml_tensor* t = node->src[s];
+                while (t->view_src != nullptr) {
+                    t = t->view_src;
+                }
+                // op NONE + no buffer yet = a graph input the sched will
+                // allocate (weights already sit in params buffers).
+                if (t->op == GGML_OP_NONE && t->buffer == nullptr) {
+                    ggml_backend_sched_set_tensor_backend(sched, t, runtime_backend);
+                }
+            }
+        }
+    }
+
     bool alloc_compute_buffer(ggml_cgraph* gf) {
+        if (multi_backend_mode) {
+            // Do NOT ggml_backend_sched_reserve(gf) here: reserve runs
+            // split_graph, which REWIRES gf's src pointers to sched-internal
+            // copy tensors. execute_graph then sched_alloc_graph's the SAME gf,
+            // and the second split sees the stale reserve-epoch copies (measure
+            // layout) as valid inputs — silently corrupting every cross-backend
+            // input (garbage rope pe, garbage Gemma stack) or crashing. A graph
+            // must be split at most once; the first sched_alloc_graph in
+            // execute_graph performs the real allocation instead.
+            return ensure_sched();
+        }
         if (compute_allocr != nullptr) {
             return true;
         }
@@ -2417,13 +2636,15 @@ struct GGMLRunner {
                max_graph_vram_bytes > 0 &&
                plan.segments.size() > 1 &&
                params_backend != runtime_backend &&
-               !sd_backend_is_cpu(runtime_backend);
+               !sd_backend_is_cpu(runtime_backend) &&
+               !multi_backend_mode;
     }
 
     bool can_attempt_graph_cut_segmented_compute() const {
         return max_graph_vram_bytes > 0 &&
                params_backend != runtime_backend &&
-               !sd_backend_is_cpu(runtime_backend);
+               !sd_backend_is_cpu(runtime_backend) &&
+               !multi_backend_mode;
     }
 
     bool resolve_graph_cut_plan(ggml_cgraph* gf,
@@ -2657,7 +2878,18 @@ struct GGMLRunner {
             return std::nullopt;
         }
 
-        if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
+        if (multi_backend_mode) {
+            ggml_backend_sched_reset(sched);
+            pin_layer_split_nodes(gf);  // reset clears pins; re-apply before alloc
+            pin_input_leaves(gf);
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LOG_ERROR("%s sched alloc compute graph failed", get_desc().c_str());
+                if (free_compute_buffer_immediately) {
+                    free_compute_buffer();
+                }
+                return std::nullopt;
+            }
+        } else if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
             LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
             if (free_compute_buffer_immediately) {
                 free_compute_buffer();
@@ -2674,9 +2906,20 @@ struct GGMLRunner {
         if (sd_backend_is_cpu(runtime_backend)) {
             sd_backend_cpu_set_n_threads(runtime_backend, n_threads);
         }
+        if (multi_backend_mode && cpu_fallback_backend != nullptr && sd_backend_is_cpu(cpu_fallback_backend)) {
+            sd_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads);
+        }
 
         int64_t t_compute_begin = ggml_time_ms();
-        ggml_status status      = ggml_backend_graph_compute(runtime_backend, gf);
+        ggml_status status;
+        if (multi_backend_mode) {
+            status = ggml_backend_sched_graph_compute(sched, gf);
+            if (status == GGML_STATUS_SUCCESS) {
+                ggml_backend_sched_synchronize(sched);
+            }
+        } else {
+            status = ggml_backend_graph_compute(runtime_backend, gf);
+        }
         int64_t t_compute_end   = ggml_time_ms();
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
@@ -3002,6 +3245,16 @@ struct GGMLRunner {
         free_params_ctx();
         free_compute_ctx();
         free_cache_ctx_and_buffer();
+        // Multi-GPU split teardown. additional_backends are owned by the
+        // SDBackendManager (not freed here); row_split_buft is backend-cached.
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched = nullptr;
+        }
+        if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) {
+            ggml_backend_free(cpu_fallback_backend);
+            cpu_fallback_backend = nullptr;
+        }
     }
 
     virtual GGMLRunnerContext get_context() {
@@ -3028,7 +3281,207 @@ struct GGMLRunner {
         alloc_compute_ctx();
     }
 
+    // Row-split eligibility: contiguous, rank-2, both dims >= 256, not a view.
+    // 1D biases/norms, embeddings, small projections and views fall back to the
+    // main GPU's regular per-device buft. Excluding views respects the split
+    // buft's documented contract (GGML_ASSERT(view_src == nullptr)) so we never
+    // need to patch ggml.
+    static bool is_row_split_eligible(const ggml_tensor* t) {
+        if (t->view_src != nullptr) return false;
+        if (!ggml_is_contiguous(t)) return false;
+        if (ggml_n_dims(t) != 2) return false;
+        if (t->ne[0] < 256 || t->ne[1] < 256) return false;
+        return true;
+    }
+
+    // ROW_SPLIT: matmul-eligible weights -> row_split_buft (split row-wise
+    // across GPUs by the CUDA/SYCL backend), everything else -> the main GPU's
+    // default buft. Each is allocated ONCE into a single resident buffer and
+    // suballocated via ggml_tallocr — no per-tensor churn, no free->realloc.
+    bool alloc_params_buffer_row_split() {
+        if (row_split_buft == nullptr) {
+            LOG_ERROR("%s row-split buft not initialized (backend lacks ggml_backend_split_buffer_type)",
+                      get_desc().c_str());
+            return false;
+        }
+        ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend);
+        const size_t main_align  = ggml_backend_buft_get_alignment(main_buft);
+        const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft);
+
+        size_t main_size = 0, split_size = 0;
+        size_t main_count = 0, split_count = 0;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            if (is_row_split_eligible(t)) {
+                split_size += GGML_PAD(ggml_backend_buft_get_alloc_size(row_split_buft, t), split_align);
+                split_count++;
+            } else {
+                main_size += GGML_PAD(ggml_backend_buft_get_alloc_size(main_buft, t), main_align);
+                main_count++;
+            }
+        }
+
+        if (main_size > 0) {
+            row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size);
+            if (row_main_buffer == nullptr) {
+                LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", get_desc().c_str(), main_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+        if (split_size > 0) {
+            row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size);
+            if (row_split_buffer == nullptr) {
+                LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", get_desc().c_str(), split_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+
+        ggml_tallocr main_alloc{};
+        ggml_tallocr split_alloc{};
+        if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer);
+        if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer);
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            ggml_status st = is_row_split_eligible(t) ? ggml_tallocr_alloc(&split_alloc, t) : ggml_tallocr_alloc(&main_alloc, t);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s row-split tallocr_alloc failed", get_desc().c_str());
+                return false;
+            }
+        }
+        if (row_main_buffer != nullptr) ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        if (row_split_buffer != nullptr) ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        rebuild_params_tensor_set();
+        LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)",
+                 get_desc().c_str(), main_size / (1024.f * 1024.f), main_count, split_size / (1024.f * 1024.f), split_count);
+        return true;
+    }
+
+    // LAYER_SPLIT: assign each param tensor to a backend (via tensor_backend_fn,
+    // keyed by tensor pointer), allocate one resident buffer per backend on its
+    // default buft, and suballocate via ggml_tallocr.
+    bool alloc_params_buffer_layer_split() {
+        std::vector<ggml_backend_t> backends;
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+
+        std::vector<ggml_backend_buffer_type_t> bufts(backends.size());
+        std::vector<size_t> aligns(backends.size());
+        std::vector<size_t> sizes(backends.size(), 0);
+        std::vector<size_t> counts(backends.size(), 0);
+        for (size_t i = 0; i < backends.size(); i++) {
+            bufts[i]  = ggml_backend_get_default_buffer_type(backends[i]);
+            aligns[i] = ggml_backend_buft_get_alignment(bufts[i]);
+        }
+
+        std::map<ggml_tensor*, int> tensor_backend_idx;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            int idx = 0;
+            if (tensor_backend_fn) {
+                ggml_backend_t target = tensor_backend_fn(t);
+                if (target != nullptr) {
+                    for (size_t i = 0; i < backends.size(); i++) {
+                        if (backends[i] == target) { idx = int(i); break; }
+                    }
+                }
+            }
+            tensor_backend_idx[t] = idx;
+            sizes[idx] += GGML_PAD(ggml_backend_buft_get_alloc_size(bufts[idx], t), aligns[idx]);
+            counts[idx] += 1;
+        }
+
+        multi_params_buffers.assign(backends.size(), nullptr);
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (sizes[i] == 0) continue;
+            ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]);
+            size_t free_pre = 0, total_pre = 0;
+            if (dev) ggml_backend_dev_memory(dev, &free_pre, &total_pre);
+            multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]);
+            if (multi_params_buffers[i] == nullptr) {
+                LOG_ERROR("%s layer-split alloc on %s failed (%.1f MB)", get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f));
+                return false;
+            }
+            size_t free_post = 0, total_post = 0;
+            if (dev) ggml_backend_dev_memory(dev, &free_post, &total_post);
+            LOG_DEBUG("%s layer-split alloc[%zu] %s req=%.1f MB dev_free %.1f -> %.1f MB is_host=%d",
+                      get_desc().c_str(), i, ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f),
+                      free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f),
+                      (int)ggml_backend_buffer_is_host(multi_params_buffers[i]));
+        }
+
+        std::vector<ggml_tallocr> tallocs(backends.size());
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (multi_params_buffers[i] != nullptr) tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]);
+        }
+        for (auto& kv : tensor_backend_idx) {
+            if (ggml_tallocr_alloc(&tallocs[kv.second], kv.first) != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s layer-split tallocr_alloc failed", get_desc().c_str());
+                return false;
+            }
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        rebuild_params_tensor_set();
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (counts[i] == 0) continue;
+            LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)",
+                     get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), counts[i]);
+        }
+        return true;
+    }
+
+    // Lazy mode: defer alloc + tensor-data load until the first compute().
+    // The caller still runs alloc_params_buffer + get_param_tensors at init,
+    // but for a lazy runner alloc_params_buffer is a no-op and the bulk loader
+    // skips this runner's tensors (they have no buffer yet); ensure_params_loaded()
+    // then allocates and invokes lazy_load_fn() on demand, and the params are
+    // freed after the phase (free_params_immediately) so components time-share VRAM.
+    void set_lazy_load(std::function<bool()> fn) {
+        lazy_load_fn = std::move(fn);
+    }
+
+    // True once a (non-lazy) buffer exists OR a lazy load has materialized one.
+    bool params_loaded() const {
+        return params_buffer != nullptr || !multi_params_buffers.empty() ||
+               row_split_buffer != nullptr || row_main_buffer != nullptr;
+    }
+
+    bool ensure_params_loaded() {
+        if (params_loaded()) {
+            return true;
+        }
+        if (!lazy_load_fn) {
+            // Non-lazy runner with no buffer: either it had no tensors, or its
+            // params are mmap-resident (data already set). Nothing to do.
+            return true;
+        }
+        int64_t t0 = ggml_time_ms();
+        if (!do_alloc_params_buffer()) {
+            return false;
+        }
+        if (!lazy_load_fn()) {
+            LOG_ERROR("%s: lazy params load failed", get_desc().c_str());
+            return false;
+        }
+        LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (ggml_time_ms() - t0) / 1000.f);
+        return true;
+    }
+
     bool alloc_params_buffer() {
+        // Defer to first compute() for lazy runners (see set_lazy_load).
+        if (lazy_load_fn) {
+            return true;
+        }
+        return do_alloc_params_buffer();
+    }
+
+    bool do_alloc_params_buffer() {
+        if (multi_backend_mode) {
+            // Split allocation bypasses the mmap fast-path: the params must land
+            // in the GPU split buffers, not stay mmap'd.
+            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+                return alloc_params_buffer_row_split();
+            }
+            return alloc_params_buffer_layer_split();
+        }
         size_t num_tensors = ggml_tensor_num(params_ctx);
         if (num_tensors > 0) {
             // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
@@ -3086,14 +3539,53 @@ struct GGMLRunner {
             ggml_backend_buffer_free(params_buffer);
             params_buffer = nullptr;
         }
+        // Multi-GPU split buffers (layer-split: one per backend; row-split:
+        // split + main). The split buft itself is backend-cached, not freed.
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_free(buf);
+            }
+        }
+        multi_params_buffers.clear();
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_free(row_split_buffer);
+            row_split_buffer = nullptr;
+        }
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_free(row_main_buffer);
+            row_main_buffer = nullptr;
+        }
+        // Release the multi-backend scheduler as well. Its reserved compute
+        // buffers can be GBs on each device, and free_compute_buffer only
+        // sched_reset()s them (kept alive across the sampling loop to avoid a
+        // per-step rebuild). free_params_buffer is the end-of-phase release, so
+        // here we actually free the sched so the next component can claim that
+        // VRAM (time-share). It is recreated lazily on the next compute().
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched          = nullptr;
+            sched_reserved = false;
+        }
         observed_max_effective_budget_ = 0;
     }
 
     size_t get_params_buffer_size() {
+        size_t total = 0;
         if (params_buffer != nullptr) {
-            return ggml_backend_buffer_get_size(params_buffer);
+            total += ggml_backend_buffer_get_size(params_buffer);
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                total += ggml_backend_buffer_get_size(buf);
+            }
+        }
+        if (row_split_buffer != nullptr) {
+            total += ggml_backend_buffer_get_size(row_split_buffer);
         }
-        return 0;
+        if (row_main_buffer != nullptr) {
+            total += ggml_backend_buffer_get_size(row_main_buffer);
+        }
+        return total;
     }
 
     void free_cache_ctx_and_buffer() {
@@ -3106,12 +3598,25 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
+        if (sched != nullptr) {
+            // Reset (not free): keeping the sched alive across the sampling
+            // loop's compute() calls avoids a per-step rebuild. It is freed in
+            // the destructor.
+            ggml_backend_sched_reset(sched);
+            sched_reserved = false;
+        }
         restore_partial_params();
         restore_all_params();
     }
 
     // do copy after alloc graph
     void set_backend_tensor_data(ggml_tensor* tensor, const void* data) {
+        // In multi-backend mode, sched needs the tensor flagged as input so it
+        // gets a concrete backend assignment (tensors with no producers and no
+        // consumers otherwise stay at backend_id = -1 and never get a buffer).
+        if (multi_backend_mode) {
+            ggml_set_input(tensor);
+        }
         backend_tensor_data_map[tensor] = data;
     }
 
@@ -3174,6 +3679,11 @@ struct GGMLRunner {
                                          int n_threads,
                                          bool free_compute_buffer_immediately,
                                          bool no_return = false) {
+        // Lazy runners allocate + load their params here, on first use of the
+        // phase; they were skipped at init so components time-share VRAM.
+        if (!ensure_params_loaded()) {
+            return std::nullopt;
+        }
         ggml_cgraph* gf = nullptr;
         if (!prepare_compute_graph(get_graph, &gf)) {
             return std::nullopt;
@@ -3240,6 +3750,45 @@ struct GGMLRunner {
         stream_layers_enabled = enabled;
     }
 
+    // Configure a multi-GPU split for this runner. Must be called AFTER
+    // construction + get_param_tensors() and BEFORE alloc_params_buffer().
+    // For ROW_SPLIT, resolves the backend's stock split buffer type; if the
+    // backend has none (non-CUDA/SYCL), it cleanly falls back to single-GPU.
+    void set_multi_backend_spec(const MultiBackendSpec& spec) {
+        if (params_buffer != nullptr || !multi_params_buffers.empty() ||
+            row_split_buffer != nullptr || row_main_buffer != nullptr) {
+            LOG_ERROR("%s set_multi_backend_spec called after params were allocated; ignoring",
+                      get_desc().c_str());
+            return;
+        }
+        multi_backend_mode  = true;
+        multi_backend_kind  = spec.mode;
+        additional_backends = spec.additional_backends;
+        tensor_backend_fn   = spec.tensor_backend_fn;
+        row_split_ratios    = spec.tensor_split_ratios;
+        row_main_device     = spec.main_device;
+        if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+            row_split_buft = sd_backend_split_buffer_type(
+                runtime_backend,
+                row_main_device,
+                row_split_ratios.empty() ? nullptr : row_split_ratios.data());
+            if (row_split_buft == nullptr) {
+                LOG_WARN("%s row-split unavailable on this backend; falling back to single-GPU",
+                         get_desc().c_str());
+                multi_backend_mode = false;
+                additional_backends.clear();
+                tensor_backend_fn = nullptr;
+                return;
+            }
+        }
+        // Streaming (graph-cut param offload) is mutually exclusive with split.
+        stream_layers_enabled = false;
+    }
+
+    bool is_multi_backend() const {
+        return multi_backend_mode;
+    }
+
     sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; }
 
     ggml_backend_t get_runtime_backend() {
diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp
index d085129db..d8f43c90d 100644
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@@ -507,6 +507,10 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
     return init_cached_backend(name);
 }
 
+ggml_backend_t SDBackendManager::ensure_backend(const std::string& device_name) {
+    return init_cached_backend(device_name);
+}
+
 bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
     return sd_backend_is_cpu(runtime_backend(module));
 }
@@ -654,3 +658,22 @@ const char* sd_backend_module_name(SDBackendModule module) {
     }
     return "unknown";
 }
+
+ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) {
+    if (backend == nullptr) {
+        return nullptr;
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (dev == nullptr) {
+        return nullptr;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    if (reg == nullptr) {
+        return nullptr;
+    }
+    auto fn = (ggml_backend_split_buffer_type_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+    if (fn == nullptr) {
+        return nullptr;  // backend has no row-split support (non-CUDA/SYCL)
+    }
+    return fn(main_device, tensor_split);
+}
diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h
index fc071ffda..24b53c1ad 100644
--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@@ -61,6 +61,12 @@ class SDBackendManager {
     ggml_backend_t runtime_backend(SDBackendModule module);
     ggml_backend_t params_backend(SDBackendModule module);
 
+    // Return (creating + caching on first use) the backend for an explicit
+    // ggml device name (e.g. "CUDA1"). Used to obtain the additional GPU
+    // backends a multi-GPU split needs; the manager owns the handle and frees
+    // it once at teardown, so callers only borrow it.
+    ggml_backend_t ensure_backend(const std::string& device_name);
+
     bool runtime_backend_is_cpu(SDBackendModule module);
     bool params_backend_is_cpu(SDBackendModule module);
     bool runtime_backend_supports_host_buffer(SDBackendModule module);
@@ -76,4 +82,12 @@ ggml_backend_t sd_backend_cpu_init();
 bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 const char* sd_backend_module_name(SDBackendModule module);
 void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
+
+// Runtime lookup of a backend's row-split buffer type, published by the CUDA
+// and SYCL backends as the "ggml_backend_split_buffer_type" proc. Returns
+// nullptr when the backend does not support row-split (the caller then falls
+// back to a non-split single-GPU path). `tensor_split` is a per-device weight
+// array of length = the backend registry's device count; `main_device` is the
+// index of the device that owns the non-split portion.
+ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split);
 #endif  // __SD_CORE_GGML_EXTEND_BACKEND_H__
diff --git a/src/core/util.cpp b/src/core/util.cpp
index 61101a08b..d50f3770f 100644
--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@@ -25,6 +25,7 @@
 #include <unistd.h>
 #endif
 
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "stable-diffusion.h"
 
@@ -972,3 +973,12 @@ std::vector<std::pair<std::string, float>> split_quotation_attention(
     }
     return result;
 }
+
+void sd_list_devices(void) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* name       = ggml_backend_dev_name(dev);
+        const char* desc       = ggml_backend_dev_description(dev);
+        printf("%s\t%s\n", name ? name : "", desc ? desc : "");
+    }
+}
diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp
index a86b4cf50..fd149ce13 100644
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
@@ -1606,8 +1606,13 @@ namespace LTXV {
             if (config.cross_attention_adaln) {
                 auto prompt_adaln_single       = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["prompt_adaln_single"]);
                 auto audio_prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["audio_prompt_adaln_single"]);
-                v_prompt_timestep_mod          = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
-                a_prompt_timestep_mod          = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
+                // The reference feeds modality.sigma (the RAW per-batch sigma) to
+                // both prompt adalns. effective_audio_timestep is exactly that:
+                // audio timesteps are never denoise-masked, so it carries the
+                // unmasked sigma even in i2v. The VIDEO timestep tensor is the
+                // denoise-masked per-token one and must NOT be used here.
+                v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
+                a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
             }
 
             auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep);
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 640c049ad..829260f00 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -197,6 +197,30 @@ class StableDiffusionGGML {
     std::string backend_spec;
     std::string params_backend_spec;
 
+    // DiT multi-GPU split decision captured from the auto-fit plan and applied
+    // to the diffusion runner(s) before param load. OFF when the DiT is not
+    // split. device_ids[0] is the "main" GPU (largest); share_bytes is the
+    // per-device VRAM share (same order as device_ids).
+    backend_fit::MultiGpuMode fit_dit_split_mode = backend_fit::MultiGpuMode::OFF;
+    std::vector<std::string>  fit_dit_split_device_names;  // ggml device names, [0] = main
+    std::vector<int64_t>      fit_dit_split_share_bytes;
+    // Conditioner (LLM) split decision — always layer-split when it splits
+    // (only the DiT ever row-splits; see backend_fit::supports_tensor_split).
+    backend_fit::MultiGpuMode fit_cond_split_mode = backend_fit::MultiGpuMode::OFF;
+    std::vector<std::string>  fit_cond_split_device_names;
+    std::vector<int64_t>      fit_cond_split_share_bytes;
+
+    // Kept alive past init() so lazy-load callbacks can re-read tensors from the
+    // model files on demand. Populated only when auto_lazy_load is on.
+    std::unique_ptr<ModelLoader> owned_model_loader;
+    // Auto-fit decided the components can't all be resident at once (the
+    // per-component MAX plan only fits if they time-share), so defer the heavy
+    // components' param alloc+load to their compute phase and free after.
+    bool auto_lazy_load = false;
+    // auto-fit is on: when a VAE decode OOMs we may auto-enable tiling and retry
+    // (temporal for LTX video, spatial otherwise) instead of failing.
+    bool auto_fit_enabled = false;
+
     bool is_using_v_parameterization     = false;
     bool is_using_edm_v_parameterization = false;
 
@@ -254,6 +278,338 @@ class StableDiffusionGGML {
         return ensure_backend_pair(SDBackendModule::DIFFUSION);
     }
 
+    // Parse a transformer block index out of a weight name, or -1 if none.
+    static int dit_block_index_of(const std::string& name) {
+        static const char* kw[] = {"transformer_blocks.", "joint_blocks.", "double_blocks.",
+                                   "single_blocks.", "blocks.", "layers."};
+        for (const char* k : kw) {
+            size_t p = name.find(k);
+            if (p == std::string::npos) {
+                continue;
+            }
+            p += strlen(k);
+            size_t e = p;
+            while (e < name.size() && name[e] >= '0' && name[e] <= '9') {
+                e++;
+            }
+            if (e > p) {
+                return atoi(name.substr(p, e - p).c_str());
+            }
+        }
+        return -1;
+    }
+
+    // Build a MultiBackendSpec from the auto-fit DiT split decision and apply it
+    // to a diffusion runner BEFORE its params are allocated. No-op when the DiT
+    // is not split. Always returns true (any failure falls back to single-GPU).
+    bool apply_dit_multi_gpu_split(const std::shared_ptr<DiffusionModelRunner>& runner,
+                                   ModelLoader& model_loader) {
+        if (!runner || fit_dit_split_mode == backend_fit::MultiGpuMode::OFF ||
+            fit_dit_split_device_names.size() < 2) {
+            return true;
+        }
+        const auto& devnames = fit_dit_split_device_names;
+        const auto& shares   = fit_dit_split_share_bytes;
+        ggml_backend_t main_backend = runner->get_runtime_backend();
+        MultiBackendSpec spec;
+
+        if (fit_dit_split_mode == backend_fit::MultiGpuMode::ROW) {
+            // ROW: one main backend; matmul rows are split across the devices by
+            // the stock split buft. sched still needs the extra backends so it
+            // can route the cross-device reductions.
+            auto reg_prefix_of = [](const std::string& n) -> std::string {
+                size_t i = 0;
+                while (i < n.size() && !(n[i] >= '0' && n[i] <= '9')) {
+                    i++;
+                }
+                return n.substr(0, i);
+            };
+            std::string        reg_name = reg_prefix_of(devnames[0]);
+            ggml_backend_reg_t reg      = ggml_backend_reg_by_name(reg_name.c_str());
+            if (reg == nullptr) {
+                LOG_WARN("row-split: backend registry '%s' not found; using single GPU", reg_name.c_str());
+                return true;
+            }
+            int dev_count = (int)ggml_backend_reg_dev_count(reg);
+            if (dev_count <= 0) {
+                return true;
+            }
+            auto reg_index_of = [&](const std::string& n) -> int {
+                if (n.rfind(reg_name, 0) != 0) {
+                    return -1;
+                }
+                try {
+                    return std::stoi(n.substr(reg_name.size()));
+                } catch (...) {
+                    return -1;
+                }
+            };
+            int64_t total = 0;
+            for (auto b : shares) {
+                total += b;
+            }
+            if (total <= 0) {
+                return true;
+            }
+            std::vector<float> ratios(dev_count, 0.f);
+            for (size_t k = 0; k < devnames.size(); k++) {
+                int idx = reg_index_of(devnames[k]);
+                if (idx < 0 || idx >= dev_count) {
+                    continue;
+                }
+                ratios[idx] = float(double(shares[k]) / double(total));
+            }
+            // The main device must be the runner's runtime backend, which the
+            // planner set to devnames[0] (the largest-VRAM GPU, listed first).
+            // Keeping these aligned ensures the split buft's non-split portion
+            // and the runner's compute buffer live on the same device.
+            int main_dev = reg_index_of(devnames[0]);
+            if (main_dev < 0 || main_dev >= dev_count) {
+                return true;
+            }
+            for (size_t k = 0; k < devnames.size(); k++) {
+                int idx = reg_index_of(devnames[k]);
+                if (idx == main_dev || idx < 0) {
+                    continue;
+                }
+                ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+                if (b != nullptr) {
+                    spec.additional_backends.push_back(b);
+                } else {
+                    LOG_WARN("row-split: failed to init backend %s", devnames[k].c_str());
+                }
+            }
+            spec.mode                = MultiBackendMode::ROW_SPLIT;
+            spec.tensor_split_ratios = ratios;
+            spec.main_device         = main_dev;
+            LOG_INFO("DiT row-split across %zu devices (main reg-index %d)", devnames.size(), main_dev);
+        } else {
+            // LAYER: assign contiguous block ranges to per-device backends.
+            std::vector<ggml_backend_t> all_backends;
+            all_backends.push_back(main_backend);
+            for (size_t k = 1; k < devnames.size(); k++) {
+                ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+                if (b == nullptr) {
+                    LOG_WARN("layer-split: failed to init backend %s; using single GPU", devnames[k].c_str());
+                    return true;
+                }
+                spec.additional_backends.push_back(b);
+                all_backends.push_back(b);
+            }
+            const std::string tensor_prefix = "model.diffusion_model.";
+            std::map<int, int64_t> block_bytes;
+            int64_t                non_block_bytes = 0;
+            int                    max_block_idx   = -1;
+            for (const auto& kv : model_loader.get_tensor_storage_map()) {
+                if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                    continue;
+                }
+                int64_t bytes = (int64_t)kv.second.nbytes();
+                int     idx   = dit_block_index_of(kv.first);
+                if (idx >= 0) {
+                    block_bytes[idx] += bytes;
+                    if (idx > max_block_idx) {
+                        max_block_idx = idx;
+                    }
+                } else {
+                    non_block_bytes += bytes;
+                }
+            }
+            if (max_block_idx < 0) {
+                LOG_WARN("layer-split: no transformer blocks found; using single GPU");
+                return true;
+            }
+            const int n_blocks    = max_block_idx + 1;
+            int64_t   total_share = 0, total_block = 0;
+            for (auto s : shares) {
+                total_share += s;
+            }
+            for (const auto& kv : block_bytes) {
+                total_block += kv.second;
+            }
+            if (total_share <= 0) {
+                return true;
+            }
+            std::vector<int64_t> budgets(shares.size(), 0);
+            for (size_t k = 0; k < shares.size(); k++) {
+                int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share));
+                if (k == 0) {
+                    b = std::max<int64_t>(b - non_block_bytes, 0);  // backend 0 also holds non-block weights
+                }
+                budgets[k] = b;
+            }
+            std::vector<int> boundaries(shares.size(), 0);
+            size_t           cur     = 0;
+            int64_t          cur_use = 0;
+            for (int b = 0; b < n_blocks; b++) {
+                int64_t bb = block_bytes[b];
+                if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) {
+                    boundaries[cur] = b;
+                    cur++;
+                    cur_use = 0;
+                }
+                cur_use += bb;
+            }
+            for (size_t k = cur; k < boundaries.size(); k++) {
+                boundaries[k] = n_blocks;
+            }
+            for (size_t k = 0; k < boundaries.size(); k++) {
+                int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+                if (boundaries[k] < min_bound) {
+                    boundaries[k] = std::min(min_bound, n_blocks);
+                }
+            }
+            // Map each param tensor pointer to its backend (block range -> device).
+            auto ptr_backend = std::make_shared<std::map<ggml_tensor*, ggml_backend_t>>();
+            std::map<std::string, ggml_tensor*> dit_map;
+            runner->get_param_tensors(dit_map);
+            for (const auto& kv : dit_map) {
+                ggml_backend_t target = all_backends[0];
+                if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) == 0) {
+                    int idx = dit_block_index_of(kv.first);
+                    if (idx >= 0) {
+                        for (size_t k = 0; k < boundaries.size(); k++) {
+                            if (idx < boundaries[k]) {
+                                target = all_backends[std::min(k, all_backends.size() - 1)];
+                                break;
+                            }
+                        }
+                    }
+                }
+                (*ptr_backend)[kv.second] = target;
+            }
+            spec.mode              = MultiBackendMode::LAYER_SPLIT;
+            spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t {
+                auto it = ptr_backend->find(t);
+                return it != ptr_backend->end() ? it->second : main_backend;
+            };
+            LOG_INFO("DiT layer-split: %d blocks across %zu devices", n_blocks, all_backends.size());
+        }
+
+        runner->set_multi_backend_spec(spec);
+        return true;
+    }
+
+    // Conditioner (LLM) layer-split: same block-partition approach as the DiT
+    // layer-split, but applied to the conditioner's LLM sub-runner (tensors
+    // under "text_encoders.llm."). LAYER only — the conditioner never row-splits
+    // (only the DiT does, preserving the single-row-component invariant). The
+    // conditioner's small projector stays on the main backend.
+    bool apply_cond_multi_gpu_split(const std::shared_ptr<Conditioner>& cond, ModelLoader& model_loader) {
+        if (!cond || fit_cond_split_mode == backend_fit::MultiGpuMode::OFF ||
+            fit_cond_split_device_names.size() < 2) {
+            return true;
+        }
+        ggml_backend_t main_backend = backend_for(SDBackendModule::TE);
+        if (main_backend == nullptr) {
+            return true;
+        }
+        const auto& devnames = fit_cond_split_device_names;
+        const auto& shares   = fit_cond_split_share_bytes;
+        std::vector<ggml_backend_t> all_backends;
+        all_backends.push_back(main_backend);
+        MultiBackendSpec spec;
+        for (size_t k = 1; k < devnames.size(); k++) {
+            ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+            if (b == nullptr) {
+                LOG_WARN("cond layer-split: failed to init backend %s; using single GPU", devnames[k].c_str());
+                return true;
+            }
+            spec.additional_backends.push_back(b);
+            all_backends.push_back(b);
+        }
+        const std::string tensor_prefix = "text_encoders.llm.";
+        std::map<int, int64_t> block_bytes;
+        int64_t                non_block_bytes = 0;
+        int                    max_block_idx   = -1;
+        for (const auto& kv : model_loader.get_tensor_storage_map()) {
+            if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                continue;
+            }
+            int64_t bytes = (int64_t)kv.second.nbytes();
+            int     idx   = dit_block_index_of(kv.first);
+            if (idx >= 0) {
+                block_bytes[idx] += bytes;
+                if (idx > max_block_idx) {
+                    max_block_idx = idx;
+                }
+            } else {
+                non_block_bytes += bytes;
+            }
+        }
+        if (max_block_idx < 0) {
+            LOG_WARN("cond layer-split: no transformer blocks under '%s'; using single GPU", tensor_prefix.c_str());
+            return true;
+        }
+        const int n_blocks    = max_block_idx + 1;
+        int64_t   total_share = 0, total_block = 0;
+        for (auto s : shares) {
+            total_share += s;
+        }
+        for (const auto& kv : block_bytes) {
+            total_block += kv.second;
+        }
+        if (total_share <= 0) {
+            return true;
+        }
+        std::vector<int64_t> budgets(shares.size(), 0);
+        for (size_t k = 0; k < shares.size(); k++) {
+            int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share));
+            if (k == 0) {
+                b = std::max<int64_t>(b - non_block_bytes, 0);
+            }
+            budgets[k] = b;
+        }
+        std::vector<int> boundaries(shares.size(), 0);
+        size_t           cur     = 0;
+        int64_t          cur_use = 0;
+        for (int b = 0; b < n_blocks; b++) {
+            int64_t bb = block_bytes[b];
+            if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) {
+                boundaries[cur] = b;
+                cur++;
+                cur_use = 0;
+            }
+            cur_use += bb;
+        }
+        for (size_t k = cur; k < boundaries.size(); k++) {
+            boundaries[k] = n_blocks;
+        }
+        for (size_t k = 0; k < boundaries.size(); k++) {
+            int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+            if (boundaries[k] < min_bound) {
+                boundaries[k] = std::min(min_bound, n_blocks);
+            }
+        }
+        auto ptr_backend = std::make_shared<std::map<ggml_tensor*, ggml_backend_t>>();
+        std::map<std::string, ggml_tensor*> cond_map;
+        cond->get_param_tensors(cond_map);
+        for (const auto& kv : cond_map) {
+            if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                continue;  // only the LLM tensors are split; projector stays on main
+            }
+            ggml_backend_t target = all_backends[0];
+            int            idx    = dit_block_index_of(kv.first);
+            if (idx >= 0) {
+                for (size_t k = 0; k < boundaries.size(); k++) {
+                    if (idx < boundaries[k]) {
+                        target = all_backends[std::min(k, all_backends.size() - 1)];
+                        break;
+                    }
+                }
+            }
+            (*ptr_backend)[kv.second] = target;
+        }
+        spec.mode              = MultiBackendMode::LAYER_SPLIT;
+        spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t {
+            auto it = ptr_backend->find(t);
+            return it != ptr_backend->end() ? it->second : main_backend;
+        };
+        cond->set_multi_backend_spec(spec);
+        LOG_INFO("Conditioner LLM layer-split: %d blocks across %zu devices", n_blocks, all_backends.size());
+        return true;
+    }
+
     std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
         if (rng_type == STD_DEFAULT_RNG) {
             return std::make_shared<STDDefaultRNG>();
@@ -300,7 +656,11 @@ class StableDiffusionGGML {
         // placements before the backends are created (see the auto-fit block
         // below, which feeds its plan into init_backend()).
 
-        ModelLoader model_loader;
+        // Owned by the SD object so lazy-load callbacks can re-read tensors
+        // after init() returns. `model_loader` aliases it, so all the existing
+        // model_loader.* uses below are unchanged.
+        owned_model_loader        = std::make_unique<ModelLoader>();
+        ModelLoader& model_loader = *owned_model_loader;
 
         if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
             LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
@@ -448,6 +808,7 @@ class StableDiffusionGGML {
             return oss.str();
         };
 
+        auto_fit_enabled = sd_ctx_params->auto_fit;
         if (sd_ctx_params->auto_fit) {
             if (!backend_spec.empty() || !params_backend_spec.empty()) {
                 LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend "
@@ -455,25 +816,59 @@ class StableDiffusionGGML {
             }
 
             backend_fit::ComputeReserves reserves;
-            if (sd_ctx_params->auto_fit_compute_reserve_dit_mb > 0) {
-                reserves.dit_bytes =
-                    int64_t(sd_ctx_params->auto_fit_compute_reserve_dit_mb) * backend_fit::MiB;
-            }
-            if (sd_ctx_params->auto_fit_compute_reserve_vae_mb > 0) {
-                reserves.vae_bytes =
-                    int64_t(sd_ctx_params->auto_fit_compute_reserve_vae_mb) * backend_fit::MiB;
-            }
-            if (sd_ctx_params->auto_fit_compute_reserve_cond_mb > 0) {
-                reserves.conditioner_bytes =
-                    int64_t(sd_ctx_params->auto_fit_compute_reserve_cond_mb) * backend_fit::MiB;
+            // Parse the per-component reserve map ("dit=2048,vae=1024,cond=512").
+            // Missing keys keep the built-in defaults.
+            if (sd_ctx_params->auto_fit_compute_reserve != nullptr) {
+                std::string spec(sd_ctx_params->auto_fit_compute_reserve);
+                size_t      pos = 0;
+                while (pos < spec.size()) {
+                    size_t      comma = spec.find(',', pos);
+                    std::string entry = spec.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+                    pos               = comma == std::string::npos ? spec.size() : comma + 1;
+                    size_t eq         = entry.find('=');
+                    if (eq == std::string::npos) {
+                        LOG_WARN("auto-fit: ignoring malformed compute-reserve entry '%s' (expected component=MiB)", entry.c_str());
+                        continue;
+                    }
+                    std::string key = entry.substr(0, eq);
+                    int64_t     mib = std::atoll(entry.c_str() + eq + 1);
+                    if (mib <= 0) {
+                        LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (value must be a positive MiB count)", entry.c_str());
+                        continue;
+                    }
+                    backend_fit::ComponentKind kind;
+                    if (key == "dit" || key == "diffusion" || key == "model" || key == "unet") {
+                        kind = backend_fit::ComponentKind::DIT;
+                    } else if (key == "vae") {
+                        kind = backend_fit::ComponentKind::VAE;
+                    } else if (key == "cond" || key == "conditioner" || key == "te" || key == "clip") {
+                        kind = backend_fit::ComponentKind::CONDITIONER;
+                    } else {
+                        LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (unknown component, expected dit/vae/cond)", entry.c_str());
+                        continue;
+                    }
+                    switch (kind) {
+                        case backend_fit::ComponentKind::DIT:
+                            reserves.dit_bytes = mib * backend_fit::MiB;
+                            break;
+                        case backend_fit::ComponentKind::VAE:
+                            reserves.vae_bytes = mib * backend_fit::MiB;
+                            break;
+                        case backend_fit::ComponentKind::CONDITIONER:
+                            reserves.conditioner_bytes = mib * backend_fit::MiB;
+                            break;
+                    }
+                }
             }
             auto components = backend_fit::estimate_components(
                 model_loader, wtype, /*alignment=*/64, reserves);
             auto    devices = backend_fit::enumerate_gpu_devices();
             int64_t margin_bytes =
                 int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            backend_fit::MultiGpuMode multi_gpu_mode =
+                backend_fit::str_to_multi_gpu_mode(SAFE_STR(sd_ctx_params->multi_gpu_mode));
             auto plan = backend_fit::compute_plan(
-                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu);
+                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu, multi_gpu_mode);
             backend_fit::print_plan(plan, components, devices, margin_bytes);
 
             if (sd_ctx_params->auto_fit_dry_run) {
@@ -498,6 +893,14 @@ class StableDiffusionGGML {
                 spec += "=";
                 spec += value;
             };
+            auto dev_name_by_id = [&](int id) -> std::string {
+                for (const auto& dev : devices) {
+                    if (dev.id == id) {
+                        return dev.name;
+                    }
+                }
+                return "";
+            };
             auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) {
                 if (d == nullptr) {
                     return;
@@ -506,13 +909,36 @@ class StableDiffusionGGML {
                     append_assignment(runtime_spec, module_key, "cpu");
                     return;
                 }
-                std::string dev_name;
-                for (const auto& dev : devices) {
-                    if (dev.id == d->device_id) {
-                        dev_name = dev.name;
-                        break;
+                // Multi-GPU split (DiT only): the runner's main backend is the
+                // largest participating GPU (split_device_ids[0]); the actual
+                // per-tensor distribution is applied later via a MultiBackendSpec
+                // (see prepare_*_split_spec). Record the decision for that step.
+                if (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                    std::string main_dev = d->split_device_ids.empty() ? "" : dev_name_by_id(d->split_device_ids[0]);
+                    if (main_dev.empty()) {
+                        return;  // fall back to default backend
+                    }
+                    append_assignment(runtime_spec, module_key, main_dev);
+                    backend_fit::MultiGpuMode m = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT)
+                                                      ? backend_fit::MultiGpuMode::ROW
+                                                      : backend_fit::MultiGpuMode::LAYER;
+                    std::vector<std::string> names;
+                    for (int id : d->split_device_ids) {
+                        names.push_back(dev_name_by_id(id));
                     }
+                    if (std::string(module_key) == "diffusion") {
+                        fit_dit_split_mode         = m;
+                        fit_dit_split_device_names = names;
+                        fit_dit_split_share_bytes  = d->split_share_bytes;
+                    } else if (std::string(module_key) == "te") {
+                        fit_cond_split_mode         = m;
+                        fit_cond_split_device_names = names;
+                        fit_cond_split_share_bytes  = d->split_share_bytes;
+                    }
+                    return;
                 }
+                std::string dev_name = dev_name_by_id(d->device_id);
                 if (dev_name.empty()) {
                     return;  // no matching device; fall back to the default backend
                 }
@@ -530,6 +956,17 @@ class StableDiffusionGGML {
             LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'",
                      backend_spec.empty() ? "(default)" : backend_spec.c_str(),
                      params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str());
+
+            // When a component is split across GPUs the working set is tight:
+            // the split component (and the others sharing those GPUs) cannot all
+            // be resident at once. Enable lazy-load so the DiT / conditioner /
+            // VAE defer their param alloc+load to their compute phase and free
+            // after, time-sharing VRAM (the per-component MAX plan assumes this).
+            if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF ||
+                fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) {
+                auto_lazy_load = true;
+                LOG_INFO("auto-fit: enabling lazy-load (components time-share VRAM across phases)");
+            }
         }
 
         // Create the backends now that the placement (manual or auto-fit) is
@@ -859,9 +1296,20 @@ class StableDiffusionGGML {
             cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
             get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE));
 
+            // When the DiT is split across GPUs its params live resident in the
+            // (per-device) split buffers, so it must not be mmap'd and must not
+            // use the RAM-streaming path (mutually exclusive with split).
+            const bool dit_split = fit_dit_split_mode != backend_fit::MultiGpuMode::OFF &&
+                                   fit_dit_split_device_names.size() >= 2;
+            if (dit_split && stream_layers) {
+                LOG_WARN("--stream-layers is ignored for the diffusion model when it is "
+                         "split across GPUs (--multi-gpu-mode=%s)",
+                         backend_fit::multi_gpu_mode_str(fit_dit_split_mode));
+            }
+
             diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-            diffusion_model->set_stream_layers_enabled(stream_layers);
-            get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
+            diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers);
+            get_param_tensors(diffusion_model, dit_split ? false : module_can_mmap(SDBackendModule::DIFFUSION));
 
             if (sd_version_is_unet_edit(version)) {
                 vae_decode_only = false;
@@ -869,8 +1317,8 @@ class StableDiffusionGGML {
 
             if (high_noise_diffusion_model) {
                 high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                high_noise_diffusion_model->set_stream_layers_enabled(stream_layers);
-                get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
+                high_noise_diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers);
+                get_param_tensors(high_noise_diffusion_model, dit_split ? false : module_can_mmap(SDBackendModule::DIFFUSION));
             }
 
             if (!ensure_backend_pair(SDBackendModule::VAE)) {
@@ -1129,6 +1577,78 @@ class StableDiffusionGGML {
             ignore_tensors.insert("model.visual.deepstack_merger_list.");
         }
 
+        // --- Multi-GPU split + lazy-load (auto-fit) ------------------------
+        // Apply the split specs to the heavy runners BEFORE any params alloc,
+        // then (when the plan needs time-sharing) mark them lazy: drop their
+        // tensors from the bulk load + mmap here and load them on the first
+        // compute() of their phase instead, freeing after, so the DiT /
+        // conditioner / VAE share VRAM rather than coexisting.
+        apply_dit_multi_gpu_split(diffusion_model, model_loader);
+        apply_dit_multi_gpu_split(high_noise_diffusion_model, model_loader);
+        apply_cond_multi_gpu_split(cond_stage_model, model_loader);
+
+        if (auto_lazy_load) {
+            const int    lazy_threads = std::min(n_threads > 0 ? n_threads : 2, 2);
+            const bool   lazy_mmap    = sd_ctx_params->enable_mmap;
+            ModelLoader* loader_ptr   = owned_model_loader.get();
+            // Defer a component's params: drop its tensors from the bulk load +
+            // mmap set, and register a callback that loads just those tensors on
+            // first compute. `only_prefix` restricts to a sub-runner (the
+            // conditioner only lazy-loads its LLM; the small projector stays
+            // eager). set_lazy_load makes the runner's alloc a no-op at init.
+            // `collect` lets each component gather its own param tensors with the
+            // right call arity: DiffusionModel/Conditioner expose a 1-arg
+            // get_param_tensors() that bakes in their prefix, while VAE only has
+            // the 2-arg form (prefix is caller-supplied, no default). A single
+            // templated call can't cover both, so the caller passes a closure.
+            auto make_lazy = [&](auto&& component,
+                                 const std::function<void(std::map<std::string, ggml_tensor*>&)>& collect,
+                                 const std::string& only_prefix) {
+                if (!component) {
+                    return;
+                }
+                std::map<std::string, ggml_tensor*> all;
+                collect(all);
+                auto sub = std::make_shared<std::map<std::string, ggml_tensor*>>();
+                for (const auto& kv : all) {
+                    if (!only_prefix.empty() &&
+                        kv.first.compare(0, only_prefix.size(), only_prefix) != 0) {
+                        continue;
+                    }
+                    (*sub)[kv.first] = kv.second;
+                    tensors.erase(kv.first);
+                    mmap_able_tensors.erase(kv.first);
+                    ignore_tensors.insert(kv.first);
+                }
+                if (sub->empty()) {
+                    return;
+                }
+                component->set_lazy_load([loader_ptr, sub, lazy_threads, lazy_mmap]() -> bool {
+                    auto local = *sub;
+                    return loader_ptr->load_tensors(local, {}, lazy_threads, lazy_mmap);
+                });
+                LOG_INFO("auto-fit: deferring %zu tensors to first compute (lazy-load)", sub->size());
+            };
+            make_lazy(diffusion_model,
+                      [&](std::map<std::string, ggml_tensor*>& m) { diffusion_model->get_param_tensors(m); },
+                      "");
+            make_lazy(high_noise_diffusion_model,
+                      [&](std::map<std::string, ggml_tensor*>& m) { high_noise_diffusion_model->get_param_tensors(m); },
+                      "");
+            make_lazy(cond_stage_model,
+                      [&](std::map<std::string, ggml_tensor*>& m) { cond_stage_model->get_param_tensors(m); },
+                      "text_encoders.llm.");
+            // The VAE must also time-share: left eager it squats its ~1.4 GB on
+            // its placed GPU through the conditioner and DiT phases, which on a
+            // tight card is exactly enough to OOM the layer-split conditioner's
+            // compute buffer. Defer it like the rest (prefix "first_stage_model"
+            // matches its loader tensor names).
+            make_lazy(first_stage_model,
+                      [&](std::map<std::string, ggml_tensor*>& m) { first_stage_model->get_param_tensors(m, "first_stage_model"); },
+                      "");
+        }
+        // ------------------------------------------------------------------
+
         if (enable_mmap_tensors) {
             if (mmap_able_tensors.empty()) {
                 LOG_DEBUG("no tensors could be memory-mapped");
@@ -2446,7 +2966,35 @@ class StableDiffusionGGML {
         }
         auto latents = first_stage_model->diffusion_to_vae_latents(x);
         first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
-        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+        auto decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+        // Auto-fit tiling fallback: a full-frame video decode can need ~10 GB of
+        // compute buffer and OOM (a graceful failure -> empty result, not an
+        // abort). Under auto-fit, enable tiling and retry once instead of failing.
+        // Temporal tiling is LTX-only (its 3D VAE supports temporal_tile_frames);
+        // every other architecture falls back to ordinary spatial tiling.
+        if (decoded.empty() && auto_fit_enabled) {
+            bool changed = false;
+            if (version == VERSION_LTXAV) {
+                if (!vae_tiling_params.temporal_tiling) {
+                    vae_tiling_params.temporal_tiling = true;
+                    changed                           = true;
+                }
+            } else if (!vae_tiling_params.enabled) {
+                vae_tiling_params.enabled = true;
+                // Reasonable default tile if the user didn't set one.
+                if (vae_tiling_params.tile_size_x <= 0) vae_tiling_params.tile_size_x = 256;
+                if (vae_tiling_params.tile_size_y <= 0) vae_tiling_params.tile_size_y = 256;
+                changed = true;
+            }
+            if (changed) {
+                LOG_WARN("auto-fit: VAE decode failed (likely OOM); retrying with %s tiling",
+                         version == VERSION_LTXAV ? "temporal" : "spatial");
+                first_stage_model->free_compute_buffer();
+                first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
+                decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+            }
+        }
+        return decoded;
     }
 
     sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
@@ -2802,10 +3350,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->auto_fit                         = true;
     sd_ctx_params->auto_fit_target_mb               = 512;
     sd_ctx_params->auto_fit_dry_run                 = false;
-    sd_ctx_params->auto_fit_compute_reserve_dit_mb  = 0;
-    sd_ctx_params->auto_fit_compute_reserve_vae_mb  = 0;
-    sd_ctx_params->auto_fit_compute_reserve_cond_mb = 0;
+    sd_ctx_params->auto_fit_compute_reserve         = nullptr;
     sd_ctx_params->auto_multi_gpu                   = true;
+    sd_ctx_params->multi_gpu_mode                   = "row";
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2847,10 +3394,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "auto_fit: %s\n"
              "auto_fit_target_mb: %d\n"
              "auto_fit_dry_run: %s\n"
-             "auto_fit_compute_reserve_dit_mb: %d\n"
-             "auto_fit_compute_reserve_vae_mb: %d\n"
-             "auto_fit_compute_reserve_cond_mb: %d\n"
+             "auto_fit_compute_reserve: %s\n"
              "auto_multi_gpu: %s\n"
+             "multi_gpu_mode: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2891,10 +3437,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->auto_fit),
              sd_ctx_params->auto_fit_target_mb,
              BOOL_STR(sd_ctx_params->auto_fit_dry_run),
-             sd_ctx_params->auto_fit_compute_reserve_dit_mb,
-             sd_ctx_params->auto_fit_compute_reserve_vae_mb,
-             sd_ctx_params->auto_fit_compute_reserve_cond_mb,
+             SAFE_STR(sd_ctx_params->auto_fit_compute_reserve),
              BOOL_STR(sd_ctx_params->auto_multi_gpu),
+             SAFE_STR(sd_ctx_params->multi_gpu_mode),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
diff --git a/src/version.cpp b/src/version.cpp
index 6c266153c..97dc8426b 100644
--- a/src/version.cpp
+++ b/src/version.cpp
@@ -1,6 +1,3 @@
-#include <cstdio>
-
-#include "ggml-backend.h"
 #include "stable-diffusion.h"
 
 #ifndef SDCPP_BUILD_COMMIT
@@ -21,12 +18,3 @@ const char* sd_commit(void) {
 const char* sd_version(void) {
     return STRINGIZE(SDCPP_BUILD_VERSION);
 }
-
-void sd_list_devices(void) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        const char* name       = ggml_backend_dev_name(dev);
-        const char* desc       = ggml_backend_dev_description(dev);
-        std::printf("%s\t%s\n", name ? name : "", desc ? desc : "");
-    }
-}