From ce85c6b959a406c3f4ebd9342098f0ed4925d71c Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 3 May 2026 08:16:12 -0300 Subject: [PATCH 1/3] sd: generalize internal interfaces to place generation on CPU --- expose.h | 6 +++--- koboldcpp.py | 12 ++++++------ otherarch/sdcpp/sdtype_adapter.cpp | 7 +++---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/expose.h b/expose.h index e4ee7ff6c98..d7d1ed1ee77 100644 --- a/expose.h +++ b/expose.h @@ -177,13 +177,13 @@ struct sd_load_model_inputs { const char * model_filename = nullptr; const char * executable_path = nullptr; - const int kcpp_main_gpu = -1; + const int kcpp_main_device = -1; const int threads = 0; const int quant = 0; const bool flash_attention = false; const bool offload_cpu = false; - const bool vae_cpu = false; - const bool clip_cpu = false; + const int kcpp_vae_device = -1; + const int kcpp_clip_device = -1; const bool diffusion_conv_direct = false; const bool vae_conv_direct = false; const bool taesd = false; diff --git a/koboldcpp.py b/koboldcpp.py index d39cba4dff3..b0a0f375b94 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -358,13 +358,13 @@ class generation_outputs(ctypes.Structure): class sd_load_model_inputs(ctypes.Structure): _fields_ = [("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), - ("kcpp_main_gpu", ctypes.c_int), + ("kcpp_main_device", ctypes.c_int), ("threads", ctypes.c_int), ("quant", ctypes.c_int), ("flash_attention", ctypes.c_bool), ("offload_cpu", ctypes.c_bool), - ("vae_cpu", ctypes.c_bool), - ("clip_cpu", ctypes.c_bool), + ("kcpp_vae_device", ctypes.c_int), + ("kcpp_clip_device", ctypes.c_int), ("diffusion_conv_direct", ctypes.c_bool), ("vae_conv_direct", ctypes.c_bool), ("taesd", ctypes.c_bool), @@ -2401,8 +2401,8 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip inputs.quant = args.sdquant inputs.flash_attention = args.sdflashattention inputs.offload_cpu = args.sdoffloadcpu - inputs.vae_cpu = args.sdvaecpu - inputs.clip_cpu = False if args.sdclipgpu else True + inputs.kcpp_vae_device = -2 if args.sdvaecpu else -1 + inputs.kcpp_clip_device = -1 if args.sdclipgpu else -2 sdconvdirect = sd_convdirect_option(args.sdconvdirect) inputs.diffusion_conv_direct = sdconvdirect == 'full' inputs.vae_conv_direct = sdconvdirect in ['vaeonly', 'full'] @@ -2430,7 +2430,7 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip inputs.img_hard_limit = args.sdclamped inputs.img_soft_limit = args.sdclampedsoft inputs = set_backend_props(inputs) - inputs.kcpp_main_gpu = args.sdmaingpu + inputs.kcpp_main_device = args.sdmaingpu ret = handle.sd_load_model(inputs) return ret diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index f622a4354b3..00df0d7ed85 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -120,7 +120,6 @@ struct SDParams { float eta = -1.0f; float strength = 0.75f; int64_t seed = 42; - bool clip_on_cpu = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; @@ -285,7 +284,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename); //kcpp allow gpu id override - config_main_gpu(inputs.kcpp_main_gpu); + config_main_gpu(inputs.kcpp_main_device); int lora_apply_mode = LORA_APPLY_AT_RUNTIME; bool lora_dynamic = false; @@ -424,8 +423,8 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { params.vae_conv_direct = sd_params->vae_conv_direct; params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask; params.offload_params_to_cpu = inputs.offload_cpu; - params.keep_vae_on_cpu = inputs.vae_cpu; - params.keep_clip_on_cpu = inputs.clip_cpu; + params.keep_vae_on_cpu = (inputs.kcpp_vae_device <= -2); + params.keep_clip_on_cpu = (inputs.kcpp_clip_device <= -2); params.lora_apply_mode = (lora_apply_mode_t)lora_apply_mode; // also switches flash attn for the vae and conditioner From 861ec24b091feeec2c136f35522701f2fa93da45 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 3 May 2026 12:55:58 -0300 Subject: [PATCH 2/3] sd: backend support for multi-device selection --- otherarch/sdcpp/kcpp_sd_extensions.h | 2 +- otherarch/sdcpp/sdtype_adapter.cpp | 9 ++++-- otherarch/sdcpp/stable-diffusion.cpp | 39 +++++++++++++++++++++-- otherarch/sdcpp/upscaler.cpp | 2 +- otherarch/sdcpp/util.cpp | 46 ++++++++++++++++++---------- otherarch/sdcpp/util.h | 1 + 6 files changed, 74 insertions(+), 25 deletions(-) diff --git a/otherarch/sdcpp/kcpp_sd_extensions.h b/otherarch/sdcpp/kcpp_sd_extensions.h index 3e7be4cb65c..1bee9f107f3 100644 --- a/otherarch/sdcpp/kcpp_sd_extensions.h +++ b/otherarch/sdcpp/kcpp_sd_extensions.h @@ -32,7 +32,7 @@ namespace kcpp_sd { void set_sd_log_level(int log); - void config_main_gpu(int value); + void config_device(int value, const std::string& mod = ""); } diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 00df0d7ed85..c1d16fc842c 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -284,7 +284,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename); //kcpp allow gpu id override - config_main_gpu(inputs.kcpp_main_device); + config_device(inputs.kcpp_main_device); + config_device(inputs.kcpp_clip_device, "clip"); + config_device(inputs.kcpp_vae_device, "vae"); + config_device(inputs.kcpp_main_device, "upscaler"); int lora_apply_mode = LORA_APPLY_AT_RUNTIME; bool lora_dynamic = false; @@ -423,8 +426,8 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { params.vae_conv_direct = sd_params->vae_conv_direct; params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask; params.offload_params_to_cpu = inputs.offload_cpu; - params.keep_vae_on_cpu = (inputs.kcpp_vae_device <= -2); - params.keep_clip_on_cpu = (inputs.kcpp_clip_device <= -2); + //params.keep_vae_on_cpu = (inputs.kcpp_vae_device <= -2); + //params.keep_clip_on_cpu = (inputs.kcpp_clip_device <= -2); params.lora_apply_mode = (lora_apply_mode_t)lora_apply_mode; // also switches flash attn for the vae and conditioner diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 9077f061854..aef7612ae7c 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -163,7 +163,24 @@ class StableDiffusionGGML { StableDiffusionGGML() = default; + std::map > kcpp_backends; + ggml_backend_t kcpp_get_backend(std::string mod = "", bool on_cpu = false) { + if (on_cpu) { + mod = "CPU"; + } + std::string device_name = kcpp_sd_get_device_name(mod); + auto it = kcpp_backends.find(device_name); + if (it == kcpp_backends.end()) { + auto backend = std::shared_ptr(init_named_backend(device_name), ggml_backend_free); + kcpp_backends[device_name] = backend; + return backend.get(); + } else { + return it->second.get(); + } + } + ~StableDiffusionGGML() { + #if 0 // kcpp if (clip_backend != backend) { ggml_backend_free(clip_backend); } @@ -174,6 +191,9 @@ class StableDiffusionGGML { ggml_backend_free(vae_backend); } ggml_backend_free(backend); + # else // kcpp + // handled by the shared_ptr destructor + #endif // kcpp } std::string toLowerCase(const std::string& str) { @@ -188,7 +208,7 @@ class StableDiffusionGGML { } void init_backend() { - backend = sd_get_default_backend(); + backend = kcpp_get_backend(); } std::shared_ptr get_rng(rng_type_t rng_type) { @@ -582,11 +602,17 @@ class StableDiffusionGGML { : static_cast(static_cast(max_vram) * 1024.0 * 1024.0 * 1024.0); { - clip_backend = backend; + #if 0 // kcpp if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("CLIP: Using CPU backend"); clip_backend = ggml_backend_cpu_init(); } + #else // kcpp + clip_backend = kcpp_get_backend("clip", clip_on_cpu); + if (ggml_backend_is_cpu(clip_backend) && !ggml_backend_is_cpu(backend)) { + LOG_INFO("CLIP: Using CPU backend"); + } + #endif if (sd_version_is_sd3(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -703,7 +729,7 @@ class StableDiffusionGGML { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, - version); +version); diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, @@ -765,12 +791,19 @@ class StableDiffusionGGML { high_noise_diffusion_model->get_param_tensors(tensors); } + #if 1 // kcpp + vae_backend = kcpp_get_backend("vae", sd_ctx_params->keep_vae_on_cpu); + if (ggml_backend_is_cpu(vae_backend) && !ggml_backend_is_cpu(backend)) { + LOG_INFO("VAE Autoencoder: Using CPU backend"); + } + #else // kcpp if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); } else { vae_backend = backend; } + #endif // kcpp auto create_tae = [&]() -> std::shared_ptr { if (sd_version_is_wan(version) || diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp index 25fc0c5df86..07d12bf8d9f 100644 --- a/otherarch/sdcpp/upscaler.cpp +++ b/otherarch/sdcpp/upscaler.cpp @@ -24,7 +24,7 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); - backend = sd_get_default_backend(); + backend = init_named_backend(kcpp_sd_get_device_name("upscaler")); ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index 17a41043d59..d26d4a93fd8 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -747,36 +747,48 @@ void kcpp_sd::set_sd_log_level(int log) sdloglevel = log; } -static int kcpp_main_gpu = -1; -void kcpp_sd::config_main_gpu(int value) { +static std::string kcpp_main_gpu = ""; +static std::map kcpp_backend_config; +void kcpp_sd::config_device(int value, const std::string& mod) { + ggml_backend_load_all_once(); + std::string dev_name = ""; if (value >= 0) { size_t dev_count = ggml_backend_dev_count(); size_t dev_index = static_cast(value); if (dev_index >= dev_count) { LOG_WARN("device %d not found, falling back to default", value); - value = -1; + } else { + dev_name = ggml_backend_dev_name(ggml_backend_dev_get(dev_index)); } } else if (value <= -2) { - value = -2; + dev_name = "CPU"; + } + if (mod == "" || mod == "main") { + kcpp_main_gpu = dev_name; + } else if (dev_name == "") { + kcpp_backend_config.erase(mod); + } else { + kcpp_backend_config[mod] = dev_name; + } +} +std::string kcpp_sd_get_device_name(const std::string& mod) { + auto it = kcpp_backend_config.find(mod); + if (it != kcpp_backend_config.end()) { + return it->second; + } else if (kcpp_main_gpu == "") { + return get_default_backend_name(); + } else { + return kcpp_main_gpu; } - kcpp_main_gpu = value; } static ggml_backend_t kcpp_get_main_gpu() { ggml_backend_t backend = nullptr; - if (kcpp_main_gpu != -1) { - std::string dev_name; - if (kcpp_main_gpu <= -2) { - dev_name = "CPU"; - } else { - auto dev = ggml_backend_dev_get(static_cast(kcpp_main_gpu)); - dev_name = ggml_backend_dev_name(dev); - } - backend = init_named_backend(dev_name); - if (backend) { + if (kcpp_main_gpu != "") { + std::string dev_name = kcpp_sd_get_device_name(); + if (dev_name != "") { + backend = init_named_backend(dev_name); LOG_INFO("Setting %s as main device (#%d)", dev_name.c_str(), kcpp_main_gpu); - } else { - LOG_WARN("Couldn't initialize device #%d; falling back to the default device", kcpp_main_gpu); } } return backend; diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index 36d168e2d59..096cdb473a2 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -88,6 +88,7 @@ bool sd_should_preview_noisy(); // test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. bool sd_backend_is(ggml_backend_t backend, const std::string& name); +std::string kcpp_sd_get_device_name(const std::string& mod = ""); ggml_backend_t sd_get_default_backend(); void log_message(const char* format, ...); From 93caa6d3252786617ae2cc64f4c8ffacc8c3b3f6 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 3 May 2026 19:02:11 -0300 Subject: [PATCH 3/3] sd: frontend support for multi-device selection --- koboldcpp.py | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index b0a0f375b94..a2506ab1a23 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2386,6 +2386,24 @@ def sd_quant_option(value): except Exception: return 0 +sd_device_choices = ['CPU', 'main', '1', '2', '3', '4'] + +def sd_get_device_number(name, offset=0): + if not name: + return -1 + name = name.lower() + aliases = {"cpu": -2, "gpu": -1, "": -1, "main": -1, "default": -1} + if name in aliases: + return aliases[name] + return tryparseint(name, -1) + offset + +def sd_get_device_name(value, offset=0): + if value <= -2: + return "CPU" + if value == -1: + return "main" + return value + offset + def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip2_filename,photomaker_filename,upscaler_filename): global args inputs = sd_load_model_inputs() @@ -2401,8 +2419,8 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip inputs.quant = args.sdquant inputs.flash_attention = args.sdflashattention inputs.offload_cpu = args.sdoffloadcpu - inputs.kcpp_vae_device = -2 if args.sdvaecpu else -1 - inputs.kcpp_clip_device = -1 if args.sdclipgpu else -2 + inputs.kcpp_vae_device = args.sdvaedevice + inputs.kcpp_clip_device = args.sdclipdevice sdconvdirect = sd_convdirect_option(args.sdconvdirect) inputs.diffusion_conv_direct = sdconvdirect == 'full' inputs.vae_conv_direct = sdconvdirect in ['vaeonly', 'full'] @@ -7604,8 +7622,8 @@ def hide_tooltip(event): sd_upscaler_var = ctk.StringVar() sd_flash_attention_var = ctk.IntVar(value=0) sd_offload_cpu_var = ctk.IntVar(value=0) - sd_vae_cpu_var = ctk.IntVar(value=0) - sd_clip_gpu_var = ctk.IntVar(value=0) + sd_vae_device_var = ctk.StringVar(value="main") + sd_clip_device_var = ctk.StringVar(value="CPU") sd_runtime_loras_var = ctk.IntVar(value=0) sd_vaeauto_var = ctk.IntVar(value=0) sd_tiled_vae_var = ctk.StringVar(value=str(default_vae_tile_threshold)) @@ -8470,9 +8488,9 @@ def toggletaesd(a,b,c): makelabelcombobox(images_tab, "Conv2D Direct:", sd_convdirect_var, row=42, labelpadx=(220), padx=(310), width=90, tooltiptxt="Use Conv2D Direct operation. May save memory or improve performance.\nMight crash if not supported by the backend.\n", values=sd_convdirect_choices) makelabelentry(images_tab, "VAE Tiling Threshold:", sd_tiled_vae_var, 44, 50, padx=(144),singleline=True,tooltip="Enable VAE Tiling for images above this size, to save memory.\nSet to 0 to disable VAE tiling.") makecheckbox(images_tab, "SD Flash Attention", sd_flash_attention_var, 44,padx=(230), tooltiptxt="Enable Flash Attention for image diffusion. May save memory or improve performance.") - makecheckbox(images_tab, "Model CPU Offload", sd_offload_cpu_var, 50,padx=8, tooltiptxt="Offload image weights in RAM to save VRAM, swap into VRAM when needed.") - makecheckbox(images_tab, "VAE on CPU", sd_vae_cpu_var, 50,padx=(160), tooltiptxt="Force VAE to CPU only for image generation.") - makecheckbox(images_tab, "CLIP on GPU", sd_clip_gpu_var, 50,padx=(280), tooltiptxt="Put CLIP and T5 to GPU for image generation. Otherwise, CLIP will use CPU.") + makecheckbox(images_tab, "Model Offload", sd_offload_cpu_var, 50,padx=8, tooltiptxt="Offload image weights in RAM to save VRAM, swap into VRAM when needed.") + makelabelcombobox(images_tab, "VAE dev:", sd_vae_device_var, 50,labelpadx=(140),padx=(200), width=70, tooltiptxt="Change VAE device for image generation.", values=sd_device_choices) + makelabelcombobox(images_tab, "CLIP dev:", sd_clip_device_var, 50,labelpadx=(280),padx=340, width=70, tooltiptxt="Change CLIP / T5 / LLM device for image generation.", values=sd_device_choices) # audio tab audio_tab = tabcontent["Audio"] @@ -8785,8 +8803,8 @@ def export_vars(): args.sdmodel = sd_model_var.get() if sd_model_var.get() != "" else "" args.sdflashattention = True if sd_flash_attention_var.get()==1 else False args.sdoffloadcpu = True if sd_offload_cpu_var.get()==1 else False - args.sdvaecpu = True if sd_vae_cpu_var.get()==1 else False - args.sdclipgpu = True if sd_clip_gpu_var.get()==1 else False + args.sdvaedevice = sd_get_device_number(sd_vae_device_var.get(), -1) + args.sdclipdevice = sd_get_device_number(sd_clip_device_var.get(), -1) args.sdthreads = (0 if sd_threads_var.get()=="" else int(sd_threads_var.get())) args.sdclamped = (0 if int(sd_clamped_var.get())<=0 else int(sd_clamped_var.get())) args.sdclampedsoft = (0 if int(sd_clamped_soft_var.get())<=0 else int(sd_clamped_soft_var.get())) @@ -9074,8 +9092,8 @@ def import_vars(mydict): sd_quant_var.set(sd_quant_choices[(mydict["sdquant"] if ("sdquant" in mydict and mydict["sdquant"]>=0 and mydict["sdquant"]