From 5ea29e33dd04f3800b53e88a0358439adf2f6584 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Mon, 20 Apr 2026 16:59:09 -0300 Subject: [PATCH 1/3] feat: transition from compile-time to runtime backend discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applies the following parts from #1184 and #1368 : - Introduce ggml_extend_backend.hpp for dynamic backend management. - Convert backend-specific SD_USE_* preprocessor tests to runtime tests, propagating the backend handler when needed. Additionally, to make this work with minimal changes: - A new function sd_get_default_backend replaces the default backend selection on src/stable-diffusion.cpp and src/upscaler.cpp, preserving the SD_VK_DEVICE env var support. - Clean up SD_USE_* defines from CMakeLists.txt (no other build changes). This is not just a refactor, because it improves device selection a bit: - Previously, Vulkan selected device 0 by default, but this was the wrong choice on my system, which has the iGPU as 0 and the discrete card as 1. The new selection algorithm correctly prioritizes the discrete GPU. - The upscaler now follows SD_VK_DEVICE too. Co-authored-by: Stéphane du Hamel Co-authored-by: Cyberhan123 <255542417@qq.com> --- CMakeLists.txt | 7 - src/common_block.hpp | 8 +- src/ggml_extend.hpp | 183 +++++++++++++--------- src/ggml_extend_backend.hpp | 293 ++++++++++++++++++++++++++++++++++++ src/lora.hpp | 74 ++++----- src/model.cpp | 15 +- src/qwen_image.hpp | 8 +- src/stable-diffusion.cpp | 55 +------ src/upscaler.cpp | 23 +-- src/util.cpp | 75 ++++++++- src/util.h | 5 + src/z_image.hpp | 16 +- 12 files changed, 543 insertions(+), 219 deletions(-) create mode 100644 src/ggml_extend_backend.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a9fb1041..48ce456ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,37 +72,31 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF if(SD_CUDA) message("-- Use CUDA as backend stable-diffusion") set(GGML_CUDA ON) - add_definitions(-DSD_USE_CUDA) endif() if(SD_METAL) message("-- Use Metal as backend stable-diffusion") set(GGML_METAL ON) - add_definitions(-DSD_USE_METAL) endif() if (SD_VULKAN) message("-- Use Vulkan as backend stable-diffusion") set(GGML_VULKAN ON) - add_definitions(-DSD_USE_VULKAN) endif () if (SD_OPENCL) message("-- Use OpenCL as backend stable-diffusion") set(GGML_OPENCL ON) - add_definitions(-DSD_USE_OPENCL) endif () if (SD_HIPBLAS) message("-- Use HIPBLAS as backend stable-diffusion") set(GGML_HIP ON) - add_definitions(-DSD_USE_CUDA) endif () if(SD_MUSA) message("-- Use MUSA as backend stable-diffusion") set(GGML_MUSA ON) - add_definitions(-DSD_USE_CUDA) endif() if(SD_WEBP) @@ -222,7 +216,6 @@ if(SD_SYCL) message("-- Use SYCL as backend stable-diffusion") set(GGML_SYCL ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl") - add_definitions(-DSD_USE_SYCL) # disable fast-math on host, see: # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html if (WIN32) diff --git a/src/common_block.hpp b/src/common_block.hpp index 112a4d7a1..e6c0b06bd 100644 --- a/src/common_block.hpp +++ b/src/common_block.hpp @@ -1,7 +1,9 @@ #ifndef __COMMON_BLOCK_HPP__ #define __COMMON_BLOCK_HPP__ +#include "ggml-backend.h" #include "ggml_extend.hpp" +#include "util.h" class DownSampleBlock : public GGMLBlock { protected: @@ -248,9 +250,6 @@ class FeedForward : public GGMLBlock { float scale = 1.f; if (precision_fix) { scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif } // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using Vulkan without enabling force_prec_f32, @@ -264,6 +263,9 @@ class FeedForward : public GGMLBlock { auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + net_2->set_force_prec_f32(true); + } x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 859270cbd..3eaf97c54 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -24,32 +24,12 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "model.h" #include "tensor.hpp" -#ifdef SD_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif - -#ifdef SD_USE_SYCL -#include "ggml-sycl.h" -#endif - #include "rng.hpp" #include "tensor_ggml.hpp" #include "util.h" @@ -91,6 +71,45 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha } } +__STATIC_INLINE__ bool backend_name_exists(std::string name) { + ggml_backend_load_all_once(); + const int device_count = ggml_backend_dev_count(); + for (int i = 0; i < device_count; i++) { + if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) { + return true; + } + } + return false; +} + +__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) { + if (name == "" || backend_name_exists(name)) { + return name; + } else { + LOG_WARN("Backend %s not found, using default backend", name.c_str()); + return ""; + } +} + +__STATIC_INLINE__ std::string get_default_backend_name() { + ggml_backend_load_all_once(); + // should pick the same backend as ggml_backend_init_best + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + return ggml_backend_dev_name(dev); +} + +__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") { + ggml_backend_load_all_once(); + LOG_DEBUG("Initializing backend: %s", name.c_str()); + if (name.empty()) { + return ggml_backend_init_best(); + } else { + return ggml_backend_init_by_name(name.c_str(), nullptr); + } +} + static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // n-mode tensor-matrix product @@ -1286,25 +1305,25 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx, return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); } -__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) { -#ifdef SD_USE_VULKAN - auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); - auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); - out = ggml_get_rows(ctx, out, zero_index); - out = ggml_reshape(ctx, out, a); - // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); - return out; -#else - auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); - ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] - if (ggml_is_transposed(out)) { - out = ggml_mul_mat(ctx, one, out); +__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* a) { + if (sd_backend_is(backend, "Vulkan")) { + auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); + auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); + out = ggml_get_rows(ctx, out, zero_index); + out = ggml_reshape(ctx, out, a); + // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); + return out; } else { - out = ggml_mul_mat(ctx, out, one); + auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); + ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] + if (ggml_is_transposed(out)) { + out = ggml_mul_mat(ctx, one, out); + } else { + out = ggml_mul_mat(ctx, out, one); + } + out = ggml_reshape(ctx, out, a); + return out; } - out = ggml_reshape(ctx, out, a); -#endif - return out; } // q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head] @@ -1496,16 +1515,14 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx, } __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { -#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) - if (!ggml_backend_is_cpu(backend)) { + if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) && + !ggml_backend_is_cpu(backend)) { ggml_backend_tensor_get_async(backend, tensor, data, offset, size); ggml_backend_synchronize(backend); - } else { - ggml_backend_tensor_get(tensor, data, offset, size); + return; } -#else + ggml_backend_tensor_get(tensor, data, offset, size); -#endif } __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { @@ -1664,14 +1681,15 @@ struct WeightAdapter { float scale = 1.f; } conv2d; }; - virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0; + virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) = 0; virtual ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, const std::string& prefix, - ForwardParams forward_params) = 0; - virtual size_t get_extra_graph_size() = 0; + ForwardParams forward_params) = 0; + virtual size_t get_extra_graph_size() = 0; }; struct GGMLRunnerContext { @@ -2192,6 +2210,14 @@ struct GGMLRunner { void set_weight_adapter(const std::shared_ptr& adapter) { weight_adapter = adapter; } + + ggml_backend_t get_runtime_backend() { + return runtime_backend; + } + + ggml_backend_t get_params_backend() { + return params_backend; + } }; class GGMLBlock { @@ -2336,6 +2362,14 @@ class Linear : public UnaryBlock { force_prec_f32(force_prec_f32), scale(scale) {} + void set_scale(float scale_) { + scale = scale_; + } + + void set_force_prec_f32(bool force_prec_f32_) { + force_prec_f32 = force_prec_f32_; + } + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* w = params["weight"]; ggml_tensor* b = nullptr; @@ -2347,7 +2381,7 @@ class Linear : public UnaryBlock { forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; forward_params.linear.force_prec_f32 = force_prec_f32; forward_params.linear.scale = scale; - return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params); } return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); } @@ -2463,7 +2497,7 @@ class Conv2d : public UnaryBlock { forward_params.conv2d.circular_x = ctx->circular_x_enabled; forward_params.conv2d.circular_y = ctx->circular_y_enabled; forward_params.conv2d.scale = scale; - return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params); } return ggml_ext_conv_2d(ctx->ggml_ctx, x, @@ -2527,7 +2561,7 @@ class Conv3d : public UnaryBlock { ggml_tensor* w = params["weight"]; ggml_tensor* b = nullptr; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); if (w->type != GGML_TYPE_F16) { w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16); } @@ -2535,7 +2569,7 @@ class Conv3d : public UnaryBlock { if (bias) { b = params["bias"]; if (ctx->weight_adapter) { - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, @@ -2582,12 +2616,12 @@ class LayerNorm : public UnaryBlock { if (elementwise_affine) { w = params["weight"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); } if (bias) { b = params["bias"]; if (ctx->weight_adapter) { - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } } @@ -2630,8 +2664,8 @@ class GroupNorm : public GGMLBlock { w = params["weight"]; b = params["bias"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups); @@ -2665,7 +2699,7 @@ class RMSNorm : public UnaryBlock { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* w = params["weight"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); } x = ggml_rms_norm(ctx->ggml_ctx, x, eps); x = ggml_mul_inplace(ctx->ggml_ctx, x, w); @@ -2748,6 +2782,7 @@ class MultiheadAttention : public GGMLBlock { __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] ggml_tensor* w1, // Outer C (Full rank) ggml_tensor* w1a, // Outer A (Low rank part 1) @@ -2778,29 +2813,29 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( int merge_batch_uq = batch; int merge_batch_vp = batch; -#if SD_USE_VULKAN - if (batch > 1) { - // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend - int max_batch = 65535; - int max_batch_uq = max_batch / uq; - merge_batch_uq = 1; - for (int i = max_batch_uq; i > 0; i--) { - if (batch % i == 0) { - merge_batch_uq = i; - break; + if (sd_backend_is(backend, "Vulkan")) { + if (batch > 1) { + // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend + int max_batch = 65535; + int max_batch_uq = max_batch / uq; + merge_batch_uq = 1; + for (int i = max_batch_uq; i > 0; i--) { + if (batch % i == 0) { + merge_batch_uq = i; + break; + } } - } - int max_batch_vp = max_batch / vp; - merge_batch_vp = 1; - for (int i = max_batch_vp; i > 0; i--) { - if (batch % i == 0) { - merge_batch_vp = i; - break; + int max_batch_vp = max_batch / vp; + merge_batch_vp = 1; + for (int i = max_batch_vp; i > 0; i--) { + if (batch % i == 0) { + merge_batch_vp = i; + break; + } } } } -#endif ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); if (w2 != NULL) { diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp new file mode 100644 index 000000000..19cf56f2e --- /dev/null +++ b/src/ggml_extend_backend.hpp @@ -0,0 +1,293 @@ +#ifndef __GGML_EXTEND_BACKEND_HPP__ +#define __GGML_EXTEND_BACKEND_HPP__ + +#include +#include + +#include "ggml-backend.h" +#include "ggml.h" + +#ifndef __STATIC_INLINE__ +#define __STATIC_INLINE__ static inline +#endif + +inline void ggml_backend_load_all_once() { +#if defined(GGML_BACKEND_DL) + // If the host process already preloaded backends explicitly + // (for example via ggml_backend_load / ggml_backend_load_all_from_path), + // do not rescan the default paths again. + if (ggml_backend_dev_count() > 0) { + return; + } + // In dynamic-backend mode the backend modules are discovered at runtime, + // so we must load them before asking for the CPU backend or its proc table. + static std::once_flag once; + std::call_once(once, []() { + if (ggml_backend_dev_count() > 0) { + return; + } + ggml_backend_load_all(); + }); +#endif +} + +#if defined(GGML_BACKEND_DL) + +// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS: +// those are CMake options used to configure ggml itself, but they are not +// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode. +// In practice, this target can reliably see GGML_BACKEND_DL, but not whether +// the CPU backend was compiled as a loadable module. We therefore use runtime +// backend discovery instead of compile-time assumptions. + +__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() { + ggml_backend_load_all_once(); + return ggml_backend_reg_by_name("CPU"); +} + +__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) { + if (backend != nullptr) { + ggml_backend_dev_t device = ggml_backend_get_device(backend); + if (device != nullptr) { + return ggml_backend_dev_backend_reg(device); + } + } + + return ggml_backend_cpu_reg(); +} + +__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() { + ggml_backend_load_all_once(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); +} + +__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) { + if (backend == nullptr) { + return false; + } + + ggml_backend_dev_t device = ggml_backend_get_device(backend); + if (device != nullptr) { + return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; + } + + const char* backend_name = ggml_backend_name(backend); + return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0; +} + +__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")); + if (fn != nullptr) { + fn(backend_cpu, n_threads); + } +} + +using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + +__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool")); + if (fn != nullptr) { + fn(backend_cpu, threadpool); + } +} + +__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback")); + if (fn != nullptr) { + fn(backend_cpu, abort_callback, abort_callback_data); + } +} + +__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + + return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; +} + +__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); + return buffer == nullptr || ggml_backend_buffer_is_host(buffer); +} + +__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); +} + +template +__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { + const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); + + if (ggml_backend_tensor_is_host_accessible(tensor)) { + auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); + *dst = value; + return; + } + + ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); +} + +__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = {0, 0, 0, 0}; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) { + (void)ctx; + + // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but + // the backend proc table does not expose it in GGML_BACKEND_DL mode. + // Recreate the old behavior by initializing the CPU backend explicitly and + // executing the graph through the generic backend API. + ggml_backend_t backend = ggml_backend_cpu_init(); + if (backend == nullptr) { + return GGML_STATUS_ALLOC_FAILED; + } + + ggml_backend_cpu_set_n_threads(backend, n_threads); + + const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph); + ggml_backend_free(backend); + + return status; +} + +__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) { + GGML_ASSERT(tensor != nullptr); + + if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) { + const int64_t nelements = ggml_nelements(tensor); + + switch (tensor->type) { + case GGML_TYPE_I8: { + auto* data = reinterpret_cast(tensor->data); + const int8_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_I16: { + auto* data = reinterpret_cast(tensor->data); + const int16_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_I32: { + auto* data = reinterpret_cast(tensor->data); + const int32_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_F16: { + auto* data = reinterpret_cast(tensor->data); + const ggml_fp16_t v = ggml_fp32_to_fp16(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_BF16: { + auto* data = reinterpret_cast(tensor->data); + const ggml_bf16_t v = ggml_fp32_to_bf16(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_F32: { + auto* data = reinterpret_cast(tensor->data); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = value; + } + } break; + default: + GGML_ABORT("fatal error"); + } + + return tensor; + } + + const int64_t nelements = ggml_nelements(tensor); + for (int64_t i = 0; i < nelements; ++i) { + ggml_set_f32_1d(tensor, static_cast(i), value); + } + + return tensor; +} + +#else +#include "ggml-cpu.h" +#endif +#endif diff --git a/src/lora.hpp b/src/lora.hpp index d4a749ef9..b57bc4226 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -129,7 +129,7 @@ struct LoraModel : public GGMLRunner { } } - ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -152,17 +152,17 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lora_up_name); if (iter != lora_tensors.end()) { - lora_up = ggml_ext_cast_f32(ctx, iter->second); + lora_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lora_mid_name); if (iter != lora_tensors.end()) { - lora_mid = ggml_ext_cast_f32(ctx, iter->second); + lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lora_down_name); if (iter != lora_tensors.end()) { - lora_down = ggml_ext_cast_f32(ctx, iter->second); + lora_down = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lora_up == nullptr || lora_down == nullptr) { @@ -208,7 +208,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -225,7 +225,7 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(diff_name); if (iter != lora_tensors.end()) { - curr_updown = ggml_ext_cast_f32(ctx, iter->second); + curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second); } else { break; } @@ -248,7 +248,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -276,33 +276,33 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(hada_1_down_name); if (iter != lora_tensors.end()) { - hada_1_down = ggml_ext_cast_f32(ctx, iter->second); + hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_1_up_name); if (iter != lora_tensors.end()) { - hada_1_up = ggml_ext_cast_f32(ctx, iter->second); + hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_1_mid_name); if (iter != lora_tensors.end()) { - hada_1_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up)); } iter = lora_tensors.find(hada_2_down_name); if (iter != lora_tensors.end()) { - hada_2_down = ggml_ext_cast_f32(ctx, iter->second); + hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_2_up_name); if (iter != lora_tensors.end()) { - hada_2_up = ggml_ext_cast_f32(ctx, iter->second); + hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_2_mid_name); if (iter != lora_tensors.end()) { - hada_2_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up)); } @@ -351,7 +351,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -378,24 +378,24 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lokr_w1_name); if (iter != lora_tensors.end()) { - lokr_w1 = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w2_name); if (iter != lora_tensors.end()) { - lokr_w2 = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second); } int64_t rank = 1; if (lokr_w1 == nullptr) { iter = lora_tensors.find(lokr_w1_a_name); if (iter != lora_tensors.end()) { - lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w1_b_name); if (iter != lora_tensors.end()) { - lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) { @@ -410,12 +410,12 @@ struct LoraModel : public GGMLRunner { if (lokr_w2 == nullptr) { iter = lora_tensors.find(lokr_w2_a_name); if (iter != lora_tensors.end()) { - lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w2_b_name); if (iter != lora_tensors.end()) { - lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) { @@ -468,23 +468,23 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { + ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { // lora ggml_tensor* diff = nullptr; if (with_lora_and_lokr) { - diff = get_lora_weight_diff(model_tensor_name, ctx); + diff = get_lora_weight_diff(model_tensor_name, ctx, backend); } // diff if (diff == nullptr) { - diff = get_raw_weight_diff(model_tensor_name, ctx); + diff = get_raw_weight_diff(model_tensor_name, ctx, backend); } // loha if (diff == nullptr) { - diff = get_loha_weight_diff(model_tensor_name, ctx); + diff = get_loha_weight_diff(model_tensor_name, ctx, backend); } // lokr if (diff == nullptr && with_lora_and_lokr) { - diff = get_lokr_weight_diff(model_tensor_name, ctx); + diff = get_lokr_weight_diff(model_tensor_name, ctx, backend); } if (diff != nullptr) { if (ggml_nelements(diff) < ggml_nelements(model_tensor)) { @@ -502,6 +502,7 @@ struct LoraModel : public GGMLRunner { } ggml_tensor* get_out_diff(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, WeightAdapter::ForwardParams forward_params, const std::string& model_tensor_name) { @@ -590,7 +591,7 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; - auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); + auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); if (out_diff == nullptr) { out_diff = curr_out_diff; } else { @@ -761,7 +762,7 @@ struct LoraModel : public GGMLRunner { ggml_tensor* model_tensor = it.second; // lora - ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor); + ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor); if (diff == nullptr) { continue; } @@ -774,7 +775,7 @@ struct LoraModel : public GGMLRunner { ggml_tensor* final_tensor; if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { - final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor); + final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor); final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff); final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); } else { @@ -841,34 +842,35 @@ struct MultiLoraAdapter : public WeightAdapter { : lora_models(lora_models) { } - ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { + ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { for (auto& lora_model : lora_models) { - ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr); + ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr); if (diff == nullptr) { continue; } if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { - weight = ggml_ext_cast_f32(ctx, weight); + weight = ggml_ext_cast_f32(ctx, backend, weight); } weight = ggml_add(ctx, weight, diff); } return weight; } - ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override { - return patch_weight(ctx, weight, weight_name, true); + ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override { + return patch_weight(ctx, backend, weight, weight_name, true); } ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, const std::string& prefix, WeightAdapter::ForwardParams forward_params) override { - w = patch_weight(ctx, w, prefix + "weight", false); + w = patch_weight(ctx, backend, w, prefix + "weight", false); if (b) { - b = patch_weight(ctx, b, prefix + "bias", false); + b = patch_weight(ctx, backend, b, prefix + "bias", false); } ggml_tensor* out; if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) { @@ -890,7 +892,7 @@ struct MultiLoraAdapter : public WeightAdapter { forward_params.conv2d.scale); } for (auto& lora_model : lora_models) { - ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight"); + ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight"); if (out_diff == nullptr) { continue; } diff --git a/src/model.cpp b/src/model.cpp index 3479a0bea..8fdde3b76 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -23,24 +23,11 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "zip.h" #include "name_conversion.h" -#include "stable-diffusion.h" - -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif /*================================================= Preprocess ==================================================*/ diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 83c8cec66..1cbeb71d5 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -95,9 +95,7 @@ namespace Qwen { float scale = 1.f / 32.f; bool force_prec_f32 = false; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example when using CUDA but the weights are k-quants (not all prompts). blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale)); @@ -124,6 +122,10 @@ namespace Qwen { auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + to_out_0->set_force_prec_f32(true); + } + auto norm_added_q = std::dynamic_pointer_cast(blocks["norm_added_q"]); auto norm_added_k = std::dynamic_pointer_cast(blocks["norm_added_k"]); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ae6bb504..0fb6f497d 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -172,60 +172,7 @@ class StableDiffusionGGML { } void init_backend() { -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - size_t device = 0; - const int device_count = ggml_backend_vk_get_device_count(); - if (device_count) { - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - device = std::stoull(sd_vk_device_str); - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } - if (device >= device_count) { - LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device); - device = 0; - } - } - LOG_INFO("Vulkan: Using device %llu", device); - backend = ggml_backend_vk_init(device); - } - if (!backend) { - LOG_WARN("Failed to initialize Vulkan backend"); - } -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs - backend = ggml_backend_opencl_init(); - if (!backend) { - LOG_WARN("Failed to initialize OpenCL backend"); - } -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif - - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } + backend = sd_get_default_backend(); } std::shared_ptr get_rng(rng_type_t rng_type) { diff --git a/src/upscaler.cpp b/src/upscaler.cpp index ed7bb89a0..80e68c947 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -16,26 +16,9 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - backend = ggml_backend_vk_init(0); -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - backend = ggml_backend_opencl_init(); -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif + + backend = sd_get_default_backend(); + ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); diff --git a/src/util.cpp b/src/util.cpp index e01876268..c1b98a09c 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -23,8 +23,9 @@ #include #endif -#include "ggml-cpu.h" +#include "ggml-backend.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "stable-diffusion.h" bool ends_with(const std::string& str, const std::string& ending) { @@ -718,3 +719,75 @@ std::vector> parse_prompt_attention(const std::str return res; } + +// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. +bool sd_backend_is(ggml_backend_t backend, const std::string& name) { + if (!backend) { + return false; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (!dev) + return false; + std::string dev_name = ggml_backend_dev_name(dev); + return dev_name.find(name) != std::string::npos; +} + +ggml_backend_t sd_get_default_backend() { + ggml_backend_load_all_once(); + static std::once_flag once; + std::call_once(once, []() { + int dev_count = ggml_backend_dev_count(); + if (dev_count == 0) { + LOG_ERROR("No devices found!"); + } else { + LOG_DEBUG("Found %d backend devices:", dev_count); + for (int i = 0; i < dev_count; i++) { + auto dev = ggml_backend_dev_get(i); + LOG_DEBUG("#%d: %s", i, ggml_backend_dev_name(dev)); + } + } + }); + std::string dev_name = get_default_backend_name(); + ggml_backend_t backend = nullptr; + // apply SD_VK_DEVICE only if the main device is Vulkan + if (dev_name.rfind("Vulkan", 0) == 0) { + const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); + if (SD_VK_DEVICE != nullptr) { + std::string sd_vk_device_str = SD_VK_DEVICE; + try { + unsigned long long device = std::stoull(sd_vk_device_str); + std::string vk_device_name = "Vulkan" + std::to_string(device); + if (vk_device_name != dev_name) { + LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE)", vk_device_name.c_str()); + backend = init_named_backend(vk_device_name); + if (!backend) { + LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); + } + } + } catch (const std::invalid_argument&) { + LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); + } catch (const std::out_of_range&) { + LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); + } + } + } + + if (!backend) { + backend = init_named_backend(dev_name); + if (!backend) { + LOG_WARN("device %s failed to init", dev_name.c_str()); + } + } + + if (!backend) { + LOG_WARN("loading CPU backend"); + backend = ggml_backend_cpu_init(); + } + + if (ggml_backend_is_cpu(backend)) { + LOG_DEBUG("Using CPU backend"); + } + + return backend; +} + diff --git a/src/util.h b/src/util.h index 2468cb93d..72c8a815d 100644 --- a/src/util.h +++ b/src/util.h @@ -6,6 +6,7 @@ #include #include +#include "ggml-backend.h" #include "stable-diffusion.h" #include "tensor.hpp" @@ -82,6 +83,10 @@ int sd_get_preview_interval(); bool sd_should_preview_denoised(); bool sd_should_preview_noisy(); +// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. +bool sd_backend_is(ggml_backend_t backend, const std::string& name); +ggml_backend_t sd_get_default_backend(); + #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) diff --git a/src/z_image.hpp b/src/z_image.hpp index 363ce5f4f..6bb44b791 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -31,10 +31,6 @@ namespace ZImage { : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) { blocks["qkv"] = std::make_shared(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false); float scale = 1.f; -#if GGML_USE_HIP - // Prevent NaN issues with certain ROCm setups - scale = 1.f / 16.f; -#endif blocks["out"] = std::make_shared(num_heads * head_dim, hidden_size, false, false, false, scale); if (qk_norm) { blocks["q_norm"] = std::make_shared(head_dim); @@ -52,6 +48,10 @@ namespace ZImage { auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto out_proj = std::dynamic_pointer_cast(blocks["out"]); + if (sd_backend_is(ctx->backend, "ROCm")) { + out_proj->set_scale(1.f / 16.f); + } + auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim] qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim] @@ -115,9 +115,7 @@ namespace ZImage { bool force_prec_f32 = false; float scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using CUDA but the weights are k-quants. blocks["w2"] = std::make_shared(hidden_dim, dim, false, false, force_prec_f32, scale); @@ -129,6 +127,10 @@ namespace ZImage { auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + w2->set_force_prec_f32(true); + } + auto x1 = w1->forward(ctx, x); auto x3 = w3->forward(ctx, x); x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3); From 576ede6ea2efd585d45715510d58045fbf6d19dd Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Mon, 20 Apr 2026 22:55:28 -0300 Subject: [PATCH 2/3] remove ifdef guard to prevent ODR violation If one translation unit includes the header with GGML_BACKEND_DL defined, and another includes it without, we'll hit an ODR violation, which is undefined behavior. For the current static-backend build, `ggml_backend_dev_count() > 0` will skip the `ggml_backend_load_all` call anyway. --- src/ggml_extend_backend.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp index 19cf56f2e..b1966c6a1 100644 --- a/src/ggml_extend_backend.hpp +++ b/src/ggml_extend_backend.hpp @@ -12,10 +12,11 @@ #endif inline void ggml_backend_load_all_once() { -#if defined(GGML_BACKEND_DL) // If the host process already preloaded backends explicitly // (for example via ggml_backend_load / ggml_backend_load_all_from_path), // do not rescan the default paths again. + // For static-backend mode, the registry is initialized by a singleton + // pattern, so any enabled backend will also cause the scan to be skipped if (ggml_backend_dev_count() > 0) { return; } @@ -28,7 +29,6 @@ inline void ggml_backend_load_all_once() { } ggml_backend_load_all(); }); -#endif } #if defined(GGML_BACKEND_DL) From 0fd3c74d6bdf58db942b34081aa7d006298b5640 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Tue, 21 Apr 2026 11:01:03 -0300 Subject: [PATCH 3/3] resolve build issues caused by GGML_BACKEND_DL The `#include "ggml-cpu.h"` removal triggered a build error in util.cpp. To avoid this kind of compatibility issue, it is preferable to build the same source code regardless of the GGML_BACKEND_DL configuration. The problem is that util.cpp requires ggml-cpu.h, which cannot be included alongside ggml_extend_backend.hpp. But since this is only required for a single function, I am using a include-inside-namespace approach to move away the definitions, and keep the diff minimal. Other ways to avoid this issue could be: * removing sd_get_system_info entirely (which may not be a bad idea, as dynamic backends could have been built with completely different flags); * moving the sd_get_system_info definition to a separate source file; * putting ggml_extend_backend.hpp functions into their own namespace; * renaming all ggml_extend_backend.hpp symbols to ggml_ext_* . --- src/ggml_extend_backend.hpp | 5 ---- src/util.cpp | 47 +++++++++++++++++++++---------------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp index b1966c6a1..8787a8b61 100644 --- a/src/ggml_extend_backend.hpp +++ b/src/ggml_extend_backend.hpp @@ -31,8 +31,6 @@ inline void ggml_backend_load_all_once() { }); } -#if defined(GGML_BACKEND_DL) - // Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS: // those are CMake options used to configure ggml itself, but they are not // exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode. @@ -287,7 +285,4 @@ __STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float va return tensor; } -#else -#include "ggml-cpu.h" -#endif #endif diff --git a/src/util.cpp b/src/util.cpp index c1b98a09c..f29be38ae 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -496,26 +496,6 @@ sd_progress_cb_t sd_get_progress_callback() { void* sd_get_progress_callback_data() { return sd_progress_cb_data; } -const char* sd_get_system_info() { - static char buffer[1024]; - std::stringstream ss; - ss << "System Info: \n"; - ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; - ss << " AVX = " << ggml_cpu_has_avx() << " | "; - ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; - ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; - ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; - ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; - ss << " FMA = " << ggml_cpu_has_fma() << " | "; - ss << " NEON = " << ggml_cpu_has_neon() << " | "; - ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; - ss << " F16C = " << ggml_cpu_has_f16c() << " | "; - ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; - ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; - ss << " VSX = " << ggml_cpu_has_vsx() << " | "; - snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); - return buffer; -} sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index) { const auto& shape = tensor.shape(); @@ -791,3 +771,30 @@ ggml_backend_t sd_get_default_backend() { return backend; } +// namespace is needed to avoid conflicts with ggml_backend_extend.hpp +namespace ggml_cpu { +#include "ggml-cpu.h" +} + +const char* sd_get_system_info() { + using namespace ggml_cpu; + static char buffer[1024]; + std::stringstream ss; + ss << "System Info: \n"; + ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; + ss << " AVX = " << ggml_cpu_has_avx() << " | "; + ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; + ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; + ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; + ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; + ss << " FMA = " << ggml_cpu_has_fma() << " | "; + ss << " NEON = " << ggml_cpu_has_neon() << " | "; + ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; + ss << " F16C = " << ggml_cpu_has_f16c() << " | "; + ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; + ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; + ss << " VSX = " << ggml_cpu_has_vsx() << " | "; + snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); + return buffer; +} +