Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,37 +72,31 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF
if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
endif()

if(SD_METAL)
message("-- Use Metal as backend stable-diffusion")
set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
endif()

if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif ()

if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif ()

if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
endif ()

if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
endif()

if(SD_WEBP)
Expand Down Expand Up @@ -222,7 +216,6 @@ if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part (not changed by the PR) is weird: it could make sense to set it for ggml, but for sd.cpp? If so, what will happen when we add support for GGML_BACKEND_DL?

# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32)
Expand Down
8 changes: 5 additions & 3 deletions src/common_block.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#ifndef __COMMON_BLOCK_HPP__
#define __COMMON_BLOCK_HPP__

#include "ggml-backend.h"
#include "ggml_extend.hpp"
#include "util.h"

class DownSampleBlock : public GGMLBlock {
protected:
Expand Down Expand Up @@ -248,9 +250,6 @@ class FeedForward : public GGMLBlock {
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
Expand All @@ -264,6 +263,9 @@ class FeedForward : public GGMLBlock {

auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
net_2->set_force_prec_f32(true);
}

x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
Expand Down
183 changes: 109 additions & 74 deletions src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,12 @@

#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "ggml_extend_backend.hpp"

#include "model.h"
#include "tensor.hpp"

#ifdef SD_USE_CUDA
#include "ggml-cuda.h"
#endif

#ifdef SD_USE_METAL
#include "ggml-metal.h"
#endif

#ifdef SD_USE_VULKAN
#include "ggml-vulkan.h"
#endif

#ifdef SD_USE_OPENCL
#include "ggml-opencl.h"
#endif

#ifdef SD_USE_SYCL
#include "ggml-sycl.h"
#endif

#include "rng.hpp"
#include "tensor_ggml.hpp"
#include "util.h"
Expand Down Expand Up @@ -91,6 +71,45 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha
}
}

__STATIC_INLINE__ bool backend_name_exists(std::string name) {
ggml_backend_load_all_once();
const int device_count = ggml_backend_dev_count();
for (int i = 0; i < device_count; i++) {
if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) {
return true;
}
}
return false;
}

__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) {
if (name == "" || backend_name_exists(name)) {
return name;
} else {
LOG_WARN("Backend %s not found, using default backend", name.c_str());
return "";
}
}

__STATIC_INLINE__ std::string get_default_backend_name() {
ggml_backend_load_all_once();
// should pick the same backend as ggml_backend_init_best
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
return ggml_backend_dev_name(dev);
}

__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") {
ggml_backend_load_all_once();
LOG_DEBUG("Initializing backend: %s", name.c_str());
if (name.empty()) {
return ggml_backend_init_best();
} else {
return ggml_backend_init_by_name(name.c_str(), nullptr);
}
}

static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");

// n-mode tensor-matrix product
Expand Down Expand Up @@ -1286,25 +1305,25 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx,
return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
}

__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
#ifdef SD_USE_VULKAN
auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
out = ggml_get_rows(ctx, out, zero_index);
out = ggml_reshape(ctx, out, a);
// auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
return out;
#else
auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,]
if (ggml_is_transposed(out)) {
out = ggml_mul_mat(ctx, one, out);
__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* a) {
if (sd_backend_is(backend, "Vulkan")) {
auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
out = ggml_get_rows(ctx, out, zero_index);
out = ggml_reshape(ctx, out, a);
// auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
return out;
} else {
out = ggml_mul_mat(ctx, out, one);
auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,]
if (ggml_is_transposed(out)) {
out = ggml_mul_mat(ctx, one, out);
} else {
out = ggml_mul_mat(ctx, out, one);
}
out = ggml_reshape(ctx, out, a);
return out;
}
out = ggml_reshape(ctx, out, a);
#endif
return out;
}

// q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
Expand Down Expand Up @@ -1496,16 +1515,14 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,
}

__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
if (!ggml_backend_is_cpu(backend)) {
if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) &&
!ggml_backend_is_cpu(backend)) {
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
} else {
ggml_backend_tensor_get(tensor, data, offset, size);
return;
}
#else

ggml_backend_tensor_get(tensor, data, offset, size);
#endif
}

__STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
Expand Down Expand Up @@ -1664,14 +1681,15 @@ struct WeightAdapter {
float scale = 1.f;
} conv2d;
};
virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) = 0;
virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x,
ggml_tensor* w,
ggml_tensor* b,
const std::string& prefix,
ForwardParams forward_params) = 0;
virtual size_t get_extra_graph_size() = 0;
ForwardParams forward_params) = 0;
virtual size_t get_extra_graph_size() = 0;
};

struct GGMLRunnerContext {
Expand Down Expand Up @@ -2192,6 +2210,14 @@ struct GGMLRunner {
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
weight_adapter = adapter;
}

ggml_backend_t get_runtime_backend() {
return runtime_backend;
}

ggml_backend_t get_params_backend() {
return params_backend;
}
};

class GGMLBlock {
Expand Down Expand Up @@ -2336,6 +2362,14 @@ class Linear : public UnaryBlock {
force_prec_f32(force_prec_f32),
scale(scale) {}

void set_scale(float scale_) {
scale = scale_;
}

void set_force_prec_f32(bool force_prec_f32_) {
force_prec_f32 = force_prec_f32_;
}

ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
ggml_tensor* w = params["weight"];
ggml_tensor* b = nullptr;
Expand All @@ -2347,7 +2381,7 @@ class Linear : public UnaryBlock {
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
forward_params.linear.force_prec_f32 = force_prec_f32;
forward_params.linear.scale = scale;
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
}
return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
}
Expand Down Expand Up @@ -2463,7 +2497,7 @@ class Conv2d : public UnaryBlock {
forward_params.conv2d.circular_x = ctx->circular_x_enabled;
forward_params.conv2d.circular_y = ctx->circular_y_enabled;
forward_params.conv2d.scale = scale;
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
}
return ggml_ext_conv_2d(ctx->ggml_ctx,
x,
Expand Down Expand Up @@ -2527,15 +2561,15 @@ class Conv3d : public UnaryBlock {
ggml_tensor* w = params["weight"];
ggml_tensor* b = nullptr;
if (ctx->weight_adapter) {
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
if (w->type != GGML_TYPE_F16) {
w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
}
}
if (bias) {
b = params["bias"];
if (ctx->weight_adapter) {
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
}
}
return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
Expand Down Expand Up @@ -2582,12 +2616,12 @@ class LayerNorm : public UnaryBlock {
if (elementwise_affine) {
w = params["weight"];
if (ctx->weight_adapter) {
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
}
if (bias) {
b = params["bias"];
if (ctx->weight_adapter) {
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
}
}
}
Expand Down Expand Up @@ -2630,8 +2664,8 @@ class GroupNorm : public GGMLBlock {
w = params["weight"];
b = params["bias"];
if (ctx->weight_adapter) {
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
}
}
return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
Expand Down Expand Up @@ -2665,7 +2699,7 @@ class RMSNorm : public UnaryBlock {
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
ggml_tensor* w = params["weight"];
if (ctx->weight_adapter) {
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
}
x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
Expand Down Expand Up @@ -2748,6 +2782,7 @@ class MultiheadAttention : public GGMLBlock {

__STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch]
ggml_tensor* w1, // Outer C (Full rank)
ggml_tensor* w1a, // Outer A (Low rank part 1)
Expand Down Expand Up @@ -2778,29 +2813,29 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
int merge_batch_uq = batch;
int merge_batch_vp = batch;

#if SD_USE_VULKAN
if (batch > 1) {
// no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
int max_batch = 65535;
int max_batch_uq = max_batch / uq;
merge_batch_uq = 1;
for (int i = max_batch_uq; i > 0; i--) {
if (batch % i == 0) {
merge_batch_uq = i;
break;
if (sd_backend_is(backend, "Vulkan")) {
if (batch > 1) {
// no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
int max_batch = 65535;
int max_batch_uq = max_batch / uq;
merge_batch_uq = 1;
for (int i = max_batch_uq; i > 0; i--) {
if (batch % i == 0) {
merge_batch_uq = i;
break;
}
}
}

int max_batch_vp = max_batch / vp;
merge_batch_vp = 1;
for (int i = max_batch_vp; i > 0; i--) {
if (batch % i == 0) {
merge_batch_vp = i;
break;
int max_batch_vp = max_batch / vp;
merge_batch_vp = 1;
for (int i = max_batch_vp; i > 0; i--) {
if (batch % i == 0) {
merge_batch_vp = i;
break;
}
}
}
}
#endif

ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
if (w2 != NULL) {
Expand Down
Loading
Loading