Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ bool convert(const char* input_path,
bool output_is_safetensors = ends_with(output_path, ".safetensors");
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);

auto backend = ggml_backend_cpu_init();
auto backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
size_t mem_size = 1 * 1024 * 1024; // for padding
mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
mem_size += model_loader.get_params_mem_size(backend, type);
Expand Down
2 changes: 1 addition & 1 deletion src/flux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1539,7 +1539,7 @@ namespace Flux {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_COUNT;

ModelLoader model_loader;
Expand Down
30 changes: 21 additions & 9 deletions src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@
#define SD_UNUSED(x) (void)(x)
#endif

bool inline sd_ggml_backend_is_cpu(ggml_backend_t backend) noexcept {
return std::string_view{"CPU"} == ggml_backend_name(backend);
}

__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
return (multiple - n % multiple) % multiple;
}
Expand Down Expand Up @@ -1497,7 +1501,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,

__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
if (!ggml_backend_is_cpu(backend)) {
if (!sd_ggml_backend_is_cpu(backend)) {
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
} else {
Expand Down Expand Up @@ -1859,7 +1863,7 @@ struct GGMLRunner {
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
get_desc().c_str(),
compute_buffer_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
sd_ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
return true;
}

Expand Down Expand Up @@ -1895,7 +1899,7 @@ struct GGMLRunner {
LOG_DEBUG("%s cache backend buffer size = % 6.2f MB(%s) (%i tensors)",
get_desc().c_str(),
cache_buffer_size / (1024.f * 1024.f),
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
num_tensors);
}

Expand Down Expand Up @@ -1998,8 +2002,8 @@ struct GGMLRunner {
GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
: runtime_backend(backend) {
alloc_params_ctx();
if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
params_backend = ggml_backend_cpu_init();
if (!sd_ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
params_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
} else {
params_backend = runtime_backend;
}
Expand Down Expand Up @@ -2046,7 +2050,7 @@ struct GGMLRunner {
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
get_desc().c_str(),
params_buffer_size / (1024.f * 1024.f),
ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
num_tensors);
return true;
}
Expand Down Expand Up @@ -2112,7 +2116,7 @@ struct GGMLRunner {
return nullptr;
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
if (!sd_ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
// pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

Expand Down Expand Up @@ -2154,8 +2158,16 @@ struct GGMLRunner {
return std::nullopt;
}
copy_data_to_backend_tensor();
if (ggml_backend_is_cpu(runtime_backend)) {
ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
if (sd_ggml_backend_is_cpu(runtime_backend)) {
if (auto reg = ggml_backend_reg_by_name("CPU")) {
if (auto fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")) {
fn(runtime_backend, n_threads);
} else {
LOG_ERROR("ggml_backend_reg_get_proc_address(\"ggml_backend_set_n_threads\") == nullptr");
}
} else {
LOG_ERROR("ggml_backend_reg_by_name(\"CPU\") == nullptr");
}
}

ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
Expand Down
2 changes: 1 addition & 1 deletion src/llm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ namespace LLM {
static void load_from_file_and_test(const std::string& file_path) {
// cpu f16: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_COUNT;

ModelLoader model_loader;
Expand Down
4 changes: 2 additions & 2 deletions src/lora.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ struct LoraModel : public GGMLRunner {
}

ggml_tensor* original_tensor = model_tensor;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
if (!sd_ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
set_backend_tensor_data(model_tensor, original_tensor->data);
}
Expand All @@ -781,7 +781,7 @@ struct LoraModel : public GGMLRunner {
final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
}
ggml_build_forward_expand(gf, final_tensor);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
if (!sd_ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
original_tensor_to_final_tensor[original_tensor] = final_tensor;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/mmdit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ struct MMDiTRunner : public GGMLRunner {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
{
Expand Down
2 changes: 1 addition & 1 deletion src/qwen_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ namespace Qwen {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_Q8_0;

ModelLoader model_loader;
Expand Down
41 changes: 25 additions & 16 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,17 @@ class StableDiffusionGGML {
#endif

if (!backend) {
static bool need_load = true;
if (need_load) {
ggml_backend_load_all();
need_load = false;
}
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (!backend) {
LOG_ERROR("CPU backend is nullptr!");
std::terminate();
}
}
}

Expand Down Expand Up @@ -429,9 +438,9 @@ class StableDiffusionGGML {

{
clip_backend = backend;
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
if (clip_on_cpu && !sd_ggml_backend_is_cpu(backend)) {
LOG_INFO("CLIP: Using CPU backend");
clip_backend = ggml_backend_cpu_init();
clip_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}
if (sd_version_is_sd3(version)) {
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
Expand Down Expand Up @@ -607,9 +616,9 @@ class StableDiffusionGGML {
high_noise_diffusion_model->get_param_tensors(tensors);
}

if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
if (sd_ctx_params->keep_vae_on_cpu && !sd_ggml_backend_is_cpu(backend)) {
LOG_INFO("VAE Autoencoder: Using CPU backend");
vae_backend = ggml_backend_cpu_init();
vae_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
} else {
vae_backend = backend;
}
Expand Down Expand Up @@ -700,9 +709,9 @@ class StableDiffusionGGML {

if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
ggml_backend_t controlnet_backend = nullptr;
if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
if (sd_ctx_params->keep_control_net_on_cpu && !sd_ggml_backend_is_cpu(backend)) {
LOG_DEBUG("ControlNet: Using CPU backend");
controlnet_backend = ggml_backend_cpu_init();
controlnet_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
} else {
controlnet_backend = backend;
}
Expand Down Expand Up @@ -869,25 +878,25 @@ class StableDiffusionGGML {

size_t total_params_ram_size = 0;
size_t total_params_vram_size = 0;
if (ggml_backend_is_cpu(clip_backend)) {
if (sd_ggml_backend_is_cpu(clip_backend)) {
total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
} else {
total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
}

if (ggml_backend_is_cpu(backend)) {
if (sd_ggml_backend_is_cpu(backend)) {
total_params_ram_size += unet_params_mem_size;
} else {
total_params_vram_size += unet_params_mem_size;
}

if (ggml_backend_is_cpu(vae_backend)) {
if (sd_ggml_backend_is_cpu(vae_backend)) {
total_params_ram_size += vae_params_mem_size;
} else {
total_params_vram_size += vae_params_mem_size;
}

if (ggml_backend_is_cpu(control_net_backend)) {
if (sd_ggml_backend_is_cpu(control_net_backend)) {
total_params_ram_size += control_net_params_mem_size;
} else {
total_params_vram_size += control_net_params_mem_size;
Expand All @@ -901,15 +910,15 @@ class StableDiffusionGGML {
total_params_vram_size / 1024.0 / 1024.0,
total_params_ram_size / 1024.0 / 1024.0,
clip_params_mem_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
unet_params_mem_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
vae_params_mem_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
control_net_params_mem_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
sd_ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
pmid_params_mem_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
sd_ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
}

// init denoiser
Expand Down
2 changes: 1 addition & 1 deletion src/t5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ struct T5Embedder {
// cuda f32: pass
// cuda q8_0: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_F16;

ModelLoader model_loader;
Expand Down
2 changes: 1 addition & 1 deletion src/upscaler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ struct UpscalerGGML {
model_loader.set_wtype_override(model_data_type);
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
Expand Down
30 changes: 17 additions & 13 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,19 +499,23 @@ const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
if (auto reg = ggml_backend_reg_by_name("CPU")) {
ggml_backend_get_features_t fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (fn) {
auto ptr = fn(reg);
if ( !ptr || !ptr->name ) {
ss << " [None]";
} else {
for ( ; ptr->name; ++ptr ) {
ss << ptr->name << " = " << ptr->value << " | ";
}
}
} else {
LOG_ERROR("ggml_backend_reg_get_proc_address() failed on \"ggml_backend_get_features\"");
}
} else {
LOG_ERROR("ggml_backend_reg_by_name(\"CPU\") == nullptr");
}
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
Expand Down
4 changes: 2 additions & 2 deletions src/wan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1315,7 +1315,7 @@ namespace WAN {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, false, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V);
{
Expand Down Expand Up @@ -2305,7 +2305,7 @@ namespace WAN {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_F16;
LOG_INFO("loading from '%s'", file_path.c_str());

Expand Down
2 changes: 1 addition & 1 deletion src/z_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ namespace ZImage {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_type model_data_type = GGML_TYPE_Q8_0;

ModelLoader model_loader;
Expand Down
Loading