diff --git a/examples/cli/README.md b/examples/cli/README.md index 78f8821a9..c79d2ab63 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -44,7 +44,6 @@ Context Options: CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --vae-tiling process vae in tiles to reduce memory usage --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed @@ -109,6 +108,7 @@ Generation Options: --skip-layer-start SLG enabling point (default: 0.01) --skip-layer-end SLG disabling point (default: 0.2) --eta eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0) + --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 50f35aed8..369c1f07f 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -581,10 +581,6 @@ struct SDContextParams { "--vae-tile-overlap", "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", &vae_tiling_params.target_overlap}, - {"", - "--flow-shift", - "shift value for Flow models like SD3.x or WAN (default: auto)", - &flow_shift}, }; options.bool_options = { @@ -903,7 +899,6 @@ struct SDContextParams { << " photo_maker_path: \"" << photo_maker_path << "\",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" - << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" @@ -986,7 +981,6 @@ struct SDContextParams { chroma_use_t5_mask, chroma_t5_mask_pad, qwen_image_zero_cond_t, - flow_shift, }; return sd_ctx_params; } @@ -1206,6 +1200,10 @@ struct SDGenerationParams { "--eta", "eta in DDIM, only for DDIM and TCD (default: 0)", &sample_params.eta}, + {"", + "--flow-shift", + "shift value for Flow models like SD3.x or WAN (default: auto)", + &sample_params.flow_shift}, {"", "--high-noise-cfg-scale", "(high noise) unconditional guidance scale: (default: 7.0)", @@ -1606,6 +1604,7 @@ struct SDGenerationParams { load_if_exists("cfg_scale", sample_params.guidance.txt_cfg); load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg); load_if_exists("guidance", sample_params.guidance.distilled_guidance); + load_if_exists("flow_shift", sample_params.flow_shift); auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) { if (j.contains(key) && j[key].is_string()) { diff --git a/examples/server/README.md b/examples/server/README.md index 8a2f2e915..386729009 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -36,7 +36,6 @@ Context Options: CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --vae-tiling process vae in tiles to reduce memory usage --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed @@ -101,6 +100,7 @@ Default Generation Options: --skip-layer-start SLG enabling point (default: 0.01) --skip-layer-end SLG disabling point (default: 0.2) --eta eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0) + --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index cb966d7e8..51b2b3291 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -201,7 +201,6 @@ typedef struct { bool chroma_use_t5_mask; int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; - float flow_shift; } sd_ctx_params_t; typedef struct { @@ -235,6 +234,7 @@ typedef struct { int shifted_timestep; float* custom_sigmas; int custom_sigmas_count; + float flow_shift; } sd_sample_params_t; typedef struct { diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 7e99b84a8..40bd7cb7f 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -657,9 +657,8 @@ struct DiscreteFlowDenoiser : public Denoiser { float sigma_data = 1.0f; - DiscreteFlowDenoiser(float shift = 3.0f) - : shift(shift) { - set_parameters(); + DiscreteFlowDenoiser(float shift = 3.0f) { + set_shift(shift); } void set_parameters() { @@ -668,6 +667,11 @@ struct DiscreteFlowDenoiser : public Denoiser { } } + void set_shift(float shift) { + this->shift = shift; + set_parameters(); + } + float sigma_min() override { return sigmas[0]; } @@ -710,34 +714,8 @@ float flux_time_shift(float mu, float sigma, float t) { return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma)); } -struct FluxFlowDenoiser : public Denoiser { - float sigmas[TIMESTEPS]; - float shift = 1.15f; - - float sigma_data = 1.0f; - - FluxFlowDenoiser(float shift = 1.15f) { - set_parameters(shift); - } - - void set_shift(float shift) { - this->shift = shift; - } - - void set_parameters(float shift) { - set_shift(shift); - for (int i = 0; i < TIMESTEPS; i++) { - sigmas[i] = t_to_sigma(static_cast(i)); - } - } - - float sigma_min() override { - return sigmas[0]; - } - - float sigma_max() override { - return sigmas[TIMESTEPS - 1]; - } +struct FluxFlowDenoiser : public DiscreteFlowDenoiser { + FluxFlowDenoiser() = default; float sigma_to_t(float sigma) override { return sigma; @@ -747,26 +725,6 @@ struct FluxFlowDenoiser : public Denoiser { t = t + 1; return flux_time_shift(shift, 1.0f, t / TIMESTEPS); } - - std::vector get_scalings(float sigma) override { - float c_skip = 1.0f; - float c_out = -sigma; - float c_in = 1.0f; - return {c_skip, c_out, c_in}; - } - - // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(noise, sigma); - ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma); - ggml_ext_tensor_add_inplace(latent, noise); - return latent; - } - - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma)); - return latent; - } }; struct Flux2FlowDenoiser : public FluxFlowDenoiser { diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 94221d8e6..e704922a1 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -115,6 +115,7 @@ class StableDiffusionGGML { int n_threads = -1; float scale_factor = 0.18215f; float shift_factor = 0.f; + float default_flow_shift = INFINITY; std::shared_ptr cond_stage_model; std::shared_ptr clip_vision; // for svd or wan2.1 i2v @@ -881,7 +882,6 @@ class StableDiffusionGGML { // init denoiser { prediction_t pred_type = sd_ctx_params->prediction; - float flow_shift = sd_ctx_params->flow_shift; if (pred_type == PREDICTION_COUNT) { if (sd_version_is_sd2(version)) { @@ -906,22 +906,19 @@ class StableDiffusionGGML { sd_version_is_qwen_image(version) || sd_version_is_z_image(version)) { pred_type = FLOW_PRED; - if (flow_shift == INFINITY) { - if (sd_version_is_wan(version)) { - flow_shift = 5.f; - } else { - flow_shift = 3.f; - } + if (sd_version_is_wan(version)) { + default_flow_shift = 5.f; + } else { + default_flow_shift = 3.f; } } else if (sd_version_is_flux(version)) { pred_type = FLUX_FLOW_PRED; - if (flow_shift == INFINITY) { - flow_shift = 1.0f; // TODO: validate - for (const auto& [name, tensor_storage] : tensor_storage_map) { - if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) { - flow_shift = 1.15f; - } + default_flow_shift = 1.0f; // TODO: validate + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) { + default_flow_shift = 1.15f; + break; } } } else if (sd_version_is_flux2(version)) { @@ -945,12 +942,12 @@ class StableDiffusionGGML { break; case FLOW_PRED: { LOG_INFO("running in FLOW mode"); - denoiser = std::make_shared(flow_shift); + denoiser = std::make_shared(); break; } case FLUX_FLOW_PRED: { LOG_INFO("running in Flux FLOW mode"); - denoiser = std::make_shared(flow_shift); + denoiser = std::make_shared(); break; } case FLUX2_FLOW_PRED: { @@ -2711,6 +2708,16 @@ class StableDiffusionGGML { ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); return result; } + + void set_flow_shift(float flow_shift = INFINITY) { + auto flow_denoiser = std::dynamic_pointer_cast(denoiser); + if (flow_denoiser) { + if (flow_shift == INFINITY) { + flow_shift = default_flow_shift; + } + flow_denoiser->set_shift(flow_shift); + } + } }; /*================================================= SD API ==================================================*/ @@ -2931,7 +2938,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; - sd_ctx_params->flow_shift = INFINITY; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { @@ -3023,6 +3029,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) { sample_params->sample_steps = 20; sample_params->custom_sigmas = nullptr; sample_params->custom_sigmas_count = 0; + sample_params->flow_shift = INFINITY; } char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { @@ -3043,7 +3050,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { "sample_method: %s, " "sample_steps: %d, " "eta: %.2f, " - "shifted_timestep: %d)", + "shifted_timestep: %d, " + "flow_shift: %.2f)", sample_params->guidance.txt_cfg, std::isfinite(sample_params->guidance.img_cfg) ? sample_params->guidance.img_cfg @@ -3057,7 +3065,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { sd_sample_method_name(sample_params->sample_method), sample_params->sample_steps, sample_params->eta, - sample_params->shifted_timestep); + sample_params->shifted_timestep, + sample_params->flow_shift); return buf; } @@ -3528,6 +3537,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g size_t t0 = ggml_time_ms(); + sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift); + // Apply lora sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); @@ -3803,6 +3814,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } LOG_INFO("generate_video %dx%dx%d", width, height, frames); + sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); + enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method; if (sample_method == SAMPLE_METHOD_COUNT) { sample_method = sd_get_default_sample_method(sd_ctx);