diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index a5b0037b6..35dd0413a 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -278,7 +278,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP bool valid = cli_params.resolve_and_validate(); if (valid && cli_params.mode != METADATA) { valid = ctx_params.resolve_and_validate(cli_params.mode) && - gen_params.resolve_and_validate(cli_params.mode, ctx_params.lora_model_dir); + gen_params.resolve_and_validate(cli_params.mode, + ctx_params.lora_model_dir, + ctx_params.hires_upscalers_dir); } if (!valid) { @@ -688,6 +690,10 @@ int main(int argc, const char* argv[]) { vae_decode_only = false; } + if (gen_params.hires_enabled && !gen_params.hires_upscaler_model_path.empty()) { + vae_decode_only = false; + } + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview); SDImageVec results; diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 0235c53de..ab770a0b5 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -351,7 +351,10 @@ ArgOptions SDContextParams::get_options() { "--lora-model-dir", "lora model directory", &lora_model_dir}, - + {"", + "--hires-upscalers-dir", + "highres fix upscaler model directory", + &hires_upscalers_dir}, {"", "--tensor-type-rules", "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", @@ -649,6 +652,7 @@ std::string SDContextParams::to_string() const { << " wtype: " << sd_type_name(wtype) << ",\n" << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" << " lora_model_dir: \"" << lora_model_dir << "\",\n" + << " hires_upscalers_dir: \"" << hires_upscalers_dir << "\",\n" << " photo_maker_path: \"" << photo_maker_path << "\",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" @@ -777,6 +781,10 @@ ArgOptions SDGenerationParams::get_options() { "--pm-id-embed-path", "path to PHOTOMAKER v2 id embed", &pm_id_embed_path}, + {"", + "--hires-upscaler", + "highres fix upscaler, Latent (nearest) or a model name/path under --hires-upscalers-dir (default: Latent (nearest))", + &hires_upscaler}, }; options.int_options = { @@ -826,6 +834,22 @@ ArgOptions SDGenerationParams::get_options() { "--upscale-tile-size", "tile size for ESRGAN upscaling (default: 128)", &upscale_tile_size}, + {"", + "--hires-width", + "highres fix target width, 0 to use --hires-scale (default: 0)", + &hires_width}, + {"", + "--hires-height", + "highres fix target height, 0 to use --hires-scale (default: 0)", + &hires_height}, + {"", + "--hires-steps", + "highres fix second pass sample steps, 0 to reuse --steps (default: 0)", + &hires_steps}, + {"", + "--hires-upscale-tile-size", + "highres fix upscaler tile size, reserved for model-backed upscalers (default: 128)", + &hires_upscale_tile_size}, }; options.float_options = { @@ -913,6 +937,14 @@ ArgOptions SDGenerationParams::get_options() { "--vae-tile-overlap", "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", &vae_tiling_params.target_overlap}, + {"", + "--hires-scale", + "highres fix scale when target size is not set (default: 2.0)", + &hires_scale}, + {"", + "--hires-denoising-strength", + "highres fix second pass denoising strength (default: 0.7)", + &hires_denoising_strength}, }; options.bool_options = { @@ -936,6 +968,11 @@ ArgOptions SDGenerationParams::get_options() { "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, + {"", + "--hires", + "enable highres fix", + true, + &hires_enabled}, }; auto on_seed_arg = [&](int argc, const char** argv, int index) { @@ -1424,6 +1461,37 @@ static bool parse_lora_json_field(const json& parent, return true; } +static bool resolve_model_file_from_dir(const std::string& model_name, + const std::string& model_dir, + const std::vector& valid_ext, + const char* label, + std::string& resolved_path) { + if (model_dir.empty()) { + LOG_ERROR("%s directory is empty", label); + return false; + } + if (model_name.empty() || + model_name.find('/') != std::string::npos || + model_name.find('\\') != std::string::npos || + fs::path(model_name).has_root_path() || + fs::path(model_name).has_extension()) { + LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str()); + return false; + } + + fs::path model_dir_path = model_dir; + for (const auto& ext : valid_ext) { + fs::path try_path = model_dir_path / (model_name + ext); + if (fs::exists(try_path) && fs::is_regular_file(try_path)) { + resolved_path = try_path.lexically_normal().string(); + return true; + } + } + + LOG_ERROR("can not find %s %s in %s", label, model_name.c_str(), model_dir_path.lexically_normal().string().c_str()); + return false; +} + bool SDGenerationParams::from_json_str( const std::string& json_str, const std::function& lora_path_resolver) { @@ -1487,6 +1555,34 @@ bool SDGenerationParams::from_json_str( load_if_exists("increase_ref_index", increase_ref_index); load_if_exists("embed_image_metadata", embed_image_metadata); + if (j.contains("hires") && j["hires"].is_object()) { + const json& hires_json = j["hires"]; + if (hires_json.contains("enabled") && hires_json["enabled"].is_boolean()) { + hires_enabled = hires_json["enabled"]; + } + if (hires_json.contains("upscaler") && hires_json["upscaler"].is_string()) { + hires_upscaler = hires_json["upscaler"]; + } + if (hires_json.contains("scale") && hires_json["scale"].is_number()) { + hires_scale = hires_json["scale"]; + } + if (hires_json.contains("target_width") && hires_json["target_width"].is_number_integer()) { + hires_width = hires_json["target_width"]; + } + if (hires_json.contains("target_height") && hires_json["target_height"].is_number_integer()) { + hires_height = hires_json["target_height"]; + } + if (hires_json.contains("steps") && hires_json["steps"].is_number_integer()) { + hires_steps = hires_json["steps"]; + } + if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) { + hires_denoising_strength = hires_json["denoising_strength"]; + } + if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) { + hires_upscale_tile_size = hires_json["upscale_tile_size"]; + } + } + auto parse_sample_params_json = [&](const json& sample_json, sd_sample_params_t& target_params, std::vector& target_skip_layers, @@ -1800,7 +1896,7 @@ bool SDGenerationParams::initialize_cache_params() { return true; } -bool SDGenerationParams::resolve(const std::string& lora_model_dir, bool strict) { +bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict) { if (high_noise_sample_params.sample_steps <= 0) { high_noise_sample_params.sample_steps = -1; } @@ -1819,6 +1915,27 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, bool strict) sample_params.sample_steps = std::clamp(sample_params.sample_steps, 1, 100); } + hires_upscaler_model_path.clear(); + if (hires_enabled) { + if (hires_upscaler.empty()) { + hires_upscaler = "Latent (nearest)"; + } + resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str()); + if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) { + hires_enabled = false; + } else if (resolved_hires_upscaler == SD_HIRES_UPSCALER_COUNT) { + static const std::vector valid_ext = {".gguf", ".safetensors", ".pt", ".pth"}; + if (!resolve_model_file_from_dir(hires_upscaler, + hires_upscalers_dir, + valid_ext, + "hires upscaler", + hires_upscaler_model_path)) { + return false; + } + resolved_hires_upscaler = SD_HIRES_UPSCALER_MODEL; + } + } + prompt_with_lora = prompt; if (!lora_model_dir.empty()) { extract_and_remove_lora(lora_model_dir); @@ -1883,6 +2000,29 @@ bool SDGenerationParams::validate(SDMode mode) { return false; } + if (hires_enabled) { + if (hires_width < 0 || hires_height < 0) { + LOG_ERROR("error: hires target width and height must be >= 0"); + return false; + } + if (hires_scale <= 0.f && hires_width <= 0 && hires_height <= 0) { + LOG_ERROR("error: hires scale must be positive when target size is not set"); + return false; + } + if (hires_steps < 0) { + LOG_ERROR("error: hires steps must be >= 0"); + return false; + } + if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) { + LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); + return false; + } + if (hires_upscale_tile_size < 1) { + LOG_ERROR("error: hires upscale tile size must be positive"); + return false; + } + } + if (mode == UPSCALE) { if (init_image_path.length() == 0) { LOG_ERROR("error: upscale mode needs an init image (--init-img)\n"); @@ -1893,8 +2033,11 @@ bool SDGenerationParams::validate(SDMode mode) { return true; } -bool SDGenerationParams::resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict) { - if (!resolve(lora_model_dir, strict)) { +bool SDGenerationParams::resolve_and_validate(SDMode mode, + const std::string& lora_model_dir, + const std::string& hires_upscalers_dir, + bool strict) { + if (!resolve(lora_model_dir, hires_upscalers_dir, strict)) { return false; } if (!validate(mode)) { @@ -1965,6 +2108,16 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() { params.pm_params = pm_params; params.vae_tiling_params = vae_tiling_params; params.cache = cache_params; + + params.hires.enabled = hires_enabled; + params.hires.upscaler = resolved_hires_upscaler; + params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); + params.hires.scale = hires_scale; + params.hires.target_width = hires_width; + params.hires.target_height = hires_height; + params.hires.steps = hires_steps; + params.hires.denoising_strength = hires_denoising_strength; + params.hires.upscale_tile_size = hires_upscale_tile_size; return params; } @@ -2089,6 +2242,15 @@ std::string SDGenerationParams::to_string() const { << " seed: " << seed << ",\n" << " upscale_repeats: " << upscale_repeats << ",\n" << " upscale_tile_size: " << upscale_tile_size << ",\n" + << " hires: { enabled: " << (hires_enabled ? "true" : "false") + << ", upscaler: \"" << hires_upscaler << "\"" + << ", model_path: \"" << hires_upscaler_model_path << "\"" + << ", scale: " << hires_scale + << ", target_width: " << hires_width + << ", target_height: " << hires_height + << ", steps: " << hires_steps + << ", denoising_strength: " << hires_denoising_strength + << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << " vae_tiling_params: { " << vae_tiling_params.enabled << ", " << vae_tiling_params.tile_size_x << ", " @@ -2162,6 +2324,13 @@ std::string get_image_params(const SDContextParams& ctx_params, const SDGenerati if (gen_params.clip_skip != -1) { parameter_string += "Clip skip: " + std::to_string(gen_params.clip_skip) + ", "; } + if (gen_params.hires_enabled) { + parameter_string += "Hires upscale: " + gen_params.hires_upscaler + ", "; + parameter_string += "Hires scale: " + std::to_string(gen_params.hires_scale) + ", "; + parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", "; + parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", "; + parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; + } parameter_string += "Version: stable-diffusion.cpp"; return parameter_string; } diff --git a/examples/common/common.h b/examples/common/common.h index 5afe89b34..a754f42be 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -101,6 +101,7 @@ struct SDContextParams { sd_type_t wtype = SD_TYPE_COUNT; std::string tensor_type_rules; std::string lora_model_dir = "."; + std::string hires_upscalers_dir; std::map embedding_map; std::vector embedding_vec; @@ -190,12 +191,23 @@ struct SDGenerationParams { int upscale_repeats = 1; int upscale_tile_size = 128; + bool hires_enabled = false; + std::string hires_upscaler = "Latent (nearest)"; + std::string hires_upscaler_model_path; + float hires_scale = 2.f; + int hires_width = 0; + int hires_height = 0; + int hires_steps = 0; + float hires_denoising_strength = 0.7f; + int hires_upscale_tile_size = 128; + std::map lora_map; std::map high_noise_lora_map; // Derived and normalized fields. std::string prompt_with_lora; // for metadata record only std::vector lora_vec; + sd_hires_upscaler_t resolved_hires_upscaler; // Owned execution payload. SDImageOwner init_image; @@ -225,9 +237,12 @@ struct SDGenerationParams { void set_width_and_height_if_unset(int w, int h); int get_resolved_width() const; int get_resolved_height() const; - bool resolve(const std::string& lora_model_dir, bool strict = false); + bool resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict = false); bool validate(SDMode mode); - bool resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict = false); + bool resolve_and_validate(SDMode mode, + const std::string& lora_model_dir, + const std::string& hires_upscalers_dir, + bool strict = false); sd_img_gen_params_t to_sd_img_gen_params_t(); sd_vid_gen_params_t to_sd_vid_gen_params_t(); std::string to_string() const; diff --git a/examples/server/api.md b/examples/server/api.md index 39744dbed..03df0908b 100644 --- a/examples/server/api.md +++ b/examples/server/api.md @@ -38,6 +38,8 @@ Current generation-related endpoints include: - `POST /sdapi/v1/txt2img` - `POST /sdapi/v1/img2img` - `GET /sdapi/v1/loras` +- `GET /sdapi/v1/upscalers` +- `GET /sdapi/v1/latent-upscale-modes` - `GET /sdapi/v1/samplers` - `GET /sdapi/v1/schedulers` - `GET /sdapi/v1/sd-models` @@ -216,6 +218,13 @@ Currently supported request fields: | `scheduler` | `string` | Scheduler name | | `lora` | `array` | Structured LoRA list | | `extra_images` | `array` | Base64 or data URL images | +| `enable_hr` | `boolean` | Enable highres fix for `txt2img` | +| `hr_upscaler` | `string` | `Latent (nearest)` or an upscaler model name from `/sdapi/v1/upscalers` | +| `hr_scale` | `number` | Highres scale when resize target is not set | +| `hr_resize_x` | `integer` | Highres target width, `0` to use scale | +| `hr_resize_y` | `integer` | Highres target height, `0` to use scale | +| `hr_steps` | `integer` | Highres second-pass sample steps, `0` to reuse `steps` | +| `denoising_strength` | `number` | Highres denoising strength for `txt2img` | Native extension fields: @@ -241,6 +250,8 @@ Currently supported request fields: | `inpainting_mask_invert` | `integer` or `boolean` | Treated as invert flag | | `denoising_strength` | `number` | Clamped to `0.0..1.0` | +Highres fix fields are currently handled for `txt2img`; `img2img` uses `denoising_strength` as image-to-image strength. + Native extension fields: - any `sdcpp API` fields embedded through `sd_cpp_extra_args` inside `prompt` @@ -258,6 +269,8 @@ Response fields: Currently exposed: - `GET /sdapi/v1/loras` +- `GET /sdapi/v1/upscalers` +- `GET /sdapi/v1/latent-upscale-modes` - `GET /sdapi/v1/samplers` - `GET /sdapi/v1/schedulers` - `GET /sdapi/v1/sd-models` @@ -272,6 +285,24 @@ Response fields: | `[].name` | `string` | Display name derived from file stem | | `[].path` | `string` | Relative path under the configured LoRA directory | +`GET /sdapi/v1/upscalers` + +| Field | Type | Notes | +| --- | --- | --- | +| `[].name` | `string` | Built-in name or model stem | +| `[].model_name` | `string \| null` | Model family label for model-backed upscalers | +| `[].model_path` | `string \| null` | Absolute model path for model-backed upscalers | +| `[].model_url` | `string \| null` | Currently always null | +| `[].scale` | `integer` | Currently `4` | + +Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. + +`GET /sdapi/v1/latent-upscale-modes` + +| Field | Type | Notes | +| --- | --- | --- | +| `[].name` | `string` | WebUI-compatible latent upscale mode name | + `GET /sdapi/v1/samplers` | Field | Type | Notes | @@ -388,6 +419,7 @@ Top-level fields: | `samplers` | `array` | Available sampling methods | | `schedulers` | `array` | Available schedulers | | `loras` | `array` | Available LoRA entries | +| `upscalers` | `array` | Available model-backed highres upscalers | | `limits` | `object` | Shared queue and size limits | `model` @@ -424,6 +456,14 @@ Shared nested fields: | `loras[].name` | `string` | | `loras[].path` | `string` | +`upscalers` + +| Field | Type | Notes | +| --- | --- | --- | +| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` | + +Built-in entries include `None` and `Latent (nearest)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. + `limits` | Field | Type | @@ -482,6 +522,15 @@ Shared default fields used by both `img_gen` and `vid_gen`: | `auto_resize_ref_image` | `boolean` | | `increase_ref_index` | `boolean` | | `control_strength` | `number` | +| `hires` | `object` | +| `hires.enabled` | `boolean` | +| `hires.upscaler` | `string` | +| `hires.scale` | `number` | +| `hires.target_width` | `integer` | +| `hires.target_height` | `integer` | +| `hires.steps` | `integer` | +| `hires.denoising_strength` | `number` | +| `hires.upscale_tile_size` | `integer` | `vid_gen`-specific default fields: @@ -514,6 +563,7 @@ Fields returned in `features_by_mode.img_gen`: - `ref_images` - `lora` - `vae_tiling` +- `hires` - `cache` - `cancel_queued` - `cancel_generating` @@ -625,6 +675,16 @@ Example: }, "lora": [], + "hires": { + "enabled": false, + "upscaler": "Latent (nearest)", + "scale": 2.0, + "target_width": 0, + "target_height": 0, + "steps": 0, + "denoising_strength": 0.7, + "upscale_tile_size": 128 + }, "vae_tiling_params": { "enabled": false, @@ -729,12 +789,23 @@ Other native fields: | Field | Type | | --- | --- | +| `hires` | `object` | +| `hires.enabled` | `boolean` | +| `hires.upscaler` | `string` | +| `hires.scale` | `number` | +| `hires.target_width` | `integer` | +| `hires.target_height` | `integer` | +| `hires.steps` | `integer` | +| `hires.denoising_strength` | `number` | +| `hires.upscale_tile_size` | `integer` | | `vae_tiling_params` | `object` | | `cache_mode` | `string` | | `cache_option` | `string` | | `scm_mask` | `string` | | `scm_policy_dynamic` | `boolean` | +For `hires.upscaler`, use `Latent (nearest)` for latent upscale or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. + HTTP-only output fields: | Field | Type | diff --git a/examples/server/main.cpp b/examples/server/main.cpp index 11a334d5f..114d526a8 100644 --- a/examples/server/main.cpp +++ b/examples/server/main.cpp @@ -48,7 +48,9 @@ static void parse_args(int argc, if (!svr_params.resolve_and_validate() || !ctx_params.resolve_and_validate(IMG_GEN) || - !default_gen_params.resolve_and_validate(IMG_GEN, ctx_params.lora_model_dir)) { + !default_gen_params.resolve_and_validate(IMG_GEN, + ctx_params.lora_model_dir, + ctx_params.hires_upscalers_dir)) { print_usage(argv[0], options_vec); exit(1); } @@ -95,6 +97,8 @@ int main(int argc, const char** argv) { std::vector lora_cache; std::mutex lora_mutex; + std::vector upscaler_cache; + std::mutex upscaler_mutex; AsyncJobManager async_job_manager; ServerRuntime runtime = { sd_ctx.get(), @@ -104,6 +108,8 @@ int main(int argc, const char** argv) { &default_gen_params, &lora_cache, &lora_mutex, + &upscaler_cache, + &upscaler_mutex, &async_job_manager, }; diff --git a/examples/server/routes_openai.cpp b/examples/server/routes_openai.cpp index ce6215d1e..a24383d67 100644 --- a/examples/server/routes_openai.cpp +++ b/examples/server/routes_openai.cpp @@ -70,7 +70,7 @@ static bool build_openai_generation_request(const httplib::Request& req, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } @@ -212,7 +212,7 @@ static bool build_openai_edit_request(const httplib::Request& req, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } diff --git a/examples/server/routes_sdapi.cpp b/examples/server/routes_sdapi.cpp index 63c89ec8b..74a6b3219 100644 --- a/examples/server/routes_sdapi.cpp +++ b/examples/server/routes_sdapi.cpp @@ -1,6 +1,7 @@ #include "routes.h" #include +#include #include #include #include @@ -35,14 +36,20 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) { return {}; } +static std::string lower_ascii(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + static enum sample_method_t get_sdapi_sample_method(std::string name) { enum sample_method_t result = str_to_sample_method(name.c_str()); if (result != SAMPLE_METHOD_COUNT) { return result; } - std::transform(name.begin(), name.end(), name.begin(), - [](unsigned char c) { return static_cast(std::tolower(c)); }); + name = lower_ascii(name); static const std::unordered_map hardcoded{ {"euler a", EULER_A_SAMPLE_METHOD}, {"k_euler_a", EULER_A_SAMPLE_METHOD}, @@ -114,6 +121,18 @@ static bool build_sdapi_img_gen_request(const json& j, request.gen_params.width = j.value("width", -1); request.gen_params.height = j.value("height", -1); + if (!img2img && j.value("enable_hr", false)) { + request.gen_params.hires_enabled = true; + request.gen_params.hires_scale = j.value("hr_scale", request.gen_params.hires_scale); + request.gen_params.hires_width = j.value("hr_resize_x", request.gen_params.hires_width); + request.gen_params.hires_height = j.value("hr_resize_y", request.gen_params.hires_height); + request.gen_params.hires_steps = j.value("hr_steps", request.gen_params.hires_steps); + request.gen_params.hires_denoising_strength = + j.value("denoising_strength", request.gen_params.hires_denoising_strength); + + request.gen_params.hires_upscaler = j.value("hr_upscaler", request.gen_params.hires_upscaler); + } + std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt); if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) { error_message = "invalid sd_cpp_extra_args"; @@ -228,7 +247,7 @@ static bool build_sdapi_img_gen_request(const json& j, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } @@ -347,6 +366,45 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) { res.set_content(result.dump(), "application/json"); }); + svr.Get("/sdapi/v1/upscalers", [runtime](const httplib::Request&, httplib::Response& res) { + refresh_upscaler_cache(*runtime); + + auto make_builtin = [](const char* name) { + json item; + item["name"] = name; + item["model_name"] = nullptr; + item["model_path"] = nullptr; + item["model_url"] = nullptr; + item["scale"] = 4; + return item; + }; + + json result = json::array(); + result.push_back(make_builtin("None")); + + { + std::lock_guard lock(*runtime->upscaler_mutex); + for (const auto& e : *runtime->upscaler_cache) { + json item; + item["name"] = e.name; + item["model_name"] = e.model_name; + item["model_path"] = e.fullpath; + item["model_url"] = nullptr; + item["scale"] = e.scale; + result.push_back(item); + } + } + + res.set_content(result.dump(), "application/json"); + }); + + svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) { + json result = json::array({ + {{"name", "Latent (nearest)"}}, + }); + res.set_content(result.dump(), "application/json"); + }); + svr.Get("/sdapi/v1/samplers", [runtime](const httplib::Request&, httplib::Response& res) { std::vector sampler_names; sampler_names.push_back("default"); diff --git a/examples/server/routes_sdcpp.cpp b/examples/server/routes_sdcpp.cpp index 8119136a4..c314eb0fa 100644 --- a/examples/server/routes_sdcpp.cpp +++ b/examples/server/routes_sdcpp.cpp @@ -114,6 +114,17 @@ static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const {"increase_ref_index", defaults.increase_ref_index}, {"control_strength", defaults.control_strength}, {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)}, + {"hires", + { + {"enabled", defaults.hires_enabled}, + {"upscaler", defaults.hires_upscaler}, + {"scale", defaults.hires_scale}, + {"target_width", defaults.hires_width}, + {"target_height", defaults.hires_height}, + {"steps", defaults.hires_steps}, + {"denoising_strength", defaults.hires_denoising_strength}, + {"upscale_tile_size", defaults.hires_upscale_tile_size}, + }}, {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, {"cache_mode", defaults.cache_mode}, {"cache_option", defaults.cache_option}, @@ -157,6 +168,7 @@ static json make_img_gen_features_json() { {"ref_images", true}, {"lora", true}, {"vae_tiling", true}, + {"hires", true}, {"cache", true}, {"cancel_queued", true}, {"cancel_generating", false}, @@ -179,6 +191,7 @@ static json make_vid_gen_features_json() { static json make_capabilities_json(ServerRuntime& runtime) { refresh_lora_cache(runtime); + refresh_upscaler_cache(runtime); AsyncJobManager& manager = *runtime.async_job_manager; const auto& defaults = *runtime.default_gen_params; @@ -190,6 +203,7 @@ static json make_capabilities_json(ServerRuntime& runtime) { json image_output_formats = supported_img_output_formats(); json video_output_formats = supported_vid_output_formats(); json available_loras = json::array(); + json available_upscalers = json::array(); json supported_modes = json::array(); for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) { @@ -210,6 +224,21 @@ static json make_capabilities_json(ServerRuntime& runtime) { } } + available_upscalers.push_back({ + {"name", "None"}, + }); + available_upscalers.push_back({ + {"name", "Latent (nearest)"}, + }); + { + std::lock_guard lock(*runtime.upscaler_mutex); + for (const auto& entry : *runtime.upscaler_cache) { + available_upscalers.push_back({ + {"name", entry.name}, + }); + } + } + if (supports_img) { supported_modes.push_back("img_gen"); } @@ -284,6 +313,7 @@ static json make_capabilities_json(ServerRuntime& runtime) { result["features"] = top_level_features; result["features_by_mode"] = features_by_mode; result["loras"] = available_loras; + result["upscalers"] = available_upscalers; return result; } @@ -307,7 +337,7 @@ static bool parse_img_gen_request(const json& body, return false; } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid generation parameters"; return false; } @@ -334,7 +364,7 @@ static bool parse_vid_gen_request(const json& body, return false; } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(VID_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(VID_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid generation parameters"; return false; } diff --git a/examples/server/runtime.cpp b/examples/server/runtime.cpp index 39880a182..afadb62ae 100644 --- a/examples/server/runtime.cpp +++ b/examples/server/runtime.cpp @@ -1,6 +1,7 @@ #include "runtime.h" #include +#include #include #include #include @@ -13,6 +14,18 @@ namespace fs = std::filesystem; +static std::string lower_ascii(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + +static bool is_supported_model_ext(const fs::path& p) { + auto ext = lower_ascii(p.extension().string()); + return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors"; +} + static const std::string k_base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" @@ -241,20 +254,12 @@ void refresh_lora_cache(ServerRuntime& rt) { fs::path lora_dir = rt.ctx_params->lora_model_dir; if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) { - auto is_lora_ext = [](const fs::path& p) { - auto ext = p.extension().string(); - std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) { - return static_cast(std::tolower(c)); - }); - return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors"; - }; - for (auto& entry : fs::recursive_directory_iterator(lora_dir)) { if (!entry.is_regular_file()) { continue; } const fs::path& p = entry.path(); - if (!is_lora_ext(p)) { + if (!is_supported_model_ext(p)) { continue; } @@ -286,6 +291,40 @@ std::string get_lora_full_path(ServerRuntime& rt, const std::string& path) { return it != rt.lora_cache->end() ? it->fullpath : ""; } +void refresh_upscaler_cache(ServerRuntime& rt) { + std::vector new_cache; + + fs::path upscaler_dir = rt.ctx_params->hires_upscalers_dir; + if (fs::exists(upscaler_dir) && fs::is_directory(upscaler_dir)) { + for (auto& entry : fs::directory_iterator(upscaler_dir)) { + if (!entry.is_regular_file()) { + continue; + } + const fs::path& p = entry.path(); + if (!is_supported_model_ext(p)) { + continue; + } + + UpscalerEntry upscaler_entry; + upscaler_entry.name = p.stem().u8string(); + upscaler_entry.fullpath = fs::absolute(p).lexically_normal().u8string(); + upscaler_entry.model_name = "ESRGAN_4x"; + upscaler_entry.path = p.filename().u8string(); + + new_cache.push_back(std::move(upscaler_entry)); + } + } + + std::sort(new_cache.begin(), new_cache.end(), [](const UpscalerEntry& a, const UpscalerEntry& b) { + return a.name < b.name; + }); + + { + std::lock_guard lock(*rt.upscaler_mutex); + *rt.upscaler_cache = std::move(new_cache); + } +} + int64_t unix_timestamp_now() { return std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) diff --git a/examples/server/runtime.h b/examples/server/runtime.h index 1970e7dbc..5c5f2d480 100644 --- a/examples/server/runtime.h +++ b/examples/server/runtime.h @@ -37,6 +37,14 @@ struct LoraEntry { std::string fullpath; }; +struct UpscalerEntry { + std::string name; + std::string path; + std::string fullpath; + std::string model_name; + int scale = 4; +}; + struct ServerRuntime { sd_ctx_t* sd_ctx; std::mutex* sd_ctx_mutex; @@ -45,6 +53,8 @@ struct ServerRuntime { const SDGenerationParams* default_gen_params; std::vector* lora_cache; std::mutex* lora_mutex; + std::vector* upscaler_cache; + std::mutex* upscaler_mutex; AsyncJobManager* async_job_manager; }; @@ -86,4 +96,5 @@ bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) std::string unsupported_generation_mode_error(SDMode mode); void refresh_lora_cache(ServerRuntime& rt); std::string get_lora_full_path(ServerRuntime& rt, const std::string& path); +void refresh_upscaler_cache(ServerRuntime& rt); int64_t unix_timestamp_now(); diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index a99b10450..fba5c1b77 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -289,6 +289,25 @@ typedef struct { const char* path; } sd_lora_t; +enum sd_hires_upscaler_t { + SD_HIRES_UPSCALER_NONE, + SD_HIRES_UPSCALER_LATENT_NEAREST, + SD_HIRES_UPSCALER_MODEL, + SD_HIRES_UPSCALER_COUNT, +}; + +typedef struct { + bool enabled; + enum sd_hires_upscaler_t upscaler; + const char* model_path; + float scale; + int target_width; + int target_height; + int steps; + float denoising_strength; + int upscale_tile_size; +} sd_hires_params_t; + typedef struct { const sd_lora_t* loras; uint32_t lora_count; @@ -312,6 +331,7 @@ typedef struct { sd_pm_params_t pm_params; sd_tiling_params_t vae_tiling_params; sd_cache_params_t cache; + sd_hires_params_t hires; } sd_img_gen_params_t; typedef struct { @@ -365,8 +385,11 @@ SD_API const char* sd_preview_name(enum preview_t preview); SD_API enum preview_t str_to_preview(const char* str); SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode); SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str); +SD_API const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler); +SD_API enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str); SD_API void sd_cache_params_init(sd_cache_params_t* cache_params); +SD_API void sd_hires_params_init(sd_hires_params_t* hires_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index b9d3e9af1..8ae6bb504 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -17,6 +17,7 @@ #include "pmid.hpp" #include "sample-cache.h" #include "tae.hpp" +#include "upscaler.h" #include "vae.hpp" #include "latent-preview.h" @@ -2113,6 +2114,28 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) { return LORA_APPLY_MODE_COUNT; } +const char* hires_upscaler_to_str[] = { + "None", + "Latent (nearest)", + "Model", +}; + +const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) { + if (upscaler < SD_HIRES_UPSCALER_COUNT) { + return hires_upscaler_to_str[upscaler]; + } + return NONE_STR; +} + +enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str) { + for (int i = 0; i < SD_HIRES_UPSCALER_COUNT; i++) { + if (!strcmp(str, hires_upscaler_to_str[i])) { + return (enum sd_hires_upscaler_t)i; + } + } + return SD_HIRES_UPSCALER_COUNT; +} + void sd_cache_params_init(sd_cache_params_t* cache_params) { *cache_params = {}; cache_params->mode = SD_CACHE_DISABLED; @@ -2141,6 +2164,19 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) { cache_params->spectrum_stop_percent = 0.9f; } +void sd_hires_params_init(sd_hires_params_t* hires_params) { + *hires_params = {}; + hires_params->enabled = false; + hires_params->upscaler = SD_HIRES_UPSCALER_LATENT_NEAREST; + hires_params->model_path = nullptr; + hires_params->scale = 2.0f; + hires_params->target_width = 0; + hires_params->target_height = 0; + hires_params->steps = 0; + hires_params->denoising_strength = 0.7f; + hires_params->upscale_tile_size = 128; +} + void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { *sd_ctx_params = {}; sd_ctx_params->vae_decode_only = true; @@ -2310,6 +2346,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f}; sd_img_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_cache_params_init(&sd_img_gen_params->cache); + sd_hires_params_init(&sd_img_gen_params->hires); } char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { @@ -2336,7 +2373,8 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "increase_ref_index: %s\n" "control_strength: %.2f\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" - "VAE tiling: %s\n", + "VAE tiling: %s\n" + "hires: {enabled=%s, upscaler=%s, model_path=%s, scale=%.2f, target=%dx%d, steps=%d, denoising_strength=%.2f}\n", SAFE_STR(sd_img_gen_params->prompt), SAFE_STR(sd_img_gen_params->negative_prompt), sd_img_gen_params->clip_skip, @@ -2353,7 +2391,15 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->pm_params.style_strength, sd_img_gen_params->pm_params.id_images_count, SAFE_STR(sd_img_gen_params->pm_params.id_embed_path), - BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled)); + BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled), + BOOL_STR(sd_img_gen_params->hires.enabled), + sd_hires_upscaler_name(sd_img_gen_params->hires.upscaler), + SAFE_STR(sd_img_gen_params->hires.model_path), + sd_img_gen_params->hires.scale, + sd_img_gen_params->hires.target_width, + sd_img_gen_params->hires.target_height, + sd_img_gen_params->hires.steps, + sd_img_gen_params->hires.denoising_strength); const char* cache_mode_str = "disabled"; if (sd_img_gen_params->cache.mode == SD_CACHE_EASYCACHE) { cache_mode_str = "easycache"; @@ -2534,6 +2580,7 @@ struct GenerationRequest { sd_guidance_params_t guidance = {}; sd_guidance_params_t high_noise_guidance = {}; sd_pm_params_t pm_params = {}; + sd_hires_params_t hires = {}; int frames = -1; float vace_strength = 1.f; @@ -2555,6 +2602,7 @@ struct GenerationRequest { auto_resize_ref_image = sd_img_gen_params->auto_resize_ref_image; guidance = sd_img_gen_params->sample_params.guidance; pm_params = sd_img_gen_params->pm_params; + hires = sd_img_gen_params->hires; cache_params = &sd_img_gen_params->cache; resolve(sd_ctx); } @@ -2577,26 +2625,76 @@ struct GenerationRequest { } void align_generation_request_size() { + align_image_size(&width, &height, "generation request"); + } + + void align_image_size(int* target_width, int* target_height, const char* label) { int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; - int width_offset = align_up_offset(width, spatial_multiple); - int height_offset = align_up_offset(height, spatial_multiple); + int width_offset = align_up_offset(*target_width, spatial_multiple); + int height_offset = align_up_offset(*target_height, spatial_multiple); if (width_offset <= 0 && height_offset <= 0) { return; } - int original_width = width; - int original_height = height; + int original_width = *target_width; + int original_height = *target_height; - width += width_offset; - height += height_offset; - LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", + *target_width += width_offset; + *target_height += height_offset; + LOG_WARN("align %s up %dx%d to %dx%d (multiple=%d)", + label, original_width, original_height, - width, - height, + *target_width, + *target_height, spatial_multiple); } + void resolve_hires() { + if (!hires.enabled) { + return; + } + if (hires.upscaler == SD_HIRES_UPSCALER_NONE) { + hires.enabled = false; + return; + } + if (hires.upscaler < SD_HIRES_UPSCALER_NONE && hires.upscaler >= SD_HIRES_UPSCALER_COUNT) { + LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler); + hires.enabled = false; + return; + } + if (hires.upscaler == SD_HIRES_UPSCALER_MODEL && strlen(SAFE_STR(hires.model_path)) == 0) { + LOG_WARN("hires model upscaler requires a model path, disabling hires"); + hires.enabled = false; + return; + } + if (hires.scale <= 0.f && hires.target_width <= 0 && hires.target_height <= 0) { + LOG_WARN("hires scale must be positive when no target size is set, disabling hires"); + hires.enabled = false; + return; + } + hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f); + hires.steps = std::max(0, hires.steps); + + if (hires.target_width > 0 && hires.target_height > 0) { + // pass + } else if (hires.target_width > 0) { + hires.target_height = hires.target_width; + } else if (hires.target_height > 0) { + hires.target_width = hires.target_height; + } else { + hires.target_width = static_cast(std::round(width * hires.scale)); + hires.target_height = static_cast(std::round(height * hires.scale)); + } + + if (hires.target_width <= 0 || hires.target_height <= 0) { + LOG_WARN("hires target size is not positive, disabling hires"); + hires.enabled = false; + return; + } + align_image_size(&hires.target_width, &hires.target_height, "hires target"); + } + static void resolve_guidance(sd_ctx_t* sd_ctx, sd_guidance_params_t* guidance, bool* use_uncond, @@ -2637,6 +2735,7 @@ struct GenerationRequest { void resolve(sd_ctx_t* sd_ctx) { align_generation_request_size(); + resolve_hires(); seed = resolve_seed(seed); resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond); @@ -3149,6 +3248,67 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, return result_images; } +static sd::Tensor upscale_hires_latent(sd_ctx_t* sd_ctx, + const sd::Tensor& latent, + const GenerationRequest& request, + UpscalerGGML* upscaler) { + if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST) { + std::vector target_shape = latent.shape(); + if (target_shape.size() < 2) { + LOG_ERROR("latent has invalid shape for hires upscale"); + return {}; + } + target_shape[0] = request.hires.target_width / request.vae_scale_factor; + target_shape[1] = request.hires.target_height / request.vae_scale_factor; + + LOG_INFO("hires latent upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64, + latent.shape()[0], + latent.shape()[1], + target_shape[0], + target_shape[1]); + return sd::ops::interpolate(latent, target_shape, sd::ops::InterpolateMode::Nearest); + } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) { + if (upscaler == nullptr) { + LOG_ERROR("hires model upscaler context is null"); + return {}; + } + if (sd_ctx->sd->vae_decode_only) { + LOG_ERROR("hires model upscaler requires VAE encoder weights; create the context with vae_decode_only=false"); + return {}; + } + + sd::Tensor decoded = sd_ctx->sd->decode_first_stage(latent); + if (decoded.empty()) { + LOG_ERROR("decode_first_stage failed before hires model upscale"); + return {}; + } + + sd::Tensor upscaled_tensor = upscaler->upscale_tensor(decoded); + if (upscaled_tensor.empty()) { + LOG_ERROR("hires model upscale failed"); + return {}; + } + + if (upscaled_tensor.shape()[0] != request.hires.target_width || + upscaled_tensor.shape()[1] != request.hires.target_height) { + upscaled_tensor = sd::ops::interpolate(upscaled_tensor, + {request.hires.target_width, + request.hires.target_height, + upscaled_tensor.shape()[2], + upscaled_tensor.shape()[3]}); + } + + sd::Tensor upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor); + if (upscaled_latent.empty()) { + LOG_ERROR("encode_first_stage failed after hires model upscale"); + } + return upscaled_latent; + } + + LOG_ERROR("unsupported hires upscaler '%s'", sd_hires_upscaler_name(request.hires.upscaler)); + return {}; +} + SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; @@ -3236,7 +3396,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s } return nullptr; } - if (sd_ctx->sd->free_params_immediately) { + if (sd_ctx->sd->free_params_immediately && !request.hires.enabled) { sd_ctx->sd->diffusion_model->free_params_buffer(); } int64_t denoise_end = ggml_time_ms(); @@ -3244,6 +3404,131 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s final_latents.size(), (denoise_end - denoise_start) * 1.0f / 1000); + if (request.hires.enabled && request.hires.target_width > 0) { + LOG_INFO("hires fix: upscaling to %dx%d", request.hires.target_width, request.hires.target_height); + + std::unique_ptr hires_upscaler; + if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) { + LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path); + hires_upscaler = std::make_unique(sd_ctx->sd->n_threads, + false, + request.hires.upscale_tile_size); + if (!hires_upscaler->load_from_file(request.hires.model_path, + sd_ctx->sd->offload_params_to_cpu, + sd_ctx->sd->n_threads)) { + LOG_ERROR("load hires model upscaler failed"); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return nullptr; + } + } + + int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps; + + // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps, + // unlike img2img which trims from a fixed step count + hires_steps = static_cast(hires_steps / request.hires.denoising_strength); + + std::vector hires_sigmas = sd_ctx->sd->denoiser->get_sigmas( + hires_steps, + sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width), + sd_img_gen_params->sample_params.scheduler, + sd_ctx->sd->version); + + size_t t_enc = static_cast(hires_steps * request.hires.denoising_strength); + if (t_enc >= static_cast(hires_steps)) { + t_enc = static_cast(hires_steps) - 1; + } + std::vector hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast(t_enc) - 1, + hires_sigmas.end()); + LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu", + hires_steps, + request.hires.denoising_strength, + hires_sigma_sched.size()); + + std::vector> hires_final_latents; + int64_t hires_denoise_start = ggml_time_ms(); + for (int b = 0; b < (int)final_latents.size(); b++) { + int64_t cur_seed = request.seed + b; + sd_ctx->sd->rng->manual_seed(cur_seed); + sd_ctx->sd->sampler_rng->manual_seed(cur_seed); + + sd::Tensor upscaled = upscale_hires_latent(sd_ctx, + final_latents[b], + request, + hires_upscaler.get()); + if (upscaled.empty()) { + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return nullptr; + } + + sd::Tensor noise = sd::randn_like(upscaled, sd_ctx->sd->rng); + + sd::Tensor hires_denoise_mask; + if (!latents.denoise_mask.empty()) { + std::vector mask_shape = latents.denoise_mask.shape(); + mask_shape[0] = upscaled.shape()[0]; + mask_shape[1] = upscaled.shape()[1]; + hires_denoise_mask = sd::ops::interpolate(latents.denoise_mask, + mask_shape, + sd::ops::InterpolateMode::NearestMax); + } + + int64_t hires_sample_start = ggml_time_ms(); + sd::Tensor x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + upscaled, + std::move(noise), + embeds.cond, + embeds.uncond, + embeds.img_cond, + embeds.id_cond, + latents.control_image, + request.control_strength, + request.guidance, + plan.eta, + request.shifted_timestep, + plan.sample_method, + sd_ctx->sd->is_flow_denoiser(), + hires_sigma_sched, + plan.start_merge_step, + latents.ref_latents, + request.increase_ref_index, + hires_denoise_mask, + sd::Tensor(), + 1.f, + request.cache_params); + int64_t hires_sample_end = ggml_time_ms(); + if (!x_0.empty()) { + LOG_INFO("hires sampling %d/%d completed, taking %.2fs", + b + 1, + (int)final_latents.size(), + (hires_sample_end - hires_sample_start) * 1.0f / 1000); + hires_final_latents.push_back(std::move(x_0)); + continue; + } + + LOG_ERROR("hires sampling for image %d/%d failed after %.2fs", + b + 1, + (int)final_latents.size(), + (hires_sample_end - hires_sample_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return nullptr; + } + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + int64_t hires_denoise_end = ggml_time_ms(); + LOG_INFO("hires fix completed, taking %.2fs", (hires_denoise_end - hires_denoise_start) * 1.0f / 1000); + + final_latents = std::move(hires_final_latents); + } + auto result = decode_image_outputs(sd_ctx, request, final_latents); if (result == nullptr) { return nullptr; diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 03f7714e5..ed7bb89a0 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -1,125 +1,115 @@ -#include "esrgan.hpp" +#include "upscaler.h" #include "ggml_extend.hpp" #include "model.h" #include "stable-diffusion.h" #include "util.h" -struct UpscalerGGML { - ggml_backend_t backend = nullptr; // general backend - ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr esrgan_upscaler; - std::string esrgan_path; - int n_threads; - bool direct = false; - int tile_size = 128; - - UpscalerGGML(int n_threads, - bool direct = false, - int tile_size = 128) - : n_threads(n_threads), - direct(direct), - tile_size(tile_size) { - } +UpscalerGGML::UpscalerGGML(int n_threads, + bool direct, + int tile_size) + : n_threads(n_threads), + direct(direct), + tile_size(tile_size) { +} - bool load_from_file(const std::string& esrgan_path, - bool offload_params_to_cpu, - int n_threads) { - ggml_log_set(ggml_log_callback_default, nullptr); +bool UpscalerGGML::load_from_file(const std::string& esrgan_path, + bool offload_params_to_cpu, + int n_threads) { + ggml_log_set(ggml_log_callback_default, nullptr); #ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); + LOG_DEBUG("Using CUDA backend"); + backend = ggml_backend_cuda_init(0); #endif #ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); + LOG_DEBUG("Using Metal backend"); + backend = ggml_backend_metal_init(); #endif #ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - backend = ggml_backend_vk_init(0); + LOG_DEBUG("Using Vulkan backend"); + backend = ggml_backend_vk_init(0); #endif #ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - backend = ggml_backend_opencl_init(); + LOG_DEBUG("Using OpenCL backend"); + backend = ggml_backend_opencl_init(); #endif #ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); + LOG_DEBUG("Using SYCL backend"); + backend = ggml_backend_sycl_init(0); #endif - ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { - LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); - } - model_loader.set_wtype_override(model_data_type); - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } - LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); - if (direct) { - esrgan_upscaler->set_conv2d_direct_enabled(true); - } - if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) { - return false; - } - return true; + ModelLoader model_loader; + if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { + LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); + } + model_loader.set_wtype_override(model_data_type); + if (!backend) { + LOG_DEBUG("Using CPU backend"); + backend = ggml_backend_cpu_init(); + } + LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); + esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); + if (direct) { + esrgan_upscaler->set_conv2d_direct_enabled(true); + } + if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) { + return false; } + return true; +} - sd::Tensor upscale_tensor(const sd::Tensor& input_tensor) { - sd::Tensor upscaled; - if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) { - upscaled = esrgan_upscaler->compute(n_threads, input_tensor); - } else { - auto on_processing = [&](const sd::Tensor& input_tile) -> sd::Tensor { - auto output_tile = esrgan_upscaler->compute(n_threads, input_tile); - if (output_tile.empty()) { - LOG_ERROR("esrgan compute failed while processing a tile"); - return {}; - } - return output_tile; - }; +sd::Tensor UpscalerGGML::upscale_tensor(const sd::Tensor& input_tensor) { + sd::Tensor upscaled; + if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) { + upscaled = esrgan_upscaler->compute(n_threads, input_tensor); + } else { + auto on_processing = [&](const sd::Tensor& input_tile) -> sd::Tensor { + auto output_tile = esrgan_upscaler->compute(n_threads, input_tile); + if (output_tile.empty()) { + LOG_ERROR("esrgan compute failed while processing a tile"); + return {}; + } + return output_tile; + }; - upscaled = process_tiles_2d(input_tensor, - static_cast(input_tensor.shape()[0] * esrgan_upscaler->scale), - static_cast(input_tensor.shape()[1] * esrgan_upscaler->scale), - esrgan_upscaler->scale, - tile_size, - tile_size, - 0.25f, - false, - false, - on_processing); - } - esrgan_upscaler->free_compute_buffer(); - if (upscaled.empty()) { - LOG_ERROR("esrgan compute failed"); - return {}; - } - return upscaled; + upscaled = process_tiles_2d(input_tensor, + static_cast(input_tensor.shape()[0] * esrgan_upscaler->scale), + static_cast(input_tensor.shape()[1] * esrgan_upscaler->scale), + esrgan_upscaler->scale, + tile_size, + tile_size, + 0.25f, + false, + false, + on_processing); + } + esrgan_upscaler->free_compute_buffer(); + if (upscaled.empty()) { + LOG_ERROR("esrgan compute failed"); + return {}; } + return upscaled; +} - sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) { - // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth - sd_image_t upscaled_image = {0, 0, 0, nullptr}; - int output_width = (int)input_image.width * esrgan_upscaler->scale; - int output_height = (int)input_image.height * esrgan_upscaler->scale; - LOG_INFO("upscaling from (%i x %i) to (%i x %i)", - input_image.width, input_image.height, output_width, output_height); +sd_image_t UpscalerGGML::upscale(sd_image_t input_image, uint32_t upscale_factor) { + // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth + sd_image_t upscaled_image = {0, 0, 0, nullptr}; + int output_width = (int)input_image.width * esrgan_upscaler->scale; + int output_height = (int)input_image.height * esrgan_upscaler->scale; + LOG_INFO("upscaling from (%i x %i) to (%i x %i)", + input_image.width, input_image.height, output_width, output_height); - sd::Tensor input_tensor = sd_image_to_tensor(input_image); - sd::Tensor upscaled; - int64_t t0 = ggml_time_ms(); - upscaled = upscale_tensor(input_tensor); - if (upscaled.empty()) { - return upscaled_image; - } - sd_image_t upscaled_data = tensor_to_sd_image(upscaled); - int64_t t3 = ggml_time_ms(); - LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f); - upscaled_image = upscaled_data; + sd::Tensor input_tensor = sd_image_to_tensor(input_image); + sd::Tensor upscaled; + int64_t t0 = ggml_time_ms(); + upscaled = upscale_tensor(input_tensor); + if (upscaled.empty()) { return upscaled_image; } -}; + sd_image_t upscaled_data = tensor_to_sd_image(upscaled); + int64_t t3 = ggml_time_ms(); + LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f); + upscaled_image = upscaled_data; + return upscaled_image; +} struct upscaler_ctx_t { UpscalerGGML* upscaler = nullptr; diff --git a/src/upscaler.h b/src/upscaler.h new file mode 100644 index 000000000..b11f004a6 --- /dev/null +++ b/src/upscaler.h @@ -0,0 +1,31 @@ +#ifndef __SD_UPSCALER_H__ +#define __SD_UPSCALER_H__ + +#include "esrgan.hpp" +#include "stable-diffusion.h" +#include "tensor.hpp" + +#include +#include + +struct UpscalerGGML { + ggml_backend_t backend = nullptr; // general backend + ggml_type model_data_type = GGML_TYPE_F16; + std::shared_ptr esrgan_upscaler; + std::string esrgan_path; + int n_threads; + bool direct = false; + int tile_size = 128; + + UpscalerGGML(int n_threads, + bool direct = false, + int tile_size = 128); + + bool load_from_file(const std::string& esrgan_path, + bool offload_params_to_cpu, + int n_threads); + sd::Tensor upscale_tensor(const sd::Tensor& input_tensor); + sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor); +}; + +#endif // __SD_UPSCALER_H__