From 516b85b8da51ca05d5842031ddf0d9cefbfa4288 Mon Sep 17 00:00:00 2001 From: rmatif Date: Wed, 25 Feb 2026 15:49:48 +0100 Subject: [PATCH] add anima --- src/anima.hpp | 774 +++++++++++++++++++++++++++++++++++++++ src/conditioner.hpp | 136 +++++++ src/diffusion_model.hpp | 67 ++++ src/model.cpp | 3 + src/model.h | 9 + src/name_conversion.cpp | 8 + src/rope.hpp | 19 +- src/stable-diffusion.cpp | 33 +- 8 files changed, 1037 insertions(+), 12 deletions(-) create mode 100644 src/anima.hpp diff --git a/src/anima.hpp b/src/anima.hpp new file mode 100644 index 000000000..c27901334 --- /dev/null +++ b/src/anima.hpp @@ -0,0 +1,774 @@ +#ifndef __ANIMA_HPP__ +#define __ANIMA_HPP__ + +#include +#include +#include +#include + +#include "common.hpp" +#include "flux.hpp" +#include "ggml_extend.hpp" +#include "rope.hpp" + +namespace Anima { + constexpr int ANIMA_GRAPH_SIZE = 65536; + + __STATIC_INLINE__ struct ggml_tensor* patchify_2d(struct ggml_context* ctx, + struct ggml_tensor* x, + int64_t patch_size) { + // x: [W*r, H*q, T, C] + // return: [W, H, T, C*q*r] + if (patch_size == 1) { + return x; + } + GGML_ASSERT(x->ne[2] == 1); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t T = x->ne[2]; + int64_t C = x->ne[3]; + int64_t p = patch_size; + int64_t h = H / p; + int64_t w = W / p; + + GGML_ASSERT(T == 1); + GGML_ASSERT(h * p == H && w * p == W); + + // Reuse Flux patchify layout on a [W, H, C, N] view. + x = ggml_reshape_4d(ctx, x, W, H, C, T); // [W, H, C, N] + + // Flux patchify: [N, C, H, W] -> [N, h*w, C*p*p] + x = ggml_reshape_4d(ctx, x, p, w, p, h * C * T); // [p, w, p, h*C*N] + x = ggml_ext_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [p, p, w, h*C*N] + x = ggml_reshape_4d(ctx, x, p * p, w * h, C, T); // [p*p, h*w, C, N] + x = ggml_ext_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [p*p, C, h*w, N] + x = ggml_reshape_3d(ctx, x, p * p * C, w * h, T); // [C*p*p, h*w, N] + + // Return [w, h, T, C*p*p] + x = ggml_reshape_4d(ctx, x, p * p * C, w, h, T); // [C*p*p, w, h, N] + x = ggml_ext_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2)); // [w, h, N, C*p*p] + return x; + } + + __STATIC_INLINE__ struct ggml_tensor* unpatchify_2d(struct ggml_context* ctx, + struct ggml_tensor* x, + int64_t patch_size) { + // x: [W, H, T, C*q*r] + // return: [W*r, H*q, T, C] + if (patch_size == 1) { + return x; + } + GGML_ASSERT(x->ne[2] == 1); + + int64_t w = x->ne[0]; + int64_t h = x->ne[1]; + int64_t T = x->ne[2]; + int64_t p = patch_size; + int64_t nm = p * p; + int64_t Cp = x->ne[3]; + int64_t C = Cp / nm; + int64_t W = w * p; + int64_t H = h * p; + + GGML_ASSERT(T == 1); + GGML_ASSERT(C * nm == Cp); + + // [w, h, 1, C*p*p] -> [W, H, 1, C] + x = ggml_reshape_4d(ctx, x, w, h * C, p, p); // [w, h*C, p2, p1] + x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 3, 1)); // [p2, w, p1, h*C] + x = ggml_reshape_4d(ctx, x, W, H, T, C); // [W, H, 1, C] + return x; + } + + __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* gate) { + gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C] + return ggml_mul(ctx, x, gate); + } + + struct XEmbedder : public GGMLBlock { + public: + XEmbedder(int64_t in_dim, int64_t out_dim) { + blocks["proj.1"] = std::make_shared(in_dim, out_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto proj = std::dynamic_pointer_cast(blocks["proj.1"]); + return proj->forward(ctx, x); + } + }; + + struct TimestepEmbedder : public GGMLBlock { + public: + TimestepEmbedder(int64_t in_dim, int64_t out_dim) { + blocks["1.linear_1"] = std::make_shared(in_dim, in_dim, false); + blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]); + + x = linear_1->forward(ctx, x); + x = ggml_silu_inplace(ctx->ggml_ctx, x); + x = linear_2->forward(ctx, x); + return x; + } + }; + + struct AdaLayerNormZero : public GGMLBlock { + protected: + int64_t in_features; + + public: + AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256) + : in_features(in_features) { + blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false); + blocks["1"] = std::make_shared(in_features, hidden_features, false); + blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false); + } + + std::pair forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb = nullptr) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); + + auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep); + emb = linear_1->forward(ctx, emb); + emb = linear_2->forward(ctx, emb); // [N, 3*C] + + if (temb != nullptr) { + emb = ggml_add(ctx->ggml_ctx, emb, temb); + } + + auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0); + auto shift = emb_chunks[0]; + auto scale = emb_chunks[1]; + auto gate = emb_chunks[2]; + + auto x = norm->forward(ctx, hidden_states); + x = Flux::modulate(ctx->ggml_ctx, x, shift, scale); + + return {x, gate}; + } + }; + + struct AdaLayerNorm : public GGMLBlock { + protected: + int64_t embedding_dim; + + public: + AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256) + : embedding_dim(in_features) { + blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false); + blocks["1"] = std::make_shared(in_features, hidden_features, false); + blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb = nullptr) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); + + auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep); + emb = linear_1->forward(ctx, emb); + emb = linear_2->forward(ctx, emb); // [N, 2*C] + + if (temb != nullptr) { + auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0); + emb = ggml_add(ctx->ggml_ctx, emb, temb_2c); + } + + auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0); + auto shift = emb_chunks[0]; + auto scale = emb_chunks[1]; + + auto x = norm->forward(ctx, hidden_states); + x = Flux::modulate(ctx->ggml_ctx, x, shift, scale); + return x; + } + }; + + struct AnimaAttention : public GGMLBlock { + protected: + int64_t num_heads; + int64_t head_dim; + std::string out_proj_name; + + public: + AnimaAttention(int64_t query_dim, + int64_t context_dim, + int64_t num_heads, + int64_t head_dim, + const std::string& out_proj_name = "output_proj") + : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) { + int64_t inner_dim = num_heads * head_dim; + + blocks["q_proj"] = std::make_shared(query_dim, inner_dim, false); + blocks["k_proj"] = std::make_shared(context_dim, inner_dim, false); + blocks["v_proj"] = std::make_shared(context_dim, inner_dim, false); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* encoder_hidden_states = nullptr, + struct ggml_tensor* pe_q = nullptr, + struct ggml_tensor* pe_k = nullptr) { + if (encoder_hidden_states == nullptr) { + encoder_hidden_states = hidden_states; + } + + auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + auto out_proj = std::dynamic_pointer_cast(blocks[out_proj_name]); + + auto q = q_proj->forward(ctx, hidden_states); + auto k = k_proj->forward(ctx, encoder_hidden_states); + auto v = v_proj->forward(ctx, encoder_hidden_states); + + int64_t N = q->ne[2]; + int64_t L_q = q->ne[1]; + int64_t L_k = k->ne[1]; + + auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D] + auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D] + auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D] + + q4 = q_norm->forward(ctx, q4); + k4 = k_norm->forward(ctx, k4); + + struct ggml_tensor* attn_out = nullptr; + if (pe_q != nullptr || pe_k != nullptr) { + if (pe_q == nullptr) { + pe_q = pe_k; + } + if (pe_k == nullptr) { + pe_k = pe_q; + } + auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false); + auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false); + attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q_rope, + k_rope, + v4, + num_heads, + nullptr, + true, + ctx->flash_attn_enabled); + } else { + auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N); + auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N); + attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q_flat, + k_flat, + v, + num_heads, + nullptr, + false, + ctx->flash_attn_enabled); + } + + return out_proj->forward(ctx, attn_out); + } + }; + + struct AnimaMLP : public GGMLBlock { + public: + AnimaMLP(int64_t dim, int64_t hidden_dim) { + blocks["layer1"] = std::make_shared(dim, hidden_dim, false); + blocks["layer2"] = std::make_shared(hidden_dim, dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]); + auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]); + + x = layer1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + x = layer2->forward(ctx, x); + return x; + } + }; + + struct AdapterMLP : public GGMLBlock { + public: + AdapterMLP(int64_t dim, int64_t hidden_dim) { + blocks["0"] = std::make_shared(dim, hidden_dim, true); + blocks["2"] = std::make_shared(hidden_dim, dim, true); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto layer0 = std::dynamic_pointer_cast(blocks["0"]); + auto layer2 = std::dynamic_pointer_cast(blocks["2"]); + + x = layer0->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + x = layer2->forward(ctx, x); + return x; + } + }; + + struct LLMAdapterBlock : public GGMLBlock { + public: + LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) { + blocks["norm_self_attn"] = std::make_shared(model_dim, 1e-6f); + blocks["self_attn"] = std::make_shared(model_dim, model_dim, num_heads, head_dim, "o_proj"); + blocks["norm_cross_attn"] = std::make_shared(model_dim, 1e-6f); + blocks["cross_attn"] = std::make_shared(model_dim, source_dim, num_heads, head_dim, "o_proj"); + blocks["norm_mlp"] = std::make_shared(model_dim, 1e-6f); + blocks["mlp"] = std::make_shared(model_dim, model_dim * 4); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* context, + struct ggml_tensor* target_pe, + struct ggml_tensor* context_pe) { + auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]); + auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); + auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]); + auto cross_attn = std::dynamic_pointer_cast(blocks["cross_attn"]); + auto norm_mlp = std::dynamic_pointer_cast(blocks["norm_mlp"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto h = norm_self_attn->forward(ctx, x); + h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe); + x = ggml_add(ctx->ggml_ctx, x, h); + + h = norm_cross_attn->forward(ctx, x); + h = cross_attn->forward(ctx, h, context, target_pe, context_pe); + x = ggml_add(ctx->ggml_ctx, x, h); + + h = norm_mlp->forward(ctx, x); + h = mlp->forward(ctx, h); + x = ggml_add(ctx->ggml_ctx, x, h); + + return x; + } + }; + + struct LLMAdapter : public GGMLBlock { + protected: + int num_layers; + + public: + LLMAdapter(int64_t source_dim = 1024, + int64_t target_dim = 1024, + int64_t model_dim = 1024, + int num_layers = 6, + int num_heads = 16) + : num_layers(num_layers) { + int64_t head_dim = model_dim / num_heads; + + blocks["embed"] = std::make_shared(32128, target_dim); + for (int i = 0; i < num_layers; i++) { + blocks["blocks." + std::to_string(i)] = + std::make_shared(model_dim, source_dim, num_heads, head_dim); + } + blocks["out_proj"] = std::make_shared(model_dim, target_dim, true); + blocks["norm"] = std::make_shared(target_dim, 1e-6f); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* source_hidden_states, + struct ggml_tensor* target_input_ids, + struct ggml_tensor* target_pe, + struct ggml_tensor* source_pe) { + GGML_ASSERT(target_input_ids != nullptr); + if (ggml_n_dims(target_input_ids) == 1) { + target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1); + } + + auto embed = std::dynamic_pointer_cast(blocks["embed"]); + auto out_proj = std::dynamic_pointer_cast(blocks["out_proj"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim] + + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe); + } + + x = out_proj->forward(ctx, x); + x = norm->forward(ctx, x); + return x; + } + }; + + struct TransformerBlock : public GGMLBlock { + public: + TransformerBlock(int64_t hidden_size, + int64_t text_embed_dim, + int64_t num_heads, + int64_t head_dim, + int64_t mlp_ratio = 4, + int64_t adaln_lora_dim = 256) { + blocks["adaln_modulation_self_attn"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["self_attn"] = std::make_shared(hidden_size, hidden_size, num_heads, head_dim); + blocks["adaln_modulation_cross_attn"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["cross_attn"] = std::make_shared(hidden_size, text_embed_dim, num_heads, head_dim); + blocks["adaln_modulation_mlp"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* encoder_hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb, + struct ggml_tensor* image_pe) { + auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]); + auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]); + auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]); + auto attn2 = std::dynamic_pointer_cast(blocks["cross_attn"]); + auto norm3 = std::dynamic_pointer_cast(blocks["adaln_modulation_mlp"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb); + auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1)); + + auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb); + h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2)); + + auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb); + h = mlp->forward(ctx, normed3); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3)); + + return hidden_states; + } + }; + + struct FinalLayer : public GGMLBlock { + protected: + int64_t hidden_size; + int64_t patch_size; + int64_t out_channels; + + public: + FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) + : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) { + blocks["adaln_modulation"] = std::make_shared(hidden_size, 256); + blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb) { + auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + + hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb); + hidden_states = linear->forward(ctx, hidden_states); + return hidden_states; + } + }; + + struct AnimaNet : public GGMLBlock { + public: + int64_t in_channels = 16; + int64_t out_channels = 16; + int64_t hidden_size = 2048; + int64_t text_embed_dim = 1024; + int64_t num_heads = 16; + int64_t head_dim = 128; + int64_t patch_size = 2; + int64_t num_layers = 28; + std::vector axes_dim = {44, 42, 42}; + int theta = 10000; + + public: + AnimaNet() = default; + explicit AnimaNet(int64_t num_layers) + : num_layers(num_layers) { + blocks["x_embedder"] = std::make_shared((in_channels + 1) * patch_size * patch_size, hidden_size); + blocks["t_embedder"] = std::make_shared(hidden_size, hidden_size * 3); + blocks["t_embedding_norm"] = std::make_shared(hidden_size, 1e-6f); + for (int i = 0; i < num_layers; i++) { + blocks["blocks." + std::to_string(i)] = std::make_shared(hidden_size, + text_embed_dim, + num_heads, + head_dim); + } + blocks["final_layer"] = std::make_shared(hidden_size, patch_size, out_channels); + blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* timestep, + struct ggml_tensor* encoder_hidden_states, + struct ggml_tensor* image_pe, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr, + struct ggml_tensor* adapter_q_pe = nullptr, + struct ggml_tensor* adapter_k_pe = nullptr) { + GGML_ASSERT(x->ne[3] == 1); + + auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); + auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); + auto t_embedding_norm = std::dynamic_pointer_cast(blocks["t_embedding_norm"]); + auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); + auto llm_adapter = std::dynamic_pointer_cast(blocks["llm_adapter"]); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + + x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); // [N*C, T, H, W] style + + int64_t pad_h = (patch_size - H % patch_size) % patch_size; + int64_t pad_w = (patch_size - W % patch_size) % patch_size; + if (pad_h > 0 || pad_w > 0) { + x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); + } + + auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], x->ne[2], 1); + x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 3); // concat mask channel + + x = patchify_2d(ctx->ggml_ctx, x, patch_size); // [C*4, T, H/2, W/2] + + int64_t w_len = x->ne[0]; + int64_t h_len = x->ne[1]; + int64_t t_len = x->ne[2]; + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1] * x->ne[2], x->ne[3], 1); + x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, n_token, C] + + x = x_embedder->forward(ctx, x); + + auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, hidden_size); + auto temb = t_embedder->forward(ctx, timestep_proj); + auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj); + + if (t5_ids != nullptr) { + auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe); + if (t5_weights != nullptr) { + auto w = t5_weights; + if (ggml_n_dims(w) == 1) { + w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1); + } + w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1); + adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w); + } + if (adapted_context->ne[1] < 512) { + auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx, + adapted_context->ne[0], + 512 - adapted_context->ne[1], + adapted_context->ne[2], + 1); + adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1); + } else if (adapted_context->ne[1] > 512) { + adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512); + } + encoder_hidden_states = adapted_context; + } + + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe); + } + + x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, n_token, C*4] + + x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [n_token, C*4, N] + x = ggml_reshape_4d(ctx->ggml_ctx, x, w_len, h_len, t_len, x->ne[1]); // [C*4, T, H/2, W/2] + x = unpatchify_2d(ctx->ggml_ctx, x, patch_size); // [C, T, H, W] + + x = ggml_ext_slice(ctx->ggml_ctx, x, 1, 0, H); // [C, T, H, W + pad] + x = ggml_ext_slice(ctx->ggml_ctx, x, 0, 0, W); // [C, T, H, W] + x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[3], x->ne[2]); // [N, C, H, W] + + return x; + } + }; + + struct AnimaRunner : public GGMLRunner { + public: + std::vector image_pe_vec; + std::vector adapter_q_pe_vec; + std::vector adapter_k_pe_vec; + AnimaNet net; + + AnimaRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : GGMLRunner(backend, offload_params_to_cpu) { + int64_t num_layers = 0; + std::string layer_tag = prefix + ".net.blocks."; + for (const auto& kv : tensor_storage_map) { + const std::string& tensor_name = kv.first; + size_t pos = tensor_name.find(layer_tag); + if (pos == std::string::npos) { + continue; + } + size_t start = pos + layer_tag.size(); + size_t end = tensor_name.find('.', start); + if (end == std::string::npos) { + continue; + } + int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str()); + num_layers = std::max(num_layers, layer_id + 1); + } + if (num_layers <= 0) { + num_layers = 28; + } + LOG_INFO("anima net layers: %" PRId64, num_layers); + + net = AnimaNet(num_layers); + net.init(params_ctx, tensor_storage_map, prefix + ".net"); + } + + std::string get_desc() override { + return "anima"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + net.get_param_tensors(tensors, prefix + ".net"); + } + + static std::vector gen_1d_rope_pe_vec(int64_t seq_len, int dim, int theta = 10000) { + std::vector pos(seq_len); + for (int64_t i = 0; i < seq_len; i++) { + pos[i] = static_cast(i); + } + auto rope_emb = Rope::rope(pos, dim, theta); + return Rope::flatten(rope_emb); + } + + static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) { + if (extrapolation_ratio == 1.0f || axis_dim <= 2) { + return 1.0f; + } + return std::pow(extrapolation_ratio, static_cast(axis_dim) / static_cast(axis_dim - 2)); + } + + static std::vector gen_anima_image_pe_vec(int bs, + int h, + int w, + int patch_size, + int theta, + const std::vector& axes_dim, + float h_extrapolation_ratio, + float w_extrapolation_ratio, + float t_extrapolation_ratio) { + static const std::vector empty_ref_latents; + auto ids = Rope::gen_flux_ids(h, + w, + patch_size, + bs, + static_cast(axes_dim.size()), + 0, + {}, + empty_ref_latents, + false, + 1.0f); + + std::vector axis_thetas = { + static_cast(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]), + static_cast(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]), + static_cast(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]), + }; + return Rope::embed_nd(ids, bs, axis_thetas, axes_dim); + } + + struct ggml_cgraph* build_graph(struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr) { + GGML_ASSERT(x->ne[3] == 1); + struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); + + x = to_backend(x); + timesteps = to_backend(timesteps); + context = to_backend(context); + t5_ids = to_backend(t5_ids); + t5_weights = to_backend(t5_weights); + + int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size; + int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size; + int64_t h_pad = x->ne[1] + pad_h; + int64_t w_pad = x->ne[0] + pad_w; + + image_pe_vec = gen_anima_image_pe_vec(1, + static_cast(h_pad), + static_cast(w_pad), + static_cast(net.patch_size), + net.theta, + net.axes_dim, + 4.0f, + 4.0f, + 1.0f); + int64_t image_pos_len = static_cast(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2)); + auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len); + set_backend_tensor_data(image_pe, image_pe_vec.data()); + + ggml_tensor* adapter_q_pe = nullptr; + ggml_tensor* adapter_k_pe = nullptr; + if (t5_ids != nullptr) { + int64_t target_len = t5_ids->ne[0]; + int64_t source_len = context->ne[1]; + + adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000); + adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000); + + int64_t target_pos_len = static_cast(adapter_q_pe_vec.size()) / (2 * 2 * 32); + int64_t source_pos_len = static_cast(adapter_k_pe_vec.size()) / (2 * 2 * 32); + + adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len); + adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len); + set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data()); + set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data()); + } + + auto runner_ctx = get_context(); + auto out = net.forward(&runner_ctx, + x, + timesteps, + context, + image_pe, + t5_ids, + t5_weights, + adapter_q_pe, + adapter_k_pe); + + ggml_build_forward_expand(gf, out); + return gf; + } + + bool compute(int n_threads, + struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph(x, timesteps, context, t5_ids, t5_weights); + }; + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } + }; +} // namespace Anima + +#endif // __ANIMA_HPP__ diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 4317ed18a..879d68c7e 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -1641,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner { } }; +struct AnimaConditioner : public Conditioner { + std::shared_ptr qwen_tokenizer; + T5UniGramTokenizer t5_tokenizer; + std::shared_ptr llm; + + AnimaConditioner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}) { + qwen_tokenizer = std::make_shared(); + llm = std::make_shared(LLM::LLMArch::QWEN3, + backend, + offload_params_to_cpu, + tensor_storage_map, + "text_encoders.llm", + false); + } + + void get_param_tensors(std::map& tensors) override { + llm->get_param_tensors(tensors, "text_encoders.llm"); + } + + void alloc_params_buffer() override { + llm->alloc_params_buffer(); + } + + void free_params_buffer() override { + llm->free_params_buffer(); + } + + size_t get_params_buffer_size() override { + return llm->get_params_buffer_size(); + } + + void set_flash_attention_enabled(bool enabled) override { + llm->set_flash_attention_enabled(enabled); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + llm->set_weight_adapter(adapter); + } + + std::tuple, std::vector, std::vector, std::vector> tokenize(std::string text) { + auto parsed_attention = parse_prompt_attention(text); + + { + std::stringstream ss; + ss << "["; + for (const auto& item : parsed_attention) { + ss << "['" << item.first << "', " << item.second << "], "; + } + ss << "]"; + LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); + } + + std::vector qwen_tokens; + std::vector qwen_weights; + std::vector t5_tokens; + std::vector t5_weights; + + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + std::vector curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr); + qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + // Anima uses uniform Qwen token weights. + qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f); + } + if (qwen_tokens.empty()) { + qwen_tokens.push_back(151643); // qwen3 pad token + qwen_weights.push_back(1.f); + } + + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + } + + return {qwen_tokens, qwen_weights, t5_tokens, t5_weights}; + } + + SDCondition get_learned_condition(ggml_context* work_ctx, + int n_threads, + const ConditionerParams& conditioner_params) override { + int64_t t0 = ggml_time_ms(); + + auto tokenized = tokenize(conditioner_params.text); + auto& qwen_tokens = std::get<0>(tokenized); + auto& qwen_weights = std::get<1>(tokenized); + auto& t5_tokens = std::get<2>(tokenized); + auto& t5_weights = std::get<3>(tokenized); + + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens); + + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] + llm->compute(n_threads, + input_ids, + nullptr, + {}, + {}, + &hidden_states, + work_ctx); + + { + auto tensor = hidden_states; + float original_mean = ggml_ext_tensor_mean(tensor); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); + value *= qwen_weights[i1]; + ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); + } + } + } + float new_mean = ggml_ext_tensor_mean(tensor); + if (new_mean != 0.f) { + ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + } + } + + struct ggml_tensor* t5_ids_tensor = nullptr; + struct ggml_tensor* t5_weight_tensor = nullptr; + if (!t5_tokens.empty()) { + t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens); + t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights); + } + + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); + + return {hidden_states, t5_weight_tensor, t5_ids_tensor}; + } +}; + struct LLMEmbedder : public Conditioner { SDVersion version; std::shared_ptr tokenizer; diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 3293ba9b7..329bb9d9a 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -1,6 +1,7 @@ #ifndef __DIFFUSION_MODEL_H__ #define __DIFFUSION_MODEL_H__ +#include "anima.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" @@ -242,6 +243,72 @@ struct FluxModel : public DiffusionModel { } }; +struct AnimaModel : public DiffusionModel { + std::string prefix; + Anima::AnimaRunner anima; + + AnimaModel(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return anima.get_desc(); + } + + void alloc_params_buffer() override { + anima.alloc_params_buffer(); + } + + void free_params_buffer() override { + anima.free_params_buffer(); + } + + void free_compute_buffer() override { + anima.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + anima.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return anima.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + anima.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 768; + } + + void set_flash_attention_enabled(bool enabled) { + anima.set_flash_attention_enabled(enabled); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + anima.set_circular_axes(circular_x, circular_y); + } + + bool compute(int n_threads, + DiffusionParams diffusion_params, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { + return anima.compute(n_threads, + diffusion_params.x, + diffusion_params.timesteps, + diffusion_params.context, + diffusion_params.c_concat, + diffusion_params.y, + output, + output_ctx); + } +}; + struct WanModel : public DiffusionModel { std::string prefix; WAN::WanRunner wan; diff --git a/src/model.cpp b/src/model.cpp index 58d71d9e4..24e3c5668 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1057,6 +1057,9 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } + if (tensor_storage.name.find("model.diffusion_model.net.llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) { + return VERSION_ANIMA; + } if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { is_flux2 = true; } diff --git a/src/model.h b/src/model.h index 0d7a85c6e..5b9ce18ab 100644 --- a/src/model.h +++ b/src/model.h @@ -45,6 +45,7 @@ enum SDVersion { VERSION_WAN2_2_I2V, VERSION_WAN2_2_TI2V, VERSION_QWEN_IMAGE, + VERSION_ANIMA, VERSION_FLUX2, VERSION_FLUX2_KLEIN, VERSION_Z_IMAGE, @@ -122,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) { return false; } +static inline bool sd_version_is_anima(SDVersion version) { + if (version == VERSION_ANIMA) { + return true; + } + return false; +} + static inline bool sd_version_is_z_image(SDVersion version) { if (version == VERSION_Z_IMAGE) { return true; @@ -146,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_z_image(version)) { return true; } diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp index d3e863b8a..a7839ab9e 100644 --- a/src/name_conversion.cpp +++ b/src/name_conversion.cpp @@ -1094,6 +1094,14 @@ std::string convert_tensor_name(std::string name, SDVersion version) { } } + if (is_lora && sd_version_is_anima(version)) { + static const std::string anima_diffusion_prefix = "model.diffusion_model."; + static const std::string anima_net_prefix = "model.diffusion_model.net."; + if (starts_with(name, anima_diffusion_prefix) && !starts_with(name, anima_net_prefix)) { + name = anima_net_prefix + name.substr(anima_diffusion_prefix.size()); + } + } + // cond_stage_model { for (const auto& prefix : cond_stage_model_prefix_vec) { diff --git a/src/rope.hpp b/src/rope.hpp index 45e88c831..8b55c9e20 100644 --- a/src/rope.hpp +++ b/src/rope.hpp @@ -43,7 +43,7 @@ namespace Rope { __STATIC_INLINE__ std::vector> rope(const std::vector& pos, int dim, - int theta, + float theta, const std::vector& axis_wrap_dims = {}) { assert(dim % 2 == 0); int half_dim = dim / 2; @@ -167,7 +167,7 @@ namespace Rope { __STATIC_INLINE__ std::vector embed_nd(const std::vector>& ids, int bs, - int theta, + const std::vector& axis_thetas, const std::vector& axes_dim, const std::vector>& wrap_dims = {}) { std::vector> trans_ids = transpose(ids); @@ -188,8 +188,12 @@ namespace Rope { if (!wrap_dims.empty() && i < (int)wrap_dims.size()) { axis_wrap_dims = wrap_dims[i]; } + float axis_theta = 10000.0f; + if (!axis_thetas.empty()) { + axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)]; + } std::vector> rope_emb = - rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] + rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] for (int b = 0; b < bs; ++b) { for (int j = 0; j < pos_len; ++j) { for (int k = 0; k < rope_emb[0].size(); ++k) { @@ -203,6 +207,15 @@ namespace Rope { return flatten(emb); } + __STATIC_INLINE__ std::vector embed_nd(const std::vector>& ids, + int bs, + float theta, + const std::vector& axes_dim, + const std::vector>& wrap_dims = {}) { + std::vector axis_thetas(axes_dim.size(), theta); + return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims); + } + __STATIC_INLINE__ std::vector> gen_refs_ids(int patch_size, int bs, int axes_dim_num, diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 94221d8e6..60397473b 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -48,6 +48,7 @@ const char* model_version_to_str[] = { "Wan 2.2 I2V", "Wan 2.2 TI2V", "Qwen Image", + "Anima", "Flux.2", "Flux.2 klein", "Z-Image", @@ -404,6 +405,7 @@ class StableDiffusionGGML { shift_factor = 0.1159f; } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_flux2(version)) { scale_factor = 1.0f; shift_factor = 0.f; @@ -534,6 +536,14 @@ class StableDiffusionGGML { "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); + } else if (sd_version_is_anima(version)) { + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -596,7 +606,7 @@ class StableDiffusionGGML { } if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu, tensor_storage_map, @@ -634,7 +644,7 @@ class StableDiffusionGGML { } } if (use_tiny_autoencoder || version == VERSION_SDXS) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { tae_first_stage = std::make_shared(vae_backend, offload_params_to_cpu, tensor_storage_map, @@ -904,6 +914,7 @@ class StableDiffusionGGML { } else if (sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_z_image(version)) { pred_type = FLOW_PRED; if (flow_shift == INFINITY) { @@ -1506,7 +1517,7 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; - } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { latent_rgb_proj = wan_21_latent_rgb_proj; latent_rgb_bias = wan_21_latent_rgb_bias; } else { @@ -1987,6 +1998,9 @@ class StableDiffusionGGML { shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); timesteps_vec.assign(1, (float)shifted_t); + } else if (sd_version_is_anima(version)) { + // Anima uses normalized flow timesteps. + timesteps_vec.assign(1, t / static_cast(TIMESTEPS)); } else if (sd_version_is_z_image(version)) { timesteps_vec.assign(1, 1000.f - t); } else { @@ -2398,7 +2412,7 @@ class StableDiffusionGGML { } void process_latent_in(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; std::vector latents_std_vec; @@ -2437,7 +2451,7 @@ class StableDiffusionGGML { } void process_latent_out(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; std::vector latents_std_vec; @@ -2515,7 +2529,7 @@ class StableDiffusionGGML { // TODO wan2.2 vae support? int64_t ne2; int64_t ne3; - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { ne2 = 1; ne3 = C * x->ne[3]; } else { @@ -2533,7 +2547,7 @@ class StableDiffusionGGML { result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); } - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); } @@ -2606,6 +2620,7 @@ class StableDiffusionGGML { ggml_tensor* latent; if (use_tiny_autoencoder || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_wan(version) || sd_version_is_flux2(version) || version == VERSION_CHROMA_RADIANCE) { @@ -2625,7 +2640,7 @@ class StableDiffusionGGML { if (!use_tiny_autoencoder) { process_latent_in(latent); } - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1); } return latent; @@ -2663,7 +2678,7 @@ class StableDiffusionGGML { } int64_t t0 = ggml_time_ms(); if (!use_tiny_autoencoder) { - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); } process_latent_out(x);