Skip to content

Commit 52a97b3

Browse files
authored
feat: add vace support (#819)
* add wan vace t2v support * add --vace-strength option * add vace i2v support * fix the processing of vace_context * add vace v2v support * update docs
1 parent 2c9b1e2 commit 52a97b3

16 files changed

+652
-286
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,9 @@ arguments:
313313
-i, --end-img [IMAGE] path to the end image, required by flf2v
314314
--control-image [IMAGE] path to image condition, control net
315315
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
316+
--control-video [PATH] path to control video frames, It must be a directory path.
317+
The video frames inside should be stored as images in lexicographical (character) order
318+
For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
316319
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
317320
-o, --output OUTPUT path to write result image to (default: ./output.png)
318321
-p, --prompt [PROMPT] the prompt to render
@@ -379,6 +382,7 @@ arguments:
379382
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
380383
only enabled if `--high-noise-steps` is set to -1
381384
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
385+
--vace-strength wan vace strength
382386
-v, --verbose print extra info
383387
```
384388
158 KB
Binary file not shown.
297 KB
Binary file not shown.
287 KB
Binary file not shown.

assets/wan/Wan2.1_14B_vace_r2v.mp4

152 KB
Binary file not shown.

assets/wan/Wan2.1_14B_vace_t2v.mp4

176 KB
Binary file not shown.

assets/wan/Wan2.1_14B_vace_v2v.mp4

347 KB
Binary file not shown.

diffusion_model.hpp

Lines changed: 72 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,29 @@
66
#include "unet.hpp"
77
#include "wan.hpp"
88

9+
struct DiffusionParams {
10+
struct ggml_tensor* x = NULL;
11+
struct ggml_tensor* timesteps = NULL;
12+
struct ggml_tensor* context = NULL;
13+
struct ggml_tensor* c_concat = NULL;
14+
struct ggml_tensor* y = NULL;
15+
struct ggml_tensor* guidance = NULL;
16+
std::vector<ggml_tensor*> ref_latents = {};
17+
bool increase_ref_index = false;
18+
int num_video_frames = -1;
19+
std::vector<struct ggml_tensor*> controls = {};
20+
float control_strength = 0.f;
21+
struct ggml_tensor* vace_context = NULL;
22+
float vace_strength = 1.f;
23+
std::vector<int> skip_layers = {};
24+
};
25+
926
struct DiffusionModel {
1027
virtual std::string get_desc() = 0;
1128
virtual void compute(int n_threads,
12-
struct ggml_tensor* x,
13-
struct ggml_tensor* timesteps,
14-
struct ggml_tensor* context,
15-
struct ggml_tensor* c_concat,
16-
struct ggml_tensor* y,
17-
struct ggml_tensor* guidance,
18-
std::vector<ggml_tensor*> ref_latents = {},
19-
bool increase_ref_index = false,
20-
int num_video_frames = -1,
21-
std::vector<struct ggml_tensor*> controls = {},
22-
float control_strength = 0.f,
23-
struct ggml_tensor** output = NULL,
24-
struct ggml_context* output_ctx = NULL,
25-
std::vector<int> skip_layers = std::vector<int>()) = 0;
29+
DiffusionParams diffusion_params,
30+
struct ggml_tensor** output = NULL,
31+
struct ggml_context* output_ctx = NULL) = 0;
2632
virtual void alloc_params_buffer() = 0;
2733
virtual void free_params_buffer() = 0;
2834
virtual void free_compute_buffer() = 0;
@@ -71,22 +77,18 @@ struct UNetModel : public DiffusionModel {
7177
}
7278

7379
void compute(int n_threads,
74-
struct ggml_tensor* x,
75-
struct ggml_tensor* timesteps,
76-
struct ggml_tensor* context,
77-
struct ggml_tensor* c_concat,
78-
struct ggml_tensor* y,
79-
struct ggml_tensor* guidance,
80-
std::vector<ggml_tensor*> ref_latents = {},
81-
bool increase_ref_index = false,
82-
int num_video_frames = -1,
83-
std::vector<struct ggml_tensor*> controls = {},
84-
float control_strength = 0.f,
85-
struct ggml_tensor** output = NULL,
86-
struct ggml_context* output_ctx = NULL,
87-
std::vector<int> skip_layers = std::vector<int>()) {
88-
(void)skip_layers; // SLG doesn't work with UNet models
89-
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
80+
DiffusionParams diffusion_params,
81+
struct ggml_tensor** output = NULL,
82+
struct ggml_context* output_ctx = NULL) {
83+
return unet.compute(n_threads,
84+
diffusion_params.x,
85+
diffusion_params.timesteps,
86+
diffusion_params.context,
87+
diffusion_params.c_concat,
88+
diffusion_params.y,
89+
diffusion_params.num_video_frames,
90+
diffusion_params.controls,
91+
diffusion_params.control_strength, output, output_ctx);
9092
}
9193
};
9294

@@ -129,21 +131,17 @@ struct MMDiTModel : public DiffusionModel {
129131
}
130132

131133
void compute(int n_threads,
132-
struct ggml_tensor* x,
133-
struct ggml_tensor* timesteps,
134-
struct ggml_tensor* context,
135-
struct ggml_tensor* c_concat,
136-
struct ggml_tensor* y,
137-
struct ggml_tensor* guidance,
138-
std::vector<ggml_tensor*> ref_latents = {},
139-
bool increase_ref_index = false,
140-
int num_video_frames = -1,
141-
std::vector<struct ggml_tensor*> controls = {},
142-
float control_strength = 0.f,
143-
struct ggml_tensor** output = NULL,
144-
struct ggml_context* output_ctx = NULL,
145-
std::vector<int> skip_layers = std::vector<int>()) {
146-
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
134+
DiffusionParams diffusion_params,
135+
struct ggml_tensor** output = NULL,
136+
struct ggml_context* output_ctx = NULL) {
137+
return mmdit.compute(n_threads,
138+
diffusion_params.x,
139+
diffusion_params.timesteps,
140+
diffusion_params.context,
141+
diffusion_params.y,
142+
output,
143+
output_ctx,
144+
diffusion_params.skip_layers);
147145
}
148146
};
149147

@@ -188,21 +186,21 @@ struct FluxModel : public DiffusionModel {
188186
}
189187

190188
void compute(int n_threads,
191-
struct ggml_tensor* x,
192-
struct ggml_tensor* timesteps,
193-
struct ggml_tensor* context,
194-
struct ggml_tensor* c_concat,
195-
struct ggml_tensor* y,
196-
struct ggml_tensor* guidance,
197-
std::vector<ggml_tensor*> ref_latents = {},
198-
bool increase_ref_index = false,
199-
int num_video_frames = -1,
200-
std::vector<struct ggml_tensor*> controls = {},
201-
float control_strength = 0.f,
202-
struct ggml_tensor** output = NULL,
203-
struct ggml_context* output_ctx = NULL,
204-
std::vector<int> skip_layers = std::vector<int>()) {
205-
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers);
189+
DiffusionParams diffusion_params,
190+
struct ggml_tensor** output = NULL,
191+
struct ggml_context* output_ctx = NULL) {
192+
return flux.compute(n_threads,
193+
diffusion_params.x,
194+
diffusion_params.timesteps,
195+
diffusion_params.context,
196+
diffusion_params.c_concat,
197+
diffusion_params.y,
198+
diffusion_params.guidance,
199+
diffusion_params.ref_latents,
200+
diffusion_params.increase_ref_index,
201+
output,
202+
output_ctx,
203+
diffusion_params.skip_layers);
206204
}
207205
};
208206

@@ -248,21 +246,20 @@ struct WanModel : public DiffusionModel {
248246
}
249247

250248
void compute(int n_threads,
251-
struct ggml_tensor* x,
252-
struct ggml_tensor* timesteps,
253-
struct ggml_tensor* context,
254-
struct ggml_tensor* c_concat,
255-
struct ggml_tensor* y,
256-
struct ggml_tensor* guidance,
257-
std::vector<ggml_tensor*> ref_latents = {},
258-
bool increase_ref_index = false,
259-
int num_video_frames = -1,
260-
std::vector<struct ggml_tensor*> controls = {},
261-
float control_strength = 0.f,
262-
struct ggml_tensor** output = NULL,
263-
struct ggml_context* output_ctx = NULL,
264-
std::vector<int> skip_layers = std::vector<int>()) {
265-
return wan.compute(n_threads, x, timesteps, context, y, c_concat, NULL, output, output_ctx);
249+
DiffusionParams diffusion_params,
250+
struct ggml_tensor** output = NULL,
251+
struct ggml_context* output_ctx = NULL) {
252+
return wan.compute(n_threads,
253+
diffusion_params.x,
254+
diffusion_params.timesteps,
255+
diffusion_params.context,
256+
diffusion_params.y,
257+
diffusion_params.c_concat,
258+
NULL,
259+
diffusion_params.vace_context,
260+
diffusion_params.vace_strength,
261+
output,
262+
output_ctx);
266263
}
267264
};
268265

docs/wan.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
- Wan2.1 FLF2V 14B 720P
1919
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
2020
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
21+
- Wan2.1 VACE 1.3B
22+
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
23+
- gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
24+
- Wan2.1 VACE 14B
25+
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
26+
- gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
2127
- Wan2.2
2228
- Wan2.2 TI2V 5B
2329
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
@@ -137,3 +143,62 @@
137143
```
138144

139145
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
146+
147+
### Wan2.1 VACE 1.3B
148+
149+
#### T2V
150+
151+
```
152+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
153+
```
154+
155+
<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
156+
157+
158+
#### R2V
159+
160+
```
161+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
162+
```
163+
164+
<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
165+
166+
167+
#### V2V
168+
169+
```
170+
mkdir post+depth
171+
ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
172+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
173+
```
174+
175+
<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
176+
177+
### Wan2.1 VACE 14B
178+
179+
#### T2V
180+
181+
```
182+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
183+
```
184+
185+
<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
186+
187+
188+
#### R2V
189+
190+
```
191+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
192+
```
193+
194+
<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
195+
196+
197+
198+
#### V2V
199+
200+
```
201+
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
202+
```
203+
204+
<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>

0 commit comments

Comments
 (0)