Address comments

coolkp · coolkp · commit 19fb249f4dc7 · 2025-11-11T23:08:41.000Z
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/configs/base_2_base.yml b/src/maxdiffusion/configs/base_2_base.yml
@@ -50,6 +50,9 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
+mask_padding_tokens: True # Whether to mask padding tokens in attention computation.
+attention_sharding_uniform: True # same sequence sharding rules applied for q in both (self and cross attention)
+
 flash_block_sizes: {}
 # to override default block sizes for flash attention
 # flash_block_sizes:
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -61,8 +61,16 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 0
-mask_padding_tokens: True # Whether to mask padding tokens in attention computation.
-attention_sharding_uniform: True # same sequence sharding rules applied for q in both (self and cross attention)
+
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like (ironwood) this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 dropout: 0.1
 
 flash_block_sizes: {
@@ -168,7 +176,7 @@ logical_axis_rules: [
                       ['norm', 'tensor'],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
-                      ['conv_in', 'fsdp'],
+                      #['conv_in', 'fsdp'],
                       ['conv_out', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'tensor']]
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -587,7 +587,7 @@ def __call__(
             width=width,
             num_frames=num_frames,
             num_channels_latents=num_channel_latents,
-        ) # # fusion.18
+        )
 
       data_sharding = NamedSharding(self.mesh, P())
       # Using global_batch_size_to_train_on so not to create more config variables