Fix sharding mismatch caused recompilation in Qwen2.5-vl-7b integration test (#1117)

kwang3939 · web-flow · commit 292d310a4334 · 2025-11-17T13:24:46.000-08:00
Signed-off-by: Kewei Wang &lt;keweiwang@google.com&gt;
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -242,10 +242,11 @@ def run_get_multimodal_embeddings(graphdef, state, image_grid_thw,
         model = nnx.merge(graphdef, state)
         return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
 
+    embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
     @functools.partial(
         jax.jit,
-        out_shardings=(logits_sharding),
+        out_shardings=(embed_sharding),
     )
     def run_get_input_embeddings(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)
diff --git a/tpu_inference/runner/compilation_manager.py b/tpu_inference/runner/compilation_manager.py
@@ -332,13 +332,15 @@ def _precompile_select_from_array(self) -> None:
             index_paddings = self.runner.num_reqs_paddings
         dp_sharding = NamedSharding(self.runner.mesh,
                                     PartitionSpec(ShardingAxisName.ATTN_DATA))
+        hidden_states_sharding = NamedSharding(
+            self.runner.mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, None))
         dp_size = self.runner.vllm_config.sharding_config.total_dp_size
         self._precompile_select_from_array_helper(
             name="select all logits",
             source_paddings=self.runner.num_tokens_paddings,
             indices_paddings=index_paddings,
             hidden_dim=hsize,
-            input_sharding=dp_sharding,
+            input_sharding=hidden_states_sharding,
             indices_sharding=dp_sharding if dp_size > 1 else None,
         )