[Misc] Change default device for vllm_get_model (#1116)

sixiang-google · web-flow · commit a5d52a785d5b · 2025-11-17T14:51:17.000-08:00
diff --git a/tpu_inference/layers/vllm/quantization/unquantized.py b/tpu_inference/layers/vllm/quantization/unquantized.py
@@ -191,131 +191,119 @@ def select_gemm_impl(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
-        available_devices = self.mesh.devices.flatten()
-        with jax.default_device(available_devices[0]):
-            w13_weight = t2j(layer.w13_weight, use_dlpack=False)
-            w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
 
-            if self.moe.has_bias:
-                w13_bias = t2j(layer.w13_bias, use_dlpack=False)
-                w2_bias = t2j(layer.w2_bias, use_dlpack=False)
-
-            if layer.activation == "swigluoai":
-                # When using swigluoai, vLLM splits gmm output in a interleaved way.
-                # However, interleaved split is not performant on TPU. Therefore,
-                # we preprocess the weight so that splitting gmm output by middle
-                # can still get the same result.
-                w1_weight = w13_weight[:, ::2, :]
-                w3_weight = w13_weight[:, 1::2, :]
-                w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+        if self.moe.has_bias:
+            w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+            w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
 
-                if self.moe.has_bias:
-                    w1_bias = w13_bias[:, ::2]
-                    w3_bias = w13_bias[:, 1::2]
-                    w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-
-            if self.use_kernel and layer.use_ep:
-                # Kernel expects:
-                # w13: (num_experts, 2, hidden_size, intermediate_size)
-                # w2: (num_experts, intermediate_size, hidden_size)
-                # Current format:
-                # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
-                # w2_weight: (num_experts, hidden_size, intermediate_size)
-                num_experts = w13_weight.shape[0]
-                intermediate_size = w13_weight.shape[1] // 2
-                hidden_size = w13_weight.shape[2]
+            if self.moe.has_bias:
+                w1_bias = w13_bias[:, ::2]
+                w3_bias = w13_bias[:, 1::2]
+                w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
 
-                # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
-                w13_reshaped = w13_weight.reshape(num_experts, 2,
-                                                  intermediate_size,
-                                                  hidden_size)
-                w13_weight_transposed = jnp.transpose(w13_reshaped,
-                                                      (0, 1, 3, 2))
+        if self.use_kernel and layer.use_ep:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
+            w13_reshaped = w13_weight.reshape(num_experts, 2,
+                                              intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+
+            # Apply EP sharding
+            w13_weight = jax.device_put(
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
+            w2_weight = jax.device_put(
+                w2_weight_transposed,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
 
-                # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
-                w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            if self.moe.has_bias:
+                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
 
                 # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+
+        else:
+            # Original logic for non-kernel path
+            if layer.use_ep:
                 w13_weight = jax.device_put(
-                    w13_weight_transposed,
-                    Format(
-                        Layout((0, 1, 2, 3)),
-                        NamedSharding(self.mesh, P("model", None, None,
-                                                   None))))
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 w2_weight = jax.device_put(
-                    w2_weight_transposed,
+                    w2_weight,
                     Format(Layout((0, 1, 2)),
                            NamedSharding(self.mesh, P("model", None, None))))
 
                 if self.moe.has_bias:
-                    w13_bias = w13_bias.reshape(num_experts, 2,
-                                                intermediate_size)
-
-                    # Apply EP sharding
                     w13_bias = jax.device_put(
                         w13_bias,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
                     w2_bias = jax.device_put(
                         w2_bias,
                         Format(Layout((0, 1)),
                                NamedSharding(self.mesh, P("model", None))))
 
             else:
-                # Original logic for non-kernel path
-                if layer.use_ep:
-                    w13_weight = jax.device_put(
-                        w13_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
-                    w2_weight = jax.device_put(
-                        w2_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
-
-                    if self.moe.has_bias:
-                        w13_bias = jax.device_put(
-                            w13_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P("model", None))))
-                        w2_bias = jax.device_put(
-                            w2_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P("model", None))))
-
-                else:
-                    intermediate_size = w13_weight.shape[1] // 2
-                    assert intermediate_size == w2_weight.shape[-1]
-                    output_sizes = [intermediate_size, intermediate_size]
-                    n_shards = self.mesh.shape["model"]
-                    assert intermediate_size % n_shards == 0
-                    w13_weight = reorder_concatenated_tensor_for_sharding(
-                        w13_weight, output_sizes, n_shards, dim=1)
-                    w13_weight = jax.device_put(
-                        w13_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P(None, "model", None))))
-                    w2_weight = jax.device_put(
-                        w2_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P(None, None, "model"))))
-
-                    if self.moe.has_bias:
-                        w13_bias = reorder_concatenated_tensor_for_sharding(
-                            w13_bias, output_sizes, n_shards, dim=1)
-                        w13_bias = jax.device_put(
-                            w13_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P(None, "model"))))
-                        w2_bias = jax.device_put(
-                            w2_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P(None, None))))
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight, output_sizes, n_shards, dim=1)
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+
+                if self.moe.has_bias:
+                    w13_bias = reorder_concatenated_tensor_for_sharding(
+                        w13_bias, output_sizes, n_shards, dim=1)
+                    w13_bias = jax.device_put(
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, "model"))))
+                    w2_bias = jax.device_put(
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, None))))
 
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -120,7 +120,8 @@ def load_weights(self):
 
         # Load the vLLM model and wrap it into a new model whose forward
         # function can calculate the hidden_state and logits.
-        with load_context:
+        available_devices = self.mesh.devices.flatten()
+        with load_context, jax.default_device(available_devices[0]):
             vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
         lora_manager = None
         if vllm_config_for_load.lora_config is not None: