local change to support 2d TP for DeepSeek

bzgoogle · bzgoogle · commit 910f6e358678 · 2025-11-11T23:19:25.000Z
diff --git a/tpu_commons/models/jax/common/moe/deepseek_moe.py b/tpu_commons/models/jax/common/moe/deepseek_moe.py
@@ -1,22 +1,18 @@
 import enum
 from dataclasses import InitVar, dataclass
 from functools import partial
-from typing import Optional, Tuple
+from typing import Tuple
 
 import jax
 import jax.numpy as jnp
 from flax import nnx
 from flax.typing import Sharding
 from jax.sharding import PartitionSpec
 from jaxtyping import Float
-from qwix._src.core.ragged_dot import ragged_dot as qwix_ragged_dot
-from qwix._src.providers import ptq
 
 from tpu_commons.models.jax.common.base import create_param
 from tpu_commons.models.jax.common.layers import FlaxUtils
 from tpu_commons.models.jax.common.moe.moe import MoE
-from tpu_commons.models.jax.utils.quantization.quantization_utils import (
-    manually_quantize_qwix_activation, manually_quantize_qwix_weight)
 
 modeling_flax_utils = FlaxUtils()
 
@@ -140,19 +136,43 @@ class SparseMoE(MoE):
         # TODO: determine if we get it from external or extrat it in MoE class
         is_batch_sharded_by_expert: True if batch is sharded over 'expert' dim.
     """
+    def_sharding: Sharding
+    fed_sharding: Sharding
     num_experts_per_tok: int
     #TODO: tile size is (tile_batch_seq, tile_activation_dim, tile_weight_dim,) from MaxText
     tile_size: tuple[int, int, int] = (128, 64, 128)
     use_megablox: bool = False
     mesh: jax.sharding.Mesh
-    # This should be set if and only if you have quantized your model (via Qwix)
-    quantized_dtype: Optional[jnp.dtype] = None
 
     def __post_init__(self, rngs: nnx.Rngs):
-        super().__post_init__(rngs)
+
+        D = self.hidden_size
+        F = self.intermediate_size_moe
+        # shape_gating = (D, self.num_local_experts, F)
+        # shape_up = (D, self.num_local_experts, F)
+        # shape_down = (F, self.num_local_experts,D)
+        shape_gating = (self.num_local_experts, D, F)
+        shape_up = (self.num_local_experts, D, F)
+        shape_down = (self.num_local_experts, F, D)
+
+        self.kernel_gating_DEF = create_param(rngs,
+                                              shape=shape_gating,
+                                              dtype=self.dtype,
+                                              sharding=self.def_sharding,
+                                              random_init=self.random_init)
+        self.kernel_up_proj_DEF = create_param(rngs,
+                                               shape=shape_up,
+                                               dtype=self.dtype,
+                                               sharding=self.def_sharding,
+                                               random_init=self.random_init)
+        self.kernel_down_proj_FED = create_param(rngs,
+                                                 shape=shape_down,
+                                                 dtype=self.dtype,
+                                                 sharding=self.fed_sharding,
+                                                 random_init=self.random_init)
 
         # Derive the expert sharding
-        self.expert_axis_name = self.edf_sharding[0]
+        self.expert_axis_name = self.def_sharding[0]
         if self.expert_axis_name is None:
             self.num_expert_parallelism = 1
         else:
@@ -329,20 +349,29 @@ def _unpermute(self, processed_tokens: jax.Array, sort_indices: jax.Array,
         with jax.named_scope("unpermute"):
             unsorted_tokens_tD = self._sort_activations(
                 processed_tokens, jnp.argsort(sort_indices))
+            D = unsorted_tokens_tD.shape[1]
             reshaped_tokens_TXD = unsorted_tokens_tD.reshape(
-                -1, self.num_experts_per_tok, self.hidden_size)
+                -1, self.num_experts_per_tok, D)
+        # jax.debug.print(
+        #     "✅ reshaped_tokens_TXD on device:  reshaped_tokens_TXD[5]={t}",
+        #     t=reshaped_tokens_TXD[5, 0,:5]
+        # )
+        # jax.debug.print(
+        #     "✅ router_weights_TX  on device:  router_weights_TX={t}",
+        #     t=router_weights_TX[5, :]
+        # )
         with jax.named_scope("combine_weights"):
             output_TD = jnp.einsum(
                 "TXD,TX -> TD",
-                reshaped_tokens_TXD.astype(jnp.float32),
-                router_weights_TX.astype(jnp.float32),
-                precision='float32',
+                reshaped_tokens_TXD.astype(self.dtype),
+                router_weights_TX.astype(self.dtype),
             )
 
         return output_TD.astype(self.dtype)
 
     def _gmm(self, inputs, kernel, group_sizes):
         """Performs Grouped Matrix Multiply."""
+        jax.config.update("jax_ragged_dot_use_ragged_dot_instruction", True)
         num_rows = inputs.shape[0]
         pad_amount = (self.tile_size[0] -
                       num_rows % self.tile_size[0]) % self.tile_size[0]
@@ -354,11 +383,8 @@ def _gmm(self, inputs, kernel, group_sizes):
             raise NotImplementedError(
                 "MegaBlox kernel call is not implemented.")
         else:
-            inputs = manually_quantize_qwix_activation(
-                inputs, "ragged_dot", jnp.float8_e4m3fn, [0], {},
-                "absmax") if self.quantized_dtype else inputs
-            ragged_dot_func = qwix_ragged_dot if self.quantized_dtype else jax.lax.ragged_dot
-            output = ragged_dot_func(
+
+            output = jax.lax.ragged_dot(
                 lhs=inputs,
                 rhs=kernel,
                 group_sizes=group_sizes,
@@ -394,10 +420,12 @@ def _distributed_sparse_moe_fwd(
 
         # TODO: update to 'expert' after we enable expert parallelism, currently experts are sharded along model axis
         # or we sould derive it from the model init
-        expert_shard_id = jax.lax.axis_index(self.expert_axis_name)
+        
         local_expert_size = self.num_local_experts // self.num_expert_parallelism
 
-        if self.num_expert_parallelism > 1:
+        #if self.num_expert_parallelism > 1:
+        if self.expert_axis_name:
+            expert_shard_id = jax.lax.axis_index(self.expert_axis_name)
             if self.is_batch_sharded_by_expert:
                 # When token sharded in devices
                 # In this path, we assume the data(tokens) are fully sharded on expert, namely data_axis_name == expert_axis_name
@@ -508,8 +536,9 @@ def _distributed_sparse_moe_fwd(
         # 5. Return Results (All-to-All)
         if self.num_expert_parallelism > 1:
             local_total_assignments = x_TD.shape[0] * self.num_experts_per_tok
+            D = x_TD.shape[1]
             output_shape = jnp.zeros(
-                (local_total_assignments, self.hidden_size),
+                (local_total_assignments, D),
                 dtype=intermediate_output.dtype)
 
             if self.is_batch_sharded_by_expert:
@@ -568,10 +597,10 @@ def __call__(self, x_TD: Float):
             PartitionSpec(*self.activation_ffw_td),  # Sharded x_TD
             PartitionSpec(),  # Replicated router_weights_TX
             PartitionSpec(),  # Replicated selected_experts_TX
-            PartitionSpec(*self.edf_sharding),  # Sharded gating kernel
-            PartitionSpec(*self.edf_sharding),  # Sharded up-projection kernel
+            PartitionSpec(*self.def_sharding),  # Sharded gating kernel
+            PartitionSpec(*self.def_sharding),  # Sharded up-projection kernel
             PartitionSpec(
-                *self.efd_sharding),  # Sharded down-projection kernel
+                *self.fed_sharding),  # Sharded down-projection kernel
         )
         out_specs = PartitionSpec(*self.activation_ffw_td)
 
@@ -582,27 +611,12 @@ def __call__(self, x_TD: Float):
                                  check_rep=False)(
                                      SparseMoE._distributed_sparse_moe_fwd)
 
-        kernel_gating_EDF = self.kernel_gating_EDF.value
-        kernel_up_proj_EDF = self.kernel_up_proj_EDF.value
-        kernel_down_proj_EFD = self.kernel_down_proj_EFD.value
-
-        if self.quantized_dtype:
-            if not isinstance(kernel_gating_EDF, ptq.WithAux):
-                kernel_gating_EDF = manually_quantize_qwix_weight(
-                    kernel_gating_EDF, self.quantized_dtype, [0, 2], {},
-                    "absmax")
-            if not isinstance(kernel_up_proj_EDF, ptq.WithAux):
-                kernel_up_proj_EDF = manually_quantize_qwix_weight(
-                    kernel_up_proj_EDF, self.quantized_dtype, [0, 2], {},
-                    "absmax")
-            if not isinstance(kernel_down_proj_EFD, ptq.WithAux):
-                kernel_down_proj_EFD = manually_quantize_qwix_weight(
-                    kernel_down_proj_EFD, self.quantized_dtype, [0, 1], {},
-                    "absmax")
-            kernel_gating_EDF = kernel_gating_EDF.array
-            kernel_up_proj_EDF = kernel_up_proj_EDF.array
-            kernel_down_proj_EFD = kernel_down_proj_EFD.array
-
-        return mapped_moe_fwd(self, x_TD, router_weights_TX,
-                              selected_experts_TX, kernel_gating_EDF,
-                              kernel_up_proj_EDF, kernel_down_proj_EFD)
+        return mapped_moe_fwd(
+            self,
+            x_TD,
+            router_weights_TX,
+            selected_experts_TX,
+            self.kernel_gating_DEF.value,
+            self.kernel_up_proj_DEF.value,
+            self.kernel_down_proj_FED.value,
+        )
diff --git a/tpu_inference/layers/jax/attention/deepseek_v3_attention.py b/tpu_inference/layers/jax/attention/deepseek_v3_attention.py
@@ -317,13 +317,13 @@ def attention(
             self.query_tnh,  # q
             self.keyvalue_skh,  # k
             self.keyvalue_skh,  # v
-            P(None, None, "model"),  # kv_cache
+            P(None, None, ('model', 'expert')),  # kv_cache
             P(),  # md.seq_lens: Replicated
             P(),  # page_indices_flat: Replicated
             P(),  # query_start_loc: Replicated
             P(),  # distribution: Replicated
         )
-        out_specs = (self.attn_o_tnh, P(None, None, "model"))
+        out_specs = (self.attn_o_tnh, P(None, None, ('model', 'expert')))
 
         def _ragged_paged_attention(*args):
             return ragged_paged_attention(
diff --git a/tpu_inference/layers/jax/moe/moe.py b/tpu_inference/layers/jax/moe/moe.py
@@ -84,8 +84,8 @@ class MoE(nnx.Module):
     router: nnx.Module
     activation_ffw_td: Sharding
     activation_ffw_ted: Sharding
-    edf_sharding: Sharding
-    efd_sharding: Sharding
+    edf_sharding: Sharding = ()
+    efd_sharding: Sharding = ()
     random_init: bool = False
 
     def __call__(self, x_TD: Float):
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -199,7 +199,7 @@ def get_flax_model(
             vllm_config.model_config.hf_config)
     jit_model = _get_nnx_model(model_class, vllm_config, rng, mesh)
     kv_cache_sharding = NamedSharding(
-        mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, None, "model"))
+        mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, None, ("model", "expert")))
     hidden_states_sharding = NamedSharding(mesh,
                                            PartitionSpec(
                                                ShardingAxisName.ATTN_DATA,
@@ -224,7 +224,7 @@ def run_model(graphdef, state, *args):
         return model(*args)
 
     logits_sharding = NamedSharding(
-        mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, "model"))
+        mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, ("model", "expert")))
 
     @functools.partial(
         jax.jit,
diff --git a/tpu_inference/models/jax/deepseek_v3.py b/tpu_inference/models/jax/deepseek_v3.py
@@ -148,14 +148,14 @@ def _create_mla() -> MLA:
                 rngs=self.rng,
                 activation_attention_td=(None, None),
                 activation_q_td=(None, None),
-                query_tnh=P(None, 'model', None),
-                keyvalue_skh=P(None, 'model', None),
+                query_tnh=P(None, ('model', 'expert'), None),
+                keyvalue_skh=P(None, ('model', 'expert'), None),
                 activation_attention_out_td=(None, None),
-                attn_o_tnh=P(None, 'model', None),
-                q_da_sharding=(None, 'model'),
-                anh_sharding=(None, 'model', None),
-                kv_da_sharding=(None, 'model'),
-                nhd_sharding=('model', None, None))
+                attn_o_tnh=P(None, ('model', 'expert'), None),
+                q_da_sharding=(None, ('model', 'expert')),
+                anh_sharding=(None, ('model', 'expert'), None),
+                kv_da_sharding=(None, ('model', 'expert')),
+                nhd_sharding=(('model', 'expert'), None, None))
 
         for i in range(first_k_dense_replace):
             block = TransformerBlock(
@@ -201,8 +201,8 @@ def _create_mla() -> MLA:
                 routed_scaling_factor=2.5,
                 dtype=dtype,
                 activation_ffw_td=('data', None),
-                ed_sharding=('model', None),
-                e_sharding=('model', ))
+                ed_sharding=(None, None),
+                e_sharding=(None, ))
             if self.sparse_matmul:
                 # TODO: orginize the SparseMoE and DenseMoE better given they share most interfaces
                 custom_module = SparseMoE(
@@ -216,12 +216,10 @@ def _create_mla() -> MLA:
                     hidden_act=hidden_act,
                     rngs=self.rng,
                     random_init=self.random_init,
-                    activation_ffw_td=('data', None),
-                    activation_ffw_ted=('data', None, None),
-                    edf_sharding=('model', None, None),
-                    efd_sharding=('model', None, None),
-                    quantized_dtype=self.weight_loader.quant_dtype
-                    if self.weight_loader.is_model_quantized else None,
+                    activation_ffw_td=('data', 'model'),
+                    activation_ffw_ted=('data', None, 'model'),
+                    def_sharding=('expert', 'model', None),
+                    fed_sharding=('expert', None, 'model'),
                     router=router) if is_moe_layer else DenseFFW(
                         dtype=dtype,
                         hidden_act=hidden_act,
@@ -241,10 +239,10 @@ def _create_mla() -> MLA:
                     hidden_act=hidden_act,
                     rngs=self.rng,
                     random_init=self.random_init,
-                    activation_ffw_td=('data', None),
+                    activation_ffw_td=('data', 'model'),
                     activation_ffw_ted=('data', None, None),
-                    edf_sharding=('model', None, None),
-                    efd_sharding=('model', None, None),
+                    edf_sharding=('expert', 'model', None),
+                    efd_sharding=('expert', None, 'model'),
                     router=router) if is_moe_layer else DenseFFW(
                         dtype=dtype,
                         hidden_act=hidden_act,
@@ -865,4 +863,8 @@ def weights_dequant_cpu(x: torch.Tensor,
             scale = s[M // block_size, j // block_size]
             y[M_main:M, j:j + block_size] = block * scale
 
+<<<<<<< HEAD:tpu_inference/models/jax/deepseek_v3.py
     return y.to(j2t_dtype(jnp.dtype(output_dtype)))
+=======
+    return y.to(torch.get_default_dtype())
+>>>>>>> 307bbd62 (local change to support 2d TP for DeepSeek):tpu_commons/models/jax/deepseek_v3.py
diff --git a/tpu_inference/runner/compilation_manager.py b/tpu_inference/runner/compilation_manager.py
@@ -350,15 +350,15 @@ def _precompile_select_from_array(self) -> None:
                 indices_paddings=self.runner.num_reqs_paddings,
                 hidden_dim=vocab_size,
                 input_sharding=NamedSharding(self.runner.mesh,
-                                             PartitionSpec(None, "model")),
+                                             PartitionSpec(None, ('model', 'expert')),
             )
             self._precompile_select_from_array_helper(
                 name="select target tokens for spec decoding",
                 source_paddings=self.runner.num_logits_paddings,
                 indices_paddings=self.runner.num_logits_paddings,
                 hidden_dim=vocab_size,
                 input_sharding=NamedSharding(self.runner.mesh,
-                                             PartitionSpec(None, "model")),
+                                             PartitionSpec(None, ('model', 'expert')),
                 only_equal_paddings=True,
             )
 
@@ -390,7 +390,7 @@ def _precompile_sampling(self) -> None:
         for num_reqs in self.runner.num_reqs_paddings:
             logits_sharding = NamedSharding(
                 self.runner.mesh,
-                PartitionSpec(ShardingAxisName.ATTN_DATA, "model"))
+                PartitionSpec(ShardingAxisName.ATTN_DATA, ('model', 'expert'))
             dp_size = self.runner.vllm_config.sharding_config.total_dp_size
             sampling_metadata_sharding = NamedSharding(
                 self.runner.mesh, PartitionSpec(
@@ -480,7 +480,7 @@ def _precompile_rejection_sampler(self) -> None:
         for num_logits in self.runner.num_logits_paddings:
             for num_reqs in self.runner.num_reqs_paddings:
                 sharding = NamedSharding(self.runner.mesh,
-                                         PartitionSpec(None, "model"))
+                                         PartitionSpec(None, ('model', 'expert')))
                 target_probs = self._create_dummy_tensor(
                     (num_logits, vocab_size), jnp.bfloat16, sharding)
                 draft_token_ids = self._create_dummy_tensor((num_logits, ),
diff --git a/tpu_inference/runner/kv_cache.py b/tpu_inference/runner/kv_cache.py
@@ -22,7 +22,7 @@ def get_kv_cache_shape_with_mesh(mesh: Mesh, total_num_pages: int,
                                  actual_head_dim: int, kv_dtype: any):
     """Gets the KV cache shape based on the mesh configuration."""
 
-    model_cnt = mesh.shape["model"]
+    model_cnt = mesh.shape["model"] * mesh.shape["expert"]
     assert actual_num_kv_heads % model_cnt == 0
     # NOTE(chengjiyao): Currently, the attention kernel is tailored to the
     # specific model, rather than being determined by the head_dim. If new
@@ -79,7 +79,7 @@ def create_kv_caches(
     sharding = NamedSharding(
         mesh,
         PartitionSpec(ShardingAxisName.ATTN_DATA, None,
-                      ShardingAxisName.ATTN_HEAD))
+                      ('model', 'expert'))
 
     def _allocate() -> jax.Array:
         return jnp.empty(
diff --git a/tpu_inference/runner/kv_cache_manager.py b/tpu_inference/runner/kv_cache_manager.py