it's up and runing well (atleast for qwen-embed and lastpooling)

carlesoctav · carlesoctav · commit 2baaaaa0f234 · 2025-11-15T01:42:59.000Z
diff --git a/tpu_inference/layers/jax/pool/pooler.py b/tpu_inference/layers/jax/pool/pooler.py
@@ -9,14 +9,9 @@
 from vllm.config.pooler import PoolerConfig
 
 
-@jax.tree_util.register_dataclass
-@dataclass
-class PoolingResult:
-    """Outputs produced by pooling kernels."""
-
-    num_reqs: int
-    pooler_output: jax.Array  # [padded_num_reqs, dim]
-    # or [padded_num_reqs, padded_max_num_batchec_token_per_req, dim] for allpool
+# [padded_num_reqs, dim]
+# or [padded_num_reqs, padded_max_num_batchec_token_per_req, dim] for allpool
+PoolerOutput = jax.Array
 
 
 class PoolingType(enum.Enum):
@@ -139,7 +134,7 @@ def __call__(
         token_embeddings: jax.Array,
         token_mask: jax.Array,
         pooling_metadata: TPUSupportedPoolingMetadata,
-    ) -> PoolingResult:
+    ) -> PoolerOutput:
         raise NotImplementedError
 
 
@@ -152,7 +147,7 @@ def __call__(
         self,
         pooled: jax.Array,
         pooling_metadata: TPUSupportedPoolingMetadata,
-    ) -> PoolingResult:
+    ) -> PoolerOutput:
 
         # In the torch version, this part should handle other computations related to pooling_params, such as
         # normalization and truncating the embedding dimensions (for matryoshka models).
@@ -166,10 +161,7 @@ def __call__(
         if self.default_normalize:
             pooled = normalize(pooled)
 
-        return PoolingResult(
-            num_reqs=pooling_metadata.num_reqs,
-            pooler_output=pooled,
-        )
+        return pooled
 
 
 class Pooler(nnx.Module):
@@ -187,7 +179,7 @@ def __call__(
         self,
         hidden_states: jax.Array,
         pooling_metadata: TPUSupportedPoolingMetadata,
-    ) -> PoolingResult:
+    ) -> PoolerOutput:
         raise NotImplementedError
 
     def get_supported_tasks(self) -> set[str]:
@@ -213,7 +205,9 @@ def __call__(
         self,
         hidden_states: jax.Array,
         pooling_metadata: TPUSupportedPoolingMetadata,
-    ) -> PoolingResult:
+    ) -> PoolerOutput:
+        hidden_states = hidden_states.astype(jnp.float32)
+        # the output mus be of type torch.tensor, but we cannot convert numpy to torch if the dtype is bf16
         pooled = self.pooling(hidden_states, pooling_metadata)
         return self.head(pooled, pooling_metadata)
 
diff --git a/tpu_inference/layers/jax/pool/pooling.py b/tpu_inference/layers/jax/pool/pooling.py
@@ -5,6 +5,9 @@
 from .pooler import Pooler, PoolerOutput
 from .pooling_metadata import TPUSupportedPoolingMetadata
 
+
+# actually my idea is not to jist this function but the model.pooler,
+# we can put some postprocesing here.
 @jax.jit
 def pool(
     hidden_states: jax.Array,
diff --git a/tpu_inference/layers/jax/pool/pooling_metadata.py b/tpu_inference/layers/jax/pool/pooling_metadata.py
@@ -25,15 +25,15 @@ def build_pooling_cursor(
     assert len(prompt_lens) == len(num_scheduled_tokens)
 
     n_seq = len(num_scheduled_tokens)
-    num_sched_tokens_padded = jnp.zeros(padded_num_seqs)
-    num_sched_tokens_padded = num_sched_tokens_padded.at[:n_seq].set(
-        jnp.asarary(num_scheduled_tokens, dtype=jnp.int32)
+    num_scheduled_tokens_padded = jnp.zeros(padded_num_seqs)
+    num_scheduled_tokens_padded = num_scheduled_tokens_padded.at[:n_seq].set(
+        jnp.asarray(num_scheduled_tokens, dtype=jnp.int32)
     )
-    cumsum = jnp.cumsum(num_scheduled_tokens)
-    first_token_indices = jnp.concatenate((jnp.asarray(0), cumsum[:-1]))
-    last_token_indices = first_token_indices + num_sched_tokens_padded - 1
+    cumsum = jnp.cumsum(num_scheduled_tokens_padded, dtype = jnp.int64)
+    first_token_indices = jnp.concatenate((jnp.asarray((0,)), cumsum[:-1]))
+    last_token_indices = (first_token_indices + num_scheduled_tokens_padded - 1).astype(jnp.int64)
     last_token_indices = jnp.where(
-        num_sched_tokens_padded > 0, last_token_indices, first_token_indices
+        num_scheduled_tokens_padded > 0, last_token_indices, first_token_indices
     )
     return first_token_indices, last_token_indices
 
@@ -42,11 +42,13 @@ def build_pooling_cursor(
     jax.tree_util.register_dataclass,
     data_fields=(
         "prompt_lens",
+        "first_token_indices",
+        "last_token_indices",
         "normalize",
         "num_reqs",
         "padded_num_reqs",
     ),
-    meta_fields=("task_id",),
+    meta_fields=("task",),
 )
 @dataclass
 class TPUSupportedPoolingMetadata:
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -281,7 +281,7 @@ def combine_hidden_states(graphdef, state, hidden_states):
         run_get_multimodal_embeddings, graphdef)
     get_input_embeddings_fn = functools.partial(run_get_input_embeddings,
                                                 graphdef)
-    lora_manager, model = None, None
+    lora_manager, _ = None, None
     combine_hidden_states_fn = functools.partial(combine_hidden_states,
                                                  graphdef)
 
diff --git a/tpu_inference/models/jax/adapters.py b/tpu_inference/models/jax/adapters.py
@@ -4,12 +4,8 @@
 from flax import nnx
 from jax.sharding import Mesh
 
-from vllm.config import VllmConfig
-from vllm.model_executor.models.interfaces_base import (
-    VllmModelForPooling,
-    is_pooling_model,
-)
 from tpu_inference.layers.jax.pool.pooler import Pooler
+from vllm.config import VllmConfig
 
 _T = tp.TypeVar("_T", bound=type[nnx.Module])
 
@@ -18,6 +14,15 @@
     "ForConditionalGeneration",
 )
 
+class PoolingMixin:
+    """
+    same as VllmModelForPooling 
+    """
+    is_pooling_model: tp.ClassVar[tp.Literal[True]] = True
+
+    default_pooling_type: tp.ClassVar[str] = "LAST"
+    pooler: Pooler
+
 
 def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
     model_name = orig_model_name
@@ -27,7 +32,7 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
 
 
 def _create_pooling_model_cls(orig_cls: _T) -> _T:
-    class ModelForPooling(orig_cls, VllmModelForPooling): 
+    class ModelForPooling(orig_cls, PoolingMixin): 
         is_pooling_model = True
 
         def __init__(
diff --git a/tpu_inference/models/jax/utils/weight_utils.py b/tpu_inference/models/jax/utils/weight_utils.py
@@ -316,6 +316,9 @@ def _load_hf_weights_on_thread(vllm_config,
         if hf_key.endswith(".weight"):
             hf_key = hf_key.removesuffix(".weight")
 
+        if not hf_key.startswith('models.'):
+            hf_key = 'model.' + hf_key
+
         # Find the corresponding model key using the HF key
         if "layers" in hf_key:
             layer_num = re.search(r"layers\.(\d+)", hf_key).group(1)
diff --git a/tpu_inference/runner/compilation_manager.py b/tpu_inference/runner/compilation_manager.py
@@ -11,9 +11,14 @@
 from tpu_inference.core.disagg_utils import is_disagg_enabled
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.jax.pool.pooling import pool
+from tpu_inference.layers.jax.pool.pooling_metadata import (
+    TPUSupportedPoolingMetadata,
+)
 from tpu_inference.layers.jax.sample.sampling import sample
-from tpu_inference.layers.jax.sample.sampling_metadata import \
-    TPUSupportedSamplingMetadata
+from tpu_inference.layers.jax.sample.sampling_metadata import (
+    TPUSupportedSamplingMetadata,
+)
 from tpu_inference.logger import init_logger
 from tpu_inference.utils import device_array
 
@@ -79,6 +84,9 @@ def capture_model(self) -> None:
                     self._run_compilation, )
                 self._precompile_input_embeddings_merger()
                 self._precompile_backbone_with_inputs_embeds()
+            if self.runner.is_pooling_model:
+                self._precompile_pooling()
+                return
             if self.runner.scheduler_config.async_scheduling:
                 self._precompile_substitute_placeholder_token()
             self._precompile_select_from_array()
@@ -90,6 +98,68 @@ def capture_model(self) -> None:
             if self.runner.speculative_config:
                 self._precompile_speculative_decoding()
 
+    def _precompile_pooling(self) -> None:
+        pooler = getattr(self.runner, "pooler", None)
+        if pooler is None:
+            logger.warning(
+                "Pooling precompile skipped because model has no pooler attribute.")
+            return
+
+        logger.info("Precompile pooling kernels for pooling models.")
+
+        hidden_size = self.runner.model_config.get_hidden_size()
+        dtype = self.runner.model_config.dtype
+        hidden_sharding = NamedSharding(
+            self.runner.mesh, PartitionSpec(None, None))
+
+        for num_tokens in self.runner.num_tokens_paddings:
+            hidden_states = self._create_dummy_tensor(
+                (num_tokens, hidden_size), dtype, sharding=hidden_sharding)
+
+            for num_reqs in self.runner.num_reqs_paddings:
+                if num_reqs == 0 or num_reqs > num_tokens:
+                    continue
+
+                prompt_lens = np.ones(num_reqs, dtype=np.int32)
+                first_token_indices = np.arange(num_reqs, dtype=np.int32)
+                last_token_indices = first_token_indices.copy()
+                normalize = np.ones(num_reqs, dtype=np.int8)
+
+                (
+                    prompt_lens,
+                    normalize,
+                    first_token_indices,
+                    last_token_indices,
+                ) = device_array(
+                    self.runner.mesh,
+                    (
+                        prompt_lens,
+                        normalize,
+                        first_token_indices,
+                        last_token_indices,
+                    ),
+                )
+
+                pooling_metadata = TPUSupportedPoolingMetadata(
+                    prompt_lens=prompt_lens,
+                    first_token_indices=first_token_indices,
+                    last_token_indices=last_token_indices,
+                    normalize=normalize,
+                    num_reqs=num_reqs,
+                    padded_num_reqs=num_reqs,
+                    task="embed",
+                )
+
+                self._run_compilation(
+                    "pool",
+                    pool,
+                    hidden_states,
+                    pooling_metadata,
+                    pooler,
+                    num_tokens=num_tokens,
+                    num_reqs=num_reqs,
+                )
+
     def _precompile_input_embeddings_merger(self) -> None:
         for num_tokens in self.runner.num_tokens_paddings:
             hidden_size = self.runner.vllm_config.model_config.get_hidden_size(
diff --git a/tpu_inference/runner/input_batch.py b/tpu_inference/runner/input_batch.py
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py