[Misc] Fix dtype not being configured correctly

kyuyeunk · kyuyeunk · commit 359da68b9038 · 2025-11-29T14:14:57.000Z
Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -231,6 +231,5 @@ def test_get_jax_dtype_from_str_dtype():
     assert get_jax_dtype_from_str_dtype("int8") == jnp.int8
     assert get_jax_dtype_from_str_dtype("bfloat16") == jnp.bfloat16
     assert get_jax_dtype_from_str_dtype("fp8") == jnp.float8_e4m3fn
-    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3
+    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3fn
     assert get_jax_dtype_from_str_dtype("fp8_e5m2") == jnp.float8_e5m2
-    assert get_jax_dtype_from_str_dtype("auto") is None
diff --git a/tpu_inference/models/jax/utils/quantization/quantization_utils.py b/tpu_inference/models/jax/utils/quantization/quantization_utils.py
@@ -154,12 +154,9 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
     logger.info(f"Memory usage before applying quantization of params: "
                 f"hbm={utils.hbm_usage_gb(jax.local_devices())}Gb")
 
-    # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
-    kv_cache_jnp_dtype = utils.get_jax_dtype_from_str_dtype(kv_cache_dtype)
-
-    # Handle the case where kv_cache_dtype is "auto"
-    if kv_cache_jnp_dtype is None:
-        assert kv_cache_dtype == "auto", "kv_cache_dtype must be 'auto' if kv_cache_jnp_dtype is None"
+    if kv_cache_dtype != "auto":
+        kv_cache_jnp_dtype = utils.to_jax_dtype(kv_cache_dtype)
+    else:
         kv_cache_jnp_dtype = DEFAULT_KV_CACHE_DTYPE
 
     kv_caches = create_kv_caches(
diff --git a/tpu_inference/platforms/tpu_platform.py b/tpu_inference/platforms/tpu_platform.py
@@ -5,7 +5,6 @@
 import jax.numpy as jnp
 import torch
 import vllm.envs as vllm_envs
-from torchax.ops.mappings import j2t_dtype
 from tpu_info import device
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.platforms.interface import Platform, PlatformEnum
@@ -14,6 +13,7 @@
 from tpu_inference import envs
 from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
+from tpu_inference.utils import to_jax_dtype, to_torch_dtype
 
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import _Backend
@@ -28,12 +28,6 @@
 
 logger = init_logger(__name__)
 
-_DTYPE: dict[str, jnp.dtype] = {
-    "bfloat16": jnp.bfloat16,
-    "float": jnp.float32,
-    "float32": jnp.float32,
-}
-
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -158,20 +152,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # NOTE(xiang): convert dtype to jnp.dtype
         # NOTE(wenlong): skip this logic for mm model preprocessing
         # For mm model preprocessors, it may need the output dtype to be torch.
-        # In order to avoid a PR to vLLM, we postpone the dtype checking during tpu_worker initialization
+        # In order to avoid a PR to vLLM, we postpone the dtype checking during
+        # tpu_worker initialization
         if not vllm_config.scheduler_config.is_multimodal_model or impl == "vllm":
-            if not isinstance(vllm_config.model_config.dtype, str):
-                logger.warning(
-                    "The model dtype is not properly set for JAX backend. "
-                    "Overwriting it to jnp.bfloat16")
-                vllm_config.model_config.dtype = jnp.bfloat16
-            else:
-                vllm_config.model_config.dtype = _DTYPE.get(
-                    vllm_config.model_config.dtype, jnp.bfloat16)
-
-        if impl == "vllm":
-            vllm_config.model_config.dtype = j2t_dtype(
-                vllm_config.model_config.dtype.dtype)
+            try:
+                dtype = to_jax_dtype(vllm_config.model_config.dtype)
+            except ValueError:
+                logger.warning("The model dtype is not set properly."
+                               "Falling back to jnp.bfloat16")
+                dtype = jnp.bfloat16
+            if impl == "vllm":
+                dtype = to_torch_dtype(dtype)
+            vllm_config.model_config.dtype = dtype
 
         # TODO(cuiq): remove this dependency.
         from vllm.v1.attention.backends.pallas import PallasAttentionBackend
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py
@@ -9,12 +9,10 @@
 import jax.numpy as jnp
 import jaxtyping
 import numpy as np
-import torch
 import vllm.envs as vllm_envs
 from flax import nnx
 from jax.experimental import mesh_utils
 from jax.sharding import NamedSharding, PartitionSpec
-from torchax.ops.mappings import j2t_dtype
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
@@ -64,7 +62,7 @@
     StructuredDecodingManager
 from tpu_inference.spec_decode.jax.eagle3 import Eagle3Proposer
 from tpu_inference.utils import (device_array, make_optimized_mesh,
-                                 time_function)
+                                 time_function, to_torch_dtype)
 
 logger = init_logger(__name__)
 
@@ -78,17 +76,6 @@
     request_distribution=[0, 0, 0],
 )
 
-TPU_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-    "fp8": torch.float8_e4m3fn,
-    "fp8_e4m3": torch.float8_e4m3fn,
-    "fp8_e5m2": torch.float8_e5m2,
-    "int8": torch.int8,
-    "uint8": torch.uint8,
-}
-
 
 class AsyncTPUModelRunnerOutput(AsyncModelRunnerOutput):
     """Holds asynchronous model output specifically from a TPU runner.
@@ -262,22 +249,10 @@ def __init__(
             self.uses_mrope, self.model_config)
         self.lora_utils = LoraUtils(self)
 
-        cache_config = self.cache_config
-        if cache_config.cache_dtype == "auto":
-            model_dtype = self.dtype
-            if isinstance(model_dtype, str):
-                self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
-            elif isinstance(getattr(model_dtype, 'dtype', None), jnp.dtype):
-                self.kv_cache_dtype = j2t_dtype(model_dtype.dtype)
-            elif isinstance(model_dtype, torch.dtype):
-                self.kv_cache_dtype = model_dtype
-            else:
-                raise ValueError(
-                    "KV cache is unsupported for model_dtype of %s",
-                    model_dtype)
-        else:
-            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[
-                cache_config.cache_dtype]
+        cache_dtype = self.cache_config.cache_dtype
+        if cache_dtype == "auto":
+            cache_dtype = self.dtype
+        self.kv_cache_dtype = to_torch_dtype(cache_dtype)
 
         self._pre_async_results: AsyncPreResults | None = None
         self._substitute_placeholder_token_fn = _substitute_placeholder_token
diff --git a/tpu_inference/utils.py b/tpu_inference/utils.py
@@ -8,11 +8,14 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+import torch
 from jax._src import dtypes
 from jax._src import mesh as mesh_lib
 from jax._src import xla_bridge as xb
 from jax._src.lib import xla_client as xc
+from jax._src.numpy.scalar_types import _ScalarMeta
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torchax.ops.mappings import j2t_dtype, t2j_dtype
 from vllm import envs as vllm_envs
 from vllm import utils
 
@@ -23,17 +26,36 @@
 TPU_HEAD_SIZE_ALIGNMENT = 128
 TPU_SECOND_LAST_MINOR = 8
 
-# This is used to translate from a string name for a dtype
-# to formal jax.numpy DType.  One use case for this is
-# converting the `--kv_cache_dtype` flag to a dtype.
-TPU_STR_DTYPE_TO_JAX_DTYPE = {
-    "bfloat16": jnp.bfloat16,
+# Map vllm dtype string that doesn't exactly match jax dtype string name.
+_VLLM_DTYPE_STR_TO_JAX_DTYPE = {
     "fp8": jnp.float8_e4m3fn,
-    "fp8_e4m3": jnp.float8_e4m3,
+    "fp8_e4m3": jnp.float8_e4m3fn,
     "fp8_e5m2": jnp.float8_e5m2,
-    "int8": jnp.int8,
 }
 
+
+def to_jax_dtype(dtype: str | jnp.dtype | torch.dtype) -> jnp.dtype:
+    if isinstance(dtype, str):
+        if dict_dtype := _VLLM_DTYPE_STR_TO_JAX_DTYPE.get(dtype, None):
+            return dict_dtype
+        return jnp.dtype(dtype)
+    elif isinstance(dtype, torch.dtype):
+        return t2j_dtype(dtype)
+    elif isinstance(dtype, jnp.dtype):
+        return dtype
+    elif isinstance(dtype, _ScalarMeta):
+        return dtype.dtype
+    else:
+        raise ValueError(f"Argument is unsupported data type {type(dtype)}")
+
+
+def to_torch_dtype(dtype: str | jnp.dtype | torch.dtype) -> torch.dtype:
+    # Use jax dtype as an intermediate dtype which we'll be used to convert it
+    # into torch dtype.
+    dtype = to_jax_dtype(dtype)
+    return j2t_dtype(dtype)
+
+
 _megacore = False
 logger = init_logger(__name__)
 
@@ -295,8 +317,8 @@ def get_jax_dtype_from_str_dtype(str_dtype: str) -> jnp.dtype:
     Returns:
         jnp.dtype: The JAX dtype.
     """
-    str_dtype = str_dtype.lower().strip()
-    return TPU_STR_DTYPE_TO_JAX_DTYPE.get(str_dtype)
+    # TODO(kyuyeunk): Replace all reference of this function into TpuDtype.
+    return to_jax_dtype(str_dtype)
 
 
 def time_function(func):