fixes

xingliu14 · xingliu14 · commit 15ee8d38c706 · 2025-11-17T20:13:33.000Z
diff --git a/tpu_inference/core/disagg_utils.py b/tpu_inference/core/disagg_utils.py
@@ -4,9 +4,6 @@
 
 from tpu_inference import envs
 
-PREFILL_SLICES = 'PREFILL_SLICES'
-DECODE_SLICES = 'DECODE_SLICES'
-
 
 def is_disagg_enabled() -> bool:
     # We triggrer our code path as long as prefill slices are set. This
diff --git a/tpu_inference/distributed/tpu_connector.py b/tpu_inference/distributed/tpu_connector.py
@@ -85,6 +85,7 @@
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+from tpu_inference import envs
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_ips,
                                              get_kv_ports,
                                              get_kv_transfer_port, get_node_id,
@@ -440,7 +441,6 @@ def __init__(self, vllm_config: VllmConfig):
 
         self.runner: TPUModelRunner = None
         self.mesh: Mesh = None
-        from tpu_inference import envs
         self.multi_host = envs.TPU_MULTIHOST_BACKEND == "ray"
         # NOTE(xiang): This can not be the worker rank set in RayDistributedExecutor.
         # The worker rank is assigned with vLLM's sorting logic, which does not work
diff --git a/tpu_inference/layers/vllm/sharding.py b/tpu_inference/layers/vllm/sharding.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 
 P = PartitionSpec
@@ -211,7 +212,6 @@ def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
 def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
     if isinstance(tensor, tuple):
         return tuple(_sharded_device_put(t, sharding) for t in tensor)
-    from tpu_inference import envs
     multihost_backend = envs.TPU_MULTIHOST_BACKEND
     if multihost_backend != "ray":
         return jax.device_put(tensor, sharding)
diff --git a/tpu_inference/models/jax/utils/weight_utils.py b/tpu_inference/models/jax/utils/weight_utils.py
@@ -18,7 +18,7 @@
 from jax.sharding import PartitionSpec as P
 from safetensors import safe_open
 
-from tpu_inference import utils
+from tpu_inference import envs, utils
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils import file_utils
 
@@ -421,7 +421,6 @@ def load_hf_weights(vllm_config,
     # NOTE(xiang): Disable multi-threading mode if running on multi-host.
     # Because multi-threading would cause different JAX processes to load
     # different weights at the same time.
-    from tpu_inference import envs
     if envs.TPU_MULTIHOST_BACKEND == "ray":
         max_workers = 1
     with ThreadPoolExecutor(max_workers=max_workers) as executor: