[Intel HPU] fix bugs caused by other commits (#5074)

fmiao2372 · web-flow · commit 74f33efdbf92 · 2025-11-17T15:28:55.000+08:00
* [Intel HPU] fix bugs caused by other commits

* update code by copilot
diff --git a/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py b/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py
@@ -186,7 +186,15 @@ class HPUAttentionBackend(AttentionBackend_HPU):
     HPUAttentionBackend backend implementation.
     """
 
-    def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
+    def __init__(
+        self,
+        llm_config: FDConfig,
+        kv_num_heads: int,
+        num_heads: int,
+        head_dim: int,
+        encoder_block_shape_q: int = -1,
+        decoder_block_shape_q: int = -1,
+    ):
         """
         HPUAttentionBackend __init__
         """
@@ -239,11 +247,13 @@ def init_attention_metadata(self, forward_meta):
     def get_kv_cache_shape(
         self,
         max_num_blocks: int,
+        kv_cache_quant_type: Optional[str] = None,
     ):
         """
         Caculate kv cache shape
         """
-        return (max_num_blocks, self.block_size, self.kv_num_heads, self.head_dim)
+        key_cache_shape = value_cache_shape = [max_num_blocks, self.block_size, self.kv_num_heads, self.head_dim]
+        return key_cache_shape, value_cache_shape
 
     def forward_extend(
         self, src, qkv_proj: QKVParallelLinear, o_proj: RowParallelLinear, layer: Attention, forward_meta
diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py
@@ -328,7 +328,7 @@ def __init__(
 
         #  Sampler
         if not self.speculative_decoding:
-            self.sampler = Sampler()
+            self.sampler = Sampler(fd_config)
         else:
             self.sampler = SpeculativeSampler(fd_config)