PaddlePaddle
diff --git a/‎custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/config.py‎
Lines changed: 0 additions & 4 deletions b/‎fastdeploy/config.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 5 additions & 1 deletion b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎fastdeploy/scheduler/config.py‎
Lines changed: 0 additions & 1 deletion b/‎fastdeploy/scheduler/config.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎fastdeploy/spec_decode/mtp.py‎
Lines changed: 7 additions & 1 deletion b/‎fastdeploy/spec_decode/mtp.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎scripts/extract_mtp_weight_from_safetensor.py‎
Lines changed: 36 additions & 0 deletions b/‎scripts/extract_mtp_weight_from_safetensor.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎scripts/run_ci_xpu.sh‎
Lines changed: 3 additions & 2 deletions b/‎scripts/run_ci_xpu.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/entrypoints/openai/tool_parsers/test_tool_parsers_utils.py‎
Lines changed: 73 additions & 0 deletions b/‎tests/entrypoints/openai/tool_parsers/test_tool_parsers_utils.py‎
Lines changed: 73 additions & 0 deletions
@@ -441,7 +441,7 @@ std::vector<paddle::Tensor> MoeExpertFFN(
     const std::string& quant_method,
     const int hadamard_blocksize,
     const int valid_token_num) {
-  if (ffn_in.numel() == 0) {
+  if (ffn_in.numel() == 0 || valid_token_num == 0) {
     paddle::Tensor ffn2_out =
         paddle::empty_like(ffn_in, paddle::DataType::BFLOAT16);
     return {ffn2_out};
 
@@ -1628,10 +1628,6 @@ def postprocess(self):
                 else:
                     self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
 
-        self.scheduler_config.max_chunk_len = (
-            self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_extra_num_batched_tokens
-        )
-
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04)
 
 
@@ -1125,7 +1125,7 @@ def _process_prefilled_requests():
                     # received the request sent by the client
                     waiting_request_outputs.append(req_output)
                     continue
-
+                req_output.finished = False
                 ready_request_outputs.append(req_output)
                 self.llm_logger.debug(f"there are enough resource for prefilled request: {req_output.request_id}")
 
@@ -1145,6 +1145,8 @@ def _process_prefilled_requests():
                         self.resource_manager.pre_recycle_resource(request_id)
                         if request_id in self.token_processor.tokens_counter:
                             del self.token_processor.tokens_counter[request_id]
+                        req_output.finished = True
+                        self.scheduler.put_results([req_output])
                         continue
                     if req_output.error_code != 200:
                         self.llm_logger.warning(
@@ -1156,6 +1158,8 @@ def _process_prefilled_requests():
                         self.scheduler.put_results([req_output])
                         continue
                     self.token_processor.tokens_counter[request_id] = 1
+                    if envs.FD_ENABLE_INTERNAL_ADAPTER:  # first token sent by D instance
+                        self.scheduler.put_results([req_output])
                     self.resource_manager.add_prefilled_request(req_output)
                     self.llm_logger.debug(f"add prefilled request success, {request_id}")
 
 
@@ -270,7 +270,6 @@ def __init__(self, args):
         self.name = "local"  # "local" for LocalScheduler or "global" for GlobalScheduler
         self.max_num_batched_tokens = 2048  # base token_num for text inputs
         self.max_extra_num_batched_tokens = 16384  # extra token_num for multimodal inputs
-        self.max_chunk_len = 18432  # max supported token_num = max_num_batched_tokens + max_extra_num_batched_tokens
         self.max_num_seqs = 34
         self.splitwise_role = "mixed"
         self.config = None
 
@@ -355,7 +355,13 @@ def _init_model_inputs(self):
             self.target_model_inputs["decoder_tile_ids_per_batch"]
         )
         self.model_inputs["target_hidden_states"] = paddle.full(
-            [self.fd_config.scheduler_config.max_chunk_len, self.model_config.hidden_size], 0, dtype="bfloat16"
+            [
+                self.fd_config.scheduler_config.max_num_batched_tokens
+                + self.fd_config.scheduler_config.max_extra_num_batched_tokens,
+                self.model_config.hidden_size,
+            ],
+            0,
+            dtype="bfloat16",
         )
 
         tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
 
@@ -17,7 +17,9 @@
 import argparse
 import json
 import os
+import re
 
+import numpy as np
 import paddle
 from paddleformers.transformers.model_utils import shard_checkpoint
 from paddleformers.utils.env import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
@@ -46,6 +48,28 @@ def parse_args():
     return parser.parse_args()
 
 
+def dtype_byte_size(dtype):
+    """
+    Returns the size (in bytes) occupied by one parameter of type `dtype`.
+
+    Example:
+
+    ```py
+    >>> dtype_byte_size(paddle.float32)
+    4
+    ```
+    """
+    if str(dtype) in {"paddle.bool", "bool"}:
+        return 1 / 8
+    if str(dtype) in {"paddle.float8_e4m3fn", "paddle.float8_e5m2", "float8_e4m3fn", "float8_e5m2"}:
+        return 1
+    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
+    if bit_search is None:
+        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
+    bit_size = int(bit_search.groups()[0])
+    return bit_size // 8
+
+
 def extract_mtp_weights(input_dir: str) -> dict:
     """
     Load all MTP-related weights from safetensors files in input_dir.
@@ -103,6 +127,18 @@ def save_safetensors(state_dict: dict, output_dir: str):
         logger.info(f"Saving shard: {save_path}")
         safe_save_file(shard, save_path, metadata={"format": "np"})
 
+    # If only one shard is returned, SAFE_WEIGHTS_INDEX_NAME will be null
+    if len(shards) == 1:
+        logger.info("Generate index file for single shard")
+        weight_size = 0
+        for key, weight in shards["model.safetensors"].items():
+            weight_size += np.prod(weight.shape) * dtype_byte_size(weight.dtype)
+
+        index = {
+            "metadata": {"total_size": int(weight_size)},
+            "weight_map": {k: "model.safetensors" for k in shards["model.safetensors"].keys()},
+        }
+
     index_path = os.path.join(output_dir, SAFE_WEIGHTS_INDEX_NAME)
     with open(index_path, "w", encoding="utf-8") as f:
         json.dump(index, f, indent=2)
 
@@ -44,8 +44,9 @@ echo "uninstall org"
 python -m pip uninstall paddlepaddle-xpu -y
 python -m pip uninstall fastdeploy-xpu -y
 
-python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
-
+# python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
+# 由于ep并行报错暂时锁死paddle版本
+python -m pip install https://paddle-whl.bj.bcebos.com/nightly/xpu-p800/paddlepaddle-xpu/paddlepaddle_xpu-3.3.0.dev20251123-cp310-cp310-linux_x86_64.whl
 echo "build whl"
 bash custom_ops/xpu_ops/download_dependencies.sh develop
 export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
 
@@ -0,0 +1,73 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+
+from partial_json_parser.core.options import Allow
+
+from fastdeploy.entrypoints.openai.tool_parsers import utils
+
+
+class TestPartialJsonUtils(unittest.TestCase):
+    """Unit test suite for partial JSON utility functions."""
+
+    def test_find_common_prefix(self):
+        """Test common prefix detection between two strings."""
+        string1 = '{"fruit": "ap"}'
+        string2 = '{"fruit": "apple"}'
+        self.assertEqual(utils.find_common_prefix(string1, string2), '{"fruit": "ap')
+
+    def test_find_common_suffix(self):
+        """Test common suffix detection between two strings."""
+        string1 = '{"fruit": "ap"}'
+        string2 = '{"fruit": "apple"}'
+        self.assertEqual(utils.find_common_suffix(string1, string2), '"}')
+
+    def test_extract_intermediate_diff(self):
+        """Test extraction of intermediate difference between current and old strings."""
+        old_string = '{"fruit": "ap"}'
+        current_string = '{"fruit": "apple"}'
+        self.assertEqual(utils.extract_intermediate_diff(current_string, old_string), "ple")
+
+    def test_find_all_indices(self):
+        """Test finding all occurrence indices of a substring in a string."""
+        target_string = "banana"
+        substring = "an"
+        self.assertEqual(utils.find_all_indices(target_string, substring), [1, 3])
+
+    def test_partial_json_loads_complete(self):
+        """Test partial_json_loads with a complete JSON string."""
+        input_json = '{"a": 1, "b": 2}'
+        parse_flags = Allow.ALL
+        parsed_obj, parsed_length = utils.partial_json_loads(input_json, parse_flags)
+        self.assertEqual(parsed_obj, {"a": 1, "b": 2})
+        self.assertEqual(parsed_length, len(input_json))
+
+    def test_is_complete_json(self):
+        """Test JSON completeness check."""
+        self.assertTrue(utils.is_complete_json('{"a": 1}'))
+        self.assertFalse(utils.is_complete_json('{"a": 1'))
+
+    def test_consume_space(self):
+        """Test whitespace consumption from the start of a string."""
+        input_string = "   \t\nabc"
+        # 3 spaces + 1 tab + 1 newline = 5 whitespace characters
+        first_non_whitespace_idx = utils.consume_space(0, input_string)
+        self.assertEqual(first_non_whitespace_idx, 5)
+
+
+if __name__ == "__main__":
+    unittest.main()
Original file line number	Diff line number	Diff line change
`@@ -355,7 +355,13 @@ def _init_model_inputs(self):`
`355`	`355`	`self.target_model_inputs["decoder_tile_ids_per_batch"]`
`356`	`356`	`)`
`357`	`357`	`self.model_inputs["target_hidden_states"] = paddle.full(`
`358`		`- [self.fd_config.scheduler_config.max_chunk_len, self.model_config.hidden_size], 0, dtype="bfloat16"`
	`358`	`+ [`
	`359`	`+ self.fd_config.scheduler_config.max_num_batched_tokens`
	`360`	`+ + self.fd_config.scheduler_config.max_extra_num_batched_tokens,`
	`361`	`+ self.model_config.hidden_size,`
	`362`	`+ ],`
	`363`	`+ 0,`
	`364`	`+ dtype="bfloat16",`
`359`	`365`	`)`
`360`	`366`
`361`	`367`	`tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))`