From 800a83c3391a943111a657e9627f7e6476e2a963 Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Wed, 19 Nov 2025 11:41:33 +0800
Subject: [PATCH 01/11] support prompt_token_ids + messages

---
 fastdeploy/entrypoints/openai/protocol.py     |   5 +-
 .../ernie4_5_vl_processor.py                  |  12 +-
 .../input/ernie4_5_vl_processor/process.py    | 260 +++++++++++++++++-
 3 files changed, 263 insertions(+), 14 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index 4a1e4ef647f..cdcc5cb9d87 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -671,10 +671,7 @@ def to_dict_for_infer(self, request_id=None):
         if request_id is not None:
             req_dict["request_id"] = request_id
 
-        if "prompt_token_ids" in req_dict:
-            if "messages" in req_dict:
-                del req_dict["messages"]
-        else:
+        if "prompt_token_ids" not in req_dict:
             # If disable_chat_template is set, then the first message in messages will be used as the prompt.
             assert (
                 len(req_dict["messages"]) > 0
diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
index 77c62125c7a..d9eec5275c2 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -219,7 +219,13 @@ def process_request_dict(self, request, max_model_len=None):
             bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
             request["bad_words_token_ids"] = bad_words_token_ids
 
-        if request.get("prompt"):
+        if request.get("prompt_token_ids"):
+            messages = request.get("messages")
+            if messages:
+                self._check_mm_limits(messages)
+            request.setdefault("enable_thinking", True)
+            outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request)
+        elif request.get("prompt"):
             multimodal_data = request.get("multimodal_data")
             if multimodal_data is None:
                 multimodal_data = {}
@@ -256,7 +262,9 @@ def process_request_dict(self, request, max_model_len=None):
             self.append_completion_tokens(outputs, request["completion_token_ids"])
 
         outputs = self.pack_outputs(outputs)
-        request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        request["prompt_token_ids"] = (
+            outputs["input_ids"].tolist() if "prompt_token_ids" not in request else request["prompt_token_ids"]
+        )
         request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
         request["multimodal_inputs"] = outputs
 
diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index 4ccdf287f20..c8816532a8b 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -136,7 +136,9 @@ def __init__(
         self.video_end = self.VID_END
         self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
         self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start)
+        self.image_end_id = self.tokenizer.convert_tokens_to_ids(self.image_end)
         self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start)
+        self.video_end_id = self.tokenizer.convert_tokens_to_ids(self.video_end)
         self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token)
         self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token)
 
@@ -243,14 +245,7 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N
 
         return outputs
 
-    def request2ids(
-        self, request: Dict[str, Any], tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        """
-        Convert chat messages into model inputs.
-        Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
-        """
-
+    def extract_mm_items(self, request: Dict[str, Any]):
         messages = parse_chat_messages(request.get("messages"))
         mm_items = []
         for msg in messages:
@@ -273,6 +268,7 @@ def request2ids(
         if len(missing_hashes) > 0 and not self.enable_processor_cache:
             raise ValueError("Missing items cannot be retrieved without processor cache.")
 
+        dealer = None
         if self.enable_processor_cache:
             context = zmq.Context()
             dealer = context.socket(zmq.DEALER)
@@ -295,6 +291,16 @@ def request2ids(
                 video_uuid.append(item["uuid"])
             else:
                 raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
+        return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
+
+    def request2ids(
+        self, request: Dict[str, Any], tgts: List[str] = None
+    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
+        """
+        Convert chat messages into model inputs.
+        Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
+        """
+        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
 
         if self.tokenizer.chat_template is None:
             raise ValueError("This model does not support chat template.")
@@ -329,6 +335,123 @@ def request2ids(
 
         return outputs
 
+    def prompt_token_ids2outputs(
+        self, request: Dict[str, Any], tgts: List[str] = None
+    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
+        outputs = {
+            "input_ids": [],
+            "token_type_ids": [],
+            "position_ids": [],
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "labels": [],
+            "cur_position": 0,
+            "video_cnt": 0,
+            "num_input_image_tokens": 0,
+            "num_input_video_tokens": 0,
+            "mm_positions": [],
+            "mm_hashes": [],
+        }
+        prompt_token_ids = request.get("prompt_token_ids", [])
+        prompt_token_ids_len = len(prompt_token_ids)
+        if not request.get("messages"):
+            outputs["input_ids"].append(prompt_token_ids)
+            outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
+            for i in range(prompt_token_ids_len):
+                outputs["position_ids"].append([i] * 3)
+            outputs["cur_position"] += prompt_token_ids_len
+            return outputs
+        images, videos, image_uuid, video_uuid, dealer = self.extract_mm_items(request)
+        st, image_idx, video_idx = 0, 0, 0
+        mm_id_set = {
+            self.image_start_id,
+            self.image_end_id,
+            self.video_start_id,
+            self.video_end_id,
+            self.image_patch_id,
+        }
+        while st < prompt_token_ids_len:
+            cur_token_id = prompt_token_ids[st]
+            if cur_token_id not in mm_id_set:
+                outputs["input_ids"].extend([cur_token_id])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
+                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+                continue
+            if cur_token_id == self.image_start_id:
+                if image_idx >= len(images):
+                    raise ValueError("prompt token ids has more image placeholder than in messages")
+                # append image_start_id
+                outputs["input_ids"].extend([cur_token_id])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
+                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+                # process placeholder token ids
+                cur_idx = st
+                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_end_id:
+                    cur_idx += 1
+                if cur_idx >= prompt_token_ids_len:
+                    raise ValueError("image token ids not complete")
+                image = images[image_idx]
+                uuid = image_uuid[image_idx] if image_uuid else None
+                if not isinstance(image, tuple):
+                    self._add_image_from_token_ids(image, outputs, uuid, cur_idx - st)
+                else:
+                    self._add_processed_image_from_token_ids(image, outputs, uuid, cur_idx - st)
+                image_idx += 1
+                # append image_end_id
+                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
+                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st = cur_idx + 1
+            elif cur_token_id == self.video_start_id:
+                if video_idx >= len(videos):
+                    raise ValueError("prompt token ids has more video placeholder than in messages")
+                # append video_start_id
+                outputs["input_ids"].extend([cur_token_id])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
+                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+                # process placeholder token ids
+                cur_idx = st
+                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.video_end_id:
+                    cur_idx += 1
+                if cur_idx >= prompt_token_ids_len:
+                    raise ValueError("video token ids not complete")
+                video = videos[video_idx]
+                uuid = video_uuid[video_idx] if video_uuid else None
+                if not isinstance(video, tuple):
+                    if isinstance(video, dict):
+                        frames = self._load_and_process_video(video["video"], video)
+                    else:
+                        frames = self._load_and_process_video(video, {})
+                    self._add_video_from_token_ids(frames, outputs, uuid, cur_idx - st)
+                else:
+                    self._add_processed_video_from_token_ids(video, outputs, uuid, cur_idx - st)
+                video_idx += 1
+                # append video_end_id
+                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
+                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st = cur_idx + 1
+        if image_idx != len(images):
+            raise ValueError("number of images does not match")
+        if video_idx != len(videos):
+            raise ValueError("number of videos does not match")
+        # for test cases
+        if len(outputs["input_ids"]) != prompt_token_ids_len:
+            raise ValueError("number of token ids does not match")
+        for idx in range(prompt_token_ids_len):
+            if outputs["input_ids"][idx] != prompt_token_ids[idx]:
+                raise ValueError("token ids does not match")
+        return outputs
+
     def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
         token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
         outputs["input_ids"].append(token_id)
@@ -348,6 +471,82 @@ def _add_text(self, tokens, outputs: Dict) -> None:
             outputs["position_ids"].append([start + i] * 3)
         outputs["cur_position"] += len(tokens)
 
+    def _preprocess_raw_image(self, img=None, frames=None):
+        if img is None and frames is None:
+            raise ValueError("image and frames cannot be None at the same time")
+        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+            img.height if img else frames[0].height,
+            img.width if img else frames[0].width,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[1]
+
+        if img:
+            ret = self.image_preprocessor.preprocess(
+                images=[img.convert("RGB")],
+                do_normalize=False,
+                do_rescale=False,
+                predetermined_grid_thw=np.array([[patches_h, patches_w]]),
+                do_convert_rgb=True,
+                input_data_format=ChannelDimension.LAST,
+            )
+        else:
+            pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+            ret = self.image_preprocessor.preprocess(
+                images=None,
+                videos=pixel_stack,
+                do_normalize=False,
+                do_rescale=False,
+                predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
+                do_convert_rgb=True,
+                input_data_format=ChannelDimension.LAST,
+            )
+        return patches_h, patches_w, ret
+
+    def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
+        patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
+        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+        if num_tokens != token_len:
+            raise ValueError("image tokens num not match the size")
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += num_tokens
+
+        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(ret["image_grid_thw"])
+        outputs["image_type_ids"].append(0)
+
+    def _add_processed_image_from_token_ids(
+        self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
+    ):
+        img, meta = img_cache
+        num_tokens = img.shape[0] // (self.spatial_conv_size**2)
+        if num_tokens != token_len:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+
+        _, h, w = meta["thw"]
+        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        outputs["images"].append(img)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[1, h, w]]))
+        outputs["image_type_ids"].append(0)
+
     def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
@@ -401,6 +600,29 @@ def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict
         outputs["grid_thw"].append(np.array([[1, h, w]]))
         outputs["image_type_ids"].append(0)
 
+    def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
+        patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
+        num_frames = len(frames)
+        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        if num_tokens != token_len:
+            raise ValueError("video tokens num not match the size")
+        outputs["images"].append(ret["pixel_values_videos"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(ret["video_grid_thw"])
+        outputs["image_type_ids"].extend([1] * num_frames)
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
+
+        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
     def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             frames[0].height,
@@ -438,6 +660,28 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None:
         outputs["position_ids"].extend(pos_ids)
         outputs["cur_position"] = np.max(pos_ids) + 1
 
+    def _add_processed_video_from_token_ids(
+        self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
+    ):
+        frames, meta = frames_cache
+        num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        if num_tokens != token_len:
+            raise ValueError("video tokens num not match the size")
+
+        t, h, w = meta["thw"]
+        outputs["images"].append(frames)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[t, h, w]]))
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["image_type_ids"].extend([1] * t)
+
+        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
     def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
         frames, meta = frames_cache
         num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)

From c51d53b5eab00b271293c6da95ce71f99ba14eaf Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Wed, 19 Nov 2025 19:37:26 +0800
Subject: [PATCH 02/11] fix bug

---
 .../input/ernie4_5_vl_processor/process.py       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index c8816532a8b..a92b7b50882 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -362,7 +362,7 @@ def prompt_token_ids2outputs(
                 outputs["position_ids"].append([i] * 3)
             outputs["cur_position"] += prompt_token_ids_len
             return outputs
-        images, videos, image_uuid, video_uuid, dealer = self.extract_mm_items(request)
+        images, videos, image_uuid, video_uuid, dealer, _, _ = self.extract_mm_items(request)
         st, image_idx, video_idx = 0, 0, 0
         mm_id_set = {
             self.image_start_id,
@@ -376,7 +376,7 @@ def prompt_token_ids2outputs(
             if cur_token_id not in mm_id_set:
                 outputs["input_ids"].extend([cur_token_id])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
                 st += 1
                 continue
@@ -386,7 +386,7 @@ def prompt_token_ids2outputs(
                 # append image_start_id
                 outputs["input_ids"].extend([cur_token_id])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
                 st += 1
                 # process placeholder token ids
@@ -405,7 +405,7 @@ def prompt_token_ids2outputs(
                 # append image_end_id
                 outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
                 st = cur_idx + 1
             elif cur_token_id == self.video_start_id:
@@ -414,7 +414,7 @@ def prompt_token_ids2outputs(
                 # append video_start_id
                 outputs["input_ids"].extend([cur_token_id])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
                 st += 1
                 # process placeholder token ids
@@ -437,7 +437,7 @@ def prompt_token_ids2outputs(
                 # append video_end_id
                 outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].extend([outputs["cur_position"]] * 3)
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
                 st = cur_idx + 1
         if image_idx != len(images):
@@ -477,8 +477,8 @@ def _preprocess_raw_image(self, img=None, frames=None):
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height if img else frames[0].height,
             img.width if img else frames[0].width,
-            min_pixels=self.image_min_pixels,
-            max_pixels=self.image_max_pixels,
+            min_pixels=self.image_min_pixels if img else self.video_min_pixels,
+            max_pixels=self.image_max_pixels if img else self.video_max_pixels,
         )[1]
 
         if img:

From 702b35e446281a8f8c27f80202108c705f81aefd Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Thu, 20 Nov 2025 10:30:45 +0800
Subject: [PATCH 03/11] refact code structure

---
 .../input/ernie4_5_vl_processor/process.py    | 270 +++++++++---------
 1 file changed, 142 insertions(+), 128 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index a92b7b50882..7eb1075bcfa 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -397,10 +397,11 @@ def prompt_token_ids2outputs(
                     raise ValueError("image token ids not complete")
                 image = images[image_idx]
                 uuid = image_uuid[image_idx] if image_uuid else None
+                token_len = cur_idx - st
                 if not isinstance(image, tuple):
-                    self._add_image_from_token_ids(image, outputs, uuid, cur_idx - st)
+                    self._add_image(image, outputs, uuid, token_len)
                 else:
-                    self._add_processed_image_from_token_ids(image, outputs, uuid, cur_idx - st)
+                    self._add_processed_image(image, outputs, uuid, token_len)
                 image_idx += 1
                 # append image_end_id
                 outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
@@ -425,14 +426,15 @@ def prompt_token_ids2outputs(
                     raise ValueError("video token ids not complete")
                 video = videos[video_idx]
                 uuid = video_uuid[video_idx] if video_uuid else None
+                token_len = cur_idx - st
                 if not isinstance(video, tuple):
                     if isinstance(video, dict):
                         frames = self._load_and_process_video(video["video"], video)
                     else:
                         frames = self._load_and_process_video(video, {})
-                    self._add_video_from_token_ids(frames, outputs, uuid, cur_idx - st)
+                    self._add_video(frames, outputs, uuid, token_len)
                 else:
-                    self._add_processed_video_from_token_ids(video, outputs, uuid, cur_idx - st)
+                    self._add_processed_video(video, outputs, uuid, token_len)
                 video_idx += 1
                 # append video_end_id
                 outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
@@ -471,83 +473,83 @@ def _add_text(self, tokens, outputs: Dict) -> None:
             outputs["position_ids"].append([start + i] * 3)
         outputs["cur_position"] += len(tokens)
 
-    def _preprocess_raw_image(self, img=None, frames=None):
-        if img is None and frames is None:
-            raise ValueError("image and frames cannot be None at the same time")
-        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
-            img.height if img else frames[0].height,
-            img.width if img else frames[0].width,
-            min_pixels=self.image_min_pixels if img else self.video_min_pixels,
-            max_pixels=self.image_max_pixels if img else self.video_max_pixels,
-        )[1]
-
-        if img:
-            ret = self.image_preprocessor.preprocess(
-                images=[img.convert("RGB")],
-                do_normalize=False,
-                do_rescale=False,
-                predetermined_grid_thw=np.array([[patches_h, patches_w]]),
-                do_convert_rgb=True,
-                input_data_format=ChannelDimension.LAST,
-            )
-        else:
-            pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-            ret = self.image_preprocessor.preprocess(
-                images=None,
-                videos=pixel_stack,
-                do_normalize=False,
-                do_rescale=False,
-                predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
-                do_convert_rgb=True,
-                input_data_format=ChannelDimension.LAST,
-            )
-        return patches_h, patches_w, ret
-
-    def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
-        patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
-        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
-        if num_tokens != token_len:
-            raise ValueError("image tokens num not match the size")
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-        outputs["num_input_image_tokens"] += num_tokens
-
-        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(ret["image_grid_thw"])
-        outputs["image_type_ids"].append(0)
-
-    def _add_processed_image_from_token_ids(
-        self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
-    ):
-        img, meta = img_cache
-        num_tokens = img.shape[0] // (self.spatial_conv_size**2)
-        if num_tokens != token_len:
-            raise ValueError("image tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-        _, h, w = meta["thw"]
-        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-        outputs["images"].append(img)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[1, h, w]]))
-        outputs["image_type_ids"].append(0)
-
-    def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
+    # def _preprocess_raw_image(self, img=None, frames=None):
+    #     if img is None and frames is None:
+    #         raise ValueError("image and frames cannot be None at the same time")
+    #     patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+    #         img.height if img else frames[0].height,
+    #         img.width if img else frames[0].width,
+    #         min_pixels=self.image_min_pixels if img else self.video_min_pixels,
+    #         max_pixels=self.image_max_pixels if img else self.video_max_pixels,
+    #     )[1]
+
+    #     if img:
+    #         ret = self.image_preprocessor.preprocess(
+    #             images=[img.convert("RGB")],
+    #             do_normalize=False,
+    #             do_rescale=False,
+    #             predetermined_grid_thw=np.array([[patches_h, patches_w]]),
+    #             do_convert_rgb=True,
+    #             input_data_format=ChannelDimension.LAST,
+    #         )
+    #     else:
+    #         pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+    #         ret = self.image_preprocessor.preprocess(
+    #             images=None,
+    #             videos=pixel_stack,
+    #             do_normalize=False,
+    #             do_rescale=False,
+    #             predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
+    #             do_convert_rgb=True,
+    #             input_data_format=ChannelDimension.LAST,
+    #         )
+    #     return patches_h, patches_w, ret
+
+    # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
+    #     patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
+    #     num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+    #     if num_tokens != token_len:
+    #         raise ValueError("image tokens num not match the size")
+    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+    #     outputs["num_input_image_tokens"] += num_tokens
+
+    #     pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
+    #     outputs["position_ids"].extend(pos_ids)
+    #     outputs["cur_position"] = np.max(pos_ids) + 1
+
+    #     outputs["images"].append(ret["pixel_values"])
+    #     if not uuid:
+    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+    #     else:
+    #         outputs["mm_hashes"].append(uuid)
+    #     outputs["grid_thw"].append(ret["image_grid_thw"])
+    #     outputs["image_type_ids"].append(0)
+
+    # def _add_processed_image_from_token_ids(
+    #     self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
+    # ):
+    #     img, meta = img_cache
+    #     num_tokens = img.shape[0] // (self.spatial_conv_size**2)
+    #     if num_tokens != token_len:
+    #         raise ValueError("image tokens num not match the size")
+
+    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+
+    #     _, h, w = meta["thw"]
+    #     pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
+    #     outputs["position_ids"].extend(pos_ids)
+    #     outputs["cur_position"] = np.max(pos_ids) + 1
+
+    #     outputs["images"].append(img)
+    #     outputs["mm_hashes"].append(uuid)
+    #     outputs["grid_thw"].append(np.array([[1, h, w]]))
+    #     outputs["image_type_ids"].append(0)
+
+    def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
             img.width,
@@ -555,6 +557,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
             max_pixels=self.image_max_pixels,
         )[1]
         num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+        if token_len and token_len != num_tokens:
+            raise ValueError("image tokens num not match the size")
 
         outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
         outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
@@ -582,9 +586,13 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
         outputs["grid_thw"].append(ret["image_grid_thw"])
         outputs["image_type_ids"].append(0)
 
-    def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
+    def _add_processed_image(
+        self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
+    ) -> None:
         img, meta = img_cache
         num_tokens = img.shape[0] // (self.spatial_conv_size**2)
+        if token_len and num_tokens != token_len:
+            raise ValueError("image tokens num not match the size")
 
         outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
         outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
@@ -600,30 +608,30 @@ def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict
         outputs["grid_thw"].append(np.array([[1, h, w]]))
         outputs["image_type_ids"].append(0)
 
-    def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
-        patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
-        num_frames = len(frames)
-        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
-        if num_tokens != token_len:
-            raise ValueError("video tokens num not match the size")
-        outputs["images"].append(ret["pixel_values_videos"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(ret["video_grid_thw"])
-        outputs["image_type_ids"].extend([1] * num_frames)
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["num_input_video_tokens"] += num_tokens
-
-        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-    def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None:
+    # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
+    #     patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
+    #     num_frames = len(frames)
+    #     num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+    #     if num_tokens != token_len:
+    #         raise ValueError("video tokens num not match the size")
+    #     outputs["images"].append(ret["pixel_values_videos"])
+    #     if not uuid:
+    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
+    #     else:
+    #         outputs["mm_hashes"].append(uuid)
+    #     outputs["grid_thw"].append(ret["video_grid_thw"])
+    #     outputs["image_type_ids"].extend([1] * num_frames)
+
+    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+    #     outputs["num_input_video_tokens"] += num_tokens
+
+    #     pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
+    #     outputs["position_ids"].extend(pos_ids)
+    #     outputs["cur_position"] = np.max(pos_ids) + 1
+
+    def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             frames[0].height,
             frames[0].width,
@@ -632,6 +640,8 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None:
         )[1]
         num_frames = len(frames)
         num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        if token_len and num_tokens != token_len:
+            raise ValueError("video tokens num not match the size")
 
         pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
         ret = self.image_preprocessor.preprocess(
@@ -660,12 +670,34 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None:
         outputs["position_ids"].extend(pos_ids)
         outputs["cur_position"] = np.max(pos_ids) + 1
 
-    def _add_processed_video_from_token_ids(
-        self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
-    ):
+    # def _add_processed_video_from_token_ids(
+    #     self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
+    # ):
+    #     frames, meta = frames_cache
+    #     num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
+    #     if num_tokens != token_len:
+    #         raise ValueError("video tokens num not match the size")
+
+    #     t, h, w = meta["thw"]
+    #     outputs["images"].append(frames)
+    #     outputs["mm_hashes"].append(uuid)
+    #     outputs["grid_thw"].append(np.array([[t, h, w]]))
+
+    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+    #     outputs["image_type_ids"].extend([1] * t)
+
+    #     pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
+    #     outputs["position_ids"].extend(pos_ids)
+    #     outputs["cur_position"] = np.max(pos_ids) + 1
+
+    def _add_processed_video(
+        self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
+    ) -> None:
         frames, meta = frames_cache
         num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
-        if num_tokens != token_len:
+        if token_len and num_tokens != token_len:
             raise ValueError("video tokens num not match the size")
 
         t, h, w = meta["thw"]
@@ -682,24 +714,6 @@ def _add_processed_video_from_token_ids(
         outputs["position_ids"].extend(pos_ids)
         outputs["cur_position"] = np.max(pos_ids) + 1
 
-    def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
-        frames, meta = frames_cache
-        num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
-
-        t, h, w = meta["thw"]
-        outputs["images"].append(frames)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["image_type_ids"].extend([1] * t)
-
-        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
     def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None:
         input_ids = copy.deepcopy(outputs["input_ids"])
         labels = [self.tokenizer.ignored_index] * len(input_ids)

From f9de470be75cfe438993d1ebe33cde0548707cac Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Thu, 20 Nov 2025 10:39:40 +0800
Subject: [PATCH 04/11] support cache mm items

---
 .../input/ernie4_5_vl_processor/process.py       | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index 7eb1075bcfa..cb5a5835ffd 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -362,7 +362,7 @@ def prompt_token_ids2outputs(
                 outputs["position_ids"].append([i] * 3)
             outputs["cur_position"] += prompt_token_ids_len
             return outputs
-        images, videos, image_uuid, video_uuid, dealer, _, _ = self.extract_mm_items(request)
+        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
         st, image_idx, video_idx = 0, 0, 0
         mm_id_set = {
             self.image_start_id,
@@ -452,6 +452,20 @@ def prompt_token_ids2outputs(
         for idx in range(prompt_token_ids_len):
             if outputs["input_ids"][idx] != prompt_token_ids[idx]:
                 raise ValueError("token ids does not match")
+
+        if self.enable_processor_cache:
+            missing_idx = set(missing_idx)
+            hashes_to_cache, items_to_cache = [], []
+            for idx in range(len(mm_items)):
+                if idx in missing_idx:
+                    continue
+                meta = {}
+                t, h, w = outputs["grid_thw"][idx][0]
+                meta["thw"] = (t, h, w)
+                hashes_to_cache.append(outputs["mm_hashes"][idx])
+                items_to_cache.append((outputs["images"][idx], meta))
+            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
+
         return outputs
 
     def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:

From bd67abae1e04fdb08f9e1c4cf3a9537767cd20ab Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Thu, 20 Nov 2025 16:42:07 +0800
Subject: [PATCH 05/11] refact code structure

---
 .../input/ernie4_5_vl_processor/process.py    | 149 +-----------------
 1 file changed, 5 insertions(+), 144 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index cb5a5835ffd..c0038a9fbde 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -364,22 +364,8 @@ def prompt_token_ids2outputs(
             return outputs
         images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
         st, image_idx, video_idx = 0, 0, 0
-        mm_id_set = {
-            self.image_start_id,
-            self.image_end_id,
-            self.video_start_id,
-            self.video_end_id,
-            self.image_patch_id,
-        }
         while st < prompt_token_ids_len:
             cur_token_id = prompt_token_ids[st]
-            if cur_token_id not in mm_id_set:
-                outputs["input_ids"].extend([cur_token_id])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st += 1
-                continue
             if cur_token_id == self.image_start_id:
                 if image_idx >= len(images):
                     raise ValueError("prompt token ids has more image placeholder than in messages")
@@ -403,12 +389,7 @@ def prompt_token_ids2outputs(
                 else:
                     self._add_processed_image(image, outputs, uuid, token_len)
                 image_idx += 1
-                # append image_end_id
-                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st = cur_idx + 1
+                st = cur_idx
             elif cur_token_id == self.video_start_id:
                 if video_idx >= len(videos):
                     raise ValueError("prompt token ids has more video placeholder than in messages")
@@ -436,12 +417,13 @@ def prompt_token_ids2outputs(
                 else:
                     self._add_processed_video(video, outputs, uuid, token_len)
                 video_idx += 1
-                # append video_end_id
-                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
+                st = cur_idx
+            else:
+                outputs["input_ids"].extend([cur_token_id])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
                 outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
-                st = cur_idx + 1
+                st += 1
         if image_idx != len(images):
             raise ValueError("number of images does not match")
         if video_idx != len(videos):
@@ -487,82 +469,6 @@ def _add_text(self, tokens, outputs: Dict) -> None:
             outputs["position_ids"].append([start + i] * 3)
         outputs["cur_position"] += len(tokens)
 
-    # def _preprocess_raw_image(self, img=None, frames=None):
-    #     if img is None and frames is None:
-    #         raise ValueError("image and frames cannot be None at the same time")
-    #     patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
-    #         img.height if img else frames[0].height,
-    #         img.width if img else frames[0].width,
-    #         min_pixels=self.image_min_pixels if img else self.video_min_pixels,
-    #         max_pixels=self.image_max_pixels if img else self.video_max_pixels,
-    #     )[1]
-
-    #     if img:
-    #         ret = self.image_preprocessor.preprocess(
-    #             images=[img.convert("RGB")],
-    #             do_normalize=False,
-    #             do_rescale=False,
-    #             predetermined_grid_thw=np.array([[patches_h, patches_w]]),
-    #             do_convert_rgb=True,
-    #             input_data_format=ChannelDimension.LAST,
-    #         )
-    #     else:
-    #         pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-    #         ret = self.image_preprocessor.preprocess(
-    #             images=None,
-    #             videos=pixel_stack,
-    #             do_normalize=False,
-    #             do_rescale=False,
-    #             predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
-    #             do_convert_rgb=True,
-    #             input_data_format=ChannelDimension.LAST,
-    #         )
-    #     return patches_h, patches_w, ret
-
-    # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
-    #     patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
-    #     num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
-    #     if num_tokens != token_len:
-    #         raise ValueError("image tokens num not match the size")
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-    #     outputs["num_input_image_tokens"] += num_tokens
-
-    #     pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
-    #     outputs["images"].append(ret["pixel_values"])
-    #     if not uuid:
-    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-    #     else:
-    #         outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(ret["image_grid_thw"])
-    #     outputs["image_type_ids"].append(0)
-
-    # def _add_processed_image_from_token_ids(
-    #     self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
-    # ):
-    #     img, meta = img_cache
-    #     num_tokens = img.shape[0] // (self.spatial_conv_size**2)
-    #     if num_tokens != token_len:
-    #         raise ValueError("image tokens num not match the size")
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-    #     _, h, w = meta["thw"]
-    #     pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
-    #     outputs["images"].append(img)
-    #     outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(np.array([[1, h, w]]))
-    #     outputs["image_type_ids"].append(0)
-
     def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
@@ -622,29 +528,6 @@ def _add_processed_image(
         outputs["grid_thw"].append(np.array([[1, h, w]]))
         outputs["image_type_ids"].append(0)
 
-    # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
-    #     patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
-    #     num_frames = len(frames)
-    #     num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
-    #     if num_tokens != token_len:
-    #         raise ValueError("video tokens num not match the size")
-    #     outputs["images"].append(ret["pixel_values_videos"])
-    #     if not uuid:
-    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
-    #     else:
-    #         outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(ret["video_grid_thw"])
-    #     outputs["image_type_ids"].extend([1] * num_frames)
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-    #     outputs["num_input_video_tokens"] += num_tokens
-
-    #     pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
     def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             frames[0].height,
@@ -684,28 +567,6 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None)
         outputs["position_ids"].extend(pos_ids)
         outputs["cur_position"] = np.max(pos_ids) + 1
 
-    # def _add_processed_video_from_token_ids(
-    #     self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
-    # ):
-    #     frames, meta = frames_cache
-    #     num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
-    #     if num_tokens != token_len:
-    #         raise ValueError("video tokens num not match the size")
-
-    #     t, h, w = meta["thw"]
-    #     outputs["images"].append(frames)
-    #     outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-    #     outputs["image_type_ids"].extend([1] * t)
-
-    #     pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
     def _add_processed_video(
         self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
     ) -> None:

From 0337379be4393fac0160830f8e12531d930f5333 Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Thu, 20 Nov 2025 17:34:47 +0800
Subject: [PATCH 06/11] delete test cases

---
 fastdeploy/input/ernie4_5_vl_processor/process.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index c0038a9fbde..efbb3452607 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -428,12 +428,6 @@ def prompt_token_ids2outputs(
             raise ValueError("number of images does not match")
         if video_idx != len(videos):
             raise ValueError("number of videos does not match")
-        # for test cases
-        if len(outputs["input_ids"]) != prompt_token_ids_len:
-            raise ValueError("number of token ids does not match")
-        for idx in range(prompt_token_ids_len):
-            if outputs["input_ids"][idx] != prompt_token_ids[idx]:
-                raise ValueError("token ids does not match")
 
         if self.enable_processor_cache:
             missing_idx = set(missing_idx)

From bcd67a4c618b20cf7abd0f2f2e3b6c792e1ea7f1 Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Thu, 20 Nov 2025 19:22:09 +0800
Subject: [PATCH 07/11] modify unit test

---
 tests/input/test_ernie_vl_processor.py | 27 +++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py
index 92d24d5b96f..b9bc22d4cc2 100644
--- a/tests/input/test_ernie_vl_processor.py
+++ b/tests/input/test_ernie_vl_processor.py
@@ -77,7 +77,7 @@ def test_process_request_dict_with_options(self):
             "prompt_token_ids": [1, 1, 1],
         }
         self.processor.process_request_dict(request_dict, 100)
-        self.assertEqual(request_dict["enable_thinking"], False)
+        self.assertEqual(request_dict["enable_thinking"], True)
 
         request_dict = {
             "messages": [{"role": "user", "content": "Hello"}],
@@ -93,7 +93,7 @@ def test_process_request_dict_with_options(self):
             "prompt_token_ids": [1, 1, 1],
         }
         self.processor.process_request_dict(request_dict, 100)
-        self.assertEqual(request_dict["enable_thinking"], False)
+        self.assertEqual(request_dict["enable_thinking"], True)
 
         request_dict = {
             "messages": [{"role": "user", "content": "Hello"}],
@@ -101,7 +101,7 @@ def test_process_request_dict_with_options(self):
             "prompt_token_ids": [1, 1, 1],
         }
         self.processor.process_request_dict(request_dict, 100)
-        self.assertEqual(request_dict["enable_thinking"], False)
+        self.assertEqual(request_dict["enable_thinking"], True)
 
         request_dict = {
             "messages": [{"role": "user", "content": "Hello"}],
@@ -111,6 +111,27 @@ def test_process_request_dict_with_options(self):
         self.processor.process_request_dict(request_dict, 100)
         self.assertEqual(request_dict["enable_thinking"], True)
 
+        request_dict = {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "chat_template_kwargs": {"options": {"thinking_mode": "close"}},
+        }
+        self.processor.process_request_dict(request_dict, 100)
+        self.assertEqual(request_dict["enable_thinking"], False)
+
+        request_dict = {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "chat_template_kwargs": {"options": {"thinking_mode": "false"}},
+        }
+        self.processor.process_request_dict(request_dict, 100)
+        self.assertEqual(request_dict["enable_thinking"], False)
+
+        request_dict = {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "chat_template_kwargs": {"enable_thinking": False},
+        }
+        self.processor.process_request_dict(request_dict, 100)
+        self.assertEqual(request_dict["enable_thinking"], False)
+
 
 if __name__ == "__main__":
     unittest.main()

From 7c0d5c2834a85035759e5540fbc035c387fc857a Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Mon, 24 Nov 2025 14:42:45 +0800
Subject: [PATCH 08/11] add unit test

---
 tests/input/test_ernie_vl_processor.py | 304 +++++++++++++++++++++++++
 1 file changed, 304 insertions(+)

diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py
index b9bc22d4cc2..3e0aa670bea 100644
--- a/tests/input/test_ernie_vl_processor.py
+++ b/tests/input/test_ernie_vl_processor.py
@@ -1,7 +1,15 @@
 import unittest
 from unittest.mock import MagicMock, patch
 
+import numpy as np
+
+from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
 from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor
+from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
+    AdaptiveImageProcessor,
+)
+from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
+from fastdeploy.input.utils import IDS_TYPE_FLAG
 
 
 class TestErnie4_5_vl_ProcessorProcessResponseDictStreaming(unittest.TestCase):
@@ -133,5 +141,301 @@ def test_process_request_dict_with_options(self):
         self.assertEqual(request_dict["enable_thinking"], False)
 
 
+class TestDataProcessorTargetMethods(unittest.TestCase):
+    def setUp(self):
+        self.mock_tokenizer = MagicMock(spec=Ernie4_5Tokenizer)
+        self.mock_tokenizer.ignored_index = -100
+        self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids
+        self.mock_tokenizer.chat_template = "mock_template"
+        self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>"
+
+        def mock_load_tokenizer(dp_instance):
+            dp_instance.tokenizer = self.mock_tokenizer
+
+        with patch.object(DataProcessor, "_load_tokenizer", side_effect=mock_load_tokenizer, autospec=True):
+            with patch.object(AdaptiveImageProcessor, "from_pretrained") as mock_image_preprocessor:
+                mock_image_preprocessor.return_value = MagicMock()
+                self.data_processor = DataProcessor(
+                    tokenizer_name="mock_tokenizer",
+                    image_preprocessor_name="mock_image_preprocessor",
+                    enable_processor_cache=False,
+                )
+        self.data_processor.image_patch_id = 1001
+        self.data_processor.image_start_id = 1002
+        self.data_processor.image_end_id = 1003
+        self.data_processor.video_start_id = 1004
+        self.data_processor.video_end_id = 1005
+        self.data_processor.role_prefixes = {"user": "User: ", "assistant": "Assistant: "}
+        self.data_processor.enable_processor_cache = False
+        self.data_processor.extract_mm_items = MagicMock(return_value=([], [], [], [], None, [], []))
+
+    def _mock_convert_tokens_to_ids(self, token):
+        token_id_map = {
+            "<|begin_of_sentence|>": 101,
+            "<|end_of_sentence|>": 102,
+            "</s>": 103,
+            "<|IMAGE_PLACEHOLDER|>": 1001,
+            "<|IMAGE_START|>": 1002,
+            "<|IMAGE_END|>": 1003,
+            "<|VIDEO_START|>": 1004,
+            "<|VIDEO_END|>": 1005,
+        }
+        return token_id_map.get(token, 999)
+
+    def test_prompt_token_ids2outputs_only_prompt_token_ids(self):
+        test_prompt_token_ids = [101, 999, 998, 997, 102]
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+        }
+
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+
+        prompt_len = len(test_prompt_token_ids)
+
+        self.assertEqual(
+            outputs["input_ids"],
+            [test_prompt_token_ids],
+            f"input_ids 不匹配：实际{outputs['input_ids']}，预期[{test_prompt_token_ids}]",
+        )
+
+        self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
+
+        expected_position_ids = [[i] * 3 for i in range(prompt_len)]
+        self.assertEqual(outputs["position_ids"], expected_position_ids)
+
+        self.assertEqual(outputs["cur_position"], prompt_len)
+
+        self.assertEqual(len(outputs["images"]), 0)
+        self.assertEqual(len(outputs["grid_thw"]), 0)
+        self.assertEqual(len(outputs["mm_positions"]), 0)
+        self.assertEqual(len(outputs["mm_hashes"]), 0)
+        self.assertEqual(outputs["video_cnt"], 0)
+        self.assertEqual(outputs["num_input_image_tokens"], 0)
+        self.assertEqual(outputs["num_input_video_tokens"], 0)
+
+    def test_prompt_token_ids2outputs_with_messages_no_mm(self):
+        test_prompt_token_ids = [101, 999, 998, 997, 102]
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [{"role": "user", "content": "Hello World"}],
+        }
+
+        self.data_processor.extract_mm_items.return_value = ([], [], [], [], None, [], [])
+
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+
+        prompt_len = len(test_prompt_token_ids)
+
+        self.assertEqual(outputs["input_ids"], test_prompt_token_ids)
+
+        self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
+
+        expected_position_ids = [[i] * 3 for i in range(prompt_len)]
+        self.assertEqual(outputs["position_ids"], expected_position_ids)
+
+        self.assertEqual(outputs["cur_position"], prompt_len)
+
+        self.assertEqual(len(outputs["images"]), 0)
+        self.assertEqual(outputs["video_cnt"], 0)
+        self.assertEqual(outputs["num_input_image_tokens"], 0)
+
+    def test_prompt_token_ids2outputs_add_image(self):
+        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
+        mock_img = MagicMock()
+        mock_img.height = 224
+        mock_img.width = 224
+        mock_img.convert.return_value = mock_img
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [mock_img],
+            [],
+            ["img_uuid"],
+            [],
+            None,
+            [],
+            [{"type": "image", "data": mock_img}],
+        )
+        mock_resize = (None, (2, 4))
+        self.data_processor.image_preprocessor.get_smarted_resize.return_value = mock_resize
+        mock_preprocess = {"pixel_values": np.random.randn(1, 16, 16, 3), "image_grid_thw": np.array([[2, 4]])}
+        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
+        # self.data_processor._compute_3d_positions = MagicMock(return_value=[[i]*3 for i in range(4)])
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+        self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
+        self.assertEqual(
+            outputs["token_type_ids"],
+            [
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["image"],
+                IDS_TYPE_FLAG["image"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+            ],
+        )
+        self.assertEqual(len(outputs["position_ids"]), 6)
+        self.assertEqual(outputs["cur_position"], 6)
+        self.assertEqual(len(outputs["images"]), 1)
+        self.assertIsNotNone(outputs["images"][0])
+        self.assertEqual(outputs["num_input_image_tokens"], 2)
+        self.assertEqual(len(outputs["mm_positions"]), 1)
+        self.assertEqual(len(outputs["mm_hashes"]), 1)
+        self.assertEqual(len(outputs["grid_thw"]), 1)
+        self.assertEqual(len(outputs["image_type_ids"]), 1)
+
+    def test_prompt_token_ids2outputs_add_processed_image(self):
+        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
+        mock_img_data = np.random.randn(8, 28, 28)
+        mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [mock_img_cache],
+            [],
+            ["img_uuid"],
+            [],
+            None,
+            [],
+            [{"type": "image", "data": mock_img_cache}],
+        )
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+        self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
+        self.assertEqual(
+            outputs["token_type_ids"],
+            [
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["image"],
+                IDS_TYPE_FLAG["image"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+            ],
+        )
+        self.assertEqual(len(outputs["position_ids"]), 20)
+        self.assertEqual(outputs["cur_position"], 8)
+        self.assertEqual(len(outputs["images"]), 1)
+        self.assertIsNotNone(outputs["images"][0])
+        self.assertEqual(len(outputs["mm_positions"]), 1)
+        self.assertEqual(outputs["mm_hashes"][0], "img_uuid")
+        self.assertEqual(len(outputs["grid_thw"]), 1)
+        self.assertEqual(len(outputs["image_type_ids"]), 1)
+
+    def test_prompt_token_ids2outputs_add_video(self):
+        test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
+        mock_frame1 = MagicMock()
+        mock_frame1.height = 224
+        mock_frame1.width = 224
+        mock_frame1.convert.return_value = mock_frame1
+        mock_frame2 = MagicMock()
+        mock_frame2.height = 224
+        mock_frame2.width = 224
+        mock_frame2.convert.return_value = mock_frame2
+        frames = [mock_frame1, mock_frame2]
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [],
+            [frames],
+            [],
+            ["vid_uuid"],
+            None,
+            [],
+            [{"type": "video", "data": frames}],
+        )
+        self.data_processor._load_and_process_video = MagicMock(return_value=frames)
+        patches_h, patches_w = 4, 4
+        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
+        mock_preprocess = {
+            "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
+            "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
+        }
+        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+        self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
+        self.assertEqual(
+            outputs["token_type_ids"],
+            [
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+            ],
+        )
+        self.assertEqual(len(outputs["position_ids"]), 8)
+        self.assertEqual(outputs["cur_position"], 6)
+        self.assertEqual(len(outputs["images"]), 1)
+        self.assertIsNotNone(outputs["images"][0])
+        self.assertEqual(len(outputs["mm_positions"]), 1)
+        self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
+        self.assertEqual(len(outputs["grid_thw"]), 1)
+        self.assertEqual(len(outputs["image_type_ids"]), 2)
+        self.assertEqual(outputs["num_input_video_tokens"], 4)
+
+    def test_prompt_token_ids2outputs_add_processed_video(self):
+        test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
+        t, h, w = 2, 4, 4
+        spatial_conv_size = self.data_processor.spatial_conv_size
+        temporal_conv_size = self.data_processor.temporal_conv_size
+        token_per_frame = (h // spatial_conv_size) * (w // spatial_conv_size)
+        num_tokens = (t // temporal_conv_size) * token_per_frame
+        mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
+        mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [],
+            [mock_frames_cache],
+            [],
+            ["vid_uuid"],
+            None,
+            [],
+            [{"type": "video", "data": mock_frames_cache}],
+        )
+        outputs = self.data_processor.prompt_token_ids2outputs(request)
+        self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
+        self.assertEqual(
+            outputs["token_type_ids"],
+            [
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["video"],
+                IDS_TYPE_FLAG["text"],
+                IDS_TYPE_FLAG["text"],
+            ],
+        )
+        self.assertEqual(len(outputs["position_ids"]), 8)
+        self.assertEqual(outputs["cur_position"], 6)
+        self.assertEqual(len(outputs["images"]), 1)
+        self.assertIsNotNone(outputs["images"][0])
+        self.assertEqual(len(outputs["mm_positions"]), 1)
+        self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
+        self.assertEqual(len(outputs["grid_thw"]), 1)
+        self.assertEqual(len(outputs["image_type_ids"]), 2)
+
+
 if __name__ == "__main__":
     unittest.main()

From e653596b041f8ce18e8ff05f1f7b1a1ea4ff11ef Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Mon, 24 Nov 2025 16:11:03 +0800
Subject: [PATCH 09/11] add unit test

---
 tests/input/test_ernie_vl_processor.py | 123 +++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py
index 3e0aa670bea..4b2f43e4e2a 100644
--- a/tests/input/test_ernie_vl_processor.py
+++ b/tests/input/test_ernie_vl_processor.py
@@ -436,6 +436,129 @@ def test_prompt_token_ids2outputs_add_processed_video(self):
         self.assertEqual(len(outputs["grid_thw"]), 1)
         self.assertEqual(len(outputs["image_type_ids"]), 2)
 
+    def test_prompt_token_ids2outputs_add_image_token_len_mismatch(self):
+        test_prompt_token_ids = [101, 1002, 1001, 1001, 1001, 1003, 102]
+        mock_img = MagicMock()
+        mock_img.height = 224
+        mock_img.width = 224
+        mock_img.convert.return_value = mock_img
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [mock_img],
+            [],
+            ["img_uuid"],
+            [],
+            None,
+            [],
+            [{"type": "image", "data": mock_img}],
+        )
+        patches_h, patches_w = 8, 8
+        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
+        mock_preprocess = {
+            "pixel_values": np.random.randn(1, patches_h, patches_w, 3),
+            "image_grid_thw": np.array([[patches_h, patches_w]]),
+        }
+        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
+        with self.assertRaises(ValueError) as ctx:
+            self.data_processor.prompt_token_ids2outputs(request)
+        self.assertIn("image tokens num not match the size", str(ctx.exception))
+
+    def test_prompt_token_ids2outputs_add_processed_image_token_len_mismatch(self):
+        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
+        spatial_conv_size = self.data_processor.spatial_conv_size
+        num_tokens = 4
+        mock_img_data = np.random.randn(num_tokens * (spatial_conv_size**2), 28, 28)
+        mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [mock_img_cache],
+            [],
+            ["img_uuid"],
+            [],
+            None,
+            [],
+            [{"type": "image", "data": mock_img_cache}],
+        )
+        with self.assertRaises(ValueError) as ctx:
+            self.data_processor.prompt_token_ids2outputs(request)
+        self.assertIn("image tokens num not match the size", str(ctx.exception))
+
+    def test_prompt_token_ids2outputs_add_video_token_len_mismatch(self):
+        test_prompt_token_ids = [101, 1004, 1001, 1001, 1005, 102]
+        mock_frame1 = MagicMock()
+        mock_frame1.height = 224
+        mock_frame1.width = 224
+        mock_frame1.convert.return_value = mock_frame1
+        mock_frame2 = MagicMock()
+        mock_frame2.height = 224
+        mock_frame2.width = 224
+        mock_frame2.convert.return_value = mock_frame2
+        frames = [mock_frame1, mock_frame2]
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [],
+            [frames],
+            [],
+            ["vid_uuid"],
+            None,
+            [],
+            [{"type": "video", "data": frames}],
+        )
+        self.data_processor._load_and_process_video = MagicMock(return_value=frames)
+        patches_h, patches_w = 8, 8
+        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
+        mock_preprocess = {
+            "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
+            "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
+        }
+        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
+        with self.assertRaises(ValueError) as ctx:
+            self.data_processor.prompt_token_ids2outputs(request)
+        self.assertIn("video tokens num not match the size", str(ctx.exception))
+
+    def test_prompt_token_ids2outputs_add_processed_video_token_len_mismatch(self):
+        test_prompt_token_ids = [101, 1004, 1001, 1005, 102]
+        t, h, w = 2, 8, 8
+        spatial_conv_size = self.data_processor.spatial_conv_size
+        temporal_conv_size = self.data_processor.temporal_conv_size
+
+        num_tokens = 4
+        mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
+        mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
+        request = {
+            "prompt_token_ids": test_prompt_token_ids,
+            "messages": [
+                {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
+            ],
+        }
+        self.data_processor.extract_mm_items.return_value = (
+            [],
+            [mock_frames_cache],
+            [],
+            ["vid_uuid"],
+            None,
+            [],
+            [{"type": "video", "data": mock_frames_cache}],
+        )
+        with self.assertRaises(ValueError) as ctx:
+            self.data_processor.prompt_token_ids2outputs(request)
+        self.assertIn("video tokens num not match the size", str(ctx.exception))
+
 
 if __name__ == "__main__":
     unittest.main()

From d93b21281ae71bffbada85f2a5958e0fb5be267f Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Tue, 25 Nov 2025 11:44:14 +0800
Subject: [PATCH 10/11] fix append

---
 fastdeploy/input/ernie4_5_vl_processor/process.py | 2 +-
 tests/input/test_ernie_vl_processor.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index efbb3452607..3da2bfcb97f 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -356,7 +356,7 @@ def prompt_token_ids2outputs(
         prompt_token_ids = request.get("prompt_token_ids", [])
         prompt_token_ids_len = len(prompt_token_ids)
         if not request.get("messages"):
-            outputs["input_ids"].append(prompt_token_ids)
+            outputs["input_ids"].extend(prompt_token_ids)
             outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
             for i in range(prompt_token_ids_len):
                 outputs["position_ids"].append([i] * 3)
diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py
index 4b2f43e4e2a..ee4d0f195f8 100644
--- a/tests/input/test_ernie_vl_processor.py
+++ b/tests/input/test_ernie_vl_processor.py
@@ -194,7 +194,7 @@ def test_prompt_token_ids2outputs_only_prompt_token_ids(self):
 
         self.assertEqual(
             outputs["input_ids"],
-            [test_prompt_token_ids],
+            test_prompt_token_ids,
             f"input_ids 不匹配：实际{outputs['input_ids']}，预期[{test_prompt_token_ids}]",
         )
 

From 8fed8b4e2a4c2c8b1e7f0fae4a85b805a22b88ff Mon Sep 17 00:00:00 2001
From: kxz2002 <kongxiangzhe@baidu.com>
Date: Tue, 25 Nov 2025 20:51:07 +0800
Subject: [PATCH 11/11] add check for messages

---
 fastdeploy/entrypoints/openai/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index cdcc5cb9d87..339164fca1d 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -671,7 +671,7 @@ def to_dict_for_infer(self, request_id=None):
         if request_id is not None:
             req_dict["request_id"] = request_id
 
-        if "prompt_token_ids" not in req_dict:
+        if "prompt_token_ids" not in req_dict or not req_dict["prompt_token_ids"]:
             # If disable_chat_template is set, then the first message in messages will be used as the prompt.
             assert (
                 len(req_dict["messages"]) > 0