From 800a83c3391a943111a657e9627f7e6476e2a963 Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Wed, 19 Nov 2025 11:41:33 +0800 Subject: [PATCH 01/11] support prompt_token_ids + messages --- fastdeploy/entrypoints/openai/protocol.py | 5 +- .../ernie4_5_vl_processor.py | 12 +- .../input/ernie4_5_vl_processor/process.py | 260 +++++++++++++++++- 3 files changed, 263 insertions(+), 14 deletions(-) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 4a1e4ef647f..cdcc5cb9d87 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -671,10 +671,7 @@ def to_dict_for_infer(self, request_id=None): if request_id is not None: req_dict["request_id"] = request_id - if "prompt_token_ids" in req_dict: - if "messages" in req_dict: - del req_dict["messages"] - else: + if "prompt_token_ids" not in req_dict: # If disable_chat_template is set, then the first message in messages will be used as the prompt. assert ( len(req_dict["messages"]) > 0 diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 77c62125c7a..d9eec5275c2 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -219,7 +219,13 @@ def process_request_dict(self, request, max_model_len=None): bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) request["bad_words_token_ids"] = bad_words_token_ids - if request.get("prompt"): + if request.get("prompt_token_ids"): + messages = request.get("messages") + if messages: + self._check_mm_limits(messages) + request.setdefault("enable_thinking", True) + outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request) + elif request.get("prompt"): multimodal_data = request.get("multimodal_data") if multimodal_data is None: multimodal_data = {} @@ -256,7 +262,9 @@ def process_request_dict(self, request, max_model_len=None): self.append_completion_tokens(outputs, request["completion_token_ids"]) outputs = self.pack_outputs(outputs) - request["prompt_token_ids"] = outputs["input_ids"].tolist() + request["prompt_token_ids"] = ( + outputs["input_ids"].tolist() if "prompt_token_ids" not in request else request["prompt_token_ids"] + ) request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) request["multimodal_inputs"] = outputs diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index 4ccdf287f20..c8816532a8b 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -136,7 +136,9 @@ def __init__( self.video_end = self.VID_END self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>") self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start) + self.image_end_id = self.tokenizer.convert_tokens_to_ids(self.image_end) self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start) + self.video_end_id = self.tokenizer.convert_tokens_to_ids(self.video_end) self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token) self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token) @@ -243,14 +245,7 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N return outputs - def request2ids( - self, request: Dict[str, Any], tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - """ - Convert chat messages into model inputs. - Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels. - """ - + def extract_mm_items(self, request: Dict[str, Any]): messages = parse_chat_messages(request.get("messages")) mm_items = [] for msg in messages: @@ -273,6 +268,7 @@ def request2ids( if len(missing_hashes) > 0 and not self.enable_processor_cache: raise ValueError("Missing items cannot be retrieved without processor cache.") + dealer = None if self.enable_processor_cache: context = zmq.Context() dealer = context.socket(zmq.DEALER) @@ -295,6 +291,16 @@ def request2ids( video_uuid.append(item["uuid"]) else: raise ValueError(f"Unsupported multimodal type: {item.get('type')}") + return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items + + def request2ids( + self, request: Dict[str, Any], tgts: List[str] = None + ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: + """ + Convert chat messages into model inputs. + Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels. + """ + images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat template.") @@ -329,6 +335,123 @@ def request2ids( return outputs + def prompt_token_ids2outputs( + self, request: Dict[str, Any], tgts: List[str] = None + ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: + outputs = { + "input_ids": [], + "token_type_ids": [], + "position_ids": [], + "images": [], + "grid_thw": [], + "image_type_ids": [], + "labels": [], + "cur_position": 0, + "video_cnt": 0, + "num_input_image_tokens": 0, + "num_input_video_tokens": 0, + "mm_positions": [], + "mm_hashes": [], + } + prompt_token_ids = request.get("prompt_token_ids", []) + prompt_token_ids_len = len(prompt_token_ids) + if not request.get("messages"): + outputs["input_ids"].append(prompt_token_ids) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len) + for i in range(prompt_token_ids_len): + outputs["position_ids"].append([i] * 3) + outputs["cur_position"] += prompt_token_ids_len + return outputs + images, videos, image_uuid, video_uuid, dealer = self.extract_mm_items(request) + st, image_idx, video_idx = 0, 0, 0 + mm_id_set = { + self.image_start_id, + self.image_end_id, + self.video_start_id, + self.video_end_id, + self.image_patch_id, + } + while st < prompt_token_ids_len: + cur_token_id = prompt_token_ids[st] + if cur_token_id not in mm_id_set: + outputs["input_ids"].extend([cur_token_id]) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) + outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["cur_position"] += 1 + st += 1 + continue + if cur_token_id == self.image_start_id: + if image_idx >= len(images): + raise ValueError("prompt token ids has more image placeholder than in messages") + # append image_start_id + outputs["input_ids"].extend([cur_token_id]) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) + outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["cur_position"] += 1 + st += 1 + # process placeholder token ids + cur_idx = st + while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_end_id: + cur_idx += 1 + if cur_idx >= prompt_token_ids_len: + raise ValueError("image token ids not complete") + image = images[image_idx] + uuid = image_uuid[image_idx] if image_uuid else None + if not isinstance(image, tuple): + self._add_image_from_token_ids(image, outputs, uuid, cur_idx - st) + else: + self._add_processed_image_from_token_ids(image, outputs, uuid, cur_idx - st) + image_idx += 1 + # append image_end_id + outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) + outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["cur_position"] += 1 + st = cur_idx + 1 + elif cur_token_id == self.video_start_id: + if video_idx >= len(videos): + raise ValueError("prompt token ids has more video placeholder than in messages") + # append video_start_id + outputs["input_ids"].extend([cur_token_id]) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) + outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["cur_position"] += 1 + st += 1 + # process placeholder token ids + cur_idx = st + while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.video_end_id: + cur_idx += 1 + if cur_idx >= prompt_token_ids_len: + raise ValueError("video token ids not complete") + video = videos[video_idx] + uuid = video_uuid[video_idx] if video_uuid else None + if not isinstance(video, tuple): + if isinstance(video, dict): + frames = self._load_and_process_video(video["video"], video) + else: + frames = self._load_and_process_video(video, {}) + self._add_video_from_token_ids(frames, outputs, uuid, cur_idx - st) + else: + self._add_processed_video_from_token_ids(video, outputs, uuid, cur_idx - st) + video_idx += 1 + # append video_end_id + outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) + outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["cur_position"] += 1 + st = cur_idx + 1 + if image_idx != len(images): + raise ValueError("number of images does not match") + if video_idx != len(videos): + raise ValueError("number of videos does not match") + # for test cases + if len(outputs["input_ids"]) != prompt_token_ids_len: + raise ValueError("number of token ids does not match") + for idx in range(prompt_token_ids_len): + if outputs["input_ids"][idx] != prompt_token_ids[idx]: + raise ValueError("token ids does not match") + return outputs + def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None: token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token) outputs["input_ids"].append(token_id) @@ -348,6 +471,82 @@ def _add_text(self, tokens, outputs: Dict) -> None: outputs["position_ids"].append([start + i] * 3) outputs["cur_position"] += len(tokens) + def _preprocess_raw_image(self, img=None, frames=None): + if img is None and frames is None: + raise ValueError("image and frames cannot be None at the same time") + patches_h, patches_w = self.image_preprocessor.get_smarted_resize( + img.height if img else frames[0].height, + img.width if img else frames[0].width, + min_pixels=self.image_min_pixels, + max_pixels=self.image_max_pixels, + )[1] + + if img: + ret = self.image_preprocessor.preprocess( + images=[img.convert("RGB")], + do_normalize=False, + do_rescale=False, + predetermined_grid_thw=np.array([[patches_h, patches_w]]), + do_convert_rgb=True, + input_data_format=ChannelDimension.LAST, + ) + else: + pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) + ret = self.image_preprocessor.preprocess( + images=None, + videos=pixel_stack, + do_normalize=False, + do_rescale=False, + predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)), + do_convert_rgb=True, + input_data_format=ChannelDimension.LAST, + ) + return patches_h, patches_w, ret + + def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int): + patches_h, patches_w, ret = self._preprocess_raw_image(img=img) + num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) + if num_tokens != token_len: + raise ValueError("image tokens num not match the size") + outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) + outputs["num_input_image_tokens"] += num_tokens + + pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"]) + outputs["position_ids"].extend(pos_ids) + outputs["cur_position"] = np.max(pos_ids) + 1 + + outputs["images"].append(ret["pixel_values"]) + if not uuid: + outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) + else: + outputs["mm_hashes"].append(uuid) + outputs["grid_thw"].append(ret["image_grid_thw"]) + outputs["image_type_ids"].append(0) + + def _add_processed_image_from_token_ids( + self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int + ): + img, meta = img_cache + num_tokens = img.shape[0] // (self.spatial_conv_size**2) + if num_tokens != token_len: + raise ValueError("image tokens num not match the size") + + outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) + + _, h, w = meta["thw"] + pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"]) + outputs["position_ids"].extend(pos_ids) + outputs["cur_position"] = np.max(pos_ids) + 1 + + outputs["images"].append(img) + outputs["mm_hashes"].append(uuid) + outputs["grid_thw"].append(np.array([[1, h, w]])) + outputs["image_type_ids"].append(0) + def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height, @@ -401,6 +600,29 @@ def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict outputs["grid_thw"].append(np.array([[1, h, w]])) outputs["image_type_ids"].append(0) + def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int): + patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames) + num_frames = len(frames) + num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) + if num_tokens != token_len: + raise ValueError("video tokens num not match the size") + outputs["images"].append(ret["pixel_values_videos"]) + if not uuid: + outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"])) + else: + outputs["mm_hashes"].append(uuid) + outputs["grid_thw"].append(ret["video_grid_thw"]) + outputs["image_type_ids"].extend([1] * num_frames) + + outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) + outputs["num_input_video_tokens"] += num_tokens + + pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"]) + outputs["position_ids"].extend(pos_ids) + outputs["cur_position"] = np.max(pos_ids) + 1 + def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( frames[0].height, @@ -438,6 +660,28 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None: outputs["position_ids"].extend(pos_ids) outputs["cur_position"] = np.max(pos_ids) + 1 + def _add_processed_video_from_token_ids( + self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int + ): + frames, meta = frames_cache + num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) + if num_tokens != token_len: + raise ValueError("video tokens num not match the size") + + t, h, w = meta["thw"] + outputs["images"].append(frames) + outputs["mm_hashes"].append(uuid) + outputs["grid_thw"].append(np.array([[t, h, w]])) + + outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) + outputs["image_type_ids"].extend([1] * t) + + pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"]) + outputs["position_ids"].extend(pos_ids) + outputs["cur_position"] = np.max(pos_ids) + 1 + def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: frames, meta = frames_cache num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) From c51d53b5eab00b271293c6da95ce71f99ba14eaf Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Wed, 19 Nov 2025 19:37:26 +0800 Subject: [PATCH 02/11] fix bug --- .../input/ernie4_5_vl_processor/process.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index c8816532a8b..a92b7b50882 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -362,7 +362,7 @@ def prompt_token_ids2outputs( outputs["position_ids"].append([i] * 3) outputs["cur_position"] += prompt_token_ids_len return outputs - images, videos, image_uuid, video_uuid, dealer = self.extract_mm_items(request) + images, videos, image_uuid, video_uuid, dealer, _, _ = self.extract_mm_items(request) st, image_idx, video_idx = 0, 0, 0 mm_id_set = { self.image_start_id, @@ -376,7 +376,7 @@ def prompt_token_ids2outputs( if cur_token_id not in mm_id_set: outputs["input_ids"].extend([cur_token_id]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 st += 1 continue @@ -386,7 +386,7 @@ def prompt_token_ids2outputs( # append image_start_id outputs["input_ids"].extend([cur_token_id]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 st += 1 # process placeholder token ids @@ -405,7 +405,7 @@ def prompt_token_ids2outputs( # append image_end_id outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 st = cur_idx + 1 elif cur_token_id == self.video_start_id: @@ -414,7 +414,7 @@ def prompt_token_ids2outputs( # append video_start_id outputs["input_ids"].extend([cur_token_id]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 st += 1 # process placeholder token ids @@ -437,7 +437,7 @@ def prompt_token_ids2outputs( # append video_end_id outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].extend([outputs["cur_position"]] * 3) + outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 st = cur_idx + 1 if image_idx != len(images): @@ -477,8 +477,8 @@ def _preprocess_raw_image(self, img=None, frames=None): patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height if img else frames[0].height, img.width if img else frames[0].width, - min_pixels=self.image_min_pixels, - max_pixels=self.image_max_pixels, + min_pixels=self.image_min_pixels if img else self.video_min_pixels, + max_pixels=self.image_max_pixels if img else self.video_max_pixels, )[1] if img: From 702b35e446281a8f8c27f80202108c705f81aefd Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Thu, 20 Nov 2025 10:30:45 +0800 Subject: [PATCH 03/11] refact code structure --- .../input/ernie4_5_vl_processor/process.py | 270 +++++++++--------- 1 file changed, 142 insertions(+), 128 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index a92b7b50882..7eb1075bcfa 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -397,10 +397,11 @@ def prompt_token_ids2outputs( raise ValueError("image token ids not complete") image = images[image_idx] uuid = image_uuid[image_idx] if image_uuid else None + token_len = cur_idx - st if not isinstance(image, tuple): - self._add_image_from_token_ids(image, outputs, uuid, cur_idx - st) + self._add_image(image, outputs, uuid, token_len) else: - self._add_processed_image_from_token_ids(image, outputs, uuid, cur_idx - st) + self._add_processed_image(image, outputs, uuid, token_len) image_idx += 1 # append image_end_id outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) @@ -425,14 +426,15 @@ def prompt_token_ids2outputs( raise ValueError("video token ids not complete") video = videos[video_idx] uuid = video_uuid[video_idx] if video_uuid else None + token_len = cur_idx - st if not isinstance(video, tuple): if isinstance(video, dict): frames = self._load_and_process_video(video["video"], video) else: frames = self._load_and_process_video(video, {}) - self._add_video_from_token_ids(frames, outputs, uuid, cur_idx - st) + self._add_video(frames, outputs, uuid, token_len) else: - self._add_processed_video_from_token_ids(video, outputs, uuid, cur_idx - st) + self._add_processed_video(video, outputs, uuid, token_len) video_idx += 1 # append video_end_id outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) @@ -471,83 +473,83 @@ def _add_text(self, tokens, outputs: Dict) -> None: outputs["position_ids"].append([start + i] * 3) outputs["cur_position"] += len(tokens) - def _preprocess_raw_image(self, img=None, frames=None): - if img is None and frames is None: - raise ValueError("image and frames cannot be None at the same time") - patches_h, patches_w = self.image_preprocessor.get_smarted_resize( - img.height if img else frames[0].height, - img.width if img else frames[0].width, - min_pixels=self.image_min_pixels if img else self.video_min_pixels, - max_pixels=self.image_max_pixels if img else self.video_max_pixels, - )[1] - - if img: - ret = self.image_preprocessor.preprocess( - images=[img.convert("RGB")], - do_normalize=False, - do_rescale=False, - predetermined_grid_thw=np.array([[patches_h, patches_w]]), - do_convert_rgb=True, - input_data_format=ChannelDimension.LAST, - ) - else: - pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - ret = self.image_preprocessor.preprocess( - images=None, - videos=pixel_stack, - do_normalize=False, - do_rescale=False, - predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)), - do_convert_rgb=True, - input_data_format=ChannelDimension.LAST, - ) - return patches_h, patches_w, ret - - def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int): - patches_h, patches_w, ret = self._preprocess_raw_image(img=img) - num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) - if num_tokens != token_len: - raise ValueError("image tokens num not match the size") - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - outputs["num_input_image_tokens"] += num_tokens - - pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(ret["image_grid_thw"]) - outputs["image_type_ids"].append(0) - - def _add_processed_image_from_token_ids( - self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int - ): - img, meta = img_cache - num_tokens = img.shape[0] // (self.spatial_conv_size**2) - if num_tokens != token_len: - raise ValueError("image tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - _, h, w = meta["thw"] - pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - outputs["images"].append(img) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[1, h, w]])) - outputs["image_type_ids"].append(0) - - def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: + # def _preprocess_raw_image(self, img=None, frames=None): + # if img is None and frames is None: + # raise ValueError("image and frames cannot be None at the same time") + # patches_h, patches_w = self.image_preprocessor.get_smarted_resize( + # img.height if img else frames[0].height, + # img.width if img else frames[0].width, + # min_pixels=self.image_min_pixels if img else self.video_min_pixels, + # max_pixels=self.image_max_pixels if img else self.video_max_pixels, + # )[1] + + # if img: + # ret = self.image_preprocessor.preprocess( + # images=[img.convert("RGB")], + # do_normalize=False, + # do_rescale=False, + # predetermined_grid_thw=np.array([[patches_h, patches_w]]), + # do_convert_rgb=True, + # input_data_format=ChannelDimension.LAST, + # ) + # else: + # pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) + # ret = self.image_preprocessor.preprocess( + # images=None, + # videos=pixel_stack, + # do_normalize=False, + # do_rescale=False, + # predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)), + # do_convert_rgb=True, + # input_data_format=ChannelDimension.LAST, + # ) + # return patches_h, patches_w, ret + + # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int): + # patches_h, patches_w, ret = self._preprocess_raw_image(img=img) + # num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) + # if num_tokens != token_len: + # raise ValueError("image tokens num not match the size") + # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) + # outputs["num_input_image_tokens"] += num_tokens + + # pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"]) + # outputs["position_ids"].extend(pos_ids) + # outputs["cur_position"] = np.max(pos_ids) + 1 + + # outputs["images"].append(ret["pixel_values"]) + # if not uuid: + # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) + # else: + # outputs["mm_hashes"].append(uuid) + # outputs["grid_thw"].append(ret["image_grid_thw"]) + # outputs["image_type_ids"].append(0) + + # def _add_processed_image_from_token_ids( + # self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int + # ): + # img, meta = img_cache + # num_tokens = img.shape[0] // (self.spatial_conv_size**2) + # if num_tokens != token_len: + # raise ValueError("image tokens num not match the size") + + # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) + + # _, h, w = meta["thw"] + # pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"]) + # outputs["position_ids"].extend(pos_ids) + # outputs["cur_position"] = np.max(pos_ids) + 1 + + # outputs["images"].append(img) + # outputs["mm_hashes"].append(uuid) + # outputs["grid_thw"].append(np.array([[1, h, w]])) + # outputs["image_type_ids"].append(0) + + def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height, img.width, @@ -555,6 +557,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: max_pixels=self.image_max_pixels, )[1] num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) + if token_len and token_len != num_tokens: + raise ValueError("image tokens num not match the size") outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) outputs["input_ids"].extend([self.image_patch_id] * num_tokens) @@ -582,9 +586,13 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: outputs["grid_thw"].append(ret["image_grid_thw"]) outputs["image_type_ids"].append(0) - def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: + def _add_processed_image( + self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None + ) -> None: img, meta = img_cache num_tokens = img.shape[0] // (self.spatial_conv_size**2) + if token_len and num_tokens != token_len: + raise ValueError("image tokens num not match the size") outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) outputs["input_ids"].extend([self.image_patch_id] * num_tokens) @@ -600,30 +608,30 @@ def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict outputs["grid_thw"].append(np.array([[1, h, w]])) outputs["image_type_ids"].append(0) - def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int): - patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames) - num_frames = len(frames) - num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) - if num_tokens != token_len: - raise ValueError("video tokens num not match the size") - outputs["images"].append(ret["pixel_values_videos"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(ret["video_grid_thw"]) - outputs["image_type_ids"].extend([1] * num_frames) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["num_input_video_tokens"] += num_tokens - - pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None: + # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int): + # patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames) + # num_frames = len(frames) + # num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) + # if num_tokens != token_len: + # raise ValueError("video tokens num not match the size") + # outputs["images"].append(ret["pixel_values_videos"]) + # if not uuid: + # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"])) + # else: + # outputs["mm_hashes"].append(uuid) + # outputs["grid_thw"].append(ret["video_grid_thw"]) + # outputs["image_type_ids"].extend([1] * num_frames) + + # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) + # outputs["num_input_video_tokens"] += num_tokens + + # pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"]) + # outputs["position_ids"].extend(pos_ids) + # outputs["cur_position"] = np.max(pos_ids) + 1 + + def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( frames[0].height, frames[0].width, @@ -632,6 +640,8 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None: )[1] num_frames = len(frames) num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) + if token_len and num_tokens != token_len: + raise ValueError("video tokens num not match the size") pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) ret = self.image_preprocessor.preprocess( @@ -660,12 +670,34 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str]) -> None: outputs["position_ids"].extend(pos_ids) outputs["cur_position"] = np.max(pos_ids) + 1 - def _add_processed_video_from_token_ids( - self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int - ): + # def _add_processed_video_from_token_ids( + # self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int + # ): + # frames, meta = frames_cache + # num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) + # if num_tokens != token_len: + # raise ValueError("video tokens num not match the size") + + # t, h, w = meta["thw"] + # outputs["images"].append(frames) + # outputs["mm_hashes"].append(uuid) + # outputs["grid_thw"].append(np.array([[t, h, w]])) + + # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) + # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) + # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) + # outputs["image_type_ids"].extend([1] * t) + + # pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"]) + # outputs["position_ids"].extend(pos_ids) + # outputs["cur_position"] = np.max(pos_ids) + 1 + + def _add_processed_video( + self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None + ) -> None: frames, meta = frames_cache num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) - if num_tokens != token_len: + if token_len and num_tokens != token_len: raise ValueError("video tokens num not match the size") t, h, w = meta["thw"] @@ -682,24 +714,6 @@ def _add_processed_video_from_token_ids( outputs["position_ids"].extend(pos_ids) outputs["cur_position"] = np.max(pos_ids) + 1 - def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: - frames, meta = frames_cache - num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) - - t, h, w = meta["thw"] - outputs["images"].append(frames) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[t, h, w]])) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["image_type_ids"].extend([1] * t) - - pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None: input_ids = copy.deepcopy(outputs["input_ids"]) labels = [self.tokenizer.ignored_index] * len(input_ids) From f9de470be75cfe438993d1ebe33cde0548707cac Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Thu, 20 Nov 2025 10:39:40 +0800 Subject: [PATCH 04/11] support cache mm items --- .../input/ernie4_5_vl_processor/process.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index 7eb1075bcfa..cb5a5835ffd 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -362,7 +362,7 @@ def prompt_token_ids2outputs( outputs["position_ids"].append([i] * 3) outputs["cur_position"] += prompt_token_ids_len return outputs - images, videos, image_uuid, video_uuid, dealer, _, _ = self.extract_mm_items(request) + images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) st, image_idx, video_idx = 0, 0, 0 mm_id_set = { self.image_start_id, @@ -452,6 +452,20 @@ def prompt_token_ids2outputs( for idx in range(prompt_token_ids_len): if outputs["input_ids"][idx] != prompt_token_ids[idx]: raise ValueError("token ids does not match") + + if self.enable_processor_cache: + missing_idx = set(missing_idx) + hashes_to_cache, items_to_cache = [], [] + for idx in range(len(mm_items)): + if idx in missing_idx: + continue + meta = {} + t, h, w = outputs["grid_thw"][idx][0] + meta["thw"] = (t, h, w) + hashes_to_cache.append(outputs["mm_hashes"][idx]) + items_to_cache.append((outputs["images"][idx], meta)) + self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) + return outputs def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None: From bd67abae1e04fdb08f9e1c4cf3a9537767cd20ab Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Thu, 20 Nov 2025 16:42:07 +0800 Subject: [PATCH 05/11] refact code structure --- .../input/ernie4_5_vl_processor/process.py | 149 +----------------- 1 file changed, 5 insertions(+), 144 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index cb5a5835ffd..c0038a9fbde 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -364,22 +364,8 @@ def prompt_token_ids2outputs( return outputs images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) st, image_idx, video_idx = 0, 0, 0 - mm_id_set = { - self.image_start_id, - self.image_end_id, - self.video_start_id, - self.video_end_id, - self.image_patch_id, - } while st < prompt_token_ids_len: cur_token_id = prompt_token_ids[st] - if cur_token_id not in mm_id_set: - outputs["input_ids"].extend([cur_token_id]) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].append([outputs["cur_position"]] * 3) - outputs["cur_position"] += 1 - st += 1 - continue if cur_token_id == self.image_start_id: if image_idx >= len(images): raise ValueError("prompt token ids has more image placeholder than in messages") @@ -403,12 +389,7 @@ def prompt_token_ids2outputs( else: self._add_processed_image(image, outputs, uuid, token_len) image_idx += 1 - # append image_end_id - outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].append([outputs["cur_position"]] * 3) - outputs["cur_position"] += 1 - st = cur_idx + 1 + st = cur_idx elif cur_token_id == self.video_start_id: if video_idx >= len(videos): raise ValueError("prompt token ids has more video placeholder than in messages") @@ -436,12 +417,13 @@ def prompt_token_ids2outputs( else: self._add_processed_video(video, outputs, uuid, token_len) video_idx += 1 - # append video_end_id - outputs["input_ids"].extend([prompt_token_ids[cur_idx]]) + st = cur_idx + else: + outputs["input_ids"].extend([cur_token_id]) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) outputs["position_ids"].append([outputs["cur_position"]] * 3) outputs["cur_position"] += 1 - st = cur_idx + 1 + st += 1 if image_idx != len(images): raise ValueError("number of images does not match") if video_idx != len(videos): @@ -487,82 +469,6 @@ def _add_text(self, tokens, outputs: Dict) -> None: outputs["position_ids"].append([start + i] * 3) outputs["cur_position"] += len(tokens) - # def _preprocess_raw_image(self, img=None, frames=None): - # if img is None and frames is None: - # raise ValueError("image and frames cannot be None at the same time") - # patches_h, patches_w = self.image_preprocessor.get_smarted_resize( - # img.height if img else frames[0].height, - # img.width if img else frames[0].width, - # min_pixels=self.image_min_pixels if img else self.video_min_pixels, - # max_pixels=self.image_max_pixels if img else self.video_max_pixels, - # )[1] - - # if img: - # ret = self.image_preprocessor.preprocess( - # images=[img.convert("RGB")], - # do_normalize=False, - # do_rescale=False, - # predetermined_grid_thw=np.array([[patches_h, patches_w]]), - # do_convert_rgb=True, - # input_data_format=ChannelDimension.LAST, - # ) - # else: - # pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - # ret = self.image_preprocessor.preprocess( - # images=None, - # videos=pixel_stack, - # do_normalize=False, - # do_rescale=False, - # predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)), - # do_convert_rgb=True, - # input_data_format=ChannelDimension.LAST, - # ) - # return patches_h, patches_w, ret - - # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int): - # patches_h, patches_w, ret = self._preprocess_raw_image(img=img) - # num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) - # if num_tokens != token_len: - # raise ValueError("image tokens num not match the size") - # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - # outputs["num_input_image_tokens"] += num_tokens - - # pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"]) - # outputs["position_ids"].extend(pos_ids) - # outputs["cur_position"] = np.max(pos_ids) + 1 - - # outputs["images"].append(ret["pixel_values"]) - # if not uuid: - # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - # else: - # outputs["mm_hashes"].append(uuid) - # outputs["grid_thw"].append(ret["image_grid_thw"]) - # outputs["image_type_ids"].append(0) - - # def _add_processed_image_from_token_ids( - # self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int - # ): - # img, meta = img_cache - # num_tokens = img.shape[0] // (self.spatial_conv_size**2) - # if num_tokens != token_len: - # raise ValueError("image tokens num not match the size") - - # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - # _, h, w = meta["thw"] - # pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"]) - # outputs["position_ids"].extend(pos_ids) - # outputs["cur_position"] = np.max(pos_ids) + 1 - - # outputs["images"].append(img) - # outputs["mm_hashes"].append(uuid) - # outputs["grid_thw"].append(np.array([[1, h, w]])) - # outputs["image_type_ids"].append(0) - def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height, @@ -622,29 +528,6 @@ def _add_processed_image( outputs["grid_thw"].append(np.array([[1, h, w]])) outputs["image_type_ids"].append(0) - # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int): - # patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames) - # num_frames = len(frames) - # num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) - # if num_tokens != token_len: - # raise ValueError("video tokens num not match the size") - # outputs["images"].append(ret["pixel_values_videos"]) - # if not uuid: - # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"])) - # else: - # outputs["mm_hashes"].append(uuid) - # outputs["grid_thw"].append(ret["video_grid_thw"]) - # outputs["image_type_ids"].extend([1] * num_frames) - - # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - # outputs["num_input_video_tokens"] += num_tokens - - # pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"]) - # outputs["position_ids"].extend(pos_ids) - # outputs["cur_position"] = np.max(pos_ids) + 1 - def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None: patches_h, patches_w = self.image_preprocessor.get_smarted_resize( frames[0].height, @@ -684,28 +567,6 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) outputs["position_ids"].extend(pos_ids) outputs["cur_position"] = np.max(pos_ids) + 1 - # def _add_processed_video_from_token_ids( - # self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int - # ): - # frames, meta = frames_cache - # num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) - # if num_tokens != token_len: - # raise ValueError("video tokens num not match the size") - - # t, h, w = meta["thw"] - # outputs["images"].append(frames) - # outputs["mm_hashes"].append(uuid) - # outputs["grid_thw"].append(np.array([[t, h, w]])) - - # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - # outputs["image_type_ids"].extend([1] * t) - - # pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"]) - # outputs["position_ids"].extend(pos_ids) - # outputs["cur_position"] = np.max(pos_ids) + 1 - def _add_processed_video( self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None ) -> None: From 0337379be4393fac0160830f8e12531d930f5333 Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Thu, 20 Nov 2025 17:34:47 +0800 Subject: [PATCH 06/11] delete test cases --- fastdeploy/input/ernie4_5_vl_processor/process.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index c0038a9fbde..efbb3452607 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -428,12 +428,6 @@ def prompt_token_ids2outputs( raise ValueError("number of images does not match") if video_idx != len(videos): raise ValueError("number of videos does not match") - # for test cases - if len(outputs["input_ids"]) != prompt_token_ids_len: - raise ValueError("number of token ids does not match") - for idx in range(prompt_token_ids_len): - if outputs["input_ids"][idx] != prompt_token_ids[idx]: - raise ValueError("token ids does not match") if self.enable_processor_cache: missing_idx = set(missing_idx) From bcd67a4c618b20cf7abd0f2f2e3b6c792e1ea7f1 Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Thu, 20 Nov 2025 19:22:09 +0800 Subject: [PATCH 07/11] modify unit test --- tests/input/test_ernie_vl_processor.py | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index 92d24d5b96f..b9bc22d4cc2 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -77,7 +77,7 @@ def test_process_request_dict_with_options(self): "prompt_token_ids": [1, 1, 1], } self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) + self.assertEqual(request_dict["enable_thinking"], True) request_dict = { "messages": [{"role": "user", "content": "Hello"}], @@ -93,7 +93,7 @@ def test_process_request_dict_with_options(self): "prompt_token_ids": [1, 1, 1], } self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) + self.assertEqual(request_dict["enable_thinking"], True) request_dict = { "messages": [{"role": "user", "content": "Hello"}], @@ -101,7 +101,7 @@ def test_process_request_dict_with_options(self): "prompt_token_ids": [1, 1, 1], } self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) + self.assertEqual(request_dict["enable_thinking"], True) request_dict = { "messages": [{"role": "user", "content": "Hello"}], @@ -111,6 +111,27 @@ def test_process_request_dict_with_options(self): self.processor.process_request_dict(request_dict, 100) self.assertEqual(request_dict["enable_thinking"], True) + request_dict = { + "messages": [{"role": "user", "content": "Hello"}], + "chat_template_kwargs": {"options": {"thinking_mode": "close"}}, + } + self.processor.process_request_dict(request_dict, 100) + self.assertEqual(request_dict["enable_thinking"], False) + + request_dict = { + "messages": [{"role": "user", "content": "Hello"}], + "chat_template_kwargs": {"options": {"thinking_mode": "false"}}, + } + self.processor.process_request_dict(request_dict, 100) + self.assertEqual(request_dict["enable_thinking"], False) + + request_dict = { + "messages": [{"role": "user", "content": "Hello"}], + "chat_template_kwargs": {"enable_thinking": False}, + } + self.processor.process_request_dict(request_dict, 100) + self.assertEqual(request_dict["enable_thinking"], False) + if __name__ == "__main__": unittest.main() From 7c0d5c2834a85035759e5540fbc035c387fc857a Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Mon, 24 Nov 2025 14:42:45 +0800 Subject: [PATCH 08/11] add unit test --- tests/input/test_ernie_vl_processor.py | 304 +++++++++++++++++++++++++ 1 file changed, 304 insertions(+) diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index b9bc22d4cc2..3e0aa670bea 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -1,7 +1,15 @@ import unittest from unittest.mock import MagicMock, patch +import numpy as np + +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor +from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import ( + AdaptiveImageProcessor, +) +from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor +from fastdeploy.input.utils import IDS_TYPE_FLAG class TestErnie4_5_vl_ProcessorProcessResponseDictStreaming(unittest.TestCase): @@ -133,5 +141,301 @@ def test_process_request_dict_with_options(self): self.assertEqual(request_dict["enable_thinking"], False) +class TestDataProcessorTargetMethods(unittest.TestCase): + def setUp(self): + self.mock_tokenizer = MagicMock(spec=Ernie4_5Tokenizer) + self.mock_tokenizer.ignored_index = -100 + self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids + self.mock_tokenizer.chat_template = "mock_template" + self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>" + + def mock_load_tokenizer(dp_instance): + dp_instance.tokenizer = self.mock_tokenizer + + with patch.object(DataProcessor, "_load_tokenizer", side_effect=mock_load_tokenizer, autospec=True): + with patch.object(AdaptiveImageProcessor, "from_pretrained") as mock_image_preprocessor: + mock_image_preprocessor.return_value = MagicMock() + self.data_processor = DataProcessor( + tokenizer_name="mock_tokenizer", + image_preprocessor_name="mock_image_preprocessor", + enable_processor_cache=False, + ) + self.data_processor.image_patch_id = 1001 + self.data_processor.image_start_id = 1002 + self.data_processor.image_end_id = 1003 + self.data_processor.video_start_id = 1004 + self.data_processor.video_end_id = 1005 + self.data_processor.role_prefixes = {"user": "User: ", "assistant": "Assistant: "} + self.data_processor.enable_processor_cache = False + self.data_processor.extract_mm_items = MagicMock(return_value=([], [], [], [], None, [], [])) + + def _mock_convert_tokens_to_ids(self, token): + token_id_map = { + "<|begin_of_sentence|>": 101, + "<|end_of_sentence|>": 102, + "": 103, + "<|IMAGE_PLACEHOLDER|>": 1001, + "<|IMAGE_START|>": 1002, + "<|IMAGE_END|>": 1003, + "<|VIDEO_START|>": 1004, + "<|VIDEO_END|>": 1005, + } + return token_id_map.get(token, 999) + + def test_prompt_token_ids2outputs_only_prompt_token_ids(self): + test_prompt_token_ids = [101, 999, 998, 997, 102] + request = { + "prompt_token_ids": test_prompt_token_ids, + } + + outputs = self.data_processor.prompt_token_ids2outputs(request) + + prompt_len = len(test_prompt_token_ids) + + self.assertEqual( + outputs["input_ids"], + [test_prompt_token_ids], + f"input_ids 不匹配:实际{outputs['input_ids']},预期[{test_prompt_token_ids}]", + ) + + self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len) + + expected_position_ids = [[i] * 3 for i in range(prompt_len)] + self.assertEqual(outputs["position_ids"], expected_position_ids) + + self.assertEqual(outputs["cur_position"], prompt_len) + + self.assertEqual(len(outputs["images"]), 0) + self.assertEqual(len(outputs["grid_thw"]), 0) + self.assertEqual(len(outputs["mm_positions"]), 0) + self.assertEqual(len(outputs["mm_hashes"]), 0) + self.assertEqual(outputs["video_cnt"], 0) + self.assertEqual(outputs["num_input_image_tokens"], 0) + self.assertEqual(outputs["num_input_video_tokens"], 0) + + def test_prompt_token_ids2outputs_with_messages_no_mm(self): + test_prompt_token_ids = [101, 999, 998, 997, 102] + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [{"role": "user", "content": "Hello World"}], + } + + self.data_processor.extract_mm_items.return_value = ([], [], [], [], None, [], []) + + outputs = self.data_processor.prompt_token_ids2outputs(request) + + prompt_len = len(test_prompt_token_ids) + + self.assertEqual(outputs["input_ids"], test_prompt_token_ids) + + self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len) + + expected_position_ids = [[i] * 3 for i in range(prompt_len)] + self.assertEqual(outputs["position_ids"], expected_position_ids) + + self.assertEqual(outputs["cur_position"], prompt_len) + + self.assertEqual(len(outputs["images"]), 0) + self.assertEqual(outputs["video_cnt"], 0) + self.assertEqual(outputs["num_input_image_tokens"], 0) + + def test_prompt_token_ids2outputs_add_image(self): + test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] + mock_img = MagicMock() + mock_img.height = 224 + mock_img.width = 224 + mock_img.convert.return_value = mock_img + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [mock_img], + [], + ["img_uuid"], + [], + None, + [], + [{"type": "image", "data": mock_img}], + ) + mock_resize = (None, (2, 4)) + self.data_processor.image_preprocessor.get_smarted_resize.return_value = mock_resize + mock_preprocess = {"pixel_values": np.random.randn(1, 16, 16, 3), "image_grid_thw": np.array([[2, 4]])} + self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess + # self.data_processor._compute_3d_positions = MagicMock(return_value=[[i]*3 for i in range(4)]) + outputs = self.data_processor.prompt_token_ids2outputs(request) + self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102]) + self.assertEqual( + outputs["token_type_ids"], + [ + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["image"], + IDS_TYPE_FLAG["image"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + ], + ) + self.assertEqual(len(outputs["position_ids"]), 6) + self.assertEqual(outputs["cur_position"], 6) + self.assertEqual(len(outputs["images"]), 1) + self.assertIsNotNone(outputs["images"][0]) + self.assertEqual(outputs["num_input_image_tokens"], 2) + self.assertEqual(len(outputs["mm_positions"]), 1) + self.assertEqual(len(outputs["mm_hashes"]), 1) + self.assertEqual(len(outputs["grid_thw"]), 1) + self.assertEqual(len(outputs["image_type_ids"]), 1) + + def test_prompt_token_ids2outputs_add_processed_image(self): + test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] + mock_img_data = np.random.randn(8, 28, 28) + mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)}) + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [mock_img_cache], + [], + ["img_uuid"], + [], + None, + [], + [{"type": "image", "data": mock_img_cache}], + ) + outputs = self.data_processor.prompt_token_ids2outputs(request) + self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102]) + self.assertEqual( + outputs["token_type_ids"], + [ + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["image"], + IDS_TYPE_FLAG["image"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + ], + ) + self.assertEqual(len(outputs["position_ids"]), 20) + self.assertEqual(outputs["cur_position"], 8) + self.assertEqual(len(outputs["images"]), 1) + self.assertIsNotNone(outputs["images"][0]) + self.assertEqual(len(outputs["mm_positions"]), 1) + self.assertEqual(outputs["mm_hashes"][0], "img_uuid") + self.assertEqual(len(outputs["grid_thw"]), 1) + self.assertEqual(len(outputs["image_type_ids"]), 1) + + def test_prompt_token_ids2outputs_add_video(self): + test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102] + mock_frame1 = MagicMock() + mock_frame1.height = 224 + mock_frame1.width = 224 + mock_frame1.convert.return_value = mock_frame1 + mock_frame2 = MagicMock() + mock_frame2.height = 224 + mock_frame2.width = 224 + mock_frame2.convert.return_value = mock_frame2 + frames = [mock_frame1, mock_frame2] + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [], + [frames], + [], + ["vid_uuid"], + None, + [], + [{"type": "video", "data": frames}], + ) + self.data_processor._load_and_process_video = MagicMock(return_value=frames) + patches_h, patches_w = 4, 4 + self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) + mock_preprocess = { + "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3), + "video_grid_thw": np.array([[patches_h, patches_w]] * 2), + } + self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess + outputs = self.data_processor.prompt_token_ids2outputs(request) + self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]) + self.assertEqual( + outputs["token_type_ids"], + [ + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + ], + ) + self.assertEqual(len(outputs["position_ids"]), 8) + self.assertEqual(outputs["cur_position"], 6) + self.assertEqual(len(outputs["images"]), 1) + self.assertIsNotNone(outputs["images"][0]) + self.assertEqual(len(outputs["mm_positions"]), 1) + self.assertEqual(outputs["mm_hashes"][0], "vid_uuid") + self.assertEqual(len(outputs["grid_thw"]), 1) + self.assertEqual(len(outputs["image_type_ids"]), 2) + self.assertEqual(outputs["num_input_video_tokens"], 4) + + def test_prompt_token_ids2outputs_add_processed_video(self): + test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102] + t, h, w = 2, 4, 4 + spatial_conv_size = self.data_processor.spatial_conv_size + temporal_conv_size = self.data_processor.temporal_conv_size + token_per_frame = (h // spatial_conv_size) * (w // spatial_conv_size) + num_tokens = (t // temporal_conv_size) * token_per_frame + mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28) + mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)}) + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [], + [mock_frames_cache], + [], + ["vid_uuid"], + None, + [], + [{"type": "video", "data": mock_frames_cache}], + ) + outputs = self.data_processor.prompt_token_ids2outputs(request) + self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]) + self.assertEqual( + outputs["token_type_ids"], + [ + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["video"], + IDS_TYPE_FLAG["text"], + IDS_TYPE_FLAG["text"], + ], + ) + self.assertEqual(len(outputs["position_ids"]), 8) + self.assertEqual(outputs["cur_position"], 6) + self.assertEqual(len(outputs["images"]), 1) + self.assertIsNotNone(outputs["images"][0]) + self.assertEqual(len(outputs["mm_positions"]), 1) + self.assertEqual(outputs["mm_hashes"][0], "vid_uuid") + self.assertEqual(len(outputs["grid_thw"]), 1) + self.assertEqual(len(outputs["image_type_ids"]), 2) + + if __name__ == "__main__": unittest.main() From e653596b041f8ce18e8ff05f1f7b1a1ea4ff11ef Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Mon, 24 Nov 2025 16:11:03 +0800 Subject: [PATCH 09/11] add unit test --- tests/input/test_ernie_vl_processor.py | 123 +++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index 3e0aa670bea..4b2f43e4e2a 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -436,6 +436,129 @@ def test_prompt_token_ids2outputs_add_processed_video(self): self.assertEqual(len(outputs["grid_thw"]), 1) self.assertEqual(len(outputs["image_type_ids"]), 2) + def test_prompt_token_ids2outputs_add_image_token_len_mismatch(self): + test_prompt_token_ids = [101, 1002, 1001, 1001, 1001, 1003, 102] + mock_img = MagicMock() + mock_img.height = 224 + mock_img.width = 224 + mock_img.convert.return_value = mock_img + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [mock_img], + [], + ["img_uuid"], + [], + None, + [], + [{"type": "image", "data": mock_img}], + ) + patches_h, patches_w = 8, 8 + self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) + mock_preprocess = { + "pixel_values": np.random.randn(1, patches_h, patches_w, 3), + "image_grid_thw": np.array([[patches_h, patches_w]]), + } + self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess + with self.assertRaises(ValueError) as ctx: + self.data_processor.prompt_token_ids2outputs(request) + self.assertIn("image tokens num not match the size", str(ctx.exception)) + + def test_prompt_token_ids2outputs_add_processed_image_token_len_mismatch(self): + test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] + spatial_conv_size = self.data_processor.spatial_conv_size + num_tokens = 4 + mock_img_data = np.random.randn(num_tokens * (spatial_conv_size**2), 28, 28) + mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)}) + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [mock_img_cache], + [], + ["img_uuid"], + [], + None, + [], + [{"type": "image", "data": mock_img_cache}], + ) + with self.assertRaises(ValueError) as ctx: + self.data_processor.prompt_token_ids2outputs(request) + self.assertIn("image tokens num not match the size", str(ctx.exception)) + + def test_prompt_token_ids2outputs_add_video_token_len_mismatch(self): + test_prompt_token_ids = [101, 1004, 1001, 1001, 1005, 102] + mock_frame1 = MagicMock() + mock_frame1.height = 224 + mock_frame1.width = 224 + mock_frame1.convert.return_value = mock_frame1 + mock_frame2 = MagicMock() + mock_frame2.height = 224 + mock_frame2.width = 224 + mock_frame2.convert.return_value = mock_frame2 + frames = [mock_frame1, mock_frame2] + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [], + [frames], + [], + ["vid_uuid"], + None, + [], + [{"type": "video", "data": frames}], + ) + self.data_processor._load_and_process_video = MagicMock(return_value=frames) + patches_h, patches_w = 8, 8 + self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) + mock_preprocess = { + "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3), + "video_grid_thw": np.array([[patches_h, patches_w]] * 2), + } + self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess + with self.assertRaises(ValueError) as ctx: + self.data_processor.prompt_token_ids2outputs(request) + self.assertIn("video tokens num not match the size", str(ctx.exception)) + + def test_prompt_token_ids2outputs_add_processed_video_token_len_mismatch(self): + test_prompt_token_ids = [101, 1004, 1001, 1005, 102] + t, h, w = 2, 8, 8 + spatial_conv_size = self.data_processor.spatial_conv_size + temporal_conv_size = self.data_processor.temporal_conv_size + + num_tokens = 4 + mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28) + mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)}) + request = { + "prompt_token_ids": test_prompt_token_ids, + "messages": [ + {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]} + ], + } + self.data_processor.extract_mm_items.return_value = ( + [], + [mock_frames_cache], + [], + ["vid_uuid"], + None, + [], + [{"type": "video", "data": mock_frames_cache}], + ) + with self.assertRaises(ValueError) as ctx: + self.data_processor.prompt_token_ids2outputs(request) + self.assertIn("video tokens num not match the size", str(ctx.exception)) + if __name__ == "__main__": unittest.main() From d93b21281ae71bffbada85f2a5958e0fb5be267f Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Tue, 25 Nov 2025 11:44:14 +0800 Subject: [PATCH 10/11] fix append --- fastdeploy/input/ernie4_5_vl_processor/process.py | 2 +- tests/input/test_ernie_vl_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index efbb3452607..3da2bfcb97f 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -356,7 +356,7 @@ def prompt_token_ids2outputs( prompt_token_ids = request.get("prompt_token_ids", []) prompt_token_ids_len = len(prompt_token_ids) if not request.get("messages"): - outputs["input_ids"].append(prompt_token_ids) + outputs["input_ids"].extend(prompt_token_ids) outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len) for i in range(prompt_token_ids_len): outputs["position_ids"].append([i] * 3) diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index 4b2f43e4e2a..ee4d0f195f8 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -194,7 +194,7 @@ def test_prompt_token_ids2outputs_only_prompt_token_ids(self): self.assertEqual( outputs["input_ids"], - [test_prompt_token_ids], + test_prompt_token_ids, f"input_ids 不匹配:实际{outputs['input_ids']},预期[{test_prompt_token_ids}]", ) From 8fed8b4e2a4c2c8b1e7f0fae4a85b805a22b88ff Mon Sep 17 00:00:00 2001 From: kxz2002 Date: Tue, 25 Nov 2025 20:51:07 +0800 Subject: [PATCH 11/11] add check for messages --- fastdeploy/entrypoints/openai/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index cdcc5cb9d87..339164fca1d 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -671,7 +671,7 @@ def to_dict_for_infer(self, request_id=None): if request_id is not None: req_dict["request_id"] = request_id - if "prompt_token_ids" not in req_dict: + if "prompt_token_ids" not in req_dict or not req_dict["prompt_token_ids"]: # If disable_chat_template is set, then the first message in messages will be used as the prompt. assert ( len(req_dict["messages"]) > 0