refact code structure

kxz2002 · kxz2002 · commit bd67abae1e04 · 2025-11-20T16:42:07.000+08:00
diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -364,22 +364,8 @@ def prompt_token_ids2outputs(
             return outputs
         images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
         st, image_idx, video_idx = 0, 0, 0
-        mm_id_set = {
-            self.image_start_id,
-            self.image_end_id,
-            self.video_start_id,
-            self.video_end_id,
-            self.image_patch_id,
-        }
         while st < prompt_token_ids_len:
             cur_token_id = prompt_token_ids[st]
-            if cur_token_id not in mm_id_set:
-                outputs["input_ids"].extend([cur_token_id])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st += 1
-                continue
             if cur_token_id == self.image_start_id:
                 if image_idx >= len(images):
                     raise ValueError("prompt token ids has more image placeholder than in messages")
@@ -403,12 +389,7 @@ def prompt_token_ids2outputs(
                 else:
                     self._add_processed_image(image, outputs, uuid, token_len)
                 image_idx += 1
-                # append image_end_id
-                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st = cur_idx + 1
+                st = cur_idx
             elif cur_token_id == self.video_start_id:
                 if video_idx >= len(videos):
                     raise ValueError("prompt token ids has more video placeholder than in messages")
@@ -436,12 +417,13 @@ def prompt_token_ids2outputs(
                 else:
                     self._add_processed_video(video, outputs, uuid, token_len)
                 video_idx += 1
-                # append video_end_id
-                outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
+                st = cur_idx
+            else:
+                outputs["input_ids"].extend([cur_token_id])
                 outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
                 outputs["position_ids"].append([outputs["cur_position"]] * 3)
                 outputs["cur_position"] += 1
-                st = cur_idx + 1
+                st += 1
         if image_idx != len(images):
             raise ValueError("number of images does not match")
         if video_idx != len(videos):
@@ -487,82 +469,6 @@ def _add_text(self, tokens, outputs: Dict) -> None:
             outputs["position_ids"].append([start + i] * 3)
         outputs["cur_position"] += len(tokens)
 
-    # def _preprocess_raw_image(self, img=None, frames=None):
-    #     if img is None and frames is None:
-    #         raise ValueError("image and frames cannot be None at the same time")
-    #     patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
-    #         img.height if img else frames[0].height,
-    #         img.width if img else frames[0].width,
-    #         min_pixels=self.image_min_pixels if img else self.video_min_pixels,
-    #         max_pixels=self.image_max_pixels if img else self.video_max_pixels,
-    #     )[1]
-
-    #     if img:
-    #         ret = self.image_preprocessor.preprocess(
-    #             images=[img.convert("RGB")],
-    #             do_normalize=False,
-    #             do_rescale=False,
-    #             predetermined_grid_thw=np.array([[patches_h, patches_w]]),
-    #             do_convert_rgb=True,
-    #             input_data_format=ChannelDimension.LAST,
-    #         )
-    #     else:
-    #         pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-    #         ret = self.image_preprocessor.preprocess(
-    #             images=None,
-    #             videos=pixel_stack,
-    #             do_normalize=False,
-    #             do_rescale=False,
-    #             predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
-    #             do_convert_rgb=True,
-    #             input_data_format=ChannelDimension.LAST,
-    #         )
-    #     return patches_h, patches_w, ret
-
-    # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
-    #     patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
-    #     num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
-    #     if num_tokens != token_len:
-    #         raise ValueError("image tokens num not match the size")
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-    #     outputs["num_input_image_tokens"] += num_tokens
-
-    #     pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
-    #     outputs["images"].append(ret["pixel_values"])
-    #     if not uuid:
-    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-    #     else:
-    #         outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(ret["image_grid_thw"])
-    #     outputs["image_type_ids"].append(0)
-
-    # def _add_processed_image_from_token_ids(
-    #     self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
-    # ):
-    #     img, meta = img_cache
-    #     num_tokens = img.shape[0] // (self.spatial_conv_size**2)
-    #     if num_tokens != token_len:
-    #         raise ValueError("image tokens num not match the size")
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-    #     _, h, w = meta["thw"]
-    #     pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
-    #     outputs["images"].append(img)
-    #     outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(np.array([[1, h, w]]))
-    #     outputs["image_type_ids"].append(0)
-
     def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
@@ -622,29 +528,6 @@ def _add_processed_image(
         outputs["grid_thw"].append(np.array([[1, h, w]]))
         outputs["image_type_ids"].append(0)
 
-    # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
-    #     patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
-    #     num_frames = len(frames)
-    #     num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
-    #     if num_tokens != token_len:
-    #         raise ValueError("video tokens num not match the size")
-    #     outputs["images"].append(ret["pixel_values_videos"])
-    #     if not uuid:
-    #         outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
-    #     else:
-    #         outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(ret["video_grid_thw"])
-    #     outputs["image_type_ids"].extend([1] * num_frames)
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-    #     outputs["num_input_video_tokens"] += num_tokens
-
-    #     pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
     def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             frames[0].height,
@@ -684,28 +567,6 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None)
         outputs["position_ids"].extend(pos_ids)
         outputs["cur_position"] = np.max(pos_ids) + 1
 
-    # def _add_processed_video_from_token_ids(
-    #     self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
-    # ):
-    #     frames, meta = frames_cache
-    #     num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
-    #     if num_tokens != token_len:
-    #         raise ValueError("video tokens num not match the size")
-
-    #     t, h, w = meta["thw"]
-    #     outputs["images"].append(frames)
-    #     outputs["mm_hashes"].append(uuid)
-    #     outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-    #     outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-    #     outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-    #     outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-    #     outputs["image_type_ids"].extend([1] * t)
-
-    #     pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
-    #     outputs["position_ids"].extend(pos_ids)
-    #     outputs["cur_position"] = np.max(pos_ids) + 1
-
     def _add_processed_video(
         self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
     ) -> None: