Skip to content

Commit bd67aba

Browse files
committed
refact code structure
1 parent f9de470 commit bd67aba

File tree

1 file changed

+5
-144
lines changed

1 file changed

+5
-144
lines changed

fastdeploy/input/ernie4_5_vl_processor/process.py

Lines changed: 5 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -364,22 +364,8 @@ def prompt_token_ids2outputs(
364364
return outputs
365365
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
366366
st, image_idx, video_idx = 0, 0, 0
367-
mm_id_set = {
368-
self.image_start_id,
369-
self.image_end_id,
370-
self.video_start_id,
371-
self.video_end_id,
372-
self.image_patch_id,
373-
}
374367
while st < prompt_token_ids_len:
375368
cur_token_id = prompt_token_ids[st]
376-
if cur_token_id not in mm_id_set:
377-
outputs["input_ids"].extend([cur_token_id])
378-
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
379-
outputs["position_ids"].append([outputs["cur_position"]] * 3)
380-
outputs["cur_position"] += 1
381-
st += 1
382-
continue
383369
if cur_token_id == self.image_start_id:
384370
if image_idx >= len(images):
385371
raise ValueError("prompt token ids has more image placeholder than in messages")
@@ -403,12 +389,7 @@ def prompt_token_ids2outputs(
403389
else:
404390
self._add_processed_image(image, outputs, uuid, token_len)
405391
image_idx += 1
406-
# append image_end_id
407-
outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
408-
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
409-
outputs["position_ids"].append([outputs["cur_position"]] * 3)
410-
outputs["cur_position"] += 1
411-
st = cur_idx + 1
392+
st = cur_idx
412393
elif cur_token_id == self.video_start_id:
413394
if video_idx >= len(videos):
414395
raise ValueError("prompt token ids has more video placeholder than in messages")
@@ -436,12 +417,13 @@ def prompt_token_ids2outputs(
436417
else:
437418
self._add_processed_video(video, outputs, uuid, token_len)
438419
video_idx += 1
439-
# append video_end_id
440-
outputs["input_ids"].extend([prompt_token_ids[cur_idx]])
420+
st = cur_idx
421+
else:
422+
outputs["input_ids"].extend([cur_token_id])
441423
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
442424
outputs["position_ids"].append([outputs["cur_position"]] * 3)
443425
outputs["cur_position"] += 1
444-
st = cur_idx + 1
426+
st += 1
445427
if image_idx != len(images):
446428
raise ValueError("number of images does not match")
447429
if video_idx != len(videos):
@@ -487,82 +469,6 @@ def _add_text(self, tokens, outputs: Dict) -> None:
487469
outputs["position_ids"].append([start + i] * 3)
488470
outputs["cur_position"] += len(tokens)
489471

490-
# def _preprocess_raw_image(self, img=None, frames=None):
491-
# if img is None and frames is None:
492-
# raise ValueError("image and frames cannot be None at the same time")
493-
# patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
494-
# img.height if img else frames[0].height,
495-
# img.width if img else frames[0].width,
496-
# min_pixels=self.image_min_pixels if img else self.video_min_pixels,
497-
# max_pixels=self.image_max_pixels if img else self.video_max_pixels,
498-
# )[1]
499-
500-
# if img:
501-
# ret = self.image_preprocessor.preprocess(
502-
# images=[img.convert("RGB")],
503-
# do_normalize=False,
504-
# do_rescale=False,
505-
# predetermined_grid_thw=np.array([[patches_h, patches_w]]),
506-
# do_convert_rgb=True,
507-
# input_data_format=ChannelDimension.LAST,
508-
# )
509-
# else:
510-
# pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
511-
# ret = self.image_preprocessor.preprocess(
512-
# images=None,
513-
# videos=pixel_stack,
514-
# do_normalize=False,
515-
# do_rescale=False,
516-
# predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
517-
# do_convert_rgb=True,
518-
# input_data_format=ChannelDimension.LAST,
519-
# )
520-
# return patches_h, patches_w, ret
521-
522-
# def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
523-
# patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
524-
# num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
525-
# if num_tokens != token_len:
526-
# raise ValueError("image tokens num not match the size")
527-
# outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
528-
# outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
529-
# outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
530-
# outputs["num_input_image_tokens"] += num_tokens
531-
532-
# pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
533-
# outputs["position_ids"].extend(pos_ids)
534-
# outputs["cur_position"] = np.max(pos_ids) + 1
535-
536-
# outputs["images"].append(ret["pixel_values"])
537-
# if not uuid:
538-
# outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
539-
# else:
540-
# outputs["mm_hashes"].append(uuid)
541-
# outputs["grid_thw"].append(ret["image_grid_thw"])
542-
# outputs["image_type_ids"].append(0)
543-
544-
# def _add_processed_image_from_token_ids(
545-
# self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
546-
# ):
547-
# img, meta = img_cache
548-
# num_tokens = img.shape[0] // (self.spatial_conv_size**2)
549-
# if num_tokens != token_len:
550-
# raise ValueError("image tokens num not match the size")
551-
552-
# outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
553-
# outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
554-
# outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
555-
556-
# _, h, w = meta["thw"]
557-
# pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
558-
# outputs["position_ids"].extend(pos_ids)
559-
# outputs["cur_position"] = np.max(pos_ids) + 1
560-
561-
# outputs["images"].append(img)
562-
# outputs["mm_hashes"].append(uuid)
563-
# outputs["grid_thw"].append(np.array([[1, h, w]]))
564-
# outputs["image_type_ids"].append(0)
565-
566472
def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
567473
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
568474
img.height,
@@ -622,29 +528,6 @@ def _add_processed_image(
622528
outputs["grid_thw"].append(np.array([[1, h, w]]))
623529
outputs["image_type_ids"].append(0)
624530

625-
# def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
626-
# patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
627-
# num_frames = len(frames)
628-
# num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
629-
# if num_tokens != token_len:
630-
# raise ValueError("video tokens num not match the size")
631-
# outputs["images"].append(ret["pixel_values_videos"])
632-
# if not uuid:
633-
# outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
634-
# else:
635-
# outputs["mm_hashes"].append(uuid)
636-
# outputs["grid_thw"].append(ret["video_grid_thw"])
637-
# outputs["image_type_ids"].extend([1] * num_frames)
638-
639-
# outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
640-
# outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
641-
# outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
642-
# outputs["num_input_video_tokens"] += num_tokens
643-
644-
# pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
645-
# outputs["position_ids"].extend(pos_ids)
646-
# outputs["cur_position"] = np.max(pos_ids) + 1
647-
648531
def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
649532
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
650533
frames[0].height,
@@ -684,28 +567,6 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None)
684567
outputs["position_ids"].extend(pos_ids)
685568
outputs["cur_position"] = np.max(pos_ids) + 1
686569

687-
# def _add_processed_video_from_token_ids(
688-
# self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
689-
# ):
690-
# frames, meta = frames_cache
691-
# num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
692-
# if num_tokens != token_len:
693-
# raise ValueError("video tokens num not match the size")
694-
695-
# t, h, w = meta["thw"]
696-
# outputs["images"].append(frames)
697-
# outputs["mm_hashes"].append(uuid)
698-
# outputs["grid_thw"].append(np.array([[t, h, w]]))
699-
700-
# outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
701-
# outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
702-
# outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
703-
# outputs["image_type_ids"].extend([1] * t)
704-
705-
# pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
706-
# outputs["position_ids"].extend(pos_ids)
707-
# outputs["cur_position"] = np.max(pos_ids) + 1
708-
709570
def _add_processed_video(
710571
self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
711572
) -> None:

0 commit comments

Comments
 (0)