@@ -364,22 +364,8 @@ def prompt_token_ids2outputs(
364364 return outputs
365365 images , videos , image_uuid , video_uuid , dealer , missing_idx , mm_items = self .extract_mm_items (request )
366366 st , image_idx , video_idx = 0 , 0 , 0
367- mm_id_set = {
368- self .image_start_id ,
369- self .image_end_id ,
370- self .video_start_id ,
371- self .video_end_id ,
372- self .image_patch_id ,
373- }
374367 while st < prompt_token_ids_len :
375368 cur_token_id = prompt_token_ids [st ]
376- if cur_token_id not in mm_id_set :
377- outputs ["input_ids" ].extend ([cur_token_id ])
378- outputs ["token_type_ids" ].extend ([IDS_TYPE_FLAG ["text" ]])
379- outputs ["position_ids" ].append ([outputs ["cur_position" ]] * 3 )
380- outputs ["cur_position" ] += 1
381- st += 1
382- continue
383369 if cur_token_id == self .image_start_id :
384370 if image_idx >= len (images ):
385371 raise ValueError ("prompt token ids has more image placeholder than in messages" )
@@ -403,12 +389,7 @@ def prompt_token_ids2outputs(
403389 else :
404390 self ._add_processed_image (image , outputs , uuid , token_len )
405391 image_idx += 1
406- # append image_end_id
407- outputs ["input_ids" ].extend ([prompt_token_ids [cur_idx ]])
408- outputs ["token_type_ids" ].extend ([IDS_TYPE_FLAG ["text" ]])
409- outputs ["position_ids" ].append ([outputs ["cur_position" ]] * 3 )
410- outputs ["cur_position" ] += 1
411- st = cur_idx + 1
392+ st = cur_idx
412393 elif cur_token_id == self .video_start_id :
413394 if video_idx >= len (videos ):
414395 raise ValueError ("prompt token ids has more video placeholder than in messages" )
@@ -436,12 +417,13 @@ def prompt_token_ids2outputs(
436417 else :
437418 self ._add_processed_video (video , outputs , uuid , token_len )
438419 video_idx += 1
439- # append video_end_id
440- outputs ["input_ids" ].extend ([prompt_token_ids [cur_idx ]])
420+ st = cur_idx
421+ else :
422+ outputs ["input_ids" ].extend ([cur_token_id ])
441423 outputs ["token_type_ids" ].extend ([IDS_TYPE_FLAG ["text" ]])
442424 outputs ["position_ids" ].append ([outputs ["cur_position" ]] * 3 )
443425 outputs ["cur_position" ] += 1
444- st = cur_idx + 1
426+ st += 1
445427 if image_idx != len (images ):
446428 raise ValueError ("number of images does not match" )
447429 if video_idx != len (videos ):
@@ -487,82 +469,6 @@ def _add_text(self, tokens, outputs: Dict) -> None:
487469 outputs ["position_ids" ].append ([start + i ] * 3 )
488470 outputs ["cur_position" ] += len (tokens )
489471
490- # def _preprocess_raw_image(self, img=None, frames=None):
491- # if img is None and frames is None:
492- # raise ValueError("image and frames cannot be None at the same time")
493- # patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
494- # img.height if img else frames[0].height,
495- # img.width if img else frames[0].width,
496- # min_pixels=self.image_min_pixels if img else self.video_min_pixels,
497- # max_pixels=self.image_max_pixels if img else self.video_max_pixels,
498- # )[1]
499-
500- # if img:
501- # ret = self.image_preprocessor.preprocess(
502- # images=[img.convert("RGB")],
503- # do_normalize=False,
504- # do_rescale=False,
505- # predetermined_grid_thw=np.array([[patches_h, patches_w]]),
506- # do_convert_rgb=True,
507- # input_data_format=ChannelDimension.LAST,
508- # )
509- # else:
510- # pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
511- # ret = self.image_preprocessor.preprocess(
512- # images=None,
513- # videos=pixel_stack,
514- # do_normalize=False,
515- # do_rescale=False,
516- # predetermined_grid_thw=np.array([[patches_h, patches_w]] * len(frames)),
517- # do_convert_rgb=True,
518- # input_data_format=ChannelDimension.LAST,
519- # )
520- # return patches_h, patches_w, ret
521-
522- # def _add_image_from_token_ids(self, img, outputs: Dict, uuid: Optional[str], token_len: int):
523- # patches_h, patches_w, ret = self._preprocess_raw_image(img=img)
524- # num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
525- # if num_tokens != token_len:
526- # raise ValueError("image tokens num not match the size")
527- # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
528- # outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
529- # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
530- # outputs["num_input_image_tokens"] += num_tokens
531-
532- # pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
533- # outputs["position_ids"].extend(pos_ids)
534- # outputs["cur_position"] = np.max(pos_ids) + 1
535-
536- # outputs["images"].append(ret["pixel_values"])
537- # if not uuid:
538- # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
539- # else:
540- # outputs["mm_hashes"].append(uuid)
541- # outputs["grid_thw"].append(ret["image_grid_thw"])
542- # outputs["image_type_ids"].append(0)
543-
544- # def _add_processed_image_from_token_ids(
545- # self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: Optional[str], token_len: int
546- # ):
547- # img, meta = img_cache
548- # num_tokens = img.shape[0] // (self.spatial_conv_size**2)
549- # if num_tokens != token_len:
550- # raise ValueError("image tokens num not match the size")
551-
552- # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
553- # outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
554- # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
555-
556- # _, h, w = meta["thw"]
557- # pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
558- # outputs["position_ids"].extend(pos_ids)
559- # outputs["cur_position"] = np.max(pos_ids) + 1
560-
561- # outputs["images"].append(img)
562- # outputs["mm_hashes"].append(uuid)
563- # outputs["grid_thw"].append(np.array([[1, h, w]]))
564- # outputs["image_type_ids"].append(0)
565-
566472 def _add_image (self , img , outputs : Dict , uuid : Optional [str ], token_len = None ) -> None :
567473 patches_h , patches_w = self .image_preprocessor .get_smarted_resize (
568474 img .height ,
@@ -622,29 +528,6 @@ def _add_processed_image(
622528 outputs ["grid_thw" ].append (np .array ([[1 , h , w ]]))
623529 outputs ["image_type_ids" ].append (0 )
624530
625- # def _add_video_from_token_ids(self, frames, outputs: Dict, uuid: Optional[str], token_len: int):
626- # patches_h, patches_w, ret = self._preprocess_raw_image(frames=frames)
627- # num_frames = len(frames)
628- # num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
629- # if num_tokens != token_len:
630- # raise ValueError("video tokens num not match the size")
631- # outputs["images"].append(ret["pixel_values_videos"])
632- # if not uuid:
633- # outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
634- # else:
635- # outputs["mm_hashes"].append(uuid)
636- # outputs["grid_thw"].append(ret["video_grid_thw"])
637- # outputs["image_type_ids"].extend([1] * num_frames)
638-
639- # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
640- # outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
641- # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
642- # outputs["num_input_video_tokens"] += num_tokens
643-
644- # pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
645- # outputs["position_ids"].extend(pos_ids)
646- # outputs["cur_position"] = np.max(pos_ids) + 1
647-
648531 def _add_video (self , frames , outputs : Dict , uuid : Optional [str ], token_len = None ) -> None :
649532 patches_h , patches_w = self .image_preprocessor .get_smarted_resize (
650533 frames [0 ].height ,
@@ -684,28 +567,6 @@ def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None)
684567 outputs ["position_ids" ].extend (pos_ids )
685568 outputs ["cur_position" ] = np .max (pos_ids ) + 1
686569
687- # def _add_processed_video_from_token_ids(
688- # self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: int
689- # ):
690- # frames, meta = frames_cache
691- # num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
692- # if num_tokens != token_len:
693- # raise ValueError("video tokens num not match the size")
694-
695- # t, h, w = meta["thw"]
696- # outputs["images"].append(frames)
697- # outputs["mm_hashes"].append(uuid)
698- # outputs["grid_thw"].append(np.array([[t, h, w]]))
699-
700- # outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
701- # outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
702- # outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
703- # outputs["image_type_ids"].extend([1] * t)
704-
705- # pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
706- # outputs["position_ids"].extend(pos_ids)
707- # outputs["cur_position"] = np.max(pos_ids) + 1
708-
709570 def _add_processed_video (
710571 self , frames_cache : Tuple [np .ndarray , dict ], outputs : Dict , uuid : str , token_len = None
711572 ) -> None :
0 commit comments