diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 42ab04e4f..bdfb4d721 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -278,6 +278,12 @@ PD 分离模式参数 当任意图片超过该阈值时,请求会被拒绝。 +.. option:: --max_image_pixels + + 单张图片在预处理缩放前允许的最大像素数量,默认为 ``8294400``(约等于 4K 图片像素总量)。 + + 当输入图片超过该阈值时,LightLLM 会先自动将其缩放到该像素预算内,再继续后续流程。 + .. option:: --visual_infer_batch_size 每次推理批次中处理的图像数量,默认为 ``1`` diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index 7f3f8f208..d7b798100 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -276,6 +276,12 @@ Multimodal Parameters Requests are rejected when any image exceeds this limit. +.. option:: --max_image_pixels + + Maximum allowed pixel count for a single image before preprocessing resize, default is ``8294400`` (about 4K image pixels). + + If an input image exceeds this threshold, LightLLM automatically resizes it down to this pixel budget before continuing. + .. option:: --visual_infer_batch_size Number of images processed in each inference batch, default is ``1`` diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 26c651b15..fe6236cbe 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -445,9 +445,15 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--max_image_token_count", type=int, - default=6128, + default=8192, help="maximum allowed token count for one image after tokenization", ) + parser.add_argument( + "--max_image_pixels", + type=int, + default=8294400, + help="maximum allowed pixel count for one image before resize preprocessing", + ) parser.add_argument( "--embed_cache_storage_size", type=float, diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index b02094eed..fe9cb6161 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -95,7 +95,8 @@ class StartArgs: enable_decode_microbatch_overlap: bool = field(default=False) enable_prefill_microbatch_overlap: bool = field(default=False) cache_capacity: int = field(default=200) - max_image_token_count: int = field(default=6128) + max_image_token_count: int = field(default=8192) + max_image_pixels: int = field(default=8294400) embed_cache_storage_size: float = field(default=4) data_type: Optional[str] = field( default=None, metadata={"choices": ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]} diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 633d1de4b..9541e434c 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -4,7 +4,7 @@ import librosa import base64 import numpy as np -from typing import List, Tuple +from typing import List, Tuple, Optional from io import BytesIO from concurrent.futures import ThreadPoolExecutor from PIL import Image, ImageFile @@ -13,6 +13,7 @@ from lightllm.utils.error_utils import ClientDisconnected from lightllm.utils.multimodal_utils import fetch_resource from lightllm.utils.log_utils import init_logger +from lightllm.utils.envs_utils import get_env_start_args logger = init_logger(__name__) @@ -131,6 +132,9 @@ def __init__(self, **kwargs): self.extra_params = {} async def preload(self, request: Request): + + max_image_pixels = get_env_start_args().max_image_pixels + try: if self._type == "url": timeout = int(os.getenv("REQUEST_TIMEOUT", "5")) @@ -141,8 +145,14 @@ async def preload(self, request: Request): elif self._type == "image_size": # image_size 代表直接传入图片的 width,height,主要是用于一些场景 # 的 token 计数判断, 所以只需要图片长宽信息,不需要具体图片的内容信息 - self.image_w = self._data[0] - self.image_h = self._data[1] + src_w = self._data[0] + src_h = self._data[1] + self.image_w, self.image_h = _resize_image_dimensions_if_needed(src_w, src_h, max_image_pixels) + if (self.image_w, self.image_h) != (src_w, src_h): + logger.warning( + f"image_size pixels {src_w * src_h} exceed max_image_pixels={max_image_pixels}, " + f"resized to {self.image_w}x{self.image_h}" + ) return else: raise ValueError(f"cannot read image which type is {self._type}!") @@ -151,7 +161,24 @@ async def preload(self, request: Request): # Decoding is mainly done in the C libraries (libjpeg/libpng/libwebp), which releases the GIL, # and multiple threads can achieve true parallelism. loop = asyncio.get_running_loop() - self.image_w, self.image_h = await loop.run_in_executor(_IMAGE_VERIFY_POOL, _verify_image_bytes, img_data) + # 1) Verify original input bytes first. + src_w, src_h = await loop.run_in_executor(_IMAGE_VERIFY_POOL, _verify_image_bytes, img_data) + # 2) Resize (or no-op) after verification. + img_data, resized_w, resized_h = await loop.run_in_executor( + _IMAGE_VERIFY_POOL, + _resize_image_bytes_if_needed, + img_data, + src_w, + src_h, + max_image_pixels, + ) + self.image_w, self.image_h = resized_w, resized_h + + if (resized_w, resized_h) != (src_w, src_h): + logger.warning( + f"image pixels {src_w * src_h} exceed max_image_pixels={max_image_pixels}," + f" resized to {self.image_w}x{self.image_h}" + ) self._preload_data = img_data return @@ -245,3 +272,45 @@ def _verify_image_bytes(img_data: bytes) -> Tuple[int, int]: w, h = image.size image.load() return w, h + + +def _resize_image_bytes_if_needed( + img_data: bytes, src_w: int, src_h: int, max_image_pixels: int +) -> Tuple[bytes, int, int]: + """ + Resize image bytes to satisfy max pixel constraint and return resized bytes with size. + """ + new_w, new_h = _resize_image_dimensions_if_needed(src_w, src_h, max_image_pixels) + if (new_w, new_h) == (src_w, src_h): + return img_data, src_w, src_h + + with Image.open(BytesIO(img_data)) as image: + resampling = Image.Resampling.LANCZOS if hasattr(Image, "Resampling") else Image.LANCZOS + resized_image = image.resize((new_w, new_h), resampling).convert("RGB") + + buffer = BytesIO() + resized_image.save(buffer, format="JPEG", quality=96, optimize=True) + return buffer.getvalue(), new_w, new_h + + +def _resize_image_dimensions_if_needed(src_w: int, src_h: int, max_image_pixels: int) -> Tuple[int, int]: + """ + Compute resized (w, h) under a max pixel budget while preserving aspect ratio. + """ + old_pixels = src_w * src_h + if old_pixels <= max_image_pixels: + return src_w, src_h + + scale = (max_image_pixels / old_pixels) ** 0.5 + new_w = max(1, int(src_w * scale)) + new_h = max(1, int(src_h * scale)) + + # Avoid overflow from integer rounding. + while new_w * new_h > max_image_pixels: + if new_w >= new_h: + new_w = max(1, new_w - 1) + else: + new_h = max(1, new_h - 1) + + assert new_w > 0 and new_h > 0, "resized image dimensions must be positive" + return new_w, new_h