Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/CN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,12 @@ PD 分离模式参数

多模态资源的缓存服务器容量,默认为 ``200``

.. option:: --max_image_token_count

单张图片在转换为 token 后允许的最大 token 数量,默认为 ``6128``

当任意图片超过该阈值时,请求会被拒绝。

.. option:: --visual_infer_batch_size

每次推理批次中处理的图像数量,默认为 ``1``
Expand Down
6 changes: 6 additions & 0 deletions docs/EN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,12 @@ Multimodal Parameters

Cache server capacity for multimodal resources, default is ``200``

.. option:: --max_image_token_count

Maximum allowed token count for a single image after tokenization, default is ``6128``

Requests are rejected when any image exceeds this limit.

.. option:: --visual_infer_batch_size

Number of images processed in each inference batch, default is ``1``
Expand Down
6 changes: 6 additions & 0 deletions lightllm/server/api_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--cache_capacity", type=int, default=200, help="cache server capacity for multimodal resources"
)
parser.add_argument(
"--max_image_token_count",
type=int,
default=6128,
help="maximum allowed token count for one image after tokenization",
)
parser.add_argument(
"--embed_cache_storage_size",
type=float,
Expand Down
1 change: 1 addition & 0 deletions lightllm/server/core/objs/start_args_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class StartArgs:
enable_decode_microbatch_overlap: bool = field(default=False)
enable_prefill_microbatch_overlap: bool = field(default=False)
cache_capacity: int = field(default=200)
max_image_token_count: int = field(default=6128)
embed_cache_storage_size: float = field(default=4)
data_type: Optional[str] = field(
default=None, metadata={"choices": ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]}
Expand Down
16 changes: 15 additions & 1 deletion lightllm/server/httpserver/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,17 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
self.cache_client.root.set_items_data(update_data_ids)
return

def _assert_image_token_count(self, token_num: int):
if token_num > self.args.max_image_token_count:
err_msg = (
f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}."
f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
)
logger.warning(err_msg)
raise ValueError(err_msg)
return
Comment on lines +184 to +193
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Avoid using assert for runtime validation of user input or critical safety guards. Python's assert statements are removed when the interpreter is run with optimizations (-O), which would disable this check and potentially lead to OOM. Use an explicit if check and raise a ValueError instead. I've also renamed the method to better reflect its behavior.

Suggested change
def _assert_image_token_count(self, token_num: int):
assert token_num <= self.args.max_image_token_count, (
f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}. "
f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
)
return
def _check_image_token_count(self, token_num: int):
if token_num > self.args.max_image_token_count:
raise ValueError(
f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}. "
f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
)
return


async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
# 只有 P 和 NORMAL 节点需要真的管理多模态资源
if self.pd_mode.is_P_or_NORMAL():
Expand All @@ -190,6 +201,7 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
data = img.read()
# must after init_imageitem_extral_params
token_num = self.tokenizer.get_image_token_length(img)
self._assert_image_token_count(token_num)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Update the call to the renamed method.

Suggested change
self._assert_image_token_count(token_num)
self._check_image_token_count(token_num)

md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
md5sums.append(md5sum)
img.md5 = md5sum
Expand Down Expand Up @@ -245,7 +257,9 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
for img in multimodal_params.images:
img_count += 1
self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
image_tokens += self.tokenizer.get_image_token_length(img)
token_num = self.tokenizer.get_image_token_length(img)
self._assert_image_token_count(token_num)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Update the call to the renamed method.

Suggested change
self._assert_image_token_count(token_num)
self._check_image_token_count(token_num)

image_tokens += token_num
for audio in multimodal_params.audios:
audio_count += 1
self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
Expand Down
11 changes: 10 additions & 1 deletion lightllm/server/httpserver_for_pd_master/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,16 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
for img in multimodal_params.images:
img_count += 1
self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
image_tokens += self.tokenizer.get_image_token_length(img)
token_num = self.tokenizer.get_image_token_length(img)
if token_num > self.args.max_image_token_count:
err_msg = (
f"the image token count {token_num} > max_image_token_count {self.args.max_image_token_count}. "
f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
)
logger.warning(err_msg)
raise ValueError(err_msg)
image_tokens += token_num
for audio in multimodal_params.audios:
audio_count += 1
self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
Expand Down
6 changes: 4 additions & 2 deletions lightllm/server/router/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,11 @@ def _generate_new_batch(self):
new_batch = self.req_queue.generate_new_batch(
Batch.merge_two_batch(self.running_batch, self.schedule_new_batch)
)

if new_batch is not None and len(new_batch.reqs) > 0:
logger.info(f"generate new batch, {new_batch.simple_log()}")

self.schedule_new_batch = Batch.merge_two_batch(self.schedule_new_batch, new_batch)
if self.schedule_new_batch is not None:
logger.info(f"gen new batch, {self.schedule_new_batch.simple_log()}")
return

def _multinode_tp_generate_new_batch(self):
Expand Down
Loading