From 7cd7c2edd21e0b51e481d5d3937cecd3c1c29bef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:35:14 +0000 Subject: [PATCH 1/3] Initial plan From b94b31802dbac9aebaf41e5e1873723aa5733676 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:45:52 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=E5=B0=86=E7=A1=AC=E7=BC=96?= =?UTF-8?q?=E7=A0=81=E7=9A=84=20token=20=E4=BC=B0=E7=AE=97=E5=80=BC?= =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=88=B0=E9=85=8D=E7=BD=AE=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在 LLMConfig 中添加 estimated_output_tokens, vlm_estimated_output_tokens, vlm_max_tokens, vlm_image_tokens_estimate - 在 DeepSeekOCRConfig 中添加 image_tokens_estimate - 更新 multi_tenant.py 使用配置值替代硬编码 - 更新 deepseek_ocr_client.py 使用配置值替代硬编码 - 更新 env.example 添加新的环境变量说明 Closes #issue Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com> --- env.example | 11 +++++++++++ src/config.py | 26 ++++++++++++++++++++++++++ src/deepseek_ocr_client.py | 11 +++++++---- src/multi_tenant.py | 22 ++++++++++++++++++---- 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/env.example b/env.example index db7a4f2..d068b1b 100644 --- a/env.example +++ b/env.example @@ -31,6 +31,14 @@ LLM_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(包含输入+输 # # 推荐:不设置此项,让系统自动计算以确保不超过 TPM/RPM 限制 # # 计算示例:min(800, 40000/3500) = min(800, 11) = 11 并发 +# --- LLM Token 估算配置(用于速率限制) --- +# 估算输出 tokens 数量,用于速率限制计算 +# 如果估算过高,并发会受限;如果估算不足,可能触发 429 错误 +# LLM_ESTIMATED_OUTPUT_TOKENS=3000 # LLM 输出估算(实体提取约 3000 tokens,默认 3000) +# LLM_VLM_ESTIMATED_OUTPUT_TOKENS=500 # VLM 输出估算(图片描述较短,默认 500) +# LLM_VLM_MAX_TOKENS=500 # VLM API 最大输出 tokens(默认 500) +# LLM_VLM_IMAGE_TOKENS_ESTIMATE=200 # VLM 图片输入估算 tokens(默认 200) + # ====== Embedding 配置 ====== # 用于向量化文本,支持语义检索 EMBEDDING_BASE_URL="https://api.siliconflow.cn/v1" @@ -142,6 +150,9 @@ DS_OCR_REQUESTS_PER_MINUTE=800 # 每分钟最大请求数(默认 800) DS_OCR_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(默认 40000) # DS_OCR_MAX_ASYNC=8 # 【可选】全局默认并发数(未设置时使用硬编码默认值 8) +# --- DeepSeek-OCR Token 估算配置(用于速率限制) --- +# DS_OCR_IMAGE_TOKENS_ESTIMATE=1000 # 图片输入估算 tokens(默认 1000) + # ====== 智能 Parser 选择器配置(v2.0) ====== # 基于文档复杂度自动选择最优 Parser 和模式 diff --git a/src/config.py b/src/config.py index 0d1f8f5..130f312 100644 --- a/src/config.py +++ b/src/config.py @@ -30,6 +30,26 @@ class LLMConfig(BaseSettings): tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute (input + output)") max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)") + # Token estimation for rate limiting (LLM) + estimated_output_tokens: int = Field( + default=3000, + description="Estimated output tokens for LLM calls (entity extraction typically outputs ~3000 tokens)" + ) + + # Token estimation for rate limiting (VLM) + vlm_estimated_output_tokens: int = Field( + default=500, + description="Estimated output tokens for VLM calls (image descriptions are typically shorter)" + ) + vlm_max_tokens: int = Field( + default=500, + description="Maximum output tokens for VLM API calls" + ) + vlm_image_tokens_estimate: int = Field( + default=200, + description="Estimated tokens for image input in VLM calls" + ) + class Config: env_prefix = "LLM_" env_file = ".env" @@ -149,6 +169,12 @@ class DeepSeekOCRConfig(BaseSettings): tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute") max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)") + # Token estimation for rate limiting + image_tokens_estimate: int = Field( + default=1000, + description="Estimated tokens for image input in OCR calls" + ) + class Config: env_prefix = "DS_OCR_" env_file = ".env" diff --git a/src/deepseek_ocr_client.py b/src/deepseek_ocr_client.py index 1cfcc76..ef13ec1 100644 --- a/src/deepseek_ocr_client.py +++ b/src/deepseek_ocr_client.py @@ -57,6 +57,9 @@ class DSSeekConfig: fallback_mode: str = field(default_factory=lambda: config.ds_ocr.fallback_mode) min_output_threshold: int = field(default_factory=lambda: config.ds_ocr.min_output_threshold) + # Token 估算配置 + image_tokens_estimate: int = field(default_factory=lambda: config.ds_ocr.image_tokens_estimate) + def __post_init__(self): """验证配置""" if not self.api_key: @@ -281,8 +284,8 @@ async def _call_api(self, img_base64: str, prompt: str) -> str: Raises: Exception: API 调用失败时抛出异常 """ - # 估算 tokens(提示词 + 图片约 1000 tokens + 输出约 2000 tokens) - estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens + # 估算 tokens(提示词 + 图片 + 输出) + estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens # 获取速率限制许可 await self.rate_limiter.rate_limiter.acquire(estimated_tokens) @@ -348,8 +351,8 @@ def _call_api_sync(self, img_base64: str, prompt: str) -> str: """ import asyncio - # 估算 tokens(提示词 + 图片约 1000 tokens + 输出约 2000 tokens) - estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens + # 估算 tokens(提示词 + 图片 + 输出) + estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens # 在同步函数中调用异步速率限制器 try: diff --git a/src/multi_tenant.py b/src/multi_tenant.py index d4cb0b1..5f47604 100644 --- a/src/multi_tenant.py +++ b/src/multi_tenant.py @@ -64,6 +64,12 @@ def __init__( self.max_async = config.llm.max_async self.vlm_timeout = config.llm.vlm_timeout + # Token 估算配置 + self.llm_estimated_output_tokens = config.llm.estimated_output_tokens + self.vlm_estimated_output_tokens = config.llm.vlm_estimated_output_tokens + self.vlm_max_tokens = config.llm.vlm_max_tokens + self.vlm_image_tokens_estimate = config.llm.vlm_image_tokens_estimate + # 存储配置 self.use_external_storage = config.storage.use_external self.kv_storage = config.storage.kv_storage @@ -106,11 +112,14 @@ def _create_llm_func(self, llm_config: Dict): # 获取 rate_limiter 实际使用的并发数(将用于 LightRAG) actual_max_concurrent = rate_limiter.max_concurrent + # 获取 token 估算配置(支持租户覆盖) + llm_estimated_output = llm_config.get("estimated_output_tokens", self.llm_estimated_output_tokens) + def llm_model_func(prompt, **kwargs): # 精确计算输入 tokens(使用 tiktoken) input_tokens = count_tokens(prompt, model="cl100k_base") # 保守估算输出 tokens(实体提取通常输出较长) - estimated_output = 3000 # 50 entities + 46 relations ≈ 3000 tokens + estimated_output = llm_estimated_output # 从配置读取 estimated_tokens = input_tokens + estimated_output # Debug: 输出 token 计数 @@ -295,6 +304,11 @@ def _create_vision_model_func(self, llm_config: Dict): tokens_per_minute=tokens_per_minute ) + # 获取 VLM token 估算配置(支持租户覆盖) + vlm_image_tokens = llm_config.get("vlm_image_tokens_estimate", self.vlm_image_tokens_estimate) + vlm_estimated_output = llm_config.get("vlm_estimated_output_tokens", self.vlm_estimated_output_tokens) + vlm_max_tokens = llm_config.get("vlm_max_tokens", self.vlm_max_tokens) + async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: str) -> str: """ 使用 VLM 理解图片内容(带速率限制) @@ -309,8 +323,8 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st """ # 精确计算 tokens(使用 tiktoken) prompt_tokens = count_tokens(prompt, model="cl100k_base") - image_tokens = 200 # 图片约 200 tokens(固定估算) - estimated_output = 500 # VLM 输出通常较短 + image_tokens = vlm_image_tokens # 从配置读取 + estimated_output = vlm_estimated_output # 从配置读取 estimated_tokens = prompt_tokens + image_tokens + estimated_output # Debug: 输出 token 计数 @@ -336,7 +350,7 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st ] } ], - "max_tokens": 500, + "max_tokens": vlm_max_tokens, # 从配置读取 "temperature": 0.1 } From e2cb6e7a0fdf265cbf15d12adde484264b995b7a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:03:40 +0000 Subject: [PATCH 3/3] =?UTF-8?q?revert:=20=E6=92=A4=E9=94=80=20token=20?= =?UTF-8?q?=E4=BC=B0=E7=AE=97=E5=80=BC=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根据用户反馈,这些估算值应该通过 LLM_REQUESTS_PER_MINUTE 和 LLM_TOKENS_PER_MINUTE 自动计算,不需要额外的配置项。 rate_limiter.py 中已有 avg_tokens_map 用于自动计算并发数。 Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com> --- env.example | 11 ----------- src/config.py | 26 -------------------------- src/deepseek_ocr_client.py | 11 ++++------- src/multi_tenant.py | 22 ++++------------------ 4 files changed, 8 insertions(+), 62 deletions(-) diff --git a/env.example b/env.example index d068b1b..db7a4f2 100644 --- a/env.example +++ b/env.example @@ -31,14 +31,6 @@ LLM_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(包含输入+输 # # 推荐:不设置此项,让系统自动计算以确保不超过 TPM/RPM 限制 # # 计算示例:min(800, 40000/3500) = min(800, 11) = 11 并发 -# --- LLM Token 估算配置(用于速率限制) --- -# 估算输出 tokens 数量,用于速率限制计算 -# 如果估算过高,并发会受限;如果估算不足,可能触发 429 错误 -# LLM_ESTIMATED_OUTPUT_TOKENS=3000 # LLM 输出估算(实体提取约 3000 tokens,默认 3000) -# LLM_VLM_ESTIMATED_OUTPUT_TOKENS=500 # VLM 输出估算(图片描述较短,默认 500) -# LLM_VLM_MAX_TOKENS=500 # VLM API 最大输出 tokens(默认 500) -# LLM_VLM_IMAGE_TOKENS_ESTIMATE=200 # VLM 图片输入估算 tokens(默认 200) - # ====== Embedding 配置 ====== # 用于向量化文本,支持语义检索 EMBEDDING_BASE_URL="https://api.siliconflow.cn/v1" @@ -150,9 +142,6 @@ DS_OCR_REQUESTS_PER_MINUTE=800 # 每分钟最大请求数(默认 800) DS_OCR_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(默认 40000) # DS_OCR_MAX_ASYNC=8 # 【可选】全局默认并发数(未设置时使用硬编码默认值 8) -# --- DeepSeek-OCR Token 估算配置(用于速率限制) --- -# DS_OCR_IMAGE_TOKENS_ESTIMATE=1000 # 图片输入估算 tokens(默认 1000) - # ====== 智能 Parser 选择器配置(v2.0) ====== # 基于文档复杂度自动选择最优 Parser 和模式 diff --git a/src/config.py b/src/config.py index 130f312..0d1f8f5 100644 --- a/src/config.py +++ b/src/config.py @@ -30,26 +30,6 @@ class LLMConfig(BaseSettings): tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute (input + output)") max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)") - # Token estimation for rate limiting (LLM) - estimated_output_tokens: int = Field( - default=3000, - description="Estimated output tokens for LLM calls (entity extraction typically outputs ~3000 tokens)" - ) - - # Token estimation for rate limiting (VLM) - vlm_estimated_output_tokens: int = Field( - default=500, - description="Estimated output tokens for VLM calls (image descriptions are typically shorter)" - ) - vlm_max_tokens: int = Field( - default=500, - description="Maximum output tokens for VLM API calls" - ) - vlm_image_tokens_estimate: int = Field( - default=200, - description="Estimated tokens for image input in VLM calls" - ) - class Config: env_prefix = "LLM_" env_file = ".env" @@ -169,12 +149,6 @@ class DeepSeekOCRConfig(BaseSettings): tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute") max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)") - # Token estimation for rate limiting - image_tokens_estimate: int = Field( - default=1000, - description="Estimated tokens for image input in OCR calls" - ) - class Config: env_prefix = "DS_OCR_" env_file = ".env" diff --git a/src/deepseek_ocr_client.py b/src/deepseek_ocr_client.py index ef13ec1..1cfcc76 100644 --- a/src/deepseek_ocr_client.py +++ b/src/deepseek_ocr_client.py @@ -57,9 +57,6 @@ class DSSeekConfig: fallback_mode: str = field(default_factory=lambda: config.ds_ocr.fallback_mode) min_output_threshold: int = field(default_factory=lambda: config.ds_ocr.min_output_threshold) - # Token 估算配置 - image_tokens_estimate: int = field(default_factory=lambda: config.ds_ocr.image_tokens_estimate) - def __post_init__(self): """验证配置""" if not self.api_key: @@ -284,8 +281,8 @@ async def _call_api(self, img_base64: str, prompt: str) -> str: Raises: Exception: API 调用失败时抛出异常 """ - # 估算 tokens(提示词 + 图片 + 输出) - estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens + # 估算 tokens(提示词 + 图片约 1000 tokens + 输出约 2000 tokens) + estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens # 获取速率限制许可 await self.rate_limiter.rate_limiter.acquire(estimated_tokens) @@ -351,8 +348,8 @@ def _call_api_sync(self, img_base64: str, prompt: str) -> str: """ import asyncio - # 估算 tokens(提示词 + 图片 + 输出) - estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens + # 估算 tokens(提示词 + 图片约 1000 tokens + 输出约 2000 tokens) + estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens # 在同步函数中调用异步速率限制器 try: diff --git a/src/multi_tenant.py b/src/multi_tenant.py index 5f47604..d4cb0b1 100644 --- a/src/multi_tenant.py +++ b/src/multi_tenant.py @@ -64,12 +64,6 @@ def __init__( self.max_async = config.llm.max_async self.vlm_timeout = config.llm.vlm_timeout - # Token 估算配置 - self.llm_estimated_output_tokens = config.llm.estimated_output_tokens - self.vlm_estimated_output_tokens = config.llm.vlm_estimated_output_tokens - self.vlm_max_tokens = config.llm.vlm_max_tokens - self.vlm_image_tokens_estimate = config.llm.vlm_image_tokens_estimate - # 存储配置 self.use_external_storage = config.storage.use_external self.kv_storage = config.storage.kv_storage @@ -112,14 +106,11 @@ def _create_llm_func(self, llm_config: Dict): # 获取 rate_limiter 实际使用的并发数(将用于 LightRAG) actual_max_concurrent = rate_limiter.max_concurrent - # 获取 token 估算配置(支持租户覆盖) - llm_estimated_output = llm_config.get("estimated_output_tokens", self.llm_estimated_output_tokens) - def llm_model_func(prompt, **kwargs): # 精确计算输入 tokens(使用 tiktoken) input_tokens = count_tokens(prompt, model="cl100k_base") # 保守估算输出 tokens(实体提取通常输出较长) - estimated_output = llm_estimated_output # 从配置读取 + estimated_output = 3000 # 50 entities + 46 relations ≈ 3000 tokens estimated_tokens = input_tokens + estimated_output # Debug: 输出 token 计数 @@ -304,11 +295,6 @@ def _create_vision_model_func(self, llm_config: Dict): tokens_per_minute=tokens_per_minute ) - # 获取 VLM token 估算配置(支持租户覆盖) - vlm_image_tokens = llm_config.get("vlm_image_tokens_estimate", self.vlm_image_tokens_estimate) - vlm_estimated_output = llm_config.get("vlm_estimated_output_tokens", self.vlm_estimated_output_tokens) - vlm_max_tokens = llm_config.get("vlm_max_tokens", self.vlm_max_tokens) - async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: str) -> str: """ 使用 VLM 理解图片内容(带速率限制) @@ -323,8 +309,8 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st """ # 精确计算 tokens(使用 tiktoken) prompt_tokens = count_tokens(prompt, model="cl100k_base") - image_tokens = vlm_image_tokens # 从配置读取 - estimated_output = vlm_estimated_output # 从配置读取 + image_tokens = 200 # 图片约 200 tokens(固定估算) + estimated_output = 500 # VLM 输出通常较短 estimated_tokens = prompt_tokens + image_tokens + estimated_output # Debug: 输出 token 计数 @@ -350,7 +336,7 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st ] } ], - "max_tokens": vlm_max_tokens, # 从配置读取 + "max_tokens": 500, "temperature": 0.1 }