From 7cd7c2edd21e0b51e481d5d3937cecd3c1c29bef Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 15 Dec 2025 09:35:14 +0000
Subject: [PATCH 1/3] Initial plan


From b94b31802dbac9aebaf41e5e1873723aa5733676 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 15 Dec 2025 09:45:52 +0000
Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=E5=B0=86=E7=A1=AC=E7=BC=96?=
 =?UTF-8?q?=E7=A0=81=E7=9A=84=20token=20=E4=BC=B0=E7=AE=97=E5=80=BC?=
 =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=88=B0=E9=85=8D=E7=BD=AE=E4=B8=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在 LLMConfig 中添加 estimated_output_tokens, vlm_estimated_output_tokens, vlm_max_tokens, vlm_image_tokens_estimate
- 在 DeepSeekOCRConfig 中添加 image_tokens_estimate
- 更新 multi_tenant.py 使用配置值替代硬编码
- 更新 deepseek_ocr_client.py 使用配置值替代硬编码
- 更新 env.example 添加新的环境变量说明

Closes #issue

Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com>
---
 env.example                | 11 +++++++++++
 src/config.py              | 26 ++++++++++++++++++++++++++
 src/deepseek_ocr_client.py | 11 +++++++----
 src/multi_tenant.py        | 22 ++++++++++++++++++----
 4 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/env.example b/env.example
index db7a4f2..d068b1b 100644
--- a/env.example
+++ b/env.example
@@ -31,6 +31,14 @@ LLM_TOKENS_PER_MINUTE=40000        # 每分钟最大令牌数（包含输入+输
 #                                  # 推荐：不设置此项，让系统自动计算以确保不超过 TPM/RPM 限制
 #                                  # 计算示例：min(800, 40000/3500) = min(800, 11) = 11 并发
 
+# --- LLM Token 估算配置（用于速率限制） ---
+# 估算输出 tokens 数量，用于速率限制计算
+# 如果估算过高，并发会受限；如果估算不足，可能触发 429 错误
+# LLM_ESTIMATED_OUTPUT_TOKENS=3000  # LLM 输出估算（实体提取约 3000 tokens，默认 3000）
+# LLM_VLM_ESTIMATED_OUTPUT_TOKENS=500  # VLM 输出估算（图片描述较短，默认 500）
+# LLM_VLM_MAX_TOKENS=500            # VLM API 最大输出 tokens（默认 500）
+# LLM_VLM_IMAGE_TOKENS_ESTIMATE=200 # VLM 图片输入估算 tokens（默认 200）
+
 # ====== Embedding 配置 ======
 # 用于向量化文本，支持语义检索
 EMBEDDING_BASE_URL="https://api.siliconflow.cn/v1"
@@ -142,6 +150,9 @@ DS_OCR_REQUESTS_PER_MINUTE=800     # 每分钟最大请求数（默认 800）
 DS_OCR_TOKENS_PER_MINUTE=40000     # 每分钟最大令牌数（默认 40000）
 # DS_OCR_MAX_ASYNC=8               # 【可选】全局默认并发数（未设置时使用硬编码默认值 8）
 
+# --- DeepSeek-OCR Token 估算配置（用于速率限制） ---
+# DS_OCR_IMAGE_TOKENS_ESTIMATE=1000  # 图片输入估算 tokens（默认 1000）
+
 # ====== 智能 Parser 选择器配置（v2.0） ======
 # 基于文档复杂度自动选择最优 Parser 和模式
 
diff --git a/src/config.py b/src/config.py
index 0d1f8f5..130f312 100644
--- a/src/config.py
+++ b/src/config.py
@@ -30,6 +30,26 @@ class LLMConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute (input + output)")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
+    # Token estimation for rate limiting (LLM)
+    estimated_output_tokens: int = Field(
+        default=3000,
+        description="Estimated output tokens for LLM calls (entity extraction typically outputs ~3000 tokens)"
+    )
+
+    # Token estimation for rate limiting (VLM)
+    vlm_estimated_output_tokens: int = Field(
+        default=500,
+        description="Estimated output tokens for VLM calls (image descriptions are typically shorter)"
+    )
+    vlm_max_tokens: int = Field(
+        default=500,
+        description="Maximum output tokens for VLM API calls"
+    )
+    vlm_image_tokens_estimate: int = Field(
+        default=200,
+        description="Estimated tokens for image input in VLM calls"
+    )
+
     class Config:
         env_prefix = "LLM_"
         env_file = ".env"
@@ -149,6 +169,12 @@ class DeepSeekOCRConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
+    # Token estimation for rate limiting
+    image_tokens_estimate: int = Field(
+        default=1000,
+        description="Estimated tokens for image input in OCR calls"
+    )
+
     class Config:
         env_prefix = "DS_OCR_"
         env_file = ".env"
diff --git a/src/deepseek_ocr_client.py b/src/deepseek_ocr_client.py
index 1cfcc76..ef13ec1 100644
--- a/src/deepseek_ocr_client.py
+++ b/src/deepseek_ocr_client.py
@@ -57,6 +57,9 @@ class DSSeekConfig:
     fallback_mode: str = field(default_factory=lambda: config.ds_ocr.fallback_mode)
     min_output_threshold: int = field(default_factory=lambda: config.ds_ocr.min_output_threshold)
 
+    # Token 估算配置
+    image_tokens_estimate: int = field(default_factory=lambda: config.ds_ocr.image_tokens_estimate)
+
     def __post_init__(self):
         """验证配置"""
         if not self.api_key:
@@ -281,8 +284,8 @@ async def _call_api(self, img_base64: str, prompt: str) -> str:
         Raises:
             Exception: API 调用失败时抛出异常
         """
-        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
-        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片 + 输出）
+        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
 
         # 获取速率限制许可
         await self.rate_limiter.rate_limiter.acquire(estimated_tokens)
@@ -348,8 +351,8 @@ def _call_api_sync(self, img_base64: str, prompt: str) -> str:
         """
         import asyncio
 
-        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
-        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片 + 输出）
+        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
 
         # 在同步函数中调用异步速率限制器
         try:
diff --git a/src/multi_tenant.py b/src/multi_tenant.py
index d4cb0b1..5f47604 100644
--- a/src/multi_tenant.py
+++ b/src/multi_tenant.py
@@ -64,6 +64,12 @@ def __init__(
         self.max_async = config.llm.max_async
         self.vlm_timeout = config.llm.vlm_timeout
 
+        # Token 估算配置
+        self.llm_estimated_output_tokens = config.llm.estimated_output_tokens
+        self.vlm_estimated_output_tokens = config.llm.vlm_estimated_output_tokens
+        self.vlm_max_tokens = config.llm.vlm_max_tokens
+        self.vlm_image_tokens_estimate = config.llm.vlm_image_tokens_estimate
+
         # 存储配置
         self.use_external_storage = config.storage.use_external
         self.kv_storage = config.storage.kv_storage
@@ -106,11 +112,14 @@ def _create_llm_func(self, llm_config: Dict):
         # 获取 rate_limiter 实际使用的并发数（将用于 LightRAG）
         actual_max_concurrent = rate_limiter.max_concurrent
 
+        # 获取 token 估算配置（支持租户覆盖）
+        llm_estimated_output = llm_config.get("estimated_output_tokens", self.llm_estimated_output_tokens)
+
         def llm_model_func(prompt, **kwargs):
             # 精确计算输入 tokens（使用 tiktoken）
             input_tokens = count_tokens(prompt, model="cl100k_base")
             # 保守估算输出 tokens（实体提取通常输出较长）
-            estimated_output = 3000  # 50 entities + 46 relations ≈ 3000 tokens
+            estimated_output = llm_estimated_output  # 从配置读取
             estimated_tokens = input_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -295,6 +304,11 @@ def _create_vision_model_func(self, llm_config: Dict):
             tokens_per_minute=tokens_per_minute
         )
 
+        # 获取 VLM token 估算配置（支持租户覆盖）
+        vlm_image_tokens = llm_config.get("vlm_image_tokens_estimate", self.vlm_image_tokens_estimate)
+        vlm_estimated_output = llm_config.get("vlm_estimated_output_tokens", self.vlm_estimated_output_tokens)
+        vlm_max_tokens = llm_config.get("vlm_max_tokens", self.vlm_max_tokens)
+
         async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: str) -> str:
             """
             使用 VLM 理解图片内容（带速率限制）
@@ -309,8 +323,8 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
             """
             # 精确计算 tokens（使用 tiktoken）
             prompt_tokens = count_tokens(prompt, model="cl100k_base")
-            image_tokens = 200  # 图片约 200 tokens（固定估算）
-            estimated_output = 500  # VLM 输出通常较短
+            image_tokens = vlm_image_tokens  # 从配置读取
+            estimated_output = vlm_estimated_output  # 从配置读取
             estimated_tokens = prompt_tokens + image_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -336,7 +350,7 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
                             ]
                         }
                     ],
-                    "max_tokens": 500,
+                    "max_tokens": vlm_max_tokens,  # 从配置读取
                     "temperature": 0.1
                 }
 

From e2cb6e7a0fdf265cbf15d12adde484264b995b7a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 15 Dec 2025 10:03:40 +0000
Subject: [PATCH 3/3] =?UTF-8?q?revert:=20=E6=92=A4=E9=94=80=20token=20?=
 =?UTF-8?q?=E4=BC=B0=E7=AE=97=E5=80=BC=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

根据用户反馈，这些估算值应该通过 LLM_REQUESTS_PER_MINUTE 和 LLM_TOKENS_PER_MINUTE 自动计算，不需要额外的配置项。

rate_limiter.py 中已有 avg_tokens_map 用于自动计算并发数。

Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com>
---
 env.example                | 11 -----------
 src/config.py              | 26 --------------------------
 src/deepseek_ocr_client.py | 11 ++++-------
 src/multi_tenant.py        | 22 ++++------------------
 4 files changed, 8 insertions(+), 62 deletions(-)

diff --git a/env.example b/env.example
index d068b1b..db7a4f2 100644
--- a/env.example
+++ b/env.example
@@ -31,14 +31,6 @@ LLM_TOKENS_PER_MINUTE=40000        # 每分钟最大令牌数（包含输入+输
 #                                  # 推荐：不设置此项，让系统自动计算以确保不超过 TPM/RPM 限制
 #                                  # 计算示例：min(800, 40000/3500) = min(800, 11) = 11 并发
 
-# --- LLM Token 估算配置（用于速率限制） ---
-# 估算输出 tokens 数量，用于速率限制计算
-# 如果估算过高，并发会受限；如果估算不足，可能触发 429 错误
-# LLM_ESTIMATED_OUTPUT_TOKENS=3000  # LLM 输出估算（实体提取约 3000 tokens，默认 3000）
-# LLM_VLM_ESTIMATED_OUTPUT_TOKENS=500  # VLM 输出估算（图片描述较短，默认 500）
-# LLM_VLM_MAX_TOKENS=500            # VLM API 最大输出 tokens（默认 500）
-# LLM_VLM_IMAGE_TOKENS_ESTIMATE=200 # VLM 图片输入估算 tokens（默认 200）
-
 # ====== Embedding 配置 ======
 # 用于向量化文本，支持语义检索
 EMBEDDING_BASE_URL="https://api.siliconflow.cn/v1"
@@ -150,9 +142,6 @@ DS_OCR_REQUESTS_PER_MINUTE=800     # 每分钟最大请求数（默认 800）
 DS_OCR_TOKENS_PER_MINUTE=40000     # 每分钟最大令牌数（默认 40000）
 # DS_OCR_MAX_ASYNC=8               # 【可选】全局默认并发数（未设置时使用硬编码默认值 8）
 
-# --- DeepSeek-OCR Token 估算配置（用于速率限制） ---
-# DS_OCR_IMAGE_TOKENS_ESTIMATE=1000  # 图片输入估算 tokens（默认 1000）
-
 # ====== 智能 Parser 选择器配置（v2.0） ======
 # 基于文档复杂度自动选择最优 Parser 和模式
 
diff --git a/src/config.py b/src/config.py
index 130f312..0d1f8f5 100644
--- a/src/config.py
+++ b/src/config.py
@@ -30,26 +30,6 @@ class LLMConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute (input + output)")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
-    # Token estimation for rate limiting (LLM)
-    estimated_output_tokens: int = Field(
-        default=3000,
-        description="Estimated output tokens for LLM calls (entity extraction typically outputs ~3000 tokens)"
-    )
-
-    # Token estimation for rate limiting (VLM)
-    vlm_estimated_output_tokens: int = Field(
-        default=500,
-        description="Estimated output tokens for VLM calls (image descriptions are typically shorter)"
-    )
-    vlm_max_tokens: int = Field(
-        default=500,
-        description="Maximum output tokens for VLM API calls"
-    )
-    vlm_image_tokens_estimate: int = Field(
-        default=200,
-        description="Estimated tokens for image input in VLM calls"
-    )
-
     class Config:
         env_prefix = "LLM_"
         env_file = ".env"
@@ -169,12 +149,6 @@ class DeepSeekOCRConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
-    # Token estimation for rate limiting
-    image_tokens_estimate: int = Field(
-        default=1000,
-        description="Estimated tokens for image input in OCR calls"
-    )
-
     class Config:
         env_prefix = "DS_OCR_"
         env_file = ".env"
diff --git a/src/deepseek_ocr_client.py b/src/deepseek_ocr_client.py
index ef13ec1..1cfcc76 100644
--- a/src/deepseek_ocr_client.py
+++ b/src/deepseek_ocr_client.py
@@ -57,9 +57,6 @@ class DSSeekConfig:
     fallback_mode: str = field(default_factory=lambda: config.ds_ocr.fallback_mode)
     min_output_threshold: int = field(default_factory=lambda: config.ds_ocr.min_output_threshold)
 
-    # Token 估算配置
-    image_tokens_estimate: int = field(default_factory=lambda: config.ds_ocr.image_tokens_estimate)
-
     def __post_init__(self):
         """验证配置"""
         if not self.api_key:
@@ -284,8 +281,8 @@ async def _call_api(self, img_base64: str, prompt: str) -> str:
         Raises:
             Exception: API 调用失败时抛出异常
         """
-        # 估算 tokens（提示词 + 图片 + 输出）
-        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
+        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
 
         # 获取速率限制许可
         await self.rate_limiter.rate_limiter.acquire(estimated_tokens)
@@ -351,8 +348,8 @@ def _call_api_sync(self, img_base64: str, prompt: str) -> str:
         """
         import asyncio
 
-        # 估算 tokens（提示词 + 图片 + 输出）
-        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
+        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
 
         # 在同步函数中调用异步速率限制器
         try:
diff --git a/src/multi_tenant.py b/src/multi_tenant.py
index 5f47604..d4cb0b1 100644
--- a/src/multi_tenant.py
+++ b/src/multi_tenant.py
@@ -64,12 +64,6 @@ def __init__(
         self.max_async = config.llm.max_async
         self.vlm_timeout = config.llm.vlm_timeout
 
-        # Token 估算配置
-        self.llm_estimated_output_tokens = config.llm.estimated_output_tokens
-        self.vlm_estimated_output_tokens = config.llm.vlm_estimated_output_tokens
-        self.vlm_max_tokens = config.llm.vlm_max_tokens
-        self.vlm_image_tokens_estimate = config.llm.vlm_image_tokens_estimate
-
         # 存储配置
         self.use_external_storage = config.storage.use_external
         self.kv_storage = config.storage.kv_storage
@@ -112,14 +106,11 @@ def _create_llm_func(self, llm_config: Dict):
         # 获取 rate_limiter 实际使用的并发数（将用于 LightRAG）
         actual_max_concurrent = rate_limiter.max_concurrent
 
-        # 获取 token 估算配置（支持租户覆盖）
-        llm_estimated_output = llm_config.get("estimated_output_tokens", self.llm_estimated_output_tokens)
-
         def llm_model_func(prompt, **kwargs):
             # 精确计算输入 tokens（使用 tiktoken）
             input_tokens = count_tokens(prompt, model="cl100k_base")
             # 保守估算输出 tokens（实体提取通常输出较长）
-            estimated_output = llm_estimated_output  # 从配置读取
+            estimated_output = 3000  # 50 entities + 46 relations ≈ 3000 tokens
             estimated_tokens = input_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -304,11 +295,6 @@ def _create_vision_model_func(self, llm_config: Dict):
             tokens_per_minute=tokens_per_minute
         )
 
-        # 获取 VLM token 估算配置（支持租户覆盖）
-        vlm_image_tokens = llm_config.get("vlm_image_tokens_estimate", self.vlm_image_tokens_estimate)
-        vlm_estimated_output = llm_config.get("vlm_estimated_output_tokens", self.vlm_estimated_output_tokens)
-        vlm_max_tokens = llm_config.get("vlm_max_tokens", self.vlm_max_tokens)
-
         async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: str) -> str:
             """
             使用 VLM 理解图片内容（带速率限制）
@@ -323,8 +309,8 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
             """
             # 精确计算 tokens（使用 tiktoken）
             prompt_tokens = count_tokens(prompt, model="cl100k_base")
-            image_tokens = vlm_image_tokens  # 从配置读取
-            estimated_output = vlm_estimated_output  # 从配置读取
+            image_tokens = 200  # 图片约 200 tokens（固定估算）
+            estimated_output = 500  # VLM 输出通常较短
             estimated_tokens = prompt_tokens + image_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -350,7 +336,7 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
                             ]
                         }
                     ],
-                    "max_tokens": vlm_max_tokens,  # 从配置读取
+                    "max_tokens": 500,
                     "temperature": 0.1
                 }