BukeLy · Copilot · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/env.example b/env.example
@@ -152,6 +152,11 @@ DS_OCR_TOKENS_PER_MINUTE=40000     # 每分钟最大令牌数（默认 40000）
 # docling: 强制使用 Docling
 PARSER_MODE=auto
 
+# --- Parser 大小阈值 ---
+# 文件大小阈值（KB），小于此值的 PDF/Office 文件使用 DeepSeek-OCR
+# 默认 500KB，可根据业务需求调整
+PARSER_SIZE_THRESHOLD_KB=500
+
 # --- 复杂度评分阈值 ---
 COMPLEXITY_SIMPLE_THRESHOLD=20                  # < 20：简单文档 → Free OCR
 COMPLEXITY_MEDIUM_TABLE_THRESHOLD=40            # 20-40：中等表格 → Grounding

diff --git a/src/config.py b/src/config.py
@@ -230,6 +230,23 @@ class Config:
         populate_by_name = True
 
 
+# ==================== Parser Configuration ====================
+
+class ParserConfig(BaseSettings):
+    """Parser Selection Configuration"""
+
+    size_threshold_kb: int = Field(
+        default=500,
+        description="File size threshold in KB for parser selection (files smaller than this use DeepSeek-OCR)",
+        alias="PARSER_SIZE_THRESHOLD_KB"
+    )
+
+    class Config:
+        env_file = ".env"
+        extra = "ignore"
+        populate_by_name = True
+
+
 # ==================== Multi-Tenant Configuration ====================
 
 class MultiTenantConfig(BaseSettings):
@@ -293,6 +310,7 @@ def __init__(self):
         self.ds_ocr = DeepSeekOCRConfig()
         self.storage = StorageConfig()
         self.lightrag_query = LightRAGQueryConfig()
+        self.parser = ParserConfig()
         self.multi_tenant = MultiTenantConfig()
 
     def validate(self) -> None:
@@ -333,6 +351,7 @@ def print_summary(self) -> None:
         print(f"Storage - Vector: {self.storage.vector_storage}")
         print(f"Storage - Graph: {self.storage.graph_storage}")
         print(f"Storage - DocStatus: {self.storage.doc_status_storage}")
+        print(f"Parser Size Threshold: {self.parser.size_threshold_kb}KB")
         print(f"Max Tenant Instances: {self.multi_tenant.max_tenant_instances}")
         print("=" * 60)
 

diff --git a/src/rag.py b/src/rag.py
@@ -152,6 +152,9 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)
 
     ext = os.path.splitext(filename)[1].lower()
 
+    # 计算文件大小阈值（字节）
+    size_threshold_bytes = config.parser.size_threshold_kb * 1024
+
     # 纯文本文件 → 不需要解析器（直接插入 LightRAG）
     if ext in ['.txt', '.md', '.markdown', '.json', '.csv']:
         return (None, None)
@@ -182,7 +185,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)
             return ("deepseek-ocr", "free_ocr")
 
         # PDF/Office 小文件 → DeepSeek-OCR（快速）
-        if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < 500 * 1024:  # < 500KB
+        if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < size_threshold_bytes:
             return ("deepseek-ocr", "free_ocr")
 
         # 大文件或其他 → MinerU（默认）
@@ -217,7 +220,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)
         # 降级：使用简单规则
         if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
             return ("deepseek-ocr", "free_ocr")
-        elif file_size < 500 * 1024:
+        elif file_size < size_threshold_bytes:
             return ("deepseek-ocr", "free_ocr")
         else:
             return ("mineru", None)