From 7cd7c2edd21e0b51e481d5d3937cecd3c1c29bef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:35:14 +0000 Subject: [PATCH 1/3] Initial plan From 97ed6b644cdae7d7bd69c0e03632905655cc1d73 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:41:48 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=E5=B0=86=20Parser=20=E5=A4=A7?= =?UTF-8?q?=E5=B0=8F=E9=98=88=E5=80=BC=E9=85=8D=E7=BD=AE=E5=8C=96=20(PARSE?= =?UTF-8?q?R=5FSIZE=5FTHRESHOLD=5FKB)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com> --- env.example | 5 +++++ src/config.py | 19 +++++++++++++++++++ src/rag.py | 5 +++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/env.example b/env.example index db7a4f2..6307e93 100644 --- a/env.example +++ b/env.example @@ -152,6 +152,11 @@ DS_OCR_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(默认 40000) # docling: 强制使用 Docling PARSER_MODE=auto +# --- Parser 大小阈值 --- +# 文件大小阈值(KB),小于此值的 PDF/Office 文件使用 DeepSeek-OCR +# 默认 500KB,可根据业务需求调整 +PARSER_SIZE_THRESHOLD_KB=500 + # --- 复杂度评分阈值 --- COMPLEXITY_SIMPLE_THRESHOLD=20 # < 20:简单文档 → Free OCR COMPLEXITY_MEDIUM_TABLE_THRESHOLD=40 # 20-40:中等表格 → Grounding diff --git a/src/config.py b/src/config.py index 0d1f8f5..3b1a423 100644 --- a/src/config.py +++ b/src/config.py @@ -230,6 +230,23 @@ class Config: populate_by_name = True +# ==================== Parser Configuration ==================== + +class ParserConfig(BaseSettings): + """Parser Selection Configuration""" + + size_threshold_kb: int = Field( + default=500, + description="File size threshold in KB for parser selection (files smaller than this use DeepSeek-OCR)", + alias="PARSER_SIZE_THRESHOLD_KB" + ) + + class Config: + env_file = ".env" + extra = "ignore" + populate_by_name = True + + # ==================== Multi-Tenant Configuration ==================== class MultiTenantConfig(BaseSettings): @@ -293,6 +310,7 @@ def __init__(self): self.ds_ocr = DeepSeekOCRConfig() self.storage = StorageConfig() self.lightrag_query = LightRAGQueryConfig() + self.parser = ParserConfig() self.multi_tenant = MultiTenantConfig() def validate(self) -> None: @@ -333,6 +351,7 @@ def print_summary(self) -> None: print(f"Storage - Vector: {self.storage.vector_storage}") print(f"Storage - Graph: {self.storage.graph_storage}") print(f"Storage - DocStatus: {self.storage.doc_status_storage}") + print(f"Parser Size Threshold: {self.parser.size_threshold_kb}KB") print(f"Max Tenant Instances: {self.multi_tenant.max_tenant_instances}") print("=" * 60) diff --git a/src/rag.py b/src/rag.py index 2d7b2d7..3254c04 100644 --- a/src/rag.py +++ b/src/rag.py @@ -182,7 +182,8 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) return ("deepseek-ocr", "free_ocr") # PDF/Office 小文件 → DeepSeek-OCR(快速) - if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < 500 * 1024: # < 500KB + size_threshold_bytes = config.parser.size_threshold_kb * 1024 + if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < size_threshold_bytes: return ("deepseek-ocr", "free_ocr") # 大文件或其他 → MinerU(默认) @@ -217,7 +218,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) # 降级:使用简单规则 if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: return ("deepseek-ocr", "free_ocr") - elif file_size < 500 * 1024: + elif file_size < config.parser.size_threshold_kb * 1024: return ("deepseek-ocr", "free_ocr") else: return ("mineru", None) From abf42e010823f5dbb436b245bd8d853177b05e1e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:44:03 +0000 Subject: [PATCH 3/3] =?UTF-8?q?refactor:=20=E5=B0=86=E9=98=88=E5=80=BC?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=8F=90=E5=8F=96=E5=88=B0=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E9=A1=B6=E9=83=A8=EF=BC=8C=E9=81=BF=E5=85=8D=E9=87=8D=E5=A4=8D?= =?UTF-8?q?=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: BukeLy <19304666+BukeLy@users.noreply.github.com> --- src/rag.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/rag.py b/src/rag.py index 3254c04..d8c8249 100644 --- a/src/rag.py +++ b/src/rag.py @@ -152,6 +152,9 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) ext = os.path.splitext(filename)[1].lower() + # 计算文件大小阈值(字节) + size_threshold_bytes = config.parser.size_threshold_kb * 1024 + # 纯文本文件 → 不需要解析器(直接插入 LightRAG) if ext in ['.txt', '.md', '.markdown', '.json', '.csv']: return (None, None) @@ -182,7 +185,6 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) return ("deepseek-ocr", "free_ocr") # PDF/Office 小文件 → DeepSeek-OCR(快速) - size_threshold_bytes = config.parser.size_threshold_kb * 1024 if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < size_threshold_bytes: return ("deepseek-ocr", "free_ocr") @@ -218,7 +220,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) # 降级:使用简单规则 if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: return ("deepseek-ocr", "free_ocr") - elif file_size < config.parser.size_threshold_kb * 1024: + elif file_size < size_threshold_bytes: return ("deepseek-ocr", "free_ocr") else: return ("mineru", None)