diff --git a/env.example b/env.example index db7a4f2..6307e93 100644 --- a/env.example +++ b/env.example @@ -152,6 +152,11 @@ DS_OCR_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(默认 40000) # docling: 强制使用 Docling PARSER_MODE=auto +# --- Parser 大小阈值 --- +# 文件大小阈值(KB),小于此值的 PDF/Office 文件使用 DeepSeek-OCR +# 默认 500KB,可根据业务需求调整 +PARSER_SIZE_THRESHOLD_KB=500 + # --- 复杂度评分阈值 --- COMPLEXITY_SIMPLE_THRESHOLD=20 # < 20:简单文档 → Free OCR COMPLEXITY_MEDIUM_TABLE_THRESHOLD=40 # 20-40:中等表格 → Grounding diff --git a/src/config.py b/src/config.py index 0d1f8f5..3b1a423 100644 --- a/src/config.py +++ b/src/config.py @@ -230,6 +230,23 @@ class Config: populate_by_name = True +# ==================== Parser Configuration ==================== + +class ParserConfig(BaseSettings): + """Parser Selection Configuration""" + + size_threshold_kb: int = Field( + default=500, + description="File size threshold in KB for parser selection (files smaller than this use DeepSeek-OCR)", + alias="PARSER_SIZE_THRESHOLD_KB" + ) + + class Config: + env_file = ".env" + extra = "ignore" + populate_by_name = True + + # ==================== Multi-Tenant Configuration ==================== class MultiTenantConfig(BaseSettings): @@ -293,6 +310,7 @@ def __init__(self): self.ds_ocr = DeepSeekOCRConfig() self.storage = StorageConfig() self.lightrag_query = LightRAGQueryConfig() + self.parser = ParserConfig() self.multi_tenant = MultiTenantConfig() def validate(self) -> None: @@ -333,6 +351,7 @@ def print_summary(self) -> None: print(f"Storage - Vector: {self.storage.vector_storage}") print(f"Storage - Graph: {self.storage.graph_storage}") print(f"Storage - DocStatus: {self.storage.doc_status_storage}") + print(f"Parser Size Threshold: {self.parser.size_threshold_kb}KB") print(f"Max Tenant Instances: {self.multi_tenant.max_tenant_instances}") print("=" * 60) diff --git a/src/rag.py b/src/rag.py index 2d7b2d7..d8c8249 100644 --- a/src/rag.py +++ b/src/rag.py @@ -152,6 +152,9 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) ext = os.path.splitext(filename)[1].lower() + # 计算文件大小阈值(字节) + size_threshold_bytes = config.parser.size_threshold_kb * 1024 + # 纯文本文件 → 不需要解析器(直接插入 LightRAG) if ext in ['.txt', '.md', '.markdown', '.json', '.csv']: return (None, None) @@ -182,7 +185,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) return ("deepseek-ocr", "free_ocr") # PDF/Office 小文件 → DeepSeek-OCR(快速) - if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < 500 * 1024: # < 500KB + if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < size_threshold_bytes: return ("deepseek-ocr", "free_ocr") # 大文件或其他 → MinerU(默认) @@ -217,7 +220,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None) # 降级:使用简单规则 if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: return ("deepseek-ocr", "free_ocr") - elif file_size < 500 * 1024: + elif file_size < size_threshold_bytes: return ("deepseek-ocr", "free_ocr") else: return ("mineru", None)