Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ DS_OCR_TOKENS_PER_MINUTE=40000 # 每分钟最大令牌数(默认 40000)
# docling: 强制使用 Docling
PARSER_MODE=auto

# --- Parser 大小阈值 ---
# 文件大小阈值(KB),小于此值的 PDF/Office 文件使用 DeepSeek-OCR
# 默认 500KB,可根据业务需求调整
PARSER_SIZE_THRESHOLD_KB=500

# --- 复杂度评分阈值 ---
COMPLEXITY_SIMPLE_THRESHOLD=20 # < 20:简单文档 → Free OCR
COMPLEXITY_MEDIUM_TABLE_THRESHOLD=40 # 20-40:中等表格 → Grounding
Expand Down
19 changes: 19 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,23 @@ class Config:
populate_by_name = True


# ==================== Parser Configuration ====================

class ParserConfig(BaseSettings):
"""Parser Selection Configuration"""

size_threshold_kb: int = Field(
default=500,
description="File size threshold in KB for parser selection (files smaller than this use DeepSeek-OCR)",
alias="PARSER_SIZE_THRESHOLD_KB"
)

class Config:
env_file = ".env"
extra = "ignore"
populate_by_name = True


# ==================== Multi-Tenant Configuration ====================

class MultiTenantConfig(BaseSettings):
Expand Down Expand Up @@ -293,6 +310,7 @@ def __init__(self):
self.ds_ocr = DeepSeekOCRConfig()
self.storage = StorageConfig()
self.lightrag_query = LightRAGQueryConfig()
self.parser = ParserConfig()
self.multi_tenant = MultiTenantConfig()

def validate(self) -> None:
Expand Down Expand Up @@ -333,6 +351,7 @@ def print_summary(self) -> None:
print(f"Storage - Vector: {self.storage.vector_storage}")
print(f"Storage - Graph: {self.storage.graph_storage}")
print(f"Storage - DocStatus: {self.storage.doc_status_storage}")
print(f"Parser Size Threshold: {self.parser.size_threshold_kb}KB")
print(f"Max Tenant Instances: {self.multi_tenant.max_tenant_instances}")
print("=" * 60)

Expand Down
7 changes: 5 additions & 2 deletions src/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)

ext = os.path.splitext(filename)[1].lower()

# 计算文件大小阈值(字节)
size_threshold_bytes = config.parser.size_threshold_kb * 1024

# 纯文本文件 → 不需要解析器(直接插入 LightRAG)
if ext in ['.txt', '.md', '.markdown', '.json', '.csv']:
return (None, None)
Expand Down Expand Up @@ -182,7 +185,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)
return ("deepseek-ocr", "free_ocr")

# PDF/Office 小文件 → DeepSeek-OCR(快速)
if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < 500 * 1024: # < 500KB
if ext in ['.pdf', '.docx', '.xlsx', '.pptx'] and file_size < size_threshold_bytes:
return ("deepseek-ocr", "free_ocr")

# 大文件或其他 → MinerU(默认)
Expand Down Expand Up @@ -217,7 +220,7 @@ def select_parser_by_file(filename: str, file_size: int, file_path: str = None)
# 降级:使用简单规则
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
return ("deepseek-ocr", "free_ocr")
elif file_size < 500 * 1024:
elif file_size < size_threshold_bytes:
return ("deepseek-ocr", "free_ocr")
else:
return ("mineru", None)