From c3a1f11e057376b99a1b76303d942eb65f778473 Mon Sep 17 00:00:00 2001 From: linuxuser Date: Thu, 7 May 2026 15:14:37 +0800 Subject: [PATCH] Fix XLSX ingestion memory spikes with streaming parser --- openkb/converter.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/openkb/converter.py b/openkb/converter.py index 3f5f5299..253a9cc5 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -8,6 +8,7 @@ import pymupdf from markitdown import MarkItDown +from openpyxl import load_workbook from openkb.config import load_config from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images @@ -33,6 +34,57 @@ def get_pdf_page_count(path: Path) -> int: return doc.page_count +def convert_xlsx_streaming(path: Path, *, max_rows: int = 5000, max_cols: int = 64) -> str: + """Convert .xlsx to markdown using a memory-safe streaming reader. + + This avoids large in-memory DataFrames and pathological worksheet ranges + that can cause extreme RAM usage in generic converters. + """ + wb = load_workbook(filename=str(path), read_only=True, data_only=True) + lines: list[str] = [] + try: + for ws in wb.worksheets: + lines.append(f"# Sheet: {ws.title}") + lines.append("") + rows_written = 0 + empty_streak = 0 + + for row in ws.iter_rows(min_row=1, max_row=max_rows, min_col=1, max_col=max_cols, values_only=True): + vals = [] + non_empty = False + for cell in row: + if cell is None: + vals.append("") + continue + text = str(cell).strip() + vals.append(text) + if text: + non_empty = True + + if not non_empty: + empty_streak += 1 + if rows_written == 0: + continue + # Stop after sustained empty tail to avoid scanning sparse sheets. + if empty_streak >= 200: + break + else: + empty_streak = 0 + + if non_empty: + lines.append(" | ".join(vals).rstrip()) + rows_written += 1 + + if rows_written == 0: + lines.append("(No non-empty cells found within scan limits.)") + + lines.append("") + finally: + wb.close() + + return "\n".join(lines) + + def convert_document(src: Path, kb_dir: Path) -> ConvertResult: """Convert a document and integrate it into the knowledge base. @@ -99,6 +151,8 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: elif src.suffix.lower() == ".pdf": # Use pymupdf dict-mode for PDFs: text + images inline at correct positions markdown = convert_pdf_with_images(src, doc_name, images_dir) + elif src.suffix.lower() == ".xlsx": + markdown = convert_xlsx_streaming(src) else: # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.) mid = MarkItDown()