From 0ee687706f12b7917d62058c64ef3dc7ee534eb6 Mon Sep 17 00:00:00 2001 From: martian7777 Date: Tue, 16 Jun 2026 13:29:11 +0500 Subject: [PATCH] feat: add XlsxConverter and XlsConverter for Excel file support and include corresponding tests --- .../markitdown/converters/_xlsx_converter.py | 26 ++++++++++++- packages/markitdown/tests/test_module_misc.py | 37 +++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..762ac55e2 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -83,8 +83,19 @@ def convert( sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: + df = sheets[s] + df = df.dropna(how="all", axis=0).dropna(how="all", axis=1) + if df.empty: + continue + + # Rename Unnamed: columns to empty strings + df.columns = [ + "" if str(col).startswith("Unnamed:") else str(col) + for col in df.columns + ] + md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + html_content = df.to_html(index=False, na_rep="") md_content += ( self._html_converter.convert_string( html_content, **kwargs @@ -145,8 +156,19 @@ def convert( sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: + df = sheets[s] + df = df.dropna(how="all", axis=0).dropna(how="all", axis=1) + if df.empty: + continue + + # Rename Unnamed: columns to empty strings + df.columns = [ + "" if str(col).startswith("Unnamed:") else str(col) + for col in df.columns + ] + md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + html_content = df.to_html(index=False, na_rep="") md_content += ( self._html_converter.convert_string( html_content, **kwargs diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..63c91975d 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -532,6 +532,41 @@ def test_markitdown_llm() -> None: validate_strings(result, PPTX_TEST_STRINGS) +def test_xlsx_clean_conversion() -> None: + """Test that empty rows/columns, NaN values, and Unnamed headers are cleaned up.""" + import openpyxl + import tempfile + wb = openpyxl.Workbook() + ws = wb.active + ws["A1"] = "PROGRESS" # a title in A1 + ws["A3"] = "Task" + ws["C3"] = "Owner" + ws["D3"] = "Status" # real headers on row 3 (col B blank) + ws["A4"] = "Design" + ws["C4"] = "Ana" + ws["D4"] = "Done" + + temp_dir = tempfile.gettempdir() + p = os.path.join(temp_dir, "repro_test.xlsx") + try: + wb.save(p) + + markitdown = MarkItDown() + result = markitdown.convert(p) + + expected_md = ( + "## Sheet\n" + "| PROGRESS | | |\n" + "| --- | --- | --- |\n" + "| Task | Owner | Status |\n" + "| Design | Ana | Done |" + ) + assert result.markdown.strip() == expected_md + finally: + if os.path.exists(p): + os.remove(p) + + if __name__ == "__main__": """Runs this file's tests from the command line.""" for test in [ @@ -547,8 +582,10 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool, test_markitdown_llm_parameters, test_markitdown_llm, + test_xlsx_clean_conversion, ]: print(f"Running {test.__name__}...", end="") test() print("OK") print("All tests passed!") +