From 17c018e29fa9c1ce055503211f2b04a0031fd573 Mon Sep 17 00:00:00 2001 From: Lucas Ma <7184042+pony-maggie@users.noreply.github.com> Date: Mon, 15 Jun 2026 08:24:03 +0800 Subject: [PATCH] fix: normalize data URI parameter case --- packages/markitdown/src/markitdown/_uri_utils.py | 6 +++--- packages/markitdown/tests/test_module_misc.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..d07275ec0 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -29,7 +29,7 @@ def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: is_base64 = False # Ends with base64? - if parts[-1] == "base64": + if parts[-1].lower() == "base64": parts.pop() is_base64 = True @@ -43,9 +43,9 @@ def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: # Handle key=value pairs in the middle if "=" in part: key, value = part.split("=", 1) - attributes[key] = value + attributes[key.lower()] = value elif len(part) > 0: - attributes[part] = "" + attributes[part.lower()] = "" content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..0f77aeb8f 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -200,6 +200,13 @@ def test_data_uris() -> None: assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" + data_uri = "data:text/plain;CHARSET=utf-8;BASE64,SGVsbG8sIFdvcmxkIQ==" + mime_type, attributes, data = parse_data_uri(data_uri) + assert mime_type == "text/plain" + assert len(attributes) == 1 + assert attributes["charset"] == "utf-8" + assert data == b"Hello, World!" + data_uri = "data:,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None