microsoft · LaplaceYoung · Apr 13, 2026
diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py
@@ -2,51 +2,52 @@
 import os
 from typing import Tuple, Dict
 from urllib.request import url2pathname
-from urllib.parse import urlparse, unquote_to_bytes
-
-
-def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
-    """Convert a file URI to a local file path"""
-    parsed = urlparse(file_uri)
+from urllib.parse import urlparse, unquote, unquote_to_bytes
+
+
+def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
+    """Convert a file URI to a local file path"""
+    parsed = urlparse(file_uri)
     if parsed.scheme != "file":
         raise ValueError(f"Not a file URL: {file_uri}")
 
     netloc = parsed.netloc if parsed.netloc else None
-    path = os.path.abspath(url2pathname(parsed.path))
+    decoded_path = unquote(parsed.path)
+    path = os.path.abspath(url2pathname(decoded_path))
     return netloc, path
-
-
-def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
-    if not uri.startswith("data:"):
-        raise ValueError("Not a data URI")
-
-    header, _, data = uri.partition(",")
-    if not _:
-        raise ValueError("Malformed data URI, missing ',' separator")
-
-    meta = header[5:]  # Strip 'data:'
-    parts = meta.split(";")
-
-    is_base64 = False
-    # Ends with base64?
-    if parts[-1] == "base64":
-        parts.pop()
-        is_base64 = True
-
-    mime_type = None  # Normally this would default to text/plain but we won't assume
-    if len(parts) and len(parts[0]) > 0:
-        # First part is the mime type
-        mime_type = parts.pop(0)
-
-    attributes: Dict[str, str] = {}
-    for part in parts:
-        # Handle key=value pairs in the middle
-        if "=" in part:
-            key, value = part.split("=", 1)
-            attributes[key] = value
-        elif len(part) > 0:
-            attributes[part] = ""
-
-    content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
-
-    return mime_type, attributes, content
+
+
+def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
+    if not uri.startswith("data:"):
+        raise ValueError("Not a data URI")
+
+    header, _, data = uri.partition(",")
+    if not _:
+        raise ValueError("Malformed data URI, missing ',' separator")
+
+    meta = header[5:]  # Strip 'data:'
+    parts = meta.split(";")
+
+    is_base64 = False
+    # Ends with base64?
+    if parts[-1] == "base64":
+        parts.pop()
+        is_base64 = True
+
+    mime_type = None  # Normally this would default to text/plain but we won't assume
+    if len(parts) and len(parts[0]) > 0:
+        # First part is the mime type
+        mime_type = parts.pop(0)
+
+    attributes: Dict[str, str] = {}
+    for part in parts:
+        # Handle key=value pairs in the middle
+        if "=" in part:
+            key, value = part.split("=", 1)
+            attributes[key] = value
+        elif len(part) > 0:
+            attributes[part] = ""
+
+    content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
+
+    return mime_type, attributes, content
diff --git a/packages/markitdown/tests/test_uri_utils.py b/packages/markitdown/tests/test_uri_utils.py
@@ -0,0 +1,13 @@
+from markitdown._uri_utils import file_uri_to_path
+
+
+def test_file_uri_to_path_decodes_percent_encoded_space() -> None:
+    _, path = file_uri_to_path("file:///path/to/my%20file.txt")
+    assert "my file.txt" in path
+    assert "%20" not in path
+
+
+def test_file_uri_to_path_decodes_percent_encoded_unicode() -> None:
+    _, path = file_uri_to_path("file:///path/to/%E6%B5%8B%E8%AF%95.txt")
+    assert "测试.txt" in path
+    assert "%E6%B5%8B%E8%AF%95" not in path