Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 44 additions & 43 deletions packages/markitdown/src/markitdown/_uri_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,52 @@
import os
from typing import Tuple, Dict
from urllib.request import url2pathname
from urllib.parse import urlparse, unquote_to_bytes


def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
"""Convert a file URI to a local file path"""
parsed = urlparse(file_uri)
from urllib.parse import urlparse, unquote, unquote_to_bytes
def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
"""Convert a file URI to a local file path"""
parsed = urlparse(file_uri)
if parsed.scheme != "file":
raise ValueError(f"Not a file URL: {file_uri}")

netloc = parsed.netloc if parsed.netloc else None
path = os.path.abspath(url2pathname(parsed.path))
decoded_path = unquote(parsed.path)
path = os.path.abspath(url2pathname(decoded_path))
return netloc, path


def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
if not uri.startswith("data:"):
raise ValueError("Not a data URI")

header, _, data = uri.partition(",")
if not _:
raise ValueError("Malformed data URI, missing ',' separator")

meta = header[5:] # Strip 'data:'
parts = meta.split(";")

is_base64 = False
# Ends with base64?
if parts[-1] == "base64":
parts.pop()
is_base64 = True

mime_type = None # Normally this would default to text/plain but we won't assume
if len(parts) and len(parts[0]) > 0:
# First part is the mime type
mime_type = parts.pop(0)

attributes: Dict[str, str] = {}
for part in parts:
# Handle key=value pairs in the middle
if "=" in part:
key, value = part.split("=", 1)
attributes[key] = value
elif len(part) > 0:
attributes[part] = ""

content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)

return mime_type, attributes, content
def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
if not uri.startswith("data:"):
raise ValueError("Not a data URI")
header, _, data = uri.partition(",")
if not _:
raise ValueError("Malformed data URI, missing ',' separator")
meta = header[5:] # Strip 'data:'
parts = meta.split(";")
is_base64 = False
# Ends with base64?
if parts[-1] == "base64":
parts.pop()
is_base64 = True
mime_type = None # Normally this would default to text/plain but we won't assume
if len(parts) and len(parts[0]) > 0:
# First part is the mime type
mime_type = parts.pop(0)
attributes: Dict[str, str] = {}
for part in parts:
# Handle key=value pairs in the middle
if "=" in part:
key, value = part.split("=", 1)
attributes[key] = value
elif len(part) > 0:
attributes[part] = ""
content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
return mime_type, attributes, content
13 changes: 13 additions & 0 deletions packages/markitdown/tests/test_uri_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from markitdown._uri_utils import file_uri_to_path


def test_file_uri_to_path_decodes_percent_encoded_space() -> None:
_, path = file_uri_to_path("file:///path/to/my%20file.txt")
assert "my file.txt" in path
assert "%20" not in path


def test_file_uri_to_path_decodes_percent_encoded_unicode() -> None:
_, path = file_uri_to_path("file:///path/to/%E6%B5%8B%E8%AF%95.txt")
assert "测试.txt" in path
assert "%E6%B5%8B%E8%AF%95" not in path