From 4d73cd281ca3b6e695265523ea72ebbb55c9c0cc Mon Sep 17 00:00:00 2001 From: Jah-yee Date: Mon, 13 Apr 2026 23:26:21 +0800 Subject: [PATCH 1/2] fix: support YouTube short URLs (youtu.be) in YouTubeConverter - Extend accepts() to recognize youtu.be and youtube.com/shorts URLs - Extract video ID from various YouTube URL formats in convert() - Fixes #1730 --- .../converters/_youtube_converter.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index c96e8f4f6..6dd010b05 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -53,7 +53,15 @@ def accepts( url = unquote(url) url = url.replace(r"\?", "?").replace(r"\=", "=") - if not url.startswith("https://www.youtube.com/watch?"): + # Support multiple YouTube URL formats: + # - https://www.youtube.com/watch?v=... + # - https://youtu.be/... (short URL) + # - https://www.youtube.com/shorts/... + if not ( + url.startswith("https://www.youtube.com/watch?") + or url.startswith("https://youtu.be/") + or url.startswith("https://www.youtube.com/shorts/") + ): # Not a YouTube URL return False @@ -148,9 +156,25 @@ def convert( ytt_api = YouTubeTranscriptApi() transcript_text = "" parsed_url = urlparse(stream_info.url) # type: ignore - params = parse_qs(parsed_url.query) # type: ignore + # Extract video ID from various YouTube URL formats: + # - https://www.youtube.com/watch?v=VIDEO_ID + # - https://youtu.be/VIDEO_ID + # - https://www.youtube.com/shorts/VIDEO_ID + video_id = None + params = parse_qs(parsed_url.query) if "v" in params and params["v"][0]: video_id = str(params["v"][0]) + elif parsed_url.path.startswith("/watch"): + params = parse_qs(parsed_url.query) + if "v" in params and params["v"][0]: + video_id = str(params["v"][0]) + elif parsed_url.path.startswith("/shorts/"): + video_id = parsed_url.path.split("/shorts/")[1].split("/")[0] + elif parsed_url.path.startswith("/"): + # Handle youtu.be/VIDEO_ID short URLs + video_id = parsed_url.path.lstrip("/") + + if video_id: transcript_list = ytt_api.list(video_id) languages = ["en"] for transcript in transcript_list: From 80d95acadfb5e52d2dac8e240718114d0b15f2d1 Mon Sep 17 00:00:00 2001 From: Jah-yee Date: Tue, 14 Apr 2026 00:08:03 +0800 Subject: [PATCH 2/2] fix: decode percent-encoded URI path before url2pathname Fix file_uri_to_path() to properly handle file URIs with percent-encoded non-ASCII characters (e.g., Korean filenames). Before the fix, parsed.path was passed directly to url2pathname() without decoding, causing MCP to fail when opening files with Unicode names. Now we call unquote(parsed.path) first to restore the URI to its original Unicode form before converting to a local OS path. --- packages/markitdown/src/markitdown/_uri_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..9257b4f3a 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -2,7 +2,7 @@ import os from typing import Tuple, Dict from urllib.request import url2pathname -from urllib.parse import urlparse, unquote_to_bytes +from urllib.parse import urlparse, unquote, unquote_to_bytes def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: @@ -12,7 +12,8 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: raise ValueError(f"Not a file URL: {file_uri}") netloc = parsed.netloc if parsed.netloc else None - path = os.path.abspath(url2pathname(parsed.path)) + decoded_path = unquote(parsed.path) + path = os.path.abspath(url2pathname(decoded_path)) return netloc, path