diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..9257b4f3a 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -2,7 +2,7 @@ import os from typing import Tuple, Dict from urllib.request import url2pathname -from urllib.parse import urlparse, unquote_to_bytes +from urllib.parse import urlparse, unquote, unquote_to_bytes def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: @@ -12,7 +12,8 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: raise ValueError(f"Not a file URL: {file_uri}") netloc = parsed.netloc if parsed.netloc else None - path = os.path.abspath(url2pathname(parsed.path)) + decoded_path = unquote(parsed.path) + path = os.path.abspath(url2pathname(decoded_path)) return netloc, path diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index c96e8f4f6..6dd010b05 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -53,7 +53,15 @@ def accepts( url = unquote(url) url = url.replace(r"\?", "?").replace(r"\=", "=") - if not url.startswith("https://www.youtube.com/watch?"): + # Support multiple YouTube URL formats: + # - https://www.youtube.com/watch?v=... + # - https://youtu.be/... (short URL) + # - https://www.youtube.com/shorts/... + if not ( + url.startswith("https://www.youtube.com/watch?") + or url.startswith("https://youtu.be/") + or url.startswith("https://www.youtube.com/shorts/") + ): # Not a YouTube URL return False @@ -148,9 +156,25 @@ def convert( ytt_api = YouTubeTranscriptApi() transcript_text = "" parsed_url = urlparse(stream_info.url) # type: ignore - params = parse_qs(parsed_url.query) # type: ignore + # Extract video ID from various YouTube URL formats: + # - https://www.youtube.com/watch?v=VIDEO_ID + # - https://youtu.be/VIDEO_ID + # - https://www.youtube.com/shorts/VIDEO_ID + video_id = None + params = parse_qs(parsed_url.query) if "v" in params and params["v"][0]: video_id = str(params["v"][0]) + elif parsed_url.path.startswith("/watch"): + params = parse_qs(parsed_url.query) + if "v" in params and params["v"][0]: + video_id = str(params["v"][0]) + elif parsed_url.path.startswith("/shorts/"): + video_id = parsed_url.path.split("/shorts/")[1].split("/")[0] + elif parsed_url.path.startswith("/"): + # Handle youtu.be/VIDEO_ID short URLs + video_id = parsed_url.path.lstrip("/") + + if video_id: transcript_list = ytt_api.list(video_id) languages = ["en"] for transcript in transcript_list: