Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions packages/markitdown/src/markitdown/_uri_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from typing import Tuple, Dict
from urllib.request import url2pathname
from urllib.parse import urlparse, unquote_to_bytes
from urllib.parse import urlparse, unquote, unquote_to_bytes


def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
Expand All @@ -12,7 +12,8 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
raise ValueError(f"Not a file URL: {file_uri}")

netloc = parsed.netloc if parsed.netloc else None
path = os.path.abspath(url2pathname(parsed.path))
decoded_path = unquote(parsed.path)
path = os.path.abspath(url2pathname(decoded_path))
return netloc, path


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,15 @@ def accepts(
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")

if not url.startswith("https://www.youtube.com/watch?"):
# Support multiple YouTube URL formats:
# - https://www.youtube.com/watch?v=...
# - https://youtu.be/... (short URL)
# - https://www.youtube.com/shorts/...
if not (
url.startswith("https://www.youtube.com/watch?")
or url.startswith("https://youtu.be/")
or url.startswith("https://www.youtube.com/shorts/")
):
# Not a YouTube URL
return False

Expand Down Expand Up @@ -148,9 +156,25 @@ def convert(
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
# Extract video ID from various YouTube URL formats:
# - https://www.youtube.com/watch?v=VIDEO_ID
# - https://youtu.be/VIDEO_ID
# - https://www.youtube.com/shorts/VIDEO_ID
video_id = None
params = parse_qs(parsed_url.query)
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
elif parsed_url.path.startswith("/watch"):
params = parse_qs(parsed_url.query)
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
elif parsed_url.path.startswith("/shorts/"):
video_id = parsed_url.path.split("/shorts/")[1].split("/")[0]
elif parsed_url.path.startswith("/"):
# Handle youtu.be/VIDEO_ID short URLs
video_id = parsed_url.path.lstrip("/")

if video_id:
transcript_list = ytt_api.list(video_id)
languages = ["en"]
for transcript in transcript_list:
Expand Down