Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def accepts(
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")

if not url.startswith("https://www.youtube.com/watch?"):
# Not a YouTube URL
if self._extract_video_id(url) is None:
# Not a supported YouTube URL
return False

if extension in ACCEPTED_FILE_EXTENSIONS:
Expand Down Expand Up @@ -147,10 +147,8 @@ def convert(
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
video_id = self._extract_video_id(stream_info.url or "")
if video_id:
transcript_list = ytt_api.list(video_id)
languages = ["en"]
for transcript in transcript_list:
Expand Down Expand Up @@ -196,6 +194,25 @@ def convert(
title=title,
)

def _extract_video_id(self, url: str) -> Union[str, None]:
parsed_url = urlparse(url)
host = parsed_url.netloc.lower()
path = parsed_url.path.strip("/")

if host in {"youtu.be", "www.youtu.be"}:
return path.split("/", 1)[0] if path else None

if host in {"youtube.com", "www.youtube.com", "m.youtube.com"}:
if path == "watch":
params = parse_qs(parsed_url.query)
video_id = params.get("v", [None])[0]
return str(video_id) if video_id else None
if path.startswith("shorts/") or path.startswith("embed/"):
video_id = path.split("/", 1)[1]
return video_id.split("/", 1)[0] if video_id else None

return None

def _get(
self,
metadata: Dict[str, str],
Expand Down
35 changes: 35 additions & 0 deletions packages/markitdown/tests/test_youtube_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import io

from markitdown import StreamInfo
from markitdown.converters._youtube_converter import YouTubeConverter


def _stream_info(url: str) -> StreamInfo:
return StreamInfo(url=url, mimetype="text/html", extension=".html")


def test_accepts_youtube_short_urls() -> None:
converter = YouTubeConverter()

assert converter.accepts(io.BytesIO(b""), _stream_info("https://youtu.be/dQw4w9WgXcQ"))
assert converter.accepts(
io.BytesIO(b""), _stream_info("https://www.youtube.com/shorts/dQw4w9WgXcQ")
)
assert converter.accepts(
io.BytesIO(b""), _stream_info("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
)


def test_extract_video_id_from_supported_youtube_urls() -> None:
converter = YouTubeConverter()

assert (
converter._extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)
assert converter._extract_video_id("https://youtu.be/dQw4w9WgXcQ?t=42") == "dQw4w9WgXcQ"
assert (
converter._extract_video_id("https://www.youtube.com/shorts/dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)
assert converter._extract_video_id("https://example.com/watch?v=dQw4w9WgXcQ") is None