Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ conda activate markitdown
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:

```bash
git clone git@github.com:microsoft/markitdown.git
git clone https://github.com/microsoft/markitdown.git
cd markitdown
pip install -e 'packages/markitdown[all]'
pip install -e packages/markitdown
```

## Usage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ def accepts(
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")

if not url.startswith("https://www.youtube.com/watch?"):
# Support full URLs, short URLs (youtu.be), and shorts (/shorts/)
if not (
url.startswith("https://www.youtube.com/watch?")
or url.startswith("https://youtu.be/")
or "/shorts/" in url
):
# Not a YouTube URL
return False

Expand Down Expand Up @@ -147,10 +152,25 @@ def convert(
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
# Extract video ID from URL - support multiple formats
# 1. youtube.com/watch?v=VIDEO_ID
# 2. youtu.be/VIDEO_ID
# 3. youtube.com/shorts/VIDEO_ID
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
video_id = None

# Try standard ?v= query param first
params = parse_qs(parsed_url.query)
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
# Try youtu.be/VIDEO_ID format
elif parsed_url.path and "/youtu.be/" in stream_info.url: # type: ignore
video_id = parsed_url.path.split("/")[-1]
# Try youtube.com/shorts/VIDEO_ID format
elif parsed_url.path and "/shorts/" in stream_info.url: # type: ignore
video_id = parsed_url.path.split("/")[-1]

if video_id:
transcript_list = ytt_api.list(video_id)
languages = ["en"]
for transcript in transcript_list:
Expand Down