Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ conda activate markitdown
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:

```bash
git clone git@github.com:microsoft/markitdown.git
git clone https://github.com/microsoft/markitdown.git
cd markitdown
pip install -e 'packages/markitdown[all]'
```
Expand Down
128 changes: 116 additions & 12 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import re
import sys
from typing import BinaryIO, Any
from io import BytesIO
from typing import Any, BinaryIO

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand All @@ -10,7 +13,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -32,6 +35,109 @@
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]

# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')


def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
"""
Format a cell value, preserving currency and other number formats.
"""
if cell.value is None:
return ""

# Check if it's a number type
if isinstance(cell.value, (int, float)):
number_format = cell.number_format

# Check if the number format contains currency symbols
# Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
# Try to use openpyxl's built-in formatting
try:
formatted = openpyxl.styles.numbers.format(cell.value, number_format)
# Clean up the formatted value (remove extra spaces, fix formatting)
formatted = formatted.strip()
if formatted and formatted != str(cell.value):
return formatted
except Exception:
pass

# Fallback: extract currency symbol from format string
currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
if currency_match:
currency_symbol = currency_match.group(1) or currency_match.group(2)
# Format with currency symbol
if isinstance(cell.value, float):
return f"{currency_symbol}{cell.value:,.2f}"
else:
return f"{currency_symbol}{cell.value:,}"

# Handle percentage format
if "%" in number_format and isinstance(cell.value, (int, float)):
return f"{cell.value * 100:.2f}%"

# Handle decimal places from format
if "#" in number_format or "0" in number_format:
# Try to preserve decimal places
decimal_match = re.search(r'\.(0+|#+)', number_format)
if decimal_match:
decimal_places = len(decimal_match.group(1))
if isinstance(cell.value, float):
return f"{cell.value:,.{decimal_places}f}"

# Default number formatting with thousand separators
if isinstance(cell.value, float):
return f"{cell.value:,.2f}"
elif isinstance(cell.value, int):
return f"{cell.value:,}"

return str(cell.value)


def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
"""
Convert an openpyxl worksheet to a Markdown table, preserving number formats.
"""
rows = list(ws.iter_rows(values_only=True))
if not rows:
return ""

# Get the max column count
max_cols = max(len(row) for row in rows)

# Build markdown table
lines = []

# Header row
header = [str(cell) if cell is not None else "" for cell in rows[0]]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * len(header)) + " |")

# Data rows - need to use openpyxl cells to get formatting
for row_idx in range(1, len(rows)):
row = rows[row_idx]
# Pad row if needed
row = list(row) + [""] * (max_cols - len(row))

# Get cell objects for formatting
cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed

formatted_cells = []
for i, cell in enumerate(cells):
if cell.value is not None:
# Check if we need to use cell object for formatting
if isinstance(cell.value, (int, float)):
formatted_cells.append(_format_cell_value(cell))
else:
formatted_cells.append(str(cell.value))
else:
formatted_cells.append("")

lines.append("| " + " | ".join(formatted_cells) + " |")

return "\n".join(lines)


class XlsxConverter(DocumentConverter):
"""
Expand Down Expand Up @@ -80,17 +186,15 @@ def convert(
_xlsx_dependency_exc_info[2]
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
# Read the Excel file using openpyxl to preserve number formats
file_stream.seek(0)
wb = openpyxl.load_workbook(file_stream, data_only=True)

md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
md_content += f"## {sheet_name}\n"
md_content += _convert_sheet_to_markdown(ws) + "\n\n"

return DocumentConverterResult(markdown=md_content.strip())

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ def accepts(
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")

if not url.startswith("https://www.youtube.com/watch?"):
# Accept standard watch URLs, short URLs, and youtu.be short URLs
if not (
url.startswith("https://www.youtube.com/watch?")
or url.startswith("https://www.youtube.com/shorts/")
or url.startswith("https://youtu.be/")
):
# Not a YouTube URL
return False

Expand Down Expand Up @@ -95,6 +100,9 @@ def convert(
metadata[key] = content
break

# Extract video ID from various YouTube URL formats
video_id = self._extract_video_id(stream_info.url) # type: ignore

# Try reading the description
try:
for script in soup(["script"]):
Expand Down Expand Up @@ -147,10 +155,12 @@ def convert(
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
if not video_id:
# Fallback to parsing from query string for standard URLs
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
transcript_list = ytt_api.list(video_id)
languages = ["en"]
for transcript in transcript_list:
Expand Down Expand Up @@ -223,6 +233,33 @@ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json t
return result
return None

def _extract_video_id(self, url: str) -> Union[str, None]:
"""Extract video ID from various YouTube URL formats."""
if not url:
return None
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")
parsed_url = urlparse(url)

# Handle youtu.be short URLs: https://youtu.be/dQw4w9WgXcQ
if parsed_url.netloc == "youtu.be":
path = parsed_url.path.strip("/")
if path:
return path

# Handle shorts URLs: https://www.youtube.com/shorts/dQw4w9WgXcQ
if "/shorts/" in url:
match = re.search(r"/shorts/([a-zA-Z0-9_-]+)", url)
if match:
return match.group(1)

# Handle standard watch URLs: https://www.youtube.com/watch?v=dQw4w9WgXcQ
params = parse_qs(parsed_url.query)
if "v" in params and params["v"]:
return str(params["v"][0])

return None

def _retry_operation(self, operation, retries=3, delay=2):
"""Retries the operation if it fails."""
attempt = 0
Expand Down