Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class DocumentIntelligenceFileType(str, Enum):
"""Enum of file types supported by the Document Intelligence Converter."""

# No OCR
DOC = "doc"
DOCX = "docx"
PPTX = "pptx"
XLSX = "xlsx"
Expand All @@ -72,7 +73,9 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s
"""Get the MIME type prefixes for the given file types."""
prefixes: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
if type_ == DocumentIntelligenceFileType.DOC:
prefixes.append("application/msword")
elif type_ == DocumentIntelligenceFileType.DOCX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
Expand Down Expand Up @@ -105,7 +108,9 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
"""Get the file extensions for the given file types."""
extensions: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
if type_ == DocumentIntelligenceFileType.DOC:
extensions.append(".doc")
elif type_ == DocumentIntelligenceFileType.DOCX:
extensions.append(".docx")
elif type_ == DocumentIntelligenceFileType.PPTX:
extensions.append(".pptx")
Expand Down Expand Up @@ -137,6 +142,7 @@ def __init__(
api_version: str = "2024-07-31-preview",
credential: AzureKeyCredential | TokenCredential | None = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOC,
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
Expand Down Expand Up @@ -215,6 +221,7 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:

# Types that don't support ocr
no_ocr_types = [
DocumentIntelligenceFileType.DOC,
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
Expand Down
128 changes: 116 additions & 12 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import re
import sys
from typing import BinaryIO, Any
from io import BytesIO
from typing import Any, BinaryIO

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand All @@ -10,7 +13,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -32,6 +35,109 @@
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]

# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')


def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
"""
Format a cell value, preserving currency and other number formats.
"""
if cell.value is None:
return ""

# Check if it's a number type
if isinstance(cell.value, (int, float)):
number_format = cell.number_format

# Check if the number format contains currency symbols
# Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
# Try to use openpyxl's built-in formatting
try:
formatted = openpyxl.styles.numbers.format(cell.value, number_format)
# Clean up the formatted value (remove extra spaces, fix formatting)
formatted = formatted.strip()
if formatted and formatted != str(cell.value):
return formatted
except Exception:
pass

# Fallback: extract currency symbol from format string
currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
if currency_match:
currency_symbol = currency_match.group(1) or currency_match.group(2)
# Format with currency symbol
if isinstance(cell.value, float):
return f"{currency_symbol}{cell.value:,.2f}"
else:
return f"{currency_symbol}{cell.value:,}"

# Handle percentage format
if "%" in number_format and isinstance(cell.value, (int, float)):
return f"{cell.value * 100:.2f}%"

# Handle decimal places from format
if "#" in number_format or "0" in number_format:
# Try to preserve decimal places
decimal_match = re.search(r'\.(0+|#+)', number_format)
if decimal_match:
decimal_places = len(decimal_match.group(1))
if isinstance(cell.value, float):
return f"{cell.value:,.{decimal_places}f}"

# Default number formatting with thousand separators
if isinstance(cell.value, float):
return f"{cell.value:,.2f}"
elif isinstance(cell.value, int):
return f"{cell.value:,}"

return str(cell.value)


def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
"""
Convert an openpyxl worksheet to a Markdown table, preserving number formats.
"""
rows = list(ws.iter_rows(values_only=True))
if not rows:
return ""

# Get the max column count
max_cols = max(len(row) for row in rows)

# Build markdown table
lines = []

# Header row
header = [str(cell) if cell is not None else "" for cell in rows[0]]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * len(header)) + " |")

# Data rows - need to use openpyxl cells to get formatting
for row_idx in range(1, len(rows)):
row = rows[row_idx]
# Pad row if needed
row = list(row) + [""] * (max_cols - len(row))

# Get cell objects for formatting
cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed

formatted_cells = []
for i, cell in enumerate(cells):
if cell.value is not None:
# Check if we need to use cell object for formatting
if isinstance(cell.value, (int, float)):
formatted_cells.append(_format_cell_value(cell))
else:
formatted_cells.append(str(cell.value))
else:
formatted_cells.append("")

lines.append("| " + " | ".join(formatted_cells) + " |")

return "\n".join(lines)


class XlsxConverter(DocumentConverter):
"""
Expand Down Expand Up @@ -80,17 +186,15 @@ def convert(
_xlsx_dependency_exc_info[2]
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
# Read the Excel file using openpyxl to preserve number formats
file_stream.seek(0)
wb = openpyxl.load_workbook(file_stream, data_only=True)

md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
md_content += f"## {sheet_name}\n"
md_content += _convert_sheet_to_markdown(ws) + "\n\n"

return DocumentConverterResult(markdown=md_content.strip())

Expand Down