microsoft · Jah-yee · Mar 11, 2026 · Apr 13, 2026 · Apr 14, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -56,6 +56,7 @@ class DocumentIntelligenceFileType(str, Enum):
     """Enum of file types supported by the Document Intelligence Converter."""
 
     # No OCR
+    DOC = "doc"
     DOCX = "docx"
     PPTX = "pptx"
     XLSX = "xlsx"
@@ -72,7 +73,9 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s
     """Get the MIME type prefixes for the given file types."""
     prefixes: List[str] = []
     for type_ in types:
-        if type_ == DocumentIntelligenceFileType.DOCX:
+        if type_ == DocumentIntelligenceFileType.DOC:
+            prefixes.append("application/msword")
+        elif type_ == DocumentIntelligenceFileType.DOCX:
             prefixes.append(
                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
             )
@@ -105,7 +108,9 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
     """Get the file extensions for the given file types."""
     extensions: List[str] = []
     for type_ in types:
-        if type_ == DocumentIntelligenceFileType.DOCX:
+        if type_ == DocumentIntelligenceFileType.DOC:
+            extensions.append(".doc")
+        elif type_ == DocumentIntelligenceFileType.DOCX:
             extensions.append(".docx")
         elif type_ == DocumentIntelligenceFileType.PPTX:
             extensions.append(".pptx")
@@ -137,6 +142,7 @@ def __init__(
         api_version: str = "2024-07-31-preview",
         credential: AzureKeyCredential | TokenCredential | None = None,
         file_types: List[DocumentIntelligenceFileType] = [
+            DocumentIntelligenceFileType.DOC,
             DocumentIntelligenceFileType.DOCX,
             DocumentIntelligenceFileType.PPTX,
             DocumentIntelligenceFileType.XLSX,
@@ -215,6 +221,7 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
 
         # Types that don't support ocr
         no_ocr_types = [
+            DocumentIntelligenceFileType.DOC,
             DocumentIntelligenceFileType.DOCX,
             DocumentIntelligenceFileType.PPTX,
             DocumentIntelligenceFileType.XLSX,

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,5 +1,8 @@
+import re
 import sys
-from typing import BinaryIO, Any
+from io import BytesIO
+from typing import Any, BinaryIO
+
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -10,7 +13,7 @@
 _xlsx_dependency_exc_info = None
 try:
     import pandas as pd
-    import openpyxl  # noqa: F401
+    import openpyxl
 except ImportError:
     _xlsx_dependency_exc_info = sys.exc_info()
 
@@ -32,6 +35,109 @@
 ]
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 
+# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
+CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')
+
+
+def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
+    """
+    Format a cell value, preserving currency and other number formats.
+    """
+    if cell.value is None:
+        return ""
+
+    # Check if it's a number type
+    if isinstance(cell.value, (int, float)):
+        number_format = cell.number_format
+
+        # Check if the number format contains currency symbols
+        # Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
+        if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
+            # Try to use openpyxl's built-in formatting
+            try:
+                formatted = openpyxl.styles.numbers.format(cell.value, number_format)
+                # Clean up the formatted value (remove extra spaces, fix formatting)
+                formatted = formatted.strip()
+                if formatted and formatted != str(cell.value):
+                    return formatted
+            except Exception:
+                pass
+
+            # Fallback: extract currency symbol from format string
+            currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
+            if currency_match:
+                currency_symbol = currency_match.group(1) or currency_match.group(2)
+                # Format with currency symbol
+                if isinstance(cell.value, float):
+                    return f"{currency_symbol}{cell.value:,.2f}"
+                else:
+                    return f"{currency_symbol}{cell.value:,}"
+
+        # Handle percentage format
+        if "%" in number_format and isinstance(cell.value, (int, float)):
+            return f"{cell.value * 100:.2f}%"
+
+        # Handle decimal places from format
+        if "#" in number_format or "0" in number_format:
+            # Try to preserve decimal places
+            decimal_match = re.search(r'\.(0+|#+)', number_format)
+            if decimal_match:
+                decimal_places = len(decimal_match.group(1))
+                if isinstance(cell.value, float):
+                    return f"{cell.value:,.{decimal_places}f}"
+
+        # Default number formatting with thousand separators
+        if isinstance(cell.value, float):
+            return f"{cell.value:,.2f}"
+        elif isinstance(cell.value, int):
+            return f"{cell.value:,}"
+
+    return str(cell.value)
+
+
+def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
+    """
+    Convert an openpyxl worksheet to a Markdown table, preserving number formats.
+    """
+    rows = list(ws.iter_rows(values_only=True))
+    if not rows:
+        return ""
+
+    # Get the max column count
+    max_cols = max(len(row) for row in rows)
+
+    # Build markdown table
+    lines = []
+
+    # Header row
+    header = [str(cell) if cell is not None else "" for cell in rows[0]]
+    lines.append("| " + " | ".join(header) + " |")
+    lines.append("| " + " | ".join(["---"] * len(header)) + " |")
+
+    # Data rows - need to use openpyxl cells to get formatting
+    for row_idx in range(1, len(rows)):
+        row = rows[row_idx]
+        # Pad row if needed
+        row = list(row) + [""] * (max_cols - len(row))
+
+        # Get cell objects for formatting
+        cells = list(ws[row_idx + 1])[:max_cols]  # +1 because openpyxl is 1-indexed
+
+        formatted_cells = []
+        for i, cell in enumerate(cells):
+            if cell.value is not None:
+                # Check if we need to use cell object for formatting
+                if isinstance(cell.value, (int, float)):
+                    formatted_cells.append(_format_cell_value(cell))
+                else:
+                    formatted_cells.append(str(cell.value))
+            else:
+                formatted_cells.append("")
+
+        lines.append("| " + " | ".join(formatted_cells) + " |")
+
+    return "\n".join(lines)
+
 
 class XlsxConverter(DocumentConverter):
     """
@@ -80,17 +186,15 @@ def convert(
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        # Read the Excel file using openpyxl to preserve number formats
+        file_stream.seek(0)
+        wb = openpyxl.load_workbook(file_stream, data_only=True)
+
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            md_content += f"## {sheet_name}\n"
+            md_content += _convert_sheet_to_markdown(ws) + "\n\n"
 
         return DocumentConverterResult(markdown=md_content.strip())