microsoft · Jah-yee · Mar 11, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ conda activate markitdown
 To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
 
 ```bash
-git clone git@github.com:microsoft/markitdown.git
+git clone https://github.com/microsoft/markitdown.git
 cd markitdown
 pip install -e 'packages/markitdown[all]'
 ```

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,5 +1,8 @@
+import re
 import sys
-from typing import BinaryIO, Any
+from io import BytesIO
+from typing import Any, BinaryIO
+
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -10,7 +13,7 @@
 _xlsx_dependency_exc_info = None
 try:
     import pandas as pd
-    import openpyxl  # noqa: F401
+    import openpyxl
 except ImportError:
     _xlsx_dependency_exc_info = sys.exc_info()
 
@@ -32,6 +35,109 @@
 ]
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 
+# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
+CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')
+
+
+def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
+    """
+    Format a cell value, preserving currency and other number formats.
+    """
+    if cell.value is None:
+        return ""
+
+    # Check if it's a number type
+    if isinstance(cell.value, (int, float)):
+        number_format = cell.number_format
+
+        # Check if the number format contains currency symbols
+        # Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
+        if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
+            # Try to use openpyxl's built-in formatting
+            try:
+                formatted = openpyxl.styles.numbers.format(cell.value, number_format)
+                # Clean up the formatted value (remove extra spaces, fix formatting)
+                formatted = formatted.strip()
+                if formatted and formatted != str(cell.value):
+                    return formatted
+            except Exception:
+                pass
+
+            # Fallback: extract currency symbol from format string
+            currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
+            if currency_match:
+                currency_symbol = currency_match.group(1) or currency_match.group(2)
+                # Format with currency symbol
+                if isinstance(cell.value, float):
+                    return f"{currency_symbol}{cell.value:,.2f}"
+                else:
+                    return f"{currency_symbol}{cell.value:,}"
+
+        # Handle percentage format
+        if "%" in number_format and isinstance(cell.value, (int, float)):
+            return f"{cell.value * 100:.2f}%"
+
+        # Handle decimal places from format
+        if "#" in number_format or "0" in number_format:
+            # Try to preserve decimal places
+            decimal_match = re.search(r'\.(0+|#+)', number_format)
+            if decimal_match:
+                decimal_places = len(decimal_match.group(1))
+                if isinstance(cell.value, float):
+                    return f"{cell.value:,.{decimal_places}f}"
+
+        # Default number formatting with thousand separators
+        if isinstance(cell.value, float):
+            return f"{cell.value:,.2f}"
+        elif isinstance(cell.value, int):
+            return f"{cell.value:,}"
+
+    return str(cell.value)
+
+
+def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
+    """
+    Convert an openpyxl worksheet to a Markdown table, preserving number formats.
+    """
+    rows = list(ws.iter_rows(values_only=True))
+    if not rows:
+        return ""
+
+    # Get the max column count
+    max_cols = max(len(row) for row in rows)
+
+    # Build markdown table
+    lines = []
+
+    # Header row
+    header = [str(cell) if cell is not None else "" for cell in rows[0]]
+    lines.append("| " + " | ".join(header) + " |")
+    lines.append("| " + " | ".join(["---"] * len(header)) + " |")
+
+    # Data rows - need to use openpyxl cells to get formatting
+    for row_idx in range(1, len(rows)):
+        row = rows[row_idx]
+        # Pad row if needed
+        row = list(row) + [""] * (max_cols - len(row))
+
+        # Get cell objects for formatting
+        cells = list(ws[row_idx + 1])[:max_cols]  # +1 because openpyxl is 1-indexed
+
+        formatted_cells = []
+        for i, cell in enumerate(cells):
+            if cell.value is not None:
+                # Check if we need to use cell object for formatting
+                if isinstance(cell.value, (int, float)):
+                    formatted_cells.append(_format_cell_value(cell))
+                else:
+                    formatted_cells.append(str(cell.value))
+            else:
+                formatted_cells.append("")
+
+        lines.append("| " + " | ".join(formatted_cells) + " |")
+
+    return "\n".join(lines)
+
 
 class XlsxConverter(DocumentConverter):
     """
@@ -80,17 +186,15 @@ def convert(
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        # Read the Excel file using openpyxl to preserve number formats
+        file_stream.seek(0)
+        wb = openpyxl.load_workbook(file_stream, data_only=True)
+
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            md_content += f"## {sheet_name}\n"
+            md_content += _convert_sheet_to_markdown(ws) + "\n\n"
 
         return DocumentConverterResult(markdown=md_content.strip())
 

diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -53,7 +53,12 @@ def accepts(
         url = unquote(url)
         url = url.replace(r"\?", "?").replace(r"\=", "=")
 
-        if not url.startswith("https://www.youtube.com/watch?"):
+        # Accept standard watch URLs, short URLs, and youtu.be short URLs
+        if not (
+            url.startswith("https://www.youtube.com/watch?")
+            or url.startswith("https://www.youtube.com/shorts/")
+            or url.startswith("https://youtu.be/")
+        ):
             # Not a YouTube URL
             return False
 
@@ -95,6 +100,9 @@ def convert(
                         metadata[key] = content
                     break
 
+        # Extract video ID from various YouTube URL formats
+        video_id = self._extract_video_id(stream_info.url)  # type: ignore
+
         # Try reading the description
         try:
             for script in soup(["script"]):
@@ -147,10 +155,12 @@ def convert(
         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
             ytt_api = YouTubeTranscriptApi()
             transcript_text = ""
-            parsed_url = urlparse(stream_info.url)  # type: ignore
-            params = parse_qs(parsed_url.query)  # type: ignore
-            if "v" in params and params["v"][0]:
-                video_id = str(params["v"][0])
+            if not video_id:
+                # Fallback to parsing from query string for standard URLs
+                parsed_url = urlparse(stream_info.url)  # type: ignore
+                params = parse_qs(parsed_url.query)  # type: ignore
+                if "v" in params and params["v"][0]:
+                    video_id = str(params["v"][0])
                 transcript_list = ytt_api.list(video_id)
                 languages = ["en"]
                 for transcript in transcript_list:
@@ -223,6 +233,33 @@ def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json t
                     return result
         return None
 
+    def _extract_video_id(self, url: str) -> Union[str, None]:
+        """Extract video ID from various YouTube URL formats."""
+        if not url:
+            return None
+        url = unquote(url)
+        url = url.replace(r"\?", "?").replace(r"\=", "=")
+        parsed_url = urlparse(url)
+
+        # Handle youtu.be short URLs: https://youtu.be/dQw4w9WgXcQ
+        if parsed_url.netloc == "youtu.be":
+            path = parsed_url.path.strip("/")
+            if path:
+                return path
+
+        # Handle shorts URLs: https://www.youtube.com/shorts/dQw4w9WgXcQ
+        if "/shorts/" in url:
+            match = re.search(r"/shorts/([a-zA-Z0-9_-]+)", url)
+            if match:
+                return match.group(1)
+
+        # Handle standard watch URLs: https://www.youtube.com/watch?v=dQw4w9WgXcQ
+        params = parse_qs(parsed_url.query)
+        if "v" in params and params["v"]:
+            return str(params["v"][0])
+
+        return None
+
     def _retry_operation(self, operation, retries=3, delay=2):
         """Retries the operation if it fails."""
         attempt = 0