diff --git a/README.md b/README.md index 6da3ee1d9..3a6b6a67c 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ conda activate markitdown To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source: ```bash -git clone git@github.com:microsoft/markitdown.git +git clone https://github.com/microsoft/markitdown.git cd markitdown pip install -e 'packages/markitdown[all]' ``` diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..5b89185ec 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,5 +1,8 @@ +import re import sys -from typing import BinaryIO, Any +from io import BytesIO +from typing import Any, BinaryIO + from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -10,7 +13,7 @@ _xlsx_dependency_exc_info = None try: import pandas as pd - import openpyxl # noqa: F401 + import openpyxl except ImportError: _xlsx_dependency_exc_info = sys.exc_info() @@ -32,6 +35,109 @@ ] ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00) +CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0') + + +def _format_cell_value(cell: "openpyxl.cell.Cell") -> str: + """ + Format a cell value, preserving currency and other number formats. + """ + if cell.value is None: + return "" + + # Check if it's a number type + if isinstance(cell.value, (int, float)): + number_format = cell.number_format + + # Check if the number format contains currency symbols + # Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00 + if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format: + # Try to use openpyxl's built-in formatting + try: + formatted = openpyxl.styles.numbers.format(cell.value, number_format) + # Clean up the formatted value (remove extra spaces, fix formatting) + formatted = formatted.strip() + if formatted and formatted != str(cell.value): + return formatted + except Exception: + pass + + # Fallback: extract currency symbol from format string + currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format) + if currency_match: + currency_symbol = currency_match.group(1) or currency_match.group(2) + # Format with currency symbol + if isinstance(cell.value, float): + return f"{currency_symbol}{cell.value:,.2f}" + else: + return f"{currency_symbol}{cell.value:,}" + + # Handle percentage format + if "%" in number_format and isinstance(cell.value, (int, float)): + return f"{cell.value * 100:.2f}%" + + # Handle decimal places from format + if "#" in number_format or "0" in number_format: + # Try to preserve decimal places + decimal_match = re.search(r'\.(0+|#+)', number_format) + if decimal_match: + decimal_places = len(decimal_match.group(1)) + if isinstance(cell.value, float): + return f"{cell.value:,.{decimal_places}f}" + + # Default number formatting with thousand separators + if isinstance(cell.value, float): + return f"{cell.value:,.2f}" + elif isinstance(cell.value, int): + return f"{cell.value:,}" + + return str(cell.value) + + +def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str: + """ + Convert an openpyxl worksheet to a Markdown table, preserving number formats. + """ + rows = list(ws.iter_rows(values_only=True)) + if not rows: + return "" + + # Get the max column count + max_cols = max(len(row) for row in rows) + + # Build markdown table + lines = [] + + # Header row + header = [str(cell) if cell is not None else "" for cell in rows[0]] + lines.append("| " + " | ".join(header) + " |") + lines.append("| " + " | ".join(["---"] * len(header)) + " |") + + # Data rows - need to use openpyxl cells to get formatting + for row_idx in range(1, len(rows)): + row = rows[row_idx] + # Pad row if needed + row = list(row) + [""] * (max_cols - len(row)) + + # Get cell objects for formatting + cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed + + formatted_cells = [] + for i, cell in enumerate(cells): + if cell.value is not None: + # Check if we need to use cell object for formatting + if isinstance(cell.value, (int, float)): + formatted_cells.append(_format_cell_value(cell)) + else: + formatted_cells.append(str(cell.value)) + else: + formatted_cells.append("") + + lines.append("| " + " | ".join(formatted_cells) + " |") + + return "\n".join(lines) + class XlsxConverter(DocumentConverter): """ @@ -80,17 +186,15 @@ def convert( _xlsx_dependency_exc_info[2] ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + # Read the Excel file using openpyxl to preserve number formats + file_stream.seek(0) + wb = openpyxl.load_workbook(file_stream, data_only=True) + md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += ( - self._html_converter.convert_string( - html_content, **kwargs - ).markdown.strip() - + "\n\n" - ) + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + md_content += f"## {sheet_name}\n" + md_content += _convert_sheet_to_markdown(ws) + "\n\n" return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index c96e8f4f6..8005c64ee 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -53,7 +53,12 @@ def accepts( url = unquote(url) url = url.replace(r"\?", "?").replace(r"\=", "=") - if not url.startswith("https://www.youtube.com/watch?"): + # Accept standard watch URLs, short URLs, and youtu.be short URLs + if not ( + url.startswith("https://www.youtube.com/watch?") + or url.startswith("https://www.youtube.com/shorts/") + or url.startswith("https://youtu.be/") + ): # Not a YouTube URL return False @@ -95,6 +100,9 @@ def convert( metadata[key] = content break + # Extract video ID from various YouTube URL formats + video_id = self._extract_video_id(stream_info.url) # type: ignore + # Try reading the description try: for script in soup(["script"]): @@ -147,10 +155,12 @@ def convert( if IS_YOUTUBE_TRANSCRIPT_CAPABLE: ytt_api = YouTubeTranscriptApi() transcript_text = "" - parsed_url = urlparse(stream_info.url) # type: ignore - params = parse_qs(parsed_url.query) # type: ignore - if "v" in params and params["v"][0]: - video_id = str(params["v"][0]) + if not video_id: + # Fallback to parsing from query string for standard URLs + parsed_url = urlparse(stream_info.url) # type: ignore + params = parse_qs(parsed_url.query) # type: ignore + if "v" in params and params["v"][0]: + video_id = str(params["v"][0]) transcript_list = ytt_api.list(video_id) languages = ["en"] for transcript in transcript_list: @@ -223,6 +233,33 @@ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json t return result return None + def _extract_video_id(self, url: str) -> Union[str, None]: + """Extract video ID from various YouTube URL formats.""" + if not url: + return None + url = unquote(url) + url = url.replace(r"\?", "?").replace(r"\=", "=") + parsed_url = urlparse(url) + + # Handle youtu.be short URLs: https://youtu.be/dQw4w9WgXcQ + if parsed_url.netloc == "youtu.be": + path = parsed_url.path.strip("/") + if path: + return path + + # Handle shorts URLs: https://www.youtube.com/shorts/dQw4w9WgXcQ + if "/shorts/" in url: + match = re.search(r"/shorts/([a-zA-Z0-9_-]+)", url) + if match: + return match.group(1) + + # Handle standard watch URLs: https://www.youtube.com/watch?v=dQw4w9WgXcQ + params = parse_qs(parsed_url.query) + if "v" in params and params["v"]: + return str(params["v"][0]) + + return None + def _retry_operation(self, operation, retries=3, delay=2): """Retries the operation if it fails.""" attempt = 0