diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 3be65b016..5f6810d96 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -1,4 +1,5 @@ import os +import posixpath import zipfile from defusedxml import minidom from xml.dom.minidom import Document @@ -92,7 +93,7 @@ def convert( opf_path.split("/")[:-1] ) # Get base directory of content.opf spine = [ - f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] + self._resolve_archive_path(base_path, manifest[item_id]) for item_id in spine_order if item_id in manifest ] @@ -129,6 +130,11 @@ def convert( markdown="\n\n".join(markdown_content), title=metadata["title"] ) + def _resolve_archive_path(self, base_path: str, href: str) -> str: + """Resolve manifest hrefs relative to the OPF directory inside the ZIP archive.""" + normalized = posixpath.normpath(posixpath.join(base_path, href)) + return normalized.removeprefix("./") + def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None: """Convenience function to extract a single occurrence of a tag (e.g., title).""" texts = self._get_all_texts_from_nodes(dom, tag_name) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index dabb0d7d3..e6edd453d 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -20,6 +20,22 @@ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + def _normalize_namespaced_tags(self, soup: BeautifulSoup) -> None: + for tag in soup.find_all(): + if isinstance(tag.name, str): + tag.name = tag.name.rsplit(":", 1)[-1] + + def _find_body_element(self, soup: BeautifulSoup): + body_elm = soup.find("body") + if body_elm is not None: + return body_elm + + for tag in soup.find_all(): + if isinstance(tag.name, str) and tag.name.rsplit(":", 1)[-1].lower() == "body": + return tag + + return None + def accepts( self, file_stream: BinaryIO, @@ -47,13 +63,14 @@ def convert( # Parse the stream encoding = "utf-8" if stream_info.charset is None else stream_info.charset soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + self._normalize_namespaced_tags(soup) # Remove javascript and style blocks for script in soup(["script", "style"]): script.extract() # Print only the main content - body_elm = soup.find("body") + body_elm = self._find_body_element(soup) webpage_text = "" if body_elm: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..0e3fc8209 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -288,6 +289,75 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_epub_relative_paths_and_prefixed_body(tmp_path) -> None: + epub_path = tmp_path / "relative-prefixed.epub" + + container_xml = """ + + + + + +""" + content_opf = """ + + + relative-prefixed-test + Relative Paths EPUB + en + Test Author + Exercises relative manifest paths and namespaced body tags. + + + + + + + + + + +""" + chapter_template = """ + + + {title} + .hero {{ color: red; }} + + + {title} + {body} + + +""" + + with zipfile.ZipFile(epub_path, "w") as z: + z.writestr("mimetype", "application/epub+zip") + z.writestr("META-INF/container.xml", container_xml) + z.writestr("EPUB/OPS/content.opf", content_opf) + z.writestr( + "EPUB/Text/chapter1.xhtml", + chapter_template.format( + title="Chapter 1", + body="The first chapter should be present without CSS noise.", + ), + ) + z.writestr( + "EPUB/Text/chapter2.xhtml", + chapter_template.format( + title="Chapter 2", + body="The second chapter depends on relative path resolution.", + ), + ) + + result = MarkItDown().convert(str(epub_path)) + + assert "# Chapter 1" in result.text_content + assert "# Chapter 2" in result.text_content + assert "The second chapter depends on relative path resolution." in result.text_content + assert ".hero" not in result.text_content + + def test_doc_rlink() -> None: # Test for: CVE-2025-11849 markitdown = MarkItDown()