microsoft · pablomartinezmake-code · Apr 12, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,4 +1,5 @@
 import os
+import posixpath
 import zipfile
 from defusedxml import minidom
 from xml.dom.minidom import Document
@@ -92,7 +93,7 @@ def convert(
                 opf_path.split("/")[:-1]
             )  # Get base directory of content.opf
             spine = [
-                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+                self._resolve_archive_path(base_path, manifest[item_id])
                 for item_id in spine_order
                 if item_id in manifest
             ]
@@ -129,6 +130,11 @@ def convert(
                 markdown="\n\n".join(markdown_content), title=metadata["title"]
             )
 
+    def _resolve_archive_path(self, base_path: str, href: str) -> str:
+        """Resolve manifest hrefs relative to the OPF directory inside the ZIP archive."""
+        normalized = posixpath.normpath(posixpath.join(base_path, href))
+        return normalized.removeprefix("./")
+
     def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
         """Convenience function to extract a single occurrence of a tag (e.g., title)."""
         texts = self._get_all_texts_from_nodes(dom, tag_name)

diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -20,6 +20,22 @@
 class HtmlConverter(DocumentConverter):
     """Anything with content type text/html"""
 
+    def _normalize_namespaced_tags(self, soup: BeautifulSoup) -> None:
+        for tag in soup.find_all():
+            if isinstance(tag.name, str):
+                tag.name = tag.name.rsplit(":", 1)[-1]
+
+    def _find_body_element(self, soup: BeautifulSoup):
+        body_elm = soup.find("body")
+        if body_elm is not None:
+            return body_elm
+
+        for tag in soup.find_all():
+            if isinstance(tag.name, str) and tag.name.rsplit(":", 1)[-1].lower() == "body":
+                return tag
+
+        return None
+
     def accepts(
         self,
         file_stream: BinaryIO,
@@ -47,13 +63,14 @@ def convert(
         # Parse the stream
         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
         soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        self._normalize_namespaced_tags(soup)
 
         # Remove javascript and style blocks
         for script in soup(["script", "style"]):
             script.extract()
 
         # Print only the main content
-        body_elm = soup.find("body")
+        body_elm = self._find_body_element(soup)
         webpage_text = ""
         if body_elm:
             webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
 import os
 import re
 import shutil
+import zipfile
 import pytest
 from unittest.mock import MagicMock
 
@@ -288,6 +289,75 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_epub_relative_paths_and_prefixed_body(tmp_path) -> None:
+    epub_path = tmp_path / "relative-prefixed.epub"
+
+    container_xml = """<?xml version="1.0" encoding="utf-8"?>
+<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
+  <rootfiles>
+    <rootfile media-type="application/oebps-package+xml" full-path="EPUB/OPS/content.opf"/>
+  </rootfiles>
+</container>
+"""
+    content_opf = """<?xml version="1.0" encoding="utf-8"?>
+<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid" version="3.0">
+  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+    <dc:identifier id="bookid">relative-prefixed-test</dc:identifier>
+    <dc:title>Relative Paths EPUB</dc:title>
+    <dc:language>en</dc:language>
+    <dc:creator>Test Author</dc:creator>
+    <dc:description>Exercises relative manifest paths and namespaced body tags.</dc:description>
+  </metadata>
+  <manifest>
+    <item id="chapter1" href="../Text/chapter1.xhtml" media-type="application/xhtml+xml"/>
+    <item id="chapter2" href="../Text/chapter2.xhtml" media-type="application/xhtml+xml"/>
+  </manifest>
+  <spine>
+    <itemref idref="chapter1"/>
+    <itemref idref="chapter2"/>
+  </spine>
+</package>
+"""
+    chapter_template = """<?xml version="1.0" encoding="utf-8"?>
+<html:html xmlns:html="http://www.w3.org/1999/xhtml">
+  <html:head>
+    <html:title>{title}</html:title>
+    <html:style>.hero {{ color: red; }}</html:style>
+  </html:head>
+  <html:body>
+    <html:h1>{title}</html:h1>
+    <html:p>{body}</html:p>
+  </html:body>
+</html:html>
+"""
+
+    with zipfile.ZipFile(epub_path, "w") as z:
+        z.writestr("mimetype", "application/epub+zip")
+        z.writestr("META-INF/container.xml", container_xml)
+        z.writestr("EPUB/OPS/content.opf", content_opf)
+        z.writestr(
+            "EPUB/Text/chapter1.xhtml",
+            chapter_template.format(
+                title="Chapter 1",
+                body="The first chapter should be present without CSS noise.",
+            ),
+        )
+        z.writestr(
+            "EPUB/Text/chapter2.xhtml",
+            chapter_template.format(
+                title="Chapter 2",
+                body="The second chapter depends on relative path resolution.",
+            ),
+        )
+
+    result = MarkItDown().convert(str(epub_path))
+
+    assert "# Chapter 1" in result.text_content
+    assert "# Chapter 2" in result.text_content
+    assert "The second chapter depends on relative path resolution." in result.text_content
+    assert ".hero" not in result.text_content
+
+
 def test_doc_rlink() -> None:
     # Test for: CVE-2025-11849
     markitdown = MarkItDown()