diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
index 3be65b016..5f6810d96 100644
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,4 +1,5 @@
import os
+import posixpath
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document
@@ -92,7 +93,7 @@ def convert(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
- f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+ self._resolve_archive_path(base_path, manifest[item_id])
for item_id in spine_order
if item_id in manifest
]
@@ -129,6 +130,11 @@ def convert(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
+ def _resolve_archive_path(self, base_path: str, href: str) -> str:
+ """Resolve manifest hrefs relative to the OPF directory inside the ZIP archive."""
+ normalized = posixpath.normpath(posixpath.join(base_path, href))
+ return normalized.removeprefix("./")
+
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index dabb0d7d3..e6edd453d 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -20,6 +20,22 @@
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
+ def _normalize_namespaced_tags(self, soup: BeautifulSoup) -> None:
+ for tag in soup.find_all():
+ if isinstance(tag.name, str):
+ tag.name = tag.name.rsplit(":", 1)[-1]
+
+ def _find_body_element(self, soup: BeautifulSoup):
+ body_elm = soup.find("body")
+ if body_elm is not None:
+ return body_elm
+
+ for tag in soup.find_all():
+ if isinstance(tag.name, str) and tag.name.rsplit(":", 1)[-1].lower() == "body":
+ return tag
+
+ return None
+
def accepts(
self,
file_stream: BinaryIO,
@@ -47,13 +63,14 @@ def convert(
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+ self._normalize_namespaced_tags(soup)
# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()
# Print only the main content
- body_elm = soup.find("body")
+ body_elm = self._find_body_element(soup)
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 8e3acc23d..0e3fc8209 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
import os
import re
import shutil
+import zipfile
import pytest
from unittest.mock import MagicMock
@@ -288,6 +289,75 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content
+def test_epub_relative_paths_and_prefixed_body(tmp_path) -> None:
+ epub_path = tmp_path / "relative-prefixed.epub"
+
+ container_xml = """
+
+
+
+
+
+"""
+ content_opf = """
+
+
+ relative-prefixed-test
+ Relative Paths EPUB
+ en
+ Test Author
+ Exercises relative manifest paths and namespaced body tags.
+
+
+
+
+
+
+
+
+
+
+"""
+ chapter_template = """
+
+
+ {title}
+ .hero {{ color: red; }}
+
+
+ {title}
+ {body}
+
+
+"""
+
+ with zipfile.ZipFile(epub_path, "w") as z:
+ z.writestr("mimetype", "application/epub+zip")
+ z.writestr("META-INF/container.xml", container_xml)
+ z.writestr("EPUB/OPS/content.opf", content_opf)
+ z.writestr(
+ "EPUB/Text/chapter1.xhtml",
+ chapter_template.format(
+ title="Chapter 1",
+ body="The first chapter should be present without CSS noise.",
+ ),
+ )
+ z.writestr(
+ "EPUB/Text/chapter2.xhtml",
+ chapter_template.format(
+ title="Chapter 2",
+ body="The second chapter depends on relative path resolution.",
+ ),
+ )
+
+ result = MarkItDown().convert(str(epub_path))
+
+ assert "# Chapter 1" in result.text_content
+ assert "# Chapter 2" in result.text_content
+ assert "The second chapter depends on relative path resolution." in result.text_content
+ assert ".hero" not in result.text_content
+
+
def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()