Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import posixpath
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document
Expand Down Expand Up @@ -92,7 +93,7 @@ def convert(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
self._resolve_archive_path(base_path, manifest[item_id])
for item_id in spine_order
if item_id in manifest
]
Expand Down Expand Up @@ -129,6 +130,11 @@ def convert(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)

def _resolve_archive_path(self, base_path: str, href: str) -> str:
"""Resolve manifest hrefs relative to the OPF directory inside the ZIP archive."""
normalized = posixpath.normpath(posixpath.join(base_path, href))
return normalized.removeprefix("./")

def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
Expand Down
19 changes: 18 additions & 1 deletion packages/markitdown/src/markitdown/converters/_html_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""

def _normalize_namespaced_tags(self, soup: BeautifulSoup) -> None:
for tag in soup.find_all():
if isinstance(tag.name, str):
tag.name = tag.name.rsplit(":", 1)[-1]

def _find_body_element(self, soup: BeautifulSoup):
body_elm = soup.find("body")
if body_elm is not None:
return body_elm

for tag in soup.find_all():
if isinstance(tag.name, str) and tag.name.rsplit(":", 1)[-1].lower() == "body":
return tag

return None

def accepts(
self,
file_stream: BinaryIO,
Expand Down Expand Up @@ -47,13 +63,14 @@ def convert(
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
self._normalize_namespaced_tags(soup)

# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()

# Print only the main content
body_elm = soup.find("body")
body_elm = self._find_body_element(soup)
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
Expand Down
70 changes: 70 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -288,6 +289,75 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_epub_relative_paths_and_prefixed_body(tmp_path) -> None:
epub_path = tmp_path / "relative-prefixed.epub"

container_xml = """<?xml version="1.0" encoding="utf-8"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile media-type="application/oebps-package+xml" full-path="EPUB/OPS/content.opf"/>
</rootfiles>
</container>
"""
content_opf = """<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid" version="3.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">relative-prefixed-test</dc:identifier>
<dc:title>Relative Paths EPUB</dc:title>
<dc:language>en</dc:language>
<dc:creator>Test Author</dc:creator>
<dc:description>Exercises relative manifest paths and namespaced body tags.</dc:description>
</metadata>
<manifest>
<item id="chapter1" href="../Text/chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="chapter2" href="../Text/chapter2.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chapter1"/>
<itemref idref="chapter2"/>
</spine>
</package>
"""
chapter_template = """<?xml version="1.0" encoding="utf-8"?>
<html:html xmlns:html="http://www.w3.org/1999/xhtml">
<html:head>
<html:title>{title}</html:title>
<html:style>.hero {{ color: red; }}</html:style>
</html:head>
<html:body>
<html:h1>{title}</html:h1>
<html:p>{body}</html:p>
</html:body>
</html:html>
"""

with zipfile.ZipFile(epub_path, "w") as z:
z.writestr("mimetype", "application/epub+zip")
z.writestr("META-INF/container.xml", container_xml)
z.writestr("EPUB/OPS/content.opf", content_opf)
z.writestr(
"EPUB/Text/chapter1.xhtml",
chapter_template.format(
title="Chapter 1",
body="The first chapter should be present without CSS noise.",
),
)
z.writestr(
"EPUB/Text/chapter2.xhtml",
chapter_template.format(
title="Chapter 2",
body="The second chapter depends on relative path resolution.",
),
)

result = MarkItDown().convert(str(epub_path))

assert "# Chapter 1" in result.text_content
assert "# Chapter 2" in result.text_content
assert "The second chapter depends on relative path resolution." in result.text_content
assert ".hero" not in result.text_content


def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
Expand Down