Files
AstrBot/tests/test_epub_parser.py

221 lines
6.2 KiB
Python

from __future__ import annotations
import io
import zipfile
import pytest
from astrbot.core.knowledge_base.parsers.epub_parser import EpubParser
from astrbot.core.knowledge_base.parsers.markitdown_parser import MarkitdownParser
from astrbot.core.knowledge_base.parsers.util import select_parser
def _make_epub_bytes() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, mode="w") as archive:
archive.writestr(
"mimetype",
"application/epub+zip",
compress_type=zipfile.ZIP_STORED,
)
archive.writestr(
"META-INF/container.xml",
"""<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""",
)
archive.writestr(
"OEBPS/nav.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Navigation</title>
</head>
<body>
<nav epub:type="toc" xmlns:epub="http://www.idpf.org/2007/ops">
<ol>
<li><a href="chapter2.xhtml">Chapter 2</a></li>
<li><a href="chapter1.xhtml">Chapter 1</a></li>
</ol>
</nav>
</body>
</html>
""",
)
archive.writestr(
"OEBPS/chapter2.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Chapter 2</title>
</head>
<body>
<h2>Second</h2>
<p>Beta paragraph.</p>
</body>
</html>
""",
)
archive.writestr(
"OEBPS/chapter1.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Chapter 1</title>
</head>
<body>
<h1>First</h1>
<p>Alpha paragraph.</p>
<ul>
<li>Point A</li>
<li>Point B</li>
</ul>
</body>
</html>
""",
)
archive.writestr(
"OEBPS/content.opf",
"""<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">test-book</dc:identifier>
<dc:title>Test Book</dc:title>
<dc:language>en</dc:language>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="chapter2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
<item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="nav"/>
<itemref idref="chapter2"/>
<itemref idref="chapter1"/>
</spine>
</package>
""",
)
return buffer.getvalue()
def _make_epub_bytes_with_generic_content() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, mode="w") as archive:
archive.writestr(
"mimetype",
"application/epub+zip",
compress_type=zipfile.ZIP_STORED,
)
archive.writestr(
"META-INF/container.xml",
"""<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""",
)
archive.writestr(
"OEBPS/chapter1.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Chapter 1</title>
</head>
<body>
<h1>First</h1>
Lead text
<p>Piura<a href="text00000.html#filepos0000045863">*5</a>, continued.</p>
<img src="Image00000.jpg" alt="" />
<div>Inside div</div>
<section>Inside section</section>
<table>
<tr>
<td>Cell A</td>
<td>Cell B</td>
</tr>
</table>
</body>
</html>
""",
)
archive.writestr(
"OEBPS/content.opf",
"""<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">test-book</dc:identifier>
<dc:title>Test Book</dc:title>
<dc:language>en</dc:language>
</metadata>
<manifest>
<item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chapter1"/>
</spine>
</package>
""",
)
return buffer.getvalue()
@pytest.mark.asyncio
async def test_select_parser_supports_epub():
parser = await select_parser(".epub")
assert isinstance(parser, EpubParser)
@pytest.mark.asyncio
@pytest.mark.parametrize("ext", [".rst", ".adoc"])
async def test_select_parser_supports_text_markup_formats(ext):
parser = await select_parser(ext)
assert isinstance(parser, MarkitdownParser)
@pytest.mark.asyncio
async def test_epub_parser_reads_spine_order_as_text():
result = await EpubParser().parse(_make_epub_bytes(), "book.epub")
assert result.media == []
assert "**Title:**" not in result.text
assert "[Chapter 2](chapter2.xhtml)" not in result.text
assert result.text.startswith("1. Chapter 2")
assert "2. Chapter 1" in result.text
assert "Beta paragraph." in result.text
assert "# First" in result.text
assert "* Point A" in result.text
assert result.text.index("1. Chapter 2") < result.text.index("## Second")
assert result.text.index("## Second") < result.text.index("# First")
@pytest.mark.asyncio
async def test_epub_parser_preserves_generic_container_text():
result = await EpubParser().parse(
_make_epub_bytes_with_generic_content(),
"book.epub",
)
assert "**Title:**" not in result.text
assert "# First" in result.text
assert "Lead text" in result.text
assert r"Piura\*5, continued." in result.text
assert "filepos" not in result.text
assert r"[\*5]" not in result.text
assert "Image00000.jpg" not in result.text
assert "![](" not in result.text
assert "\n\n\n" not in result.text
assert "Inside div" in result.text
assert "Inside section" in result.text
assert "| Cell A | Cell B |" in result.text