-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(tests): add comprehensive tests for MarkItDown
Introduce new test cases for various file formats including XLSX, DOCX, PPTX, RSS, and CSV. a validation utility to the output strings match expected results. These changes enhance test coverage and ensure the reliability of the MarkItDown conversion functionality.
- Loading branch information
1 parent
125e206
commit 2a75273
Showing
4 changed files
with
166 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import os | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from markitdown import MarkItDown | ||
from tests.helpers.utils import validate_strings | ||
|
||
TEST_FILES_DIR = Path(__file__).parent.parent / "test_files" | ||
|
||
XLSX_TEST_STRINGS = [ | ||
"## 09060124-b5e7-4717-9d07-3c046eb", | ||
"6ff4173b-42a5-4784-9b19-f49caff4d93d", | ||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", | ||
] | ||
|
||
DOCX_TEST_STRINGS = [ | ||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a", | ||
"49e168b7-d2ae-407f-a055-2167576f39a1", | ||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", | ||
"# Abstract", | ||
"# Introduction", | ||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", | ||
] | ||
|
||
DOCX_COMMENT_TEST_STRINGS = [ | ||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a", | ||
"49e168b7-d2ae-407f-a055-2167576f39a1", | ||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", | ||
"# Abstract", | ||
"# Introduction", | ||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", | ||
"This is a test comment. 12df-321a", | ||
"Yet another comment in the doc. 55yiyi-asd09", | ||
] | ||
|
||
PPTX_TEST_STRINGS = [ | ||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", | ||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", | ||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", | ||
"1b92870d-e3b5-4e65-8153-919f4ff45592", | ||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", | ||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title | ||
"2003", # chart value | ||
] | ||
|
||
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" | ||
BLOG_TEST_STRINGS = [ | ||
"Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", | ||
"an example where high cost can easily prevent a generic complex", | ||
] | ||
|
||
|
||
RSS_TEST_STRINGS = [ | ||
"The Official Microsoft Blog", | ||
"In the case of AI, it is absolutely true that the industry is moving incredibly fast", | ||
] | ||
|
||
|
||
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" | ||
WIKIPEDIA_TEST_STRINGS = [ | ||
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", | ||
'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', | ||
] | ||
WIKIPEDIA_TEST_EXCLUDES = [ | ||
"You are encouraged to create an account and log in", | ||
"154 languages", | ||
"move to sidebar", | ||
] | ||
|
||
SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia" | ||
SERP_TEST_STRINGS = [ | ||
"](https://en.wikipedia.org/wiki/Microsoft", | ||
"Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", | ||
"1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", | ||
] | ||
SERP_TEST_EXCLUDES = [ | ||
"https://www.bing.com/ck/a?!&&p=", | ||
"data:image/svg+xml,%3Csvg%20width%3D", | ||
] | ||
|
||
CSV_CP932_TEST_STRINGS = [ | ||
"名前,年齢,住所", | ||
"佐藤太郎,30,東京", | ||
"三木英子,25,大阪", | ||
"髙橋淳,35,名古屋", | ||
] | ||
|
||
|
||
@pytest.fixture | ||
def markitdown() -> MarkItDown: | ||
return MarkItDown() | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"filename, expected_strings, exclude_strings, kwargs", | ||
[ | ||
("test.xlsx", XLSX_TEST_STRINGS, None, {}), | ||
("test.docx", DOCX_TEST_STRINGS, None, {}), | ||
( | ||
"test_with_comment.docx", | ||
DOCX_COMMENT_TEST_STRINGS, | ||
None, | ||
{"style_map": "comment-reference => "}, | ||
), | ||
("test.pptx", PPTX_TEST_STRINGS, None, {}), | ||
("test_blog.html", BLOG_TEST_STRINGS, None, {"url": BLOG_TEST_URL}), | ||
("test_files.zip", XLSX_TEST_STRINGS, None, {}), | ||
( | ||
"test_wikipedia.html", | ||
WIKIPEDIA_TEST_STRINGS, | ||
WIKIPEDIA_TEST_EXCLUDES, | ||
{"url": WIKIPEDIA_TEST_URL}, | ||
), | ||
( | ||
"test_serp.html", | ||
SERP_TEST_STRINGS, | ||
SERP_TEST_EXCLUDES, | ||
{"url": SERP_TEST_URL}, | ||
), | ||
("test_rss.xml", RSS_TEST_STRINGS, None, {}), | ||
("test_mskanji.csv", CSV_CP932_TEST_STRINGS, None, {}), | ||
], | ||
) | ||
def test_common( | ||
markitdown: MarkItDown, | ||
filename: str, | ||
expected_strings: list, | ||
exclude_strings: list, | ||
kwargs, | ||
) -> None: | ||
source = TEST_FILES_DIR / filename | ||
result = markitdown.convert(source, **kwargs) | ||
validate_strings(result, expected_strings, exclude_strings) | ||
|
||
|
||
def test_docx() -> None: | ||
markitdown = MarkItDown() | ||
# Test DOCX processing | ||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) | ||
validate_strings(result, DOCX_TEST_STRINGS) | ||
|
||
# Test DOCX processing, with comments | ||
result = markitdown.convert( | ||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), | ||
style_map="comment-reference => ", | ||
) | ||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS) | ||
|
||
# Test DOCX processing, with comments and setting style_map on init | ||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") | ||
result = markitdown_with_style_map.convert( | ||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx") | ||
) | ||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# --- Helper Functions --- | ||
|
||
|
||
def validate_strings(result, expected_strings, exclude_strings=None): | ||
"""Validate presence or absence of specific strings.""" | ||
text_content = result.text_content.replace("\\", "") | ||
for string in expected_strings: | ||
assert string in text_content | ||
if exclude_strings: | ||
for string in exclude_strings: | ||
assert string not in text_content |