Don't consider better OCR as failing

Tesseract 5.3.0 does a better job at OCR, and correctly
reads "a webp" instead of "awebp", this is good, so we
don't want the test to fail.
This commit is contained in:
Dennis Brakhane 2023-07-11 16:41:31 +02:00
parent db48d4c576
commit 93009c1eed

View File

@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
self.assertIsFile(parser.archive_path)
# OCR consistent mangles this space, oh well
self.assertIn(
"this is awebp document, created 11/14/2022.",
# Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both
self.assertRegex(
parser.get_text().lower(),
r"this is a ?webp document, created 11/14/2022.",
)