mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Don't consider better OCR as failing
Tesseract 5.3.0 does a better job at OCR, and correctly reads "a webp" instead of "awebp", this is good, so we don't want the test to fail.
This commit is contained in:
parent
db48d4c576
commit
93009c1eed
@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
|
||||||
self.assertIsFile(parser.archive_path)
|
self.assertIsFile(parser.archive_path)
|
||||||
# OCR consistent mangles this space, oh well
|
# Older tesseracts consistently mangle the space between "a webp",
|
||||||
self.assertIn(
|
# tesseract 5.3.0 seems to do a better job, so we're accepting both
|
||||||
"this is awebp document, created 11/14/2022.",
|
self.assertRegex(
|
||||||
parser.get_text().lower(),
|
parser.get_text().lower(),
|
||||||
|
r"this is a ?webp document, created 11/14/2022.",
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user