From 93009c1eeda1cc339dbf3acb0882dc0313c8a12f Mon Sep 17 00:00:00 2001
From: Dennis Brakhane <brakhane@gmail.com>
Date: Tue, 11 Jul 2023 16:41:31 +0200
Subject: [PATCH] Don't consider better OCR as failing

Tesseract 5.3.0 does a better job at OCR, and correctly
reads "a webp" instead of "awebp", this is good, so we
don't want the test to fail.
---
 src/paperless_tesseract/tests/test_parser.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py
index 7850ad4ef..8b3de5615 100644
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         parser = RasterisedDocumentParser(None)
         parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
         self.assertIsFile(parser.archive_path)
-        # OCR consistent mangles this space, oh well
-        self.assertIn(
-            "this is awebp document, created 11/14/2022.",
+        # Older tesseracts consistently mangle the space between "a webp",
+        # tesseract 5.3.0 seems to do a better job, so we're accepting both
+        self.assertRegex(
             parser.get_text().lower(),
+            r"this is a ?webp document, created 11/14/2022.",
         )