diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index aa3ad64fa..bde2ad25e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -66,6 +66,7 @@ class RasterisedDocumentParser(DocumentParser): "image/tiff", "image/bmp", "image/gif", + "image/webp", ] def has_alpha(self, image): diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 85f2cab9f..c4fd1e039 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -15,5 +15,6 @@ def tesseract_consumer_declaration(sender, **kwargs): "image/tiff": ".tif", "image/gif": ".gif", "image/bmp": ".bmp", + "image/webp": ".webp", }, } diff --git a/src/paperless_tesseract/tests/samples/document.webp b/src/paperless_tesseract/tests/samples/document.webp new file mode 100755 index 000000000..c19ba2980 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/document.webp differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 67c1ad859..a0550bde9 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -597,23 +597,34 @@ class TestParserFileTypes(DirectoriesMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp") self.assertTrue(os.path.isfile(parser.archive_path)) - self.assertTrue("this is a test document" in parser.get_text().lower()) + self.assertIn("this is a test document", parser.get_text().lower()) def test_jpg(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg") self.assertTrue(os.path.isfile(parser.archive_path)) - self.assertTrue("this is a test document" in parser.get_text().lower()) + self.assertIn("this is a test document", parser.get_text().lower()) @override_settings(OCR_IMAGE_DPI=200) def test_gif(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif") self.assertTrue(os.path.isfile(parser.archive_path)) - self.assertTrue("this is a test document" in parser.get_text().lower()) + self.assertIn("this is a test document", parser.get_text().lower()) def test_tiff(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff") self.assertTrue(os.path.isfile(parser.archive_path)) - self.assertTrue("this is a test document" in parser.get_text().lower()) + self.assertIn("this is a test document", parser.get_text().lower()) + + @override_settings(OCR_IMAGE_DPI=72) + def test_webp(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp") + self.assertTrue(os.path.isfile(parser.archive_path)) + # OCR consistent mangles this space, oh well + self.assertIn( + "this is awebp document, created 11/14/2022.", + parser.get_text().lower(), + )