Allows parsing of WebP format images

This commit is contained in:
Trenton H 2022-11-14 15:38:35 -08:00
parent 1e11c12d96
commit e96d65f945
4 changed files with 17 additions and 4 deletions

View File

@ -66,6 +66,7 @@ class RasterisedDocumentParser(DocumentParser):
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
def has_alpha(self, image):

View File

@ -15,5 +15,6 @@ def tesseract_consumer_declaration(sender, **kwargs):
"image/tiff": ".tif",
"image/gif": ".gif",
"image/bmp": ".bmp",
"image/webp": ".webp",
},
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.7 KiB

View File

@ -597,23 +597,34 @@ class TestParserFileTypes(DirectoriesMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72)
def test_webp(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
self.assertTrue(os.path.isfile(parser.archive_path))
# OCR consistent mangles this space, oh well
self.assertIn(
"this is awebp document, created 11/14/2022.",
parser.get_text().lower(),
)