mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Allows parsing of WebP format images
This commit is contained in:
parent
1e11c12d96
commit
e96d65f945
@ -66,6 +66,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
]
|
||||
|
||||
def has_alpha(self, image):
|
||||
|
@ -15,5 +15,6 @@ def tesseract_consumer_declaration(sender, **kwargs):
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
"image/webp": ".webp",
|
||||
},
|
||||
}
|
||||
|
BIN
src/paperless_tesseract/tests/samples/document.webp
Executable file
BIN
src/paperless_tesseract/tests/samples/document.webp
Executable file
Binary file not shown.
After Width: | Height: | Size: 5.7 KiB |
@ -597,23 +597,34 @@ class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
def test_jpg(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=200)
|
||||
def test_gif(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
def test_tiff(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
self.assertIn("this is a test document", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=72)
|
||||
def test_webp(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
# OCR consistent mangles this space, oh well
|
||||
self.assertIn(
|
||||
"this is awebp document, created 11/14/2022.",
|
||||
parser.get_text().lower(),
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user