mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-10-22 03:16:15 -05:00
Allows parsing of WebP format images
This commit is contained in:
@@ -66,6 +66,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"image/tiff",
|
"image/tiff",
|
||||||
"image/bmp",
|
"image/bmp",
|
||||||
"image/gif",
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
]
|
]
|
||||||
|
|
||||||
def has_alpha(self, image):
|
def has_alpha(self, image):
|
||||||
|
@@ -15,5 +15,6 @@ def tesseract_consumer_declaration(sender, **kwargs):
|
|||||||
"image/tiff": ".tif",
|
"image/tiff": ".tif",
|
||||||
"image/gif": ".gif",
|
"image/gif": ".gif",
|
||||||
"image/bmp": ".bmp",
|
"image/bmp": ".bmp",
|
||||||
|
"image/webp": ".webp",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
BIN
src/paperless_tesseract/tests/samples/document.webp
Executable file
BIN
src/paperless_tesseract/tests/samples/document.webp
Executable file
Binary file not shown.
After Width: | Height: | Size: 5.7 KiB |
@@ -597,23 +597,34 @@ class TestParserFileTypes(DirectoriesMixin, TestCase):
|
|||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
|
||||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
self.assertIn("this is a test document", parser.get_text().lower())
|
||||||
|
|
||||||
def test_jpg(self):
|
def test_jpg(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
|
||||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
self.assertIn("this is a test document", parser.get_text().lower())
|
||||||
|
|
||||||
@override_settings(OCR_IMAGE_DPI=200)
|
@override_settings(OCR_IMAGE_DPI=200)
|
||||||
def test_gif(self):
|
def test_gif(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
|
||||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
self.assertIn("this is a test document", parser.get_text().lower())
|
||||||
|
|
||||||
def test_tiff(self):
|
def test_tiff(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
|
||||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
self.assertIn("this is a test document", parser.get_text().lower())
|
||||||
|
|
||||||
|
@override_settings(OCR_IMAGE_DPI=72)
|
||||||
|
def test_webp(self):
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
|
||||||
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
|
# OCR consistent mangles this space, oh well
|
||||||
|
self.assertIn(
|
||||||
|
"this is awebp document, created 11/14/2022.",
|
||||||
|
parser.get_text().lower(),
|
||||||
|
)
|
||||||
|
Reference in New Issue
Block a user