a couple fixes and more supported image files

This commit is contained in:
jonaswinkler 2020-12-02 17:39:49 +01:00
parent 5e1543bad5
commit e3ce573fbb
8 changed files with 41 additions and 4 deletions

View File

@ -268,8 +268,9 @@ def update_filename_and_move_files(sender, instance, **kwargs):
logging.getLogger(__name__).debug(
f"Moved file {old_source_path} to {new_source_path}.")
logging.getLogger(__name__).debug(
f"Moved file {old_archive_path} to {new_archive_path}.")
if instance.archive_checksum:
logging.getLogger(__name__).debug(
f"Moved file {old_archive_path} to {new_archive_path}.")
except OSError as e:
instance.filename = old_filename

View File

@ -65,7 +65,10 @@ class RasterisedDocumentParser(DocumentParser):
def is_image(self, mime_type):
return mime_type in [
"image/png",
"image/jpeg"
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
]
def get_dpi(self, image):

View File

@ -8,6 +8,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
"mime_types": {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png"
"image/png": ".png",
"image/tiff": ".tif",
"image/gif": ".gif",
"image/bmp": ".bmp",
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

View File

@ -247,3 +247,33 @@ class TestParser(DirectoriesMixin, TestCase):
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
self.assertTrue(os.path.join(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
class TestParserFileTypes(DirectoriesMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_bmp(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())