diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 6df142dc4..4d9dc9ccd 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -268,8 +268,9 @@ def update_filename_and_move_files(sender, instance, **kwargs): logging.getLogger(__name__).debug( f"Moved file {old_source_path} to {new_source_path}.") - logging.getLogger(__name__).debug( - f"Moved file {old_archive_path} to {new_archive_path}.") + if instance.archive_checksum: + logging.getLogger(__name__).debug( + f"Moved file {old_archive_path} to {new_archive_path}.") except OSError as e: instance.filename = old_filename diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index e49d54ac4..4a542b345 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -65,7 +65,10 @@ class RasterisedDocumentParser(DocumentParser): def is_image(self, mime_type): return mime_type in [ "image/png", - "image/jpeg" + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", ] def get_dpi(self, image): diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 57363b65e..1e1cd1e1a 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -8,6 +8,9 @@ def tesseract_consumer_declaration(sender, **kwargs): "mime_types": { "application/pdf": ".pdf", "image/jpeg": ".jpg", - "image/png": ".png" + "image/png": ".png", + "image/tiff": ".tif", + "image/gif": ".gif", + "image/bmp": ".bmp", } } diff --git a/src/paperless_tesseract/tests/samples/simple.bmp b/src/paperless_tesseract/tests/samples/simple.bmp new file mode 100644 index 000000000..a25dee50f Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.bmp differ diff --git a/src/paperless_tesseract/tests/samples/simple.gif b/src/paperless_tesseract/tests/samples/simple.gif new file mode 100644 index 000000000..1c75f7442 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.gif differ diff --git a/src/paperless_tesseract/tests/samples/simple.jpg b/src/paperless_tesseract/tests/samples/simple.jpg new file mode 100644 index 000000000..53e55129a Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.jpg differ diff --git a/src/paperless_tesseract/tests/samples/simple.tif b/src/paperless_tesseract/tests/samples/simple.tif new file mode 100644 index 000000000..1621f4496 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.tif differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index a5f4a7f77..8834ec755 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -247,3 +247,33 @@ class TestParser(DirectoriesMixin, TestCase): parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") self.assertTrue(os.path.join(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) + + +class TestParserFileTypes(DirectoriesMixin, TestCase): + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + + def test_bmp(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertTrue("this is a test document" in parser.get_text().lower()) + + def test_jpg(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertTrue("this is a test document" in parser.get_text().lower()) + + @override_settings(OCR_IMAGE_DPI=200) + def test_gif(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertTrue("this is a test document" in parser.get_text().lower()) + + def test_tiff(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertTrue("this is a test document" in parser.get_text().lower())