diff --git a/docs/configuration.rst b/docs/configuration.rst index 3541f2e07..248d7a492 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -626,6 +626,12 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES= Defaults to false. +PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT= + Whether TIFF image files should be scanned for barcodes. + This will automatically convert any TIFF image(s) to pdfs for later + processing. + + Defaults to false. PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT Defines the string to be detected as a separator barcode. diff --git a/src/documents/tasks.py b/src/documents/tasks.py index e9a015d67..dfa2c6bcd 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -22,6 +22,8 @@ from documents.models import Tag from documents.sanity_checker import SanityCheckFailedException from pdf2image import convert_from_path from pikepdf import Pdf +from PIL import Image +from PIL import ImageSequence from pyzbar import pyzbar from whoosh.writing import AsyncWriter @@ -93,6 +95,41 @@ def barcode_reader(image) -> List[str]: return barcodes +def convert_from_tiff_to_pdf(filepath: str) -> str: + """ + converts a given TIFF image file to pdf. + Returns the new pdf file. + """ + file_extension = os.path.splitext(os.path.basename(filepath))[1] + # use old file name with pdf extension + if file_extension == ".tif": + newpath = filepath.replace(".tif", ".pdf") + elif file_extension == ".tiff": + newpath = filepath.replace(".tiff", ".pdf") + else: + logger.warning(f"Cannot convert from {str(file_extension)} to pdf.") + return "" + image = Image.open(filepath) + images = [] + for i, page in enumerate(ImageSequence.Iterator(image)): + page = page.convert("RGB") + images.append(page) + try: + if len(images) == 1: + images[0].save(newpath) + else: + images[0].save(newpath, save_all=True, append_images=images[1:]) + os.unlink(filepath) + except OSError as e: + logger.warning( + f"Could not save the file as pdf. " + f"The original image file was not deleted. Error: " + f"{str(e)}", + ) + image.close() + return newpath + + def scan_file_for_separating_barcodes(filepath: str) -> List[int]: """ Scan the provided file for page separating barcodes @@ -195,42 +232,56 @@ def consume_file( if settings.CONSUMER_ENABLE_BARCODES: separators = [] document_list = [] - separators = scan_file_for_separating_barcodes(path) - if separators: - logger.debug(f"Pages with separators found in: {str(path)}") - document_list = separate_pages(path, separators) - if document_list: - for n, document in enumerate(document_list): - # save to consumption dir - # rename it to the original filename with number prefix - if override_filename: - newname = f"{str(n)}_" + override_filename - else: - newname = None - save_to_dir(document, newname=newname) - # if we got here, the document was successfully split - # and can safely be deleted - logger.debug("Deleting file {}".format(path)) - os.unlink(path) - # notify the sender, otherwise the progress bar - # in the UI stays stuck - payload = { - "filename": override_filename, - "task_id": task_id, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except OSError as e: - logger.warning("OSError. It could be, the broker cannot be reached.") - logger.warning(str(e)) - return "File successfully split" + if settings.CONSUMER_BARCODE_TIFF_SUPPORT: + supported_extensions = [".pdf", ".tiff", ".tif"] + else: + supported_extensions = [".pdf"] + file_extension = os.path.splitext(os.path.basename(path))[1] + if file_extension not in supported_extensions: + logger.warning( + f"Unsupported file format for barcode reader: {str(file_extension)}", + ) + else: + if file_extension == ".tif" or file_extension == ".tiff": + path = convert_from_tiff_to_pdf(path) + separators = scan_file_for_separating_barcodes(path) + if separators: + logger.debug(f"Pages with separators found in: {str(path)}") + document_list = separate_pages(path, separators) + if document_list: + for n, document in enumerate(document_list): + # save to consumption dir + # rename it to the original filename with number prefix + if override_filename: + newname = f"{str(n)}_" + override_filename + else: + newname = None + save_to_dir(document, newname=newname) + # if we got here, the document was successfully split + # and can safely be deleted + logger.debug("Deleting file {}".format(path)) + os.unlink(path) + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": override_filename, + "task_id": task_id, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except OSError as e: + logger.warning( + "OSError. It could be, the broker cannot be reached.", + ) + logger.warning(str(e)) + return "File successfully split" # continue with consumption if no barcode was found document = Consumer().try_consume_file( diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff b/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff new file mode 100644 index 000000000..33c420b4f Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff differ diff --git a/src/documents/tests/samples/simple.tiff b/src/documents/tests/samples/simple.tiff new file mode 100644 index 000000000..ef30a8689 Binary files /dev/null and b/src/documents/tests/samples/simple.tiff differ diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index c78fa16c2..df9e10077 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -204,6 +204,30 @@ class TestTasks(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) + def test_convert_from_tiff_to_pdf(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff") + shutil.copy(test_file, dst) + target_file = tasks.convert_from_tiff_to_pdf(dst) + file_extension = os.path.splitext(os.path.basename(target_file))[1] + self.assertTrue(os.path.isfile(target_file)) + self.assertEqual(file_extension, ".pdf") + + def test_convert_error_from_pdf_to_pdf(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.pdf", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf") + shutil.copy(test_file, dst) + target_file = tasks.convert_from_tiff_to_pdf(dst) + self.assertFalse(os.path.isfile(target_file)) + def test_scan_file_for_separating_barcodes(self): test_file = os.path.join( os.path.dirname(__file__), @@ -400,7 +424,23 @@ class TestTasks(DirectoriesMixin, TestCase): "barcodes", "patch-code-t-middle.pdf", ) - dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf") + shutil.copy(test_file, dst) + + self.assertEqual(tasks.consume_file(dst), "File successfully split") + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_consume_barcode_tiff_file(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff") shutil.copy(test_file, dst) self.assertEqual(tasks.consume_file(dst), "File successfully split") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b267ee10f..ec18cc0ea 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -502,6 +502,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_BARCODES", ) +CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( + "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", +) + CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")