diff --git a/docs/configuration.rst b/docs/configuration.rst index 42935dab8..7849d8dad 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -629,8 +629,19 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES= If no barcodes are detected in the uploaded file, no page separation will happen. + The original document will be removed and the separated pages will be + saved as pdf. + Defaults to false. +PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT= + Whether TIFF image files should be scanned for barcodes. + This will automatically convert any TIFF image(s) to pdfs for later + processing. + This only has an effect, if PAPERLESS_CONSUMER_ENABLE_BARCODES has been + enabled. + + Defaults to false. PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT Defines the string to be detected as a separator barcode. diff --git a/src/documents/tasks.py b/src/documents/tasks.py index e9a015d67..9fe58b325 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -22,6 +22,8 @@ from documents.models import Tag from documents.sanity_checker import SanityCheckFailedException from pdf2image import convert_from_path from pikepdf import Pdf +from PIL import Image +from PIL import ImageSequence from pyzbar import pyzbar from whoosh.writing import AsyncWriter @@ -93,9 +95,41 @@ def barcode_reader(image) -> List[str]: return barcodes +def convert_from_tiff_to_pdf(filepath: str) -> str: + """ + converts a given TIFF image file to pdf into a temp. directory. + Returns the new pdf file. + """ + file_name = os.path.splitext(os.path.basename(filepath))[0] + file_extension = os.path.splitext(os.path.basename(filepath))[1].lower() + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + # use old file name with pdf extension + if file_extension == ".tif" or file_extension == ".tiff": + newpath = os.path.join(tempdir, file_name + ".pdf") + else: + logger.warning(f"Cannot convert from {str(file_extension)} to pdf.") + return None + with Image.open(filepath) as image: + images = [] + for i, page in enumerate(ImageSequence.Iterator(image)): + page = page.convert("RGB") + images.append(page) + try: + if len(images) == 1: + images[0].save(newpath) + else: + images[0].save(newpath, save_all=True, append_images=images[1:]) + except OSError as e: + logger.warning( + f"Could not save the file as pdf. Error: {str(e)}", + ) + return None + return newpath + + def scan_file_for_separating_barcodes(filepath: str) -> List[int]: """ - Scan the provided file for page separating barcodes + Scan the provided pdf file for page separating barcodes Returns a list of pagenumbers, which separate the file """ separator_page_numbers = [] @@ -112,7 +146,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> List[int]: def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: """ - Separate the provided file on the pages_to_split_on. + Separate the provided pdf file on the pages_to_split_on. The pages which are defined by page_numbers will be removed. Returns a list of (temporary) filepaths to consume. These will need to be deleted later. @@ -195,42 +229,70 @@ def consume_file( if settings.CONSUMER_ENABLE_BARCODES: separators = [] document_list = [] - separators = scan_file_for_separating_barcodes(path) - if separators: - logger.debug(f"Pages with separators found in: {str(path)}") - document_list = separate_pages(path, separators) - if document_list: - for n, document in enumerate(document_list): - # save to consumption dir - # rename it to the original filename with number prefix - if override_filename: - newname = f"{str(n)}_" + override_filename - else: - newname = None - save_to_dir(document, newname=newname) - # if we got here, the document was successfully split - # and can safely be deleted - logger.debug("Deleting file {}".format(path)) - os.unlink(path) - # notify the sender, otherwise the progress bar - # in the UI stays stuck - payload = { - "filename": override_filename, - "task_id": task_id, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, + converted_tiff = None + if settings.CONSUMER_BARCODE_TIFF_SUPPORT: + supported_extensions = [".pdf", ".tiff", ".tif"] + else: + supported_extensions = [".pdf"] + file_extension = os.path.splitext(os.path.basename(path))[1].lower() + if file_extension not in supported_extensions: + # if not supported, skip this routine + logger.warning( + f"Unsupported file format for barcode reader: {str(file_extension)}", + ) + else: + if file_extension in {".tif", ".tiff"}: + file_to_process = convert_from_tiff_to_pdf(path) + else: + file_to_process = path + + separators = scan_file_for_separating_barcodes(file_to_process) + + if separators: + logger.debug( + f"Pages with separators found in: {str(path)}", ) - except OSError as e: - logger.warning("OSError. It could be, the broker cannot be reached.") - logger.warning(str(e)) - return "File successfully split" + document_list = separate_pages(file_to_process, separators) + + if document_list: + for n, document in enumerate(document_list): + # save to consumption dir + # rename it to the original filename with number prefix + if override_filename: + newname = f"{str(n)}_" + override_filename + else: + newname = None + save_to_dir(document, newname=newname) + # if we got here, the document was successfully split + # and can safely be deleted + if converted_tiff: + logger.debug("Deleting file {}".format(file_to_process)) + os.unlink(file_to_process) + logger.debug("Deleting file {}".format(path)) + os.unlink(path) + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": override_filename, + "task_id": task_id, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except OSError as e: + logger.warning( + "OSError. It could be, the broker cannot be reached.", + ) + logger.warning(str(e)) + # consuming stops here, since the original document with + # the barcodes has been split and will be consumed separately + return "File successfully split" # continue with consumption if no barcode was found document = Consumer().try_consume_file( diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff b/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff new file mode 100644 index 000000000..33c420b4f Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle.tiff differ diff --git a/src/documents/tests/samples/simple.tiff b/src/documents/tests/samples/simple.tiff new file mode 100644 index 000000000..ef30a8689 Binary files /dev/null and b/src/documents/tests/samples/simple.tiff differ diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index c78fa16c2..7e5381def 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -204,6 +204,29 @@ class TestTasks(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) + def test_convert_from_tiff_to_pdf(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff") + shutil.copy(test_file, dst) + target_file = tasks.convert_from_tiff_to_pdf(dst) + file_extension = os.path.splitext(os.path.basename(target_file))[1] + self.assertTrue(os.path.isfile(target_file)) + self.assertEqual(file_extension, ".pdf") + + def test_convert_error_from_pdf_to_pdf(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.pdf", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf") + shutil.copy(test_file, dst) + self.assertIsNone(tasks.convert_from_tiff_to_pdf(dst)) + def test_scan_file_for_separating_barcodes(self): test_file = os.path.join( os.path.dirname(__file__), @@ -400,11 +423,64 @@ class TestTasks(DirectoriesMixin, TestCase): "barcodes", "patch-code-t-middle.pdf", ) - dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf") shutil.copy(test_file, dst) self.assertEqual(tasks.consume_file(dst), "File successfully split") + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_consume_barcode_tiff_file(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff") + shutil.copy(test_file, dst) + + self.assertEqual(tasks.consume_file(dst), "File successfully split") + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + @mock.patch("documents.consumer.Consumer.try_consume_file") + def test_consume_barcode_unsupported_jpg_file(self, m): + """ + This test assumes barcode and TIFF support are enabled and + the user uploads an unsupported image file (e.g. jpg) + + The function shouldn't try to scan for separating barcodes + and continue archiving the file as is. + """ + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.jpg", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") + shutil.copy(test_file, dst) + with self.assertLogs("paperless.tasks", level="WARNING") as cm: + self.assertIn("Success", tasks.consume_file(dst)) + self.assertEqual( + cm.output, + [ + "WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg", + ], + ) + m.assert_called_once() + + args, kwargs = m.call_args + self.assertIsNone(kwargs["override_filename"]) + self.assertIsNone(kwargs["override_title"]) + self.assertIsNone(kwargs["override_correspondent_id"]) + self.assertIsNone(kwargs["override_document_type_id"]) + self.assertIsNone(kwargs["override_tag_ids"]) + @mock.patch("documents.tasks.sanity_checker.check_sanity") def test_sanity_check_success(self, m): m.return_value = SanityCheckMessages() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index e720a6946..5274c356a 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -503,6 +503,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_BARCODES", ) +CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( + "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", +) + CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")