mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge pull request #766 from paperless-ngx/feature-barcode-tiff-support
Feature barcode tiff support
This commit is contained in:
		@@ -629,8 +629,19 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
 | 
				
			|||||||
    If no barcodes are detected in the uploaded file, no page separation
 | 
					    If no barcodes are detected in the uploaded file, no page separation
 | 
				
			||||||
    will happen.
 | 
					    will happen.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The original document will be removed and the separated pages will be
 | 
				
			||||||
 | 
					    saved as pdf.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Defaults to false.
 | 
					    Defaults to false.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
 | 
				
			||||||
 | 
					    Whether TIFF image files should be scanned for barcodes.
 | 
				
			||||||
 | 
					    This will automatically convert any TIFF image(s) to pdfs for later
 | 
				
			||||||
 | 
					    processing.
 | 
				
			||||||
 | 
					    This only has an effect, if PAPERLESS_CONSUMER_ENABLE_BARCODES has been
 | 
				
			||||||
 | 
					    enabled.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Defaults to false.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
 | 
					PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
 | 
				
			||||||
  Defines the string to be detected as a separator barcode.
 | 
					  Defines the string to be detected as a separator barcode.
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -22,6 +22,8 @@ from documents.models import Tag
 | 
				
			|||||||
from documents.sanity_checker import SanityCheckFailedException
 | 
					from documents.sanity_checker import SanityCheckFailedException
 | 
				
			||||||
from pdf2image import convert_from_path
 | 
					from pdf2image import convert_from_path
 | 
				
			||||||
from pikepdf import Pdf
 | 
					from pikepdf import Pdf
 | 
				
			||||||
 | 
					from PIL import Image
 | 
				
			||||||
 | 
					from PIL import ImageSequence
 | 
				
			||||||
from pyzbar import pyzbar
 | 
					from pyzbar import pyzbar
 | 
				
			||||||
from whoosh.writing import AsyncWriter
 | 
					from whoosh.writing import AsyncWriter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -93,9 +95,41 @@ def barcode_reader(image) -> List[str]:
 | 
				
			|||||||
    return barcodes
 | 
					    return barcodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_from_tiff_to_pdf(filepath: str) -> str:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    converts a given TIFF image file to pdf into a temp. directory.
 | 
				
			||||||
 | 
					    Returns the new pdf file.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    file_name = os.path.splitext(os.path.basename(filepath))[0]
 | 
				
			||||||
 | 
					    file_extension = os.path.splitext(os.path.basename(filepath))[1].lower()
 | 
				
			||||||
 | 
					    tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
 | 
				
			||||||
 | 
					    # use old file name with pdf extension
 | 
				
			||||||
 | 
					    if file_extension == ".tif" or file_extension == ".tiff":
 | 
				
			||||||
 | 
					        newpath = os.path.join(tempdir, file_name + ".pdf")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    with Image.open(filepath) as image:
 | 
				
			||||||
 | 
					        images = []
 | 
				
			||||||
 | 
					        for i, page in enumerate(ImageSequence.Iterator(image)):
 | 
				
			||||||
 | 
					            page = page.convert("RGB")
 | 
				
			||||||
 | 
					            images.append(page)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            if len(images) == 1:
 | 
				
			||||||
 | 
					                images[0].save(newpath)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                images[0].save(newpath, save_all=True, append_images=images[1:])
 | 
				
			||||||
 | 
					        except OSError as e:
 | 
				
			||||||
 | 
					            logger.warning(
 | 
				
			||||||
 | 
					                f"Could not save the file as pdf. Error: {str(e)}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					    return newpath
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
 | 
					def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Scan the provided file for page separating barcodes
 | 
					    Scan the provided pdf file for page separating barcodes
 | 
				
			||||||
    Returns a list of pagenumbers, which separate the file
 | 
					    Returns a list of pagenumbers, which separate the file
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    separator_page_numbers = []
 | 
					    separator_page_numbers = []
 | 
				
			||||||
@@ -112,7 +146,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
 | 
					def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Separate the provided file on the pages_to_split_on.
 | 
					    Separate the provided pdf file on the pages_to_split_on.
 | 
				
			||||||
    The pages which are defined by page_numbers will be removed.
 | 
					    The pages which are defined by page_numbers will be removed.
 | 
				
			||||||
    Returns a list of (temporary) filepaths to consume.
 | 
					    Returns a list of (temporary) filepaths to consume.
 | 
				
			||||||
    These will need to be deleted later.
 | 
					    These will need to be deleted later.
 | 
				
			||||||
@@ -195,42 +229,70 @@ def consume_file(
 | 
				
			|||||||
    if settings.CONSUMER_ENABLE_BARCODES:
 | 
					    if settings.CONSUMER_ENABLE_BARCODES:
 | 
				
			||||||
        separators = []
 | 
					        separators = []
 | 
				
			||||||
        document_list = []
 | 
					        document_list = []
 | 
				
			||||||
        separators = scan_file_for_separating_barcodes(path)
 | 
					        converted_tiff = None
 | 
				
			||||||
        if separators:
 | 
					        if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
 | 
				
			||||||
            logger.debug(f"Pages with separators found in: {str(path)}")
 | 
					            supported_extensions = [".pdf", ".tiff", ".tif"]
 | 
				
			||||||
            document_list = separate_pages(path, separators)
 | 
					        else:
 | 
				
			||||||
        if document_list:
 | 
					            supported_extensions = [".pdf"]
 | 
				
			||||||
            for n, document in enumerate(document_list):
 | 
					        file_extension = os.path.splitext(os.path.basename(path))[1].lower()
 | 
				
			||||||
                # save to consumption dir
 | 
					        if file_extension not in supported_extensions:
 | 
				
			||||||
                # rename it to the original filename  with number prefix
 | 
					            # if not supported, skip this routine
 | 
				
			||||||
                if override_filename:
 | 
					            logger.warning(
 | 
				
			||||||
                    newname = f"{str(n)}_" + override_filename
 | 
					                f"Unsupported file format for barcode reader: {str(file_extension)}",
 | 
				
			||||||
                else:
 | 
					            )
 | 
				
			||||||
                    newname = None
 | 
					        else:
 | 
				
			||||||
                save_to_dir(document, newname=newname)
 | 
					            if file_extension in {".tif", ".tiff"}:
 | 
				
			||||||
            # if we got here, the document was successfully split
 | 
					                file_to_process = convert_from_tiff_to_pdf(path)
 | 
				
			||||||
            # and can safely be deleted
 | 
					            else:
 | 
				
			||||||
            logger.debug("Deleting file {}".format(path))
 | 
					                file_to_process = path
 | 
				
			||||||
            os.unlink(path)
 | 
					
 | 
				
			||||||
            # notify the sender, otherwise the progress bar
 | 
					            separators = scan_file_for_separating_barcodes(file_to_process)
 | 
				
			||||||
            # in the UI stays stuck
 | 
					
 | 
				
			||||||
            payload = {
 | 
					            if separators:
 | 
				
			||||||
                "filename": override_filename,
 | 
					                logger.debug(
 | 
				
			||||||
                "task_id": task_id,
 | 
					                    f"Pages with separators found in: {str(path)}",
 | 
				
			||||||
                "current_progress": 100,
 | 
					 | 
				
			||||||
                "max_progress": 100,
 | 
					 | 
				
			||||||
                "status": "SUCCESS",
 | 
					 | 
				
			||||||
                "message": "finished",
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                async_to_sync(get_channel_layer().group_send)(
 | 
					 | 
				
			||||||
                    "status_updates",
 | 
					 | 
				
			||||||
                    {"type": "status_update", "data": payload},
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            except OSError as e:
 | 
					                document_list = separate_pages(file_to_process, separators)
 | 
				
			||||||
                logger.warning("OSError. It could be, the broker cannot be reached.")
 | 
					
 | 
				
			||||||
                logger.warning(str(e))
 | 
					            if document_list:
 | 
				
			||||||
            return "File successfully split"
 | 
					                for n, document in enumerate(document_list):
 | 
				
			||||||
 | 
					                    # save to consumption dir
 | 
				
			||||||
 | 
					                    # rename it to the original filename  with number prefix
 | 
				
			||||||
 | 
					                    if override_filename:
 | 
				
			||||||
 | 
					                        newname = f"{str(n)}_" + override_filename
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        newname = None
 | 
				
			||||||
 | 
					                    save_to_dir(document, newname=newname)
 | 
				
			||||||
 | 
					                # if we got here, the document was successfully split
 | 
				
			||||||
 | 
					                # and can safely be deleted
 | 
				
			||||||
 | 
					                if converted_tiff:
 | 
				
			||||||
 | 
					                    logger.debug("Deleting file {}".format(file_to_process))
 | 
				
			||||||
 | 
					                    os.unlink(file_to_process)
 | 
				
			||||||
 | 
					                logger.debug("Deleting file {}".format(path))
 | 
				
			||||||
 | 
					                os.unlink(path)
 | 
				
			||||||
 | 
					                # notify the sender, otherwise the progress bar
 | 
				
			||||||
 | 
					                # in the UI stays stuck
 | 
				
			||||||
 | 
					                payload = {
 | 
				
			||||||
 | 
					                    "filename": override_filename,
 | 
				
			||||||
 | 
					                    "task_id": task_id,
 | 
				
			||||||
 | 
					                    "current_progress": 100,
 | 
				
			||||||
 | 
					                    "max_progress": 100,
 | 
				
			||||||
 | 
					                    "status": "SUCCESS",
 | 
				
			||||||
 | 
					                    "message": "finished",
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    async_to_sync(get_channel_layer().group_send)(
 | 
				
			||||||
 | 
					                        "status_updates",
 | 
				
			||||||
 | 
					                        {"type": "status_update", "data": payload},
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                except OSError as e:
 | 
				
			||||||
 | 
					                    logger.warning(
 | 
				
			||||||
 | 
					                        "OSError. It could be, the broker cannot be reached.",
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                    logger.warning(str(e))
 | 
				
			||||||
 | 
					                # consuming stops here, since the original document with
 | 
				
			||||||
 | 
					                # the barcodes has been split and will be consumed separately
 | 
				
			||||||
 | 
					                return "File successfully split"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # continue with consumption if no barcode was found
 | 
					    # continue with consumption if no barcode was found
 | 
				
			||||||
    document = Consumer().try_consume_file(
 | 
					    document = Consumer().try_consume_file(
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.tiff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.tiff
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -204,6 +204,29 @@ class TestTasks(DirectoriesMixin, TestCase):
 | 
				
			|||||||
        img = Image.open(test_file)
 | 
					        img = Image.open(test_file)
 | 
				
			||||||
        self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
 | 
					        self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_convert_from_tiff_to_pdf(self):
 | 
				
			||||||
 | 
					        test_file = os.path.join(
 | 
				
			||||||
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
 | 
					            "samples",
 | 
				
			||||||
 | 
					            "simple.tiff",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
 | 
				
			||||||
 | 
					        shutil.copy(test_file, dst)
 | 
				
			||||||
 | 
					        target_file = tasks.convert_from_tiff_to_pdf(dst)
 | 
				
			||||||
 | 
					        file_extension = os.path.splitext(os.path.basename(target_file))[1]
 | 
				
			||||||
 | 
					        self.assertTrue(os.path.isfile(target_file))
 | 
				
			||||||
 | 
					        self.assertEqual(file_extension, ".pdf")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_convert_error_from_pdf_to_pdf(self):
 | 
				
			||||||
 | 
					        test_file = os.path.join(
 | 
				
			||||||
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
 | 
					            "samples",
 | 
				
			||||||
 | 
					            "simple.pdf",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
 | 
				
			||||||
 | 
					        shutil.copy(test_file, dst)
 | 
				
			||||||
 | 
					        self.assertIsNone(tasks.convert_from_tiff_to_pdf(dst))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_scan_file_for_separating_barcodes(self):
 | 
					    def test_scan_file_for_separating_barcodes(self):
 | 
				
			||||||
        test_file = os.path.join(
 | 
					        test_file = os.path.join(
 | 
				
			||||||
            os.path.dirname(__file__),
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
@@ -400,11 +423,64 @@ class TestTasks(DirectoriesMixin, TestCase):
 | 
				
			|||||||
            "barcodes",
 | 
					            "barcodes",
 | 
				
			||||||
            "patch-code-t-middle.pdf",
 | 
					            "patch-code-t-middle.pdf",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd")
 | 
					        dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
 | 
				
			||||||
        shutil.copy(test_file, dst)
 | 
					        shutil.copy(test_file, dst)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(tasks.consume_file(dst), "File successfully split")
 | 
					        self.assertEqual(tasks.consume_file(dst), "File successfully split")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @override_settings(
 | 
				
			||||||
 | 
					        CONSUMER_ENABLE_BARCODES=True,
 | 
				
			||||||
 | 
					        CONSUMER_BARCODE_TIFF_SUPPORT=True,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    def test_consume_barcode_tiff_file(self):
 | 
				
			||||||
 | 
					        test_file = os.path.join(
 | 
				
			||||||
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
 | 
					            "samples",
 | 
				
			||||||
 | 
					            "barcodes",
 | 
				
			||||||
 | 
					            "patch-code-t-middle.tiff",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
 | 
				
			||||||
 | 
					        shutil.copy(test_file, dst)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.assertEqual(tasks.consume_file(dst), "File successfully split")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @override_settings(
 | 
				
			||||||
 | 
					        CONSUMER_ENABLE_BARCODES=True,
 | 
				
			||||||
 | 
					        CONSUMER_BARCODE_TIFF_SUPPORT=True,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    @mock.patch("documents.consumer.Consumer.try_consume_file")
 | 
				
			||||||
 | 
					    def test_consume_barcode_unsupported_jpg_file(self, m):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        This test assumes barcode and TIFF support are enabled and
 | 
				
			||||||
 | 
					        the user uploads an unsupported image file (e.g. jpg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        The function shouldn't try to scan for separating barcodes
 | 
				
			||||||
 | 
					        and continue archiving the file as is.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        test_file = os.path.join(
 | 
				
			||||||
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
 | 
					            "samples",
 | 
				
			||||||
 | 
					            "simple.jpg",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
 | 
				
			||||||
 | 
					        shutil.copy(test_file, dst)
 | 
				
			||||||
 | 
					        with self.assertLogs("paperless.tasks", level="WARNING") as cm:
 | 
				
			||||||
 | 
					            self.assertIn("Success", tasks.consume_file(dst))
 | 
				
			||||||
 | 
					        self.assertEqual(
 | 
				
			||||||
 | 
					            cm.output,
 | 
				
			||||||
 | 
					            [
 | 
				
			||||||
 | 
					                "WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg",
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        m.assert_called_once()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        args, kwargs = m.call_args
 | 
				
			||||||
 | 
					        self.assertIsNone(kwargs["override_filename"])
 | 
				
			||||||
 | 
					        self.assertIsNone(kwargs["override_title"])
 | 
				
			||||||
 | 
					        self.assertIsNone(kwargs["override_correspondent_id"])
 | 
				
			||||||
 | 
					        self.assertIsNone(kwargs["override_document_type_id"])
 | 
				
			||||||
 | 
					        self.assertIsNone(kwargs["override_tag_ids"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("documents.tasks.sanity_checker.check_sanity")
 | 
					    @mock.patch("documents.tasks.sanity_checker.check_sanity")
 | 
				
			||||||
    def test_sanity_check_success(self, m):
 | 
					    def test_sanity_check_success(self, m):
 | 
				
			||||||
        m.return_value = SanityCheckMessages()
 | 
					        m.return_value = SanityCheckMessages()
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -503,6 +503,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean(
 | 
				
			|||||||
    "PAPERLESS_CONSUMER_ENABLE_BARCODES",
 | 
					    "PAPERLESS_CONSUMER_ENABLE_BARCODES",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
 | 
				
			||||||
 | 
					    "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
 | 
					CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
 | 
					OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user