diff --git a/docs/configuration.rst b/docs/configuration.rst index 5a379cb05..8a3a25252 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -701,6 +701,17 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES= Defaults to false. +PAPERLESS_CONSUMER_USE_LEGACY_DETECTION= + Enables the legacy method of detecting barcodes. By default, images are + extracted directly from the PDF structure for barcode detection. If this + configuration value is set, images of the whole PDF page will be used instead. + + This is a slower and more memory intensive process, but may be required for + certain files, depending on how it is produced and how images are encoded. + + Defaults to false. + + PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT= Whether TIFF image files should be scanned for barcodes. This will automatically convert any TIFF image(s) to pdfs for later diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index a4be126a5..13e78e181 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -150,16 +150,20 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) - try: - _pikepdf_barcode_scan(pdf_filepath) - except Exception as e: - - logger.warning( - f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}", - ) - # Reset this incase pikepdf got part way through - separator_page_numbers = [] + if settings.CONSUMER_USE_LEGACY_DETECTION: _pdf2image_barcode_scan(pdf_filepath) + else: + try: + _pikepdf_barcode_scan(pdf_filepath) + except Exception as e: + + logger.warning( + f"Exception using pikepdf for barcodes," + f" falling back to pdf2image: {e}", + ) + # Reset this incase pikepdf got part way through + separator_page_numbers = [] + _pdf2image_barcode_scan(pdf_filepath) else: logger.warning( diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index ee8df9f34..1c4ab7cc3 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -468,6 +468,41 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file2)) + @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) + def test_barcode_splitter_legacy_fallback(self): + """ + GIVEN: + - File containing barcode + - Legacy method of detection is enabled + WHEN: + - File is scanned for barcodes + THEN: + - Barcodes are properly detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + + pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( + test_file, + ) + + self.assertEqual(test_file, pdf_file) + self.assertTrue(len(separator_page_numbers) > 0) + + document_list = barcodes.separate_pages(test_file, separator_page_numbers) + self.assertTrue(document_list) + for document in document_list: + barcodes.save_to_dir(document, target_dir=tempdir) + + target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") + target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") + + self.assertTrue(os.path.isfile(target_file1)) + self.assertTrue(os.path.isfile(target_file2)) + @override_settings(CONSUMER_ENABLE_BARCODES=True) def test_consume_barcode_file(self): test_file = os.path.join( diff --git a/src/paperless/settings.py b/src/paperless/settings.py index a262bd501..1fb6ba913 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -558,15 +558,23 @@ CONSUMER_IGNORE_PATTERNS = list( CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") -CONSUMER_ENABLE_BARCODES = __get_boolean( +CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_BARCODES", ) -CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( +CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", ) -CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") +CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean( + "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION", + "NO", +) + +CONSUMER_BARCODE_STRING: Final[str] = os.getenv( + "PAPERLESS_CONSUMER_BARCODE_STRING", + "PATCHT", +) OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))