Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration

2025-07-30 18:27:45 -05:00 · 2022-11-09 08:50:34 -08:00
parent 1d0cf77e7e
commit 10f6195bac
4 changed files with 16 additions and 95 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
    Defaults to false.
 PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
    Enables the legacy method of detecting barcodes.  By default, images are
    extracted directly from the PDF structure for barcode detection.  If this
    configuration value is set, images of the whole PDF page will be used instead.
    This is a slower and more memory intensive process, but may be required for
    certain files, depending on how it is produced and how images are encoded.
    Defaults to false.
 PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
    Whether TIFF image files should be scanned for barcodes.
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -10,12 +10,10 @@ from typing import Tuple
 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
 from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
 from pikepdf import PasswordError
 from pikepdf import Pdf
 from pikepdf import PdfImage
 from pikepdf.models.image import HifiPrintImageNotTranscodableError
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
                images[0].save(newpath)
            else:
                images[0].save(newpath, save_all=True, append_images=images[1:])
-        except OSError as e:
+        except OSError as e:  # pragma: no cover
            logger.warning(
                f"Could not save the file as pdf. Error: {str(e)}",
            )
@@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                for image_key in page.images:
                    pdfimage = PdfImage(page.images[image_key])
                    # This type is known to have issues:
                    # https://github.com/pikepdf/pikepdf/issues/401
                    if "/CCITTFaxDecode" in pdfimage.filters:
                        raise BarcodeImageFormatError(
                            "Unable to decode CCITTFaxDecode images",
                        )
                    # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive
+                    # is what pyzbar expects to receive, so this may
                    # raise an exception, triggering fallback
                    pillow_img = pdfimage.as_pil_image()
                    detected_barcodes = barcode_reader(pillow_img)
@@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
        if mime_type == "image/tiff":
            pdf_filepath = convert_from_tiff_to_pdf(filepath)
-        # Chose the scanner
+        # Always try pikepdf first, it's usually fine, faster and
-        if settings.CONSUMER_USE_LEGACY_DETECTION:
+        # uses less memory
            logger.debug("Using pdf2image for barcodes")
            scanner_function = _pdf2image_barcode_scan
        else:
            logger.debug("Using pikepdf for barcodes")
            scanner_function = _pikepdf_barcode_scan
        # Run the scanner
        try:
-            scanner_function(pdf_filepath)
+            _pikepdf_barcode_scan(pdf_filepath)
-        # Neither method can handle password protected PDFs without it being
+        # Password protected files can't be checked
-        # provided.  Log it and continue
+        except PasswordError as e:
        except (PasswordError, PDFPageCountError) as e:
            logger.warning(
-                f"File is likely password protected, not splitting: {e}",
+                f"File is likely password protected, not checking for barcodes: {e}",
            )
-        # Handle pikepdf related image decoding issues with a fallback
+        # Handle pikepdf related image decoding issues with a fallback to page
-        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+        # by page conversion to images in a temporary directory
        except Exception as e:
            logger.warning(
                f"Falling back to pdf2image because: {e}",
            )
            try:
                # Clear the list in case some processing worked
                separator_page_numbers = []
                _pdf2image_barcode_scan(pdf_filepath)
            # This file is really borked, allow the consumption to continue
@@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                logger.warning(
                    f"Exception during barcode scanning: {e}",
                )
        # We're not sure what happened, but allow the consumption to continue
        except Exception as e:  # pragma: no cover
            logger.warning(
                f"Exception during barcode scanning: {e}",
            )
    else:
        logger.warning(
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
        self.assertTrue(os.path.isfile(target_file1))
        self.assertTrue(os.path.isfile(target_file2))
    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
    def test_barcode_splitter_legacy_fallback(self):
        """
        GIVEN:
            - File containing barcode
            - Legacy method of detection is enabled
        WHEN:
            - File is scanned for barcodes
        THEN:
            - Barcodes are properly detected
        """
        test_file = os.path.join(
            self.BARCODE_SAMPLE_DIR,
            "patch-code-t-middle.pdf",
        )
        tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
            test_file,
        )
        self.assertEqual(test_file, pdf_file)
        self.assertTrue(len(separator_page_numbers) > 0)
        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
        self.assertTrue(document_list)
        for document in document_list:
            barcodes.save_to_dir(document, target_dir=tempdir)
        target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
        target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
        self.assertTrue(os.path.isfile(target_file1))
        self.assertTrue(os.path.isfile(target_file2))
    @override_settings(CONSUMER_ENABLE_BARCODES=True)
    def test_consume_barcode_file(self):
        test_file = os.path.join(
@@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        with mock.patch("documents.tasks.async_to_sync"):
            self.assertEqual(tasks.consume_file(dst), "File successfully split")
-    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+    def test_scan_file_for_separating_barcodes_password(self):
        """
        GIVEN:
            - Password protected PDF
@@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
        self.assertEqual(pdf_file, test_file)
        self.assertListEqual(separator_page_numbers, [])
    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
        """
        GIVEN:
            - Password protected PDF
            - pdf2image based scanning
        WHEN:
            - File is scanned for barcode
        THEN:
            - Scanning handle the exception without exception
        """
        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
            test_file,
        )
        self.assertEqual(pdf_file, test_file)
        self.assertListEqual(separator_page_numbers, [])
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
 )
 CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
    "NO",
 )
 CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
    "PAPERLESS_CONSUMER_BARCODE_STRING",
    "PATCHT",