Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration

2026-02-26 01:09:34 -06:00 · 2022-11-09 08:50:34 -08:00
parent 1d0cf77e7e
commit 10f6195bac
4 changed files with 16 additions and 95 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>

    Defaults to false.

-PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
-    Enables the legacy method of detecting barcodes.  By default, images are
-    extracted directly from the PDF structure for barcode detection.  If this
-    configuration value is set, images of the whole PDF page will be used instead.
-
-    This is a slower and more memory intensive process, but may be required for
-    certain files, depending on how it is produced and how images are encoded.
-
-    Defaults to false.
-

 PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
    Whether TIFF image files should be scanned for barcodes.
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -10,12 +10,10 @@ from typing import Tuple
 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
-from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
 from pikepdf import PasswordError
 from pikepdf import Pdf
 from pikepdf import PdfImage
-from pikepdf.models.image import HifiPrintImageNotTranscodableError
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
                images[0].save(newpath)
            else:
                images[0].save(newpath, save_all=True, append_images=images[1:])
-        except OSError as e:
+        except OSError as e:  # pragma: no cover
            logger.warning(
                f"Could not save the file as pdf. Error: {str(e)}",
            )
@@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                for image_key in page.images:
                    pdfimage = PdfImage(page.images[image_key])

+                    # This type is known to have issues:
+                    # https://github.com/pikepdf/pikepdf/issues/401
                    if "/CCITTFaxDecode" in pdfimage.filters:
                        raise BarcodeImageFormatError(
                            "Unable to decode CCITTFaxDecode images",
                        )

                    # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive
+                    # is what pyzbar expects to receive, so this may
+                    # raise an exception, triggering fallback
                    pillow_img = pdfimage.as_pil_image()

                    detected_barcodes = barcode_reader(pillow_img)
@@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
        if mime_type == "image/tiff":
            pdf_filepath = convert_from_tiff_to_pdf(filepath)

-        # Chose the scanner
-        if settings.CONSUMER_USE_LEGACY_DETECTION:
-            logger.debug("Using pdf2image for barcodes")
-            scanner_function = _pdf2image_barcode_scan
-        else:
-            logger.debug("Using pikepdf for barcodes")
-            scanner_function = _pikepdf_barcode_scan
-
-        # Run the scanner
+        # Always try pikepdf first, it's usually fine, faster and
+        # uses less memory
        try:
-            scanner_function(pdf_filepath)
-        # Neither method can handle password protected PDFs without it being
-        # provided.  Log it and continue
-        except (PasswordError, PDFPageCountError) as e:
+            _pikepdf_barcode_scan(pdf_filepath)
+        # Password protected files can't be checked
+        except PasswordError as e:
            logger.warning(
-                f"File is likely password protected, not splitting: {e}",
+                f"File is likely password protected, not checking for barcodes: {e}",
            )
-        # Handle pikepdf related image decoding issues with a fallback
-        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+        # Handle pikepdf related image decoding issues with a fallback to page
+        # by page conversion to images in a temporary directory
+        except Exception as e:
            logger.warning(
                f"Falling back to pdf2image because: {e}",
            )
            try:
+                # Clear the list in case some processing worked
                separator_page_numbers = []
                _pdf2image_barcode_scan(pdf_filepath)
            # This file is really borked, allow the consumption to continue
@@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                logger.warning(
                    f"Exception during barcode scanning: {e}",
                )
-        # We're not sure what happened, but allow the consumption to continue
-        except Exception as e:  # pragma: no cover
-            logger.warning(
-                f"Exception during barcode scanning: {e}",
-            )

    else:
        logger.warning(
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
        self.assertTrue(os.path.isfile(target_file1))
        self.assertTrue(os.path.isfile(target_file2))

-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_barcode_splitter_legacy_fallback(self):
-        """
-        GIVEN:
-            - File containing barcode
-            - Legacy method of detection is enabled
-        WHEN:
-            - File is scanned for barcodes
-        THEN:
-            - Barcodes are properly detected
-        """
-        test_file = os.path.join(
-            self.BARCODE_SAMPLE_DIR,
-            "patch-code-t-middle.pdf",
-        )
-        tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
-
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(test_file, pdf_file)
-        self.assertTrue(len(separator_page_numbers) > 0)
-
-        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
-        self.assertTrue(document_list)
-        for document in document_list:
-            barcodes.save_to_dir(document, target_dir=tempdir)
-
-        target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
-        target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
-
-        self.assertTrue(os.path.isfile(target_file1))
-        self.assertTrue(os.path.isfile(target_file2))
-
    @override_settings(CONSUMER_ENABLE_BARCODES=True)
    def test_consume_barcode_file(self):
        test_file = os.path.join(
@@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        with mock.patch("documents.tasks.async_to_sync"):
            self.assertEqual(tasks.consume_file(dst), "File successfully split")

-    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+    def test_scan_file_for_separating_barcodes_password(self):
        """
        GIVEN:
            - Password protected PDF
@@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):

        self.assertEqual(pdf_file, test_file)
        self.assertListEqual(separator_page_numbers, [])
-
-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
-        """
-        GIVEN:
-            - Password protected PDF
-            - pdf2image based scanning
-        WHEN:
-            - File is scanned for barcode
-        THEN:
-            - Scanning handle the exception without exception
-        """
-        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(pdf_file, test_file)
-        self.assertListEqual(separator_page_numbers, [])
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
 )

-CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
-    "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
-    "NO",
-)
-
 CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
    "PAPERLESS_CONSUMER_BARCODE_STRING",
    "PATCHT",