From 10f6195bacd2bb1596cd523da04529692a926034 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Wed, 9 Nov 2022 08:50:34 -0800
Subject: [PATCH] Always use pikepdf, then pdf2image if needed to check for
 barcodes instead of requiring/allowing configuration

---
 docs/configuration.rst               | 10 -----
 src/documents/barcodes.py            | 40 ++++++++------------
 src/documents/tests/test_barcodes.py | 56 +---------------------------
 src/paperless/settings.py            |  5 ---
 4 files changed, 16 insertions(+), 95 deletions(-)
diff --git a/docs/configuration.rst b/docs/configuration.rst
index 8a3a25252..f9c4f89cc 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
 
     Defaults to false.
 
-PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
-    Enables the legacy method of detecting barcodes.  By default, images are
-    extracted directly from the PDF structure for barcode detection.  If this
-    configuration value is set, images of the whole PDF page will be used instead.
-
-    This is a slower and more memory intensive process, but may be required for
-    certain files, depending on how it is produced and how images are encoded.
-
-    Defaults to false.
-
 
 PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
     Whether TIFF image files should be scanned for barcodes.
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py
index 1f5e33d37..9533742a8 100644
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -10,12 +10,10 @@ from typing import Tuple
 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
-from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
 from pikepdf import PasswordError
 from pikepdf import Pdf
 from pikepdf import PdfImage
-from pikepdf.models.image import HifiPrintImageNotTranscodableError
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
                 images[0].save(newpath)
             else:
                 images[0].save(newpath, save_all=True, append_images=images[1:])
-        except OSError as e:
+        except OSError as e:  # pragma: no cover
             logger.warning(
                 f"Could not save the file as pdf. Error: {str(e)}",
             )
@@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                 for image_key in page.images:
                     pdfimage = PdfImage(page.images[image_key])
 
+                    # This type is known to have issues:
+                    # https://github.com/pikepdf/pikepdf/issues/401
                     if "/CCITTFaxDecode" in pdfimage.filters:
                         raise BarcodeImageFormatError(
                             "Unable to decode CCITTFaxDecode images",
                         )
 
                     # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive
+                    # is what pyzbar expects to receive, so this may
+                    # raise an exception, triggering fallback
                     pillow_img = pdfimage.as_pil_image()
 
                     detected_barcodes = barcode_reader(pillow_img)
@@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
         if mime_type == "image/tiff":
             pdf_filepath = convert_from_tiff_to_pdf(filepath)
 
-        # Chose the scanner
-        if settings.CONSUMER_USE_LEGACY_DETECTION:
-            logger.debug("Using pdf2image for barcodes")
-            scanner_function = _pdf2image_barcode_scan
-        else:
-            logger.debug("Using pikepdf for barcodes")
-            scanner_function = _pikepdf_barcode_scan
-
-        # Run the scanner
+        # Always try pikepdf first, it's usually fine, faster and
+        # uses less memory
         try:
-            scanner_function(pdf_filepath)
-        # Neither method can handle password protected PDFs without it being
-        # provided.  Log it and continue
-        except (PasswordError, PDFPageCountError) as e:
+            _pikepdf_barcode_scan(pdf_filepath)
+        # Password protected files can't be checked
+        except PasswordError as e:
             logger.warning(
-                f"File is likely password protected, not splitting: {e}",
+                f"File is likely password protected, not checking for barcodes: {e}",
             )
-        # Handle pikepdf related image decoding issues with a fallback
-        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
+        # Handle pikepdf related image decoding issues with a fallback to page
+        # by page conversion to images in a temporary directory
+        except Exception as e:
             logger.warning(
                 f"Falling back to pdf2image because: {e}",
             )
             try:
+                # Clear the list in case some processing worked
                 separator_page_numbers = []
                 _pdf2image_barcode_scan(pdf_filepath)
             # This file is really borked, allow the consumption to continue
@@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
                 logger.warning(
                     f"Exception during barcode scanning: {e}",
                 )
-        # We're not sure what happened, but allow the consumption to continue
-        except Exception as e:  # pragma: no cover
-            logger.warning(
-                f"Exception during barcode scanning: {e}",
-            )
 
     else:
         logger.warning(
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py
index f4e5fce14..ef086626d 100644
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
         self.assertTrue(os.path.isfile(target_file1))
         self.assertTrue(os.path.isfile(target_file2))
 
-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_barcode_splitter_legacy_fallback(self):
-        """
-        GIVEN:
-            - File containing barcode
-            - Legacy method of detection is enabled
-        WHEN:
-            - File is scanned for barcodes
-        THEN:
-            - Barcodes are properly detected
-        """
-        test_file = os.path.join(
-            self.BARCODE_SAMPLE_DIR,
-            "patch-code-t-middle.pdf",
-        )
-        tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
-
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(test_file, pdf_file)
-        self.assertTrue(len(separator_page_numbers) > 0)
-
-        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
-        self.assertTrue(document_list)
-        for document in document_list:
-            barcodes.save_to_dir(document, target_dir=tempdir)
-
-        target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
-        target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
-
-        self.assertTrue(os.path.isfile(target_file1))
-        self.assertTrue(os.path.isfile(target_file2))
-
     @override_settings(CONSUMER_ENABLE_BARCODES=True)
     def test_consume_barcode_file(self):
         test_file = os.path.join(
@@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
         with mock.patch("documents.tasks.async_to_sync"):
             self.assertEqual(tasks.consume_file(dst), "File successfully split")
 
-    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
+    def test_scan_file_for_separating_barcodes_password(self):
         """
         GIVEN:
             - Password protected PDF
@@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
 
         self.assertEqual(pdf_file, test_file)
         self.assertListEqual(separator_page_numbers, [])
-
-    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
-    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
-        """
-        GIVEN:
-            - Password protected PDF
-            - pdf2image based scanning
-        WHEN:
-            - File is scanned for barcode
-        THEN:
-            - Scanning handle the exception without exception
-        """
-        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
-        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
-            test_file,
-        )
-
-        self.assertEqual(pdf_file, test_file)
-        self.assertListEqual(separator_page_numbers, [])
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 5547d6c33..456e15745 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
     "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
 )
 
-CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
-    "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
-    "NO",
-)
-
 CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
     "PAPERLESS_CONSUMER_BARCODE_STRING",
     "PATCHT",