From caf4b54bc7bf828ba170fcc329aa82a0c45da382 Mon Sep 17 00:00:00 2001
From: Trenton H <holmes.trenton@gmail.com>
Date: Tue, 4 Oct 2022 12:55:50 -0700
Subject: [PATCH] In case pikepdf fails to convert an image to a PIL image,
 fall back to converting pages to PIL images

---
 Pipfile                              |  1 +
 Pipfile.lock                         |  8 ++++
 src/documents/barcodes.py            | 43 ++++++++++++++++----
 src/documents/tests/test_barcodes.py | 61 ++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/Pipfile b/Pipfile
index d95af784e..e44ace17e 100644
--- a/Pipfile
+++ b/Pipfile
@@ -57,6 +57,7 @@ celery = {extras = ["redis"], version = "*"}
 django-celery-results = "*"
 setproctitle = "*"
 nltk = "*"
+pdf2image = "*"
 
 [dev-packages]
 coveralls = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index a78cc5ff0..7852a1ced 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -939,6 +939,14 @@
             "index": "pypi",
             "version": "==2.5.2"
         },
+        "pdf2image": {
+            "hashes": [
+                "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65",
+                "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb"
+            ],
+            "index": "pypi",
+            "version": "==1.16.0"
+        },
         "pdfminer.six": {
             "hashes": [
                 "sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2",
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py
index a30a55bbb..54db83c19 100644
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -9,6 +9,7 @@ from typing import Tuple
 
 import magic
 from django.conf import settings
+from pdf2image import convert_from_path
 from pikepdf import Page
 from pikepdf import Pdf
 from pikepdf import PdfImage
@@ -108,6 +109,30 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
     which separate the file into new files
     """
 
+    def _pikepdf_barcode_scan(pdf_filepath: str):
+        with Pdf.open(pdf_filepath) as pdf:
+            for page_num, page in enumerate(pdf.pages):
+                for image_key in page.images:
+                    pdfimage = PdfImage(page.images[image_key])
+
+                    # Not all images can be transcoded to a PIL image, which
+                    # is what pyzbar expects to receive
+                    pillow_img = pdfimage.as_pil_image()
+
+                    detected_barcodes = barcode_reader(pillow_img)
+
+                    if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
+                        separator_page_numbers.append(page_num)
+
+    def _pdf2image_barcode_scan(pdf_filepath: str):
+        # use a temporary directory in case the file os too big to handle in memory
+        with tempfile.TemporaryDirectory() as path:
+            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
+            for current_page_number, page in enumerate(pages_from_path):
+                current_barcodes = barcode_reader(page)
+                if settings.CONSUMER_BARCODE_STRING in current_barcodes:
+                    separator_page_numbers.append(current_page_number)
+
     separator_page_numbers = []
     pdf_filepath = None
 
@@ -118,17 +143,17 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
         if mime_type == "image/tiff":
             pdf_filepath = convert_from_tiff_to_pdf(filepath)
 
-        pdf = Pdf.open(pdf_filepath)
+        try:
+            _pikepdf_barcode_scan(pdf_filepath)
+        except Exception as e:
 
-        for page_num, page in enumerate(pdf.pages):
-            for image_key in page.images:
-                pdfimage = PdfImage(page.images[image_key])
-                pillow_img = pdfimage.as_pil_image()
+            logger.warning(
+                f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
+            )
+            # Reset this incase pikepdf got part way through
+            separator_page_numbers = []
+            _pdf2image_barcode_scan(pdf_filepath)
 
-                detected_barcodes = barcode_reader(pillow_img)
-
-                if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
-                    separator_page_numbers.append(page_num)
     else:
         logger.warning(
             f"Unsupported file format for barcode reader: {str(mime_type)}",
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py
index 5de475578..0f16845d2 100644
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -3,6 +3,7 @@ import shutil
 import tempfile
 from unittest import mock
 
+import pikepdf
 from django.conf import settings
 from django.test import override_settings
 from django.test import TestCase
@@ -218,6 +219,66 @@ class TestBarcode(DirectoriesMixin, TestCase):
         self.assertEqual(pdf_file, test_file)
         self.assertListEqual(separator_page_numbers, [1])
 
+    def test_scan_file_for_separating_barcodes_pillow_transcode_error(self):
+        """
+        GIVEN:
+            - A PDF containing an image which cannot be transcoded to a PIL image
+        WHEN:
+            - The image tries to be transcoded to a PIL image, but fails
+        THEN:
+            - The barcode reader is still called, as
+        """
+
+        def _build_device_n_pdf(self, save_path: str):
+            # Based on the pikepdf tests
+            # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
+            pdf = pikepdf.new()
+            pdf.add_blank_page(page_size=(72, 72))
+            imobj = pikepdf.Stream(
+                pdf,
+                bytes(range(0, 256)),
+                BitsPerComponent=8,
+                ColorSpace=pikepdf.Array(
+                    [
+                        pikepdf.Name.DeviceN,
+                        pikepdf.Array([pikepdf.Name.Black]),
+                        pikepdf.Name.DeviceCMYK,
+                        pikepdf.Stream(
+                            pdf,
+                            b"{0 0 0 4 -1 roll}",  # Colorspace conversion function
+                            FunctionType=4,
+                            Domain=[0.0, 1.0],
+                            Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
+                        ),
+                    ],
+                ),
+                Width=16,
+                Height=16,
+                Type=pikepdf.Name.XObject,
+                Subtype=pikepdf.Name.Image,
+            )
+            pim = pikepdf.PdfImage(imobj)
+            self.assertEqual(pim.mode, "DeviceN")
+            self.assertTrue(pim.is_device_n)
+
+            pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
+            pdf.pages[0].Resources = pikepdf.Dictionary(
+                XObject=pikepdf.Dictionary(Im0=imobj),
+            )
+            pdf.save(save_path)
+
+        with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
+            # Build an offending file
+            _build_device_n_pdf(self, str(device_n_pdf.name))
+            with mock.patch("documents.barcodes.barcode_reader") as reader:
+                reader.return_value = list()
+
+                _, _ = barcodes.scan_file_for_separating_barcodes(
+                    str(device_n_pdf.name),
+                )
+
+                reader.assert_called()
+
     def test_scan_file_for_separating_qr_barcodes(self):
         test_file = os.path.join(
             self.BARCODE_SAMPLE_DIR,