diff --git a/Pipfile b/Pipfile index d95af784e..e44ace17e 100644 --- a/Pipfile +++ b/Pipfile @@ -57,6 +57,7 @@ celery = {extras = ["redis"], version = "*"} django-celery-results = "*" setproctitle = "*" nltk = "*" +pdf2image = "*" [dev-packages] coveralls = "*" diff --git a/Pipfile.lock b/Pipfile.lock index a78cc5ff0..7852a1ced 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -939,6 +939,14 @@ "index": "pypi", "version": "==2.5.2" }, + "pdf2image": { + "hashes": [ + "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65", + "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb" + ], + "index": "pypi", + "version": "==1.16.0" + }, "pdfminer.six": { "hashes": [ "sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2", diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index a30a55bbb..a4be126a5 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -9,6 +9,7 @@ from typing import Tuple import magic from django.conf import settings +from pdf2image import convert_from_path from pikepdf import Page from pikepdf import Pdf from pikepdf import PdfImage @@ -19,6 +20,10 @@ from pyzbar import pyzbar logger = logging.getLogger("paperless.barcodes") +class BarcodeImageFormatError(Exception): + pass + + @lru_cache(maxsize=8) def supported_file_type(mime_type) -> bool: """ @@ -108,6 +113,33 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis which separate the file into new files """ + def _pikepdf_barcode_scan(pdf_filepath: str): + with Pdf.open(pdf_filepath) as pdf: + for page_num, page in enumerate(pdf.pages): + for image_key in page.images: + pdfimage = PdfImage(page.images[image_key]) + + if "/CCITTFaxDecode" in pdfimage.filters: + raise BarcodeImageFormatError() + + # Not all images can be transcoded to a PIL image, which + # is what pyzbar expects to receive + pillow_img = pdfimage.as_pil_image() + + detected_barcodes = barcode_reader(pillow_img) + + if settings.CONSUMER_BARCODE_STRING in detected_barcodes: + separator_page_numbers.append(page_num) + + def _pdf2image_barcode_scan(pdf_filepath: str): + # use a temporary directory in case the file os too big to handle in memory + with tempfile.TemporaryDirectory() as path: + pages_from_path = convert_from_path(pdf_filepath, output_folder=path) + for current_page_number, page in enumerate(pages_from_path): + current_barcodes = barcode_reader(page) + if settings.CONSUMER_BARCODE_STRING in current_barcodes: + separator_page_numbers.append(current_page_number) + separator_page_numbers = [] pdf_filepath = None @@ -118,17 +150,17 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) - pdf = Pdf.open(pdf_filepath) + try: + _pikepdf_barcode_scan(pdf_filepath) + except Exception as e: - for page_num, page in enumerate(pdf.pages): - for image_key in page.images: - pdfimage = PdfImage(page.images[image_key]) - pillow_img = pdfimage.as_pil_image() + logger.warning( + f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}", + ) + # Reset this incase pikepdf got part way through + separator_page_numbers = [] + _pdf2image_barcode_scan(pdf_filepath) - detected_barcodes = barcode_reader(pillow_img) - - if settings.CONSUMER_BARCODE_STRING in detected_barcodes: - separator_page_numbers.append(page_num) else: logger.warning( f"Unsupported file format for barcode reader: {str(mime_type)}", diff --git a/src/documents/tests/samples/barcodes/barcode-fax-image.pdf b/src/documents/tests/samples/barcodes/barcode-fax-image.pdf new file mode 100644 index 000000000..2e248c82b Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-fax-image.pdf differ diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 5de475578..ee8df9f34 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -3,6 +3,7 @@ import shutil import tempfile from unittest import mock +import pikepdf from django.conf import settings from django.test import override_settings from django.test import TestCase @@ -218,6 +219,86 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(pdf_file, test_file) self.assertListEqual(separator_page_numbers, [1]) + def test_scan_file_for_separating_barcodes_pillow_transcode_error(self): + """ + GIVEN: + - A PDF containing an image which cannot be transcoded to a PIL image + WHEN: + - The image tries to be transcoded to a PIL image, but fails + THEN: + - The barcode reader is still called + """ + + def _build_device_n_pdf(self, save_path: str): + # Based on the pikepdf tests + # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py + pdf = pikepdf.new() + pdf.add_blank_page(page_size=(72, 72)) + imobj = pikepdf.Stream( + pdf, + bytes(range(0, 256)), + BitsPerComponent=8, + ColorSpace=pikepdf.Array( + [ + pikepdf.Name.DeviceN, + pikepdf.Array([pikepdf.Name.Black]), + pikepdf.Name.DeviceCMYK, + pikepdf.Stream( + pdf, + b"{0 0 0 4 -1 roll}", # Colorspace conversion function + FunctionType=4, + Domain=[0.0, 1.0], + Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], + ), + ], + ), + Width=16, + Height=16, + Type=pikepdf.Name.XObject, + Subtype=pikepdf.Name.Image, + ) + pim = pikepdf.PdfImage(imobj) + self.assertEqual(pim.mode, "DeviceN") + self.assertTrue(pim.is_device_n) + + pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do") + pdf.pages[0].Resources = pikepdf.Dictionary( + XObject=pikepdf.Dictionary(Im0=imobj), + ) + pdf.save(save_path) + + with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf: + # Build an offending file + _build_device_n_pdf(self, str(device_n_pdf.name)) + with mock.patch("documents.barcodes.barcode_reader") as reader: + reader.return_value = list() + + _, _ = barcodes.scan_file_for_separating_barcodes( + str(device_n_pdf.name), + ) + + reader.assert_called() + + def test_scan_file_for_separating_barcodes_fax_decode(self): + """ + GIVEN: + - A PDF containing an image encoded as CCITT Group 4 encoding + WHEN: + - Barcode processing happens with the file + THEN: + - The barcode is still detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-fax-image.pdf", + ) + pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( + test_file, + ) + + self.assertEqual(pdf_file, test_file) + self.assertListEqual(separator_page_numbers, [1]) + def test_scan_file_for_separating_qr_barcodes(self): test_file = os.path.join( self.BARCODE_SAMPLE_DIR,