diff --git a/Pipfile.lock b/Pipfile.lock index 9bbb72bc7..2bb81496f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -226,7 +226,7 @@ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.1.1" }, "click": { @@ -242,7 +242,7 @@ "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" ], - "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'", + "markers": "python_version < '4' and python_full_version >= '3.6.2'", "version": "==0.3.0" }, "click-plugins": { @@ -2191,7 +2191,7 @@ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.1.1" }, "click": { @@ -2211,6 +2211,9 @@ "version": "==0.4.5" }, "coverage": { + "extras": [ + "toml" + ], "hashes": [ "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", @@ -2785,7 +2788,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_full_version < '3.11.0a7'", + "markers": "python_version >= '3.7'", "version": "==2.0.1" }, "tornado": { diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 13e78e181..1f5e33d37 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -10,9 +10,12 @@ from typing import Tuple import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page +from pikepdf import PasswordError from pikepdf import Pdf from pikepdf import PdfImage +from pikepdf.models.image import HifiPrintImageNotTranscodableError from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis pdfimage = PdfImage(page.images[image_key]) if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError() + raise BarcodeImageFormatError( + "Unable to decode CCITTFaxDecode images", + ) # Not all images can be transcoded to a PIL image, which # is what pyzbar expects to receive @@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis separator_page_numbers.append(page_num) def _pdf2image_barcode_scan(pdf_filepath: str): - # use a temporary directory in case the file os too big to handle in memory + # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: pages_from_path = convert_from_path(pdf_filepath, output_folder=path) for current_page_number, page in enumerate(pages_from_path): @@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) + # Chose the scanner if settings.CONSUMER_USE_LEGACY_DETECTION: - _pdf2image_barcode_scan(pdf_filepath) + logger.debug("Using pdf2image for barcodes") + scanner_function = _pdf2image_barcode_scan else: - try: - _pikepdf_barcode_scan(pdf_filepath) - except Exception as e: + logger.debug("Using pikepdf for barcodes") + scanner_function = _pikepdf_barcode_scan - logger.warning( - f"Exception using pikepdf for barcodes," - f" falling back to pdf2image: {e}", - ) - # Reset this incase pikepdf got part way through + # Run the scanner + try: + scanner_function(pdf_filepath) + # Neither method can handle password protected PDFs without it being + # provided. Log it and continue + except (PasswordError, PDFPageCountError) as e: + logger.warning( + f"File is likely password protected, not splitting: {e}", + ) + # Handle pikepdf related image decoding issues with a fallback + except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: + logger.warning( + f"Falling back to pdf2image because: {e}", + ) + try: separator_page_numbers = [] _pdf2image_barcode_scan(pdf_filepath) + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) + # We're not sure what happened, but allow the consumption to continue + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) else: logger.warning( diff --git a/src/documents/tests/samples/password-is-test.pdf b/src/documents/tests/samples/password-is-test.pdf new file mode 100755 index 000000000..b16b023c3 Binary files /dev/null and b/src/documents/tests/samples/password-is-test.pdf differ diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 1c4ab7cc3..f4e5fce14 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(pdf_file, test_file) self.assertListEqual(separator_page_numbers, [0]) - def test_scan_file_for_separating_barcodes2(self): + def test_scan_file_for_separating_barcodes_none_present(self): test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( test_file, @@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase): with mock.patch("documents.tasks.async_to_sync"): self.assertEqual(tasks.consume_file(dst), "File successfully split") + + def test_scan_file_for_separating_barcodes_password_pikepdf(self): + """ + GIVEN: + - Password protected PDF + - pikepdf based scanning + WHEN: + - File is scanned for barcode + THEN: + - Scanning handle the exception without exception + """ + test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") + pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( + test_file, + ) + + self.assertEqual(pdf_file, test_file) + self.assertListEqual(separator_page_numbers, []) + + @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) + def test_scan_file_for_separating_barcodes_password_pdf2image(self): + """ + GIVEN: + - Password protected PDF + - pdf2image based scanning + WHEN: + - File is scanned for barcode + THEN: + - Scanning handle the exception without exception + """ + test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") + pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( + test_file, + ) + + self.assertEqual(pdf_file, test_file) + self.assertListEqual(separator_page_numbers, [])