From 10f6195bacd2bb1596cd523da04529692a926034 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 9 Nov 2022 08:50:34 -0800 Subject: [PATCH] Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration --- docs/configuration.rst | 10 ----- src/documents/barcodes.py | 40 ++++++++------------ src/documents/tests/test_barcodes.py | 56 +--------------------------- src/paperless/settings.py | 5 --- 4 files changed, 16 insertions(+), 95 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a3a25252..f9c4f89cc 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES= Defaults to false. -PAPERLESS_CONSUMER_USE_LEGACY_DETECTION= - Enables the legacy method of detecting barcodes. By default, images are - extracted directly from the PDF structure for barcode detection. If this - configuration value is set, images of the whole PDF page will be used instead. - - This is a slower and more memory intensive process, but may be required for - certain files, depending on how it is produced and how images are encoded. - - Defaults to false. - PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT= Whether TIFF image files should be scanned for barcodes. diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 1f5e33d37..9533742a8 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -10,12 +10,10 @@ from typing import Tuple import magic from django.conf import settings from pdf2image import convert_from_path -from pdf2image.exceptions import PDFPageCountError from pikepdf import Page from pikepdf import PasswordError from pikepdf import Pdf from pikepdf import PdfImage -from pikepdf.models.image import HifiPrintImageNotTranscodableError from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str: images[0].save(newpath) else: images[0].save(newpath, save_all=True, append_images=images[1:]) - except OSError as e: + except OSError as e: # pragma: no cover logger.warning( f"Could not save the file as pdf. Error: {str(e)}", ) @@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis for image_key in page.images: pdfimage = PdfImage(page.images[image_key]) + # This type is known to have issues: + # https://github.com/pikepdf/pikepdf/issues/401 if "/CCITTFaxDecode" in pdfimage.filters: raise BarcodeImageFormatError( "Unable to decode CCITTFaxDecode images", ) # Not all images can be transcoded to a PIL image, which - # is what pyzbar expects to receive + # is what pyzbar expects to receive, so this may + # raise an exception, triggering fallback pillow_img = pdfimage.as_pil_image() detected_barcodes = barcode_reader(pillow_img) @@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) - # Chose the scanner - if settings.CONSUMER_USE_LEGACY_DETECTION: - logger.debug("Using pdf2image for barcodes") - scanner_function = _pdf2image_barcode_scan - else: - logger.debug("Using pikepdf for barcodes") - scanner_function = _pikepdf_barcode_scan - - # Run the scanner + # Always try pikepdf first, it's usually fine, faster and + # uses less memory try: - scanner_function(pdf_filepath) - # Neither method can handle password protected PDFs without it being - # provided. Log it and continue - except (PasswordError, PDFPageCountError) as e: + _pikepdf_barcode_scan(pdf_filepath) + # Password protected files can't be checked + except PasswordError as e: logger.warning( - f"File is likely password protected, not splitting: {e}", + f"File is likely password protected, not checking for barcodes: {e}", ) - # Handle pikepdf related image decoding issues with a fallback - except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: + # Handle pikepdf related image decoding issues with a fallback to page + # by page conversion to images in a temporary directory + except Exception as e: logger.warning( f"Falling back to pdf2image because: {e}", ) try: + # Clear the list in case some processing worked separator_page_numbers = [] _pdf2image_barcode_scan(pdf_filepath) # This file is really borked, allow the consumption to continue @@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis logger.warning( f"Exception during barcode scanning: {e}", ) - # We're not sure what happened, but allow the consumption to continue - except Exception as e: # pragma: no cover - logger.warning( - f"Exception during barcode scanning: {e}", - ) else: logger.warning( diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index f4e5fce14..ef086626d 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file2)) - @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) - def test_barcode_splitter_legacy_fallback(self): - """ - GIVEN: - - File containing barcode - - Legacy method of detection is enabled - WHEN: - - File is scanned for barcodes - THEN: - - Barcodes are properly detected - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - - pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( - test_file, - ) - - self.assertEqual(test_file, pdf_file) - self.assertTrue(len(separator_page_numbers) > 0) - - document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertTrue(document_list) - for document in document_list: - barcodes.save_to_dir(document, target_dir=tempdir) - - target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") - target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") - - self.assertTrue(os.path.isfile(target_file1)) - self.assertTrue(os.path.isfile(target_file2)) - @override_settings(CONSUMER_ENABLE_BARCODES=True) def test_consume_barcode_file(self): test_file = os.path.join( @@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase): with mock.patch("documents.tasks.async_to_sync"): self.assertEqual(tasks.consume_file(dst), "File successfully split") - def test_scan_file_for_separating_barcodes_password_pikepdf(self): + def test_scan_file_for_separating_barcodes_password(self): """ GIVEN: - Password protected PDF @@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(pdf_file, test_file) self.assertListEqual(separator_page_numbers, []) - - @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) - def test_scan_file_for_separating_barcodes_password_pdf2image(self): - """ - GIVEN: - - Password protected PDF - - pdf2image based scanning - WHEN: - - File is scanned for barcode - THEN: - - Scanning handle the exception without exception - """ - test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") - pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( - test_file, - ) - - self.assertEqual(pdf_file, test_file) - self.assertListEqual(separator_page_numbers, []) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5547d6c33..456e15745 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", ) -CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean( - "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION", - "NO", -) - CONSUMER_BARCODE_STRING: Final[str] = os.getenv( "PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT",