Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration

This commit is contained in:
Trenton H 2022-11-09 08:50:34 -08:00
parent 1d0cf77e7e
commit 10f6195bac
4 changed files with 16 additions and 95 deletions

View File

@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Defaults to false. Defaults to false.
PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
Enables the legacy method of detecting barcodes. By default, images are
extracted directly from the PDF structure for barcode detection. If this
configuration value is set, images of the whole PDF page will be used instead.
This is a slower and more memory intensive process, but may be required for
certain files, depending on how it is produced and how images are encoded.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool> PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes. Whether TIFF image files should be scanned for barcodes.

View File

@ -10,12 +10,10 @@ from typing import Tuple
import magic import magic
from django.conf import settings from django.conf import settings
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page from pikepdf import Page
from pikepdf import PasswordError from pikepdf import PasswordError
from pikepdf import Pdf from pikepdf import Pdf
from pikepdf import PdfImage from pikepdf import PdfImage
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image from PIL import Image
from PIL import ImageSequence from PIL import ImageSequence
from pyzbar import pyzbar from pyzbar import pyzbar
@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
images[0].save(newpath) images[0].save(newpath)
else: else:
images[0].save(newpath, save_all=True, append_images=images[1:]) images[0].save(newpath, save_all=True, append_images=images[1:])
except OSError as e: except OSError as e: # pragma: no cover
logger.warning( logger.warning(
f"Could not save the file as pdf. Error: {str(e)}", f"Could not save the file as pdf. Error: {str(e)}",
) )
@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
for image_key in page.images: for image_key in page.images:
pdfimage = PdfImage(page.images[image_key]) pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters: if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError( raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images", "Unable to decode CCITTFaxDecode images",
) )
# Not all images can be transcoded to a PIL image, which # Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive # is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image() pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img) detected_barcodes = barcode_reader(pillow_img)
@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
if mime_type == "image/tiff": if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath) pdf_filepath = convert_from_tiff_to_pdf(filepath)
# Chose the scanner # Always try pikepdf first, it's usually fine, faster and
if settings.CONSUMER_USE_LEGACY_DETECTION: # uses less memory
logger.debug("Using pdf2image for barcodes")
scanner_function = _pdf2image_barcode_scan
else:
logger.debug("Using pikepdf for barcodes")
scanner_function = _pikepdf_barcode_scan
# Run the scanner
try: try:
scanner_function(pdf_filepath) _pikepdf_barcode_scan(pdf_filepath)
# Neither method can handle password protected PDFs without it being # Password protected files can't be checked
# provided. Log it and continue except PasswordError as e:
except (PasswordError, PDFPageCountError) as e:
logger.warning( logger.warning(
f"File is likely password protected, not splitting: {e}", f"File is likely password protected, not checking for barcodes: {e}",
) )
# Handle pikepdf related image decoding issues with a fallback # Handle pikepdf related image decoding issues with a fallback to page
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: # by page conversion to images in a temporary directory
except Exception as e:
logger.warning( logger.warning(
f"Falling back to pdf2image because: {e}", f"Falling back to pdf2image because: {e}",
) )
try: try:
# Clear the list in case some processing worked
separator_page_numbers = [] separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath) _pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue # This file is really borked, allow the consumption to continue
@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
logger.warning( logger.warning(
f"Exception during barcode scanning: {e}", f"Exception during barcode scanning: {e}",
) )
# We're not sure what happened, but allow the consumption to continue
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
else: else:
logger.warning( logger.warning(

View File

@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2)) self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_barcode_splitter_legacy_fallback(self):
"""
GIVEN:
- File containing barcode
- Legacy method of detection is enabled
WHEN:
- File is scanned for barcodes
THEN:
- Barcodes are properly detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True) @override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self): def test_consume_barcode_file(self):
test_file = os.path.join( test_file = os.path.join(
@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
with mock.patch("documents.tasks.async_to_sync"): with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split") self.assertEqual(tasks.consume_file(dst), "File successfully split")
def test_scan_file_for_separating_barcodes_password_pikepdf(self): def test_scan_file_for_separating_barcodes_password(self):
""" """
GIVEN: GIVEN:
- Password protected PDF - Password protected PDF
@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(pdf_file, test_file) self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, []) self.assertListEqual(separator_page_numbers, [])
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
"""
GIVEN:
- Password protected PDF
- pdf2image based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])

View File

@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
) )
CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
"NO",
)
CONSUMER_BARCODE_STRING: Final[str] = os.getenv( CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_STRING", "PAPERLESS_CONSUMER_BARCODE_STRING",
"PATCHT", "PATCHT",