Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration

This commit is contained in:
Trenton H 2022-11-09 08:50:34 -08:00
parent 1d0cf77e7e
commit 10f6195bac
4 changed files with 16 additions and 95 deletions

View File

@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Defaults to false.
PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
Enables the legacy method of detecting barcodes. By default, images are
extracted directly from the PDF structure for barcode detection. If this
configuration value is set, images of the whole PDF page will be used instead.
This is a slower and more memory intensive process, but may be required for
certain files, depending on how it is produced and how images are encoded.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.

View File

@ -10,12 +10,10 @@ from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
except OSError as e:
except OSError as e: # pragma: no cover
logger.warning(
f"Could not save the file as pdf. Error: {str(e)}",
)
@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive
# is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
# Chose the scanner
if settings.CONSUMER_USE_LEGACY_DETECTION:
logger.debug("Using pdf2image for barcodes")
scanner_function = _pdf2image_barcode_scan
else:
logger.debug("Using pikepdf for barcodes")
scanner_function = _pikepdf_barcode_scan
# Run the scanner
# Always try pikepdf first, it's usually fine, faster and
# uses less memory
try:
scanner_function(pdf_filepath)
# Neither method can handle password protected PDFs without it being
# provided. Log it and continue
except (PasswordError, PDFPageCountError) as e:
_pikepdf_barcode_scan(pdf_filepath)
# Password protected files can't be checked
except PasswordError as e:
logger.warning(
f"File is likely password protected, not splitting: {e}",
f"File is likely password protected, not checking for barcodes: {e}",
)
# Handle pikepdf related image decoding issues with a fallback
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
# Handle pikepdf related image decoding issues with a fallback to page
# by page conversion to images in a temporary directory
except Exception as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
# Clear the list in case some processing worked
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
logger.warning(
f"Exception during barcode scanning: {e}",
)
# We're not sure what happened, but allow the consumption to continue
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
else:
logger.warning(

View File

@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_barcode_splitter_legacy_fallback(self):
"""
GIVEN:
- File containing barcode
- Legacy method of detection is enabled
WHEN:
- File is scanned for barcodes
THEN:
- Barcodes are properly detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
def test_scan_file_for_separating_barcodes_password_pikepdf(self):
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
"""
GIVEN:
- Password protected PDF
- pdf2image based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])

View File

@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
"NO",
)
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_STRING",
"PATCHT",