Merge remote-tracking branch 'paperless/dev' into feature-consume-eml

This commit is contained in:
phail 2022-10-26 20:59:49 +02:00
commit e1fa59122d
4 changed files with 83 additions and 16 deletions

11
Pipfile.lock generated
View File

@ -226,7 +226,7 @@
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2.1.1"
},
"click": {
@ -242,7 +242,7 @@
"sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667",
"sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"
],
"markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'",
"markers": "python_version < '4' and python_full_version >= '3.6.2'",
"version": "==0.3.0"
},
"click-plugins": {
@ -2191,7 +2191,7 @@
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2.1.1"
},
"click": {
@ -2211,6 +2211,9 @@
"version": "==0.4.5"
},
"coverage": {
"extras": [
"toml"
],
"hashes": [
"sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79",
"sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a",
@ -2785,7 +2788,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_full_version < '3.11.0a7'",
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"tornado": {

View File

@ -10,9 +10,12 @@ from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
pdfimage = PdfImage(page.images[image_key])
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError()
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive
@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
separator_page_numbers.append(page_num)
def _pdf2image_barcode_scan(pdf_filepath: str):
# use a temporary directory in case the file os too big to handle in memory
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
# Chose the scanner
if settings.CONSUMER_USE_LEGACY_DETECTION:
_pdf2image_barcode_scan(pdf_filepath)
logger.debug("Using pdf2image for barcodes")
scanner_function = _pdf2image_barcode_scan
else:
try:
_pikepdf_barcode_scan(pdf_filepath)
except Exception as e:
logger.debug("Using pikepdf for barcodes")
scanner_function = _pikepdf_barcode_scan
logger.warning(
f"Exception using pikepdf for barcodes,"
f" falling back to pdf2image: {e}",
)
# Reset this incase pikepdf got part way through
# Run the scanner
try:
scanner_function(pdf_filepath)
# Neither method can handle password protected PDFs without it being
# provided. Log it and continue
except (PasswordError, PDFPageCountError) as e:
logger.warning(
f"File is likely password protected, not splitting: {e}",
)
# Handle pikepdf related image decoding issues with a fallback
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
# We're not sure what happened, but allow the consumption to continue
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
else:
logger.warning(

Binary file not shown.

View File

@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes2(self):
def test_scan_file_for_separating_barcodes_none_present(self):
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase):
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
def test_scan_file_for_separating_barcodes_password_pikepdf(self):
"""
GIVEN:
- Password protected PDF
- pikepdf based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
"""
GIVEN:
- Password protected PDF
- pdf2image based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])