mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
commit
e1fa59122d
11
Pipfile.lock
generated
11
Pipfile.lock
generated
@ -226,7 +226,7 @@
|
||||
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
|
||||
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"click": {
|
||||
@ -242,7 +242,7 @@
|
||||
"sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667",
|
||||
"sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'",
|
||||
"markers": "python_version < '4' and python_full_version >= '3.6.2'",
|
||||
"version": "==0.3.0"
|
||||
},
|
||||
"click-plugins": {
|
||||
@ -2191,7 +2191,7 @@
|
||||
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
|
||||
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"click": {
|
||||
@ -2211,6 +2211,9 @@
|
||||
"version": "==0.4.5"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
"toml"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79",
|
||||
"sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a",
|
||||
@ -2785,7 +2788,7 @@
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_full_version < '3.11.0a7'",
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"tornado": {
|
||||
|
@ -10,9 +10,12 @@ from typing import Tuple
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pdf2image.exceptions import PDFPageCountError
|
||||
from pikepdf import Page
|
||||
from pikepdf import PasswordError
|
||||
from pikepdf import Pdf
|
||||
from pikepdf import PdfImage
|
||||
from pikepdf.models.image import HifiPrintImageNotTranscodableError
|
||||
from PIL import Image
|
||||
from PIL import ImageSequence
|
||||
from pyzbar import pyzbar
|
||||
@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
||||
pdfimage = PdfImage(page.images[image_key])
|
||||
|
||||
if "/CCITTFaxDecode" in pdfimage.filters:
|
||||
raise BarcodeImageFormatError()
|
||||
raise BarcodeImageFormatError(
|
||||
"Unable to decode CCITTFaxDecode images",
|
||||
)
|
||||
|
||||
# Not all images can be transcoded to a PIL image, which
|
||||
# is what pyzbar expects to receive
|
||||
@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
||||
separator_page_numbers.append(page_num)
|
||||
|
||||
def _pdf2image_barcode_scan(pdf_filepath: str):
|
||||
# use a temporary directory in case the file os too big to handle in memory
|
||||
# use a temporary directory in case the file is too big to handle in memory
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
|
||||
for current_page_number, page in enumerate(pages_from_path):
|
||||
@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
||||
if mime_type == "image/tiff":
|
||||
pdf_filepath = convert_from_tiff_to_pdf(filepath)
|
||||
|
||||
# Chose the scanner
|
||||
if settings.CONSUMER_USE_LEGACY_DETECTION:
|
||||
_pdf2image_barcode_scan(pdf_filepath)
|
||||
logger.debug("Using pdf2image for barcodes")
|
||||
scanner_function = _pdf2image_barcode_scan
|
||||
else:
|
||||
try:
|
||||
_pikepdf_barcode_scan(pdf_filepath)
|
||||
except Exception as e:
|
||||
logger.debug("Using pikepdf for barcodes")
|
||||
scanner_function = _pikepdf_barcode_scan
|
||||
|
||||
logger.warning(
|
||||
f"Exception using pikepdf for barcodes,"
|
||||
f" falling back to pdf2image: {e}",
|
||||
)
|
||||
# Reset this incase pikepdf got part way through
|
||||
# Run the scanner
|
||||
try:
|
||||
scanner_function(pdf_filepath)
|
||||
# Neither method can handle password protected PDFs without it being
|
||||
# provided. Log it and continue
|
||||
except (PasswordError, PDFPageCountError) as e:
|
||||
logger.warning(
|
||||
f"File is likely password protected, not splitting: {e}",
|
||||
)
|
||||
# Handle pikepdf related image decoding issues with a fallback
|
||||
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
|
||||
logger.warning(
|
||||
f"Falling back to pdf2image because: {e}",
|
||||
)
|
||||
try:
|
||||
separator_page_numbers = []
|
||||
_pdf2image_barcode_scan(pdf_filepath)
|
||||
# This file is really borked, allow the consumption to continue
|
||||
# but it may fail further on
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.warning(
|
||||
f"Exception during barcode scanning: {e}",
|
||||
)
|
||||
# We're not sure what happened, but allow the consumption to continue
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.warning(
|
||||
f"Exception during barcode scanning: {e}",
|
||||
)
|
||||
|
||||
else:
|
||||
logger.warning(
|
||||
|
BIN
src/documents/tests/samples/password-is-test.pdf
Executable file
BIN
src/documents/tests/samples/password-is-test.pdf
Executable file
Binary file not shown.
@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(pdf_file, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
|
||||
def test_scan_file_for_separating_barcodes2(self):
|
||||
def test_scan_file_for_separating_barcodes_none_present(self):
|
||||
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
|
||||
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
|
||||
test_file,
|
||||
@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
|
||||
with mock.patch("documents.tasks.async_to_sync"):
|
||||
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
||||
|
||||
def test_scan_file_for_separating_barcodes_password_pikepdf(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Password protected PDF
|
||||
- pikepdf based scanning
|
||||
WHEN:
|
||||
- File is scanned for barcode
|
||||
THEN:
|
||||
- Scanning handle the exception without exception
|
||||
"""
|
||||
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
|
||||
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
|
||||
test_file,
|
||||
)
|
||||
|
||||
self.assertEqual(pdf_file, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [])
|
||||
|
||||
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
|
||||
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Password protected PDF
|
||||
- pdf2image based scanning
|
||||
WHEN:
|
||||
- File is scanned for barcode
|
||||
THEN:
|
||||
- Scanning handle the exception without exception
|
||||
"""
|
||||
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
|
||||
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
|
||||
test_file,
|
||||
)
|
||||
|
||||
self.assertEqual(pdf_file, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [])
|
||||
|
Loading…
x
Reference in New Issue
Block a user