mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-19 10:19:27 -05:00
Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration
This commit is contained in:
parent
1d0cf77e7e
commit
10f6195bac
@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
|
|||||||
|
|
||||||
Defaults to false.
|
Defaults to false.
|
||||||
|
|
||||||
PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
|
|
||||||
Enables the legacy method of detecting barcodes. By default, images are
|
|
||||||
extracted directly from the PDF structure for barcode detection. If this
|
|
||||||
configuration value is set, images of the whole PDF page will be used instead.
|
|
||||||
|
|
||||||
This is a slower and more memory intensive process, but may be required for
|
|
||||||
certain files, depending on how it is produced and how images are encoded.
|
|
||||||
|
|
||||||
Defaults to false.
|
|
||||||
|
|
||||||
|
|
||||||
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
|
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
|
||||||
Whether TIFF image files should be scanned for barcodes.
|
Whether TIFF image files should be scanned for barcodes.
|
||||||
|
@ -10,12 +10,10 @@ from typing import Tuple
|
|||||||
import magic
|
import magic
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
from pdf2image.exceptions import PDFPageCountError
|
|
||||||
from pikepdf import Page
|
from pikepdf import Page
|
||||||
from pikepdf import PasswordError
|
from pikepdf import PasswordError
|
||||||
from pikepdf import Pdf
|
from pikepdf import Pdf
|
||||||
from pikepdf import PdfImage
|
from pikepdf import PdfImage
|
||||||
from pikepdf.models.image import HifiPrintImageNotTranscodableError
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from PIL import ImageSequence
|
from PIL import ImageSequence
|
||||||
from pyzbar import pyzbar
|
from pyzbar import pyzbar
|
||||||
@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
|
|||||||
images[0].save(newpath)
|
images[0].save(newpath)
|
||||||
else:
|
else:
|
||||||
images[0].save(newpath, save_all=True, append_images=images[1:])
|
images[0].save(newpath, save_all=True, append_images=images[1:])
|
||||||
except OSError as e:
|
except OSError as e: # pragma: no cover
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not save the file as pdf. Error: {str(e)}",
|
f"Could not save the file as pdf. Error: {str(e)}",
|
||||||
)
|
)
|
||||||
@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
|||||||
for image_key in page.images:
|
for image_key in page.images:
|
||||||
pdfimage = PdfImage(page.images[image_key])
|
pdfimage = PdfImage(page.images[image_key])
|
||||||
|
|
||||||
|
# This type is known to have issues:
|
||||||
|
# https://github.com/pikepdf/pikepdf/issues/401
|
||||||
if "/CCITTFaxDecode" in pdfimage.filters:
|
if "/CCITTFaxDecode" in pdfimage.filters:
|
||||||
raise BarcodeImageFormatError(
|
raise BarcodeImageFormatError(
|
||||||
"Unable to decode CCITTFaxDecode images",
|
"Unable to decode CCITTFaxDecode images",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Not all images can be transcoded to a PIL image, which
|
# Not all images can be transcoded to a PIL image, which
|
||||||
# is what pyzbar expects to receive
|
# is what pyzbar expects to receive, so this may
|
||||||
|
# raise an exception, triggering fallback
|
||||||
pillow_img = pdfimage.as_pil_image()
|
pillow_img = pdfimage.as_pil_image()
|
||||||
|
|
||||||
detected_barcodes = barcode_reader(pillow_img)
|
detected_barcodes = barcode_reader(pillow_img)
|
||||||
@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
|||||||
if mime_type == "image/tiff":
|
if mime_type == "image/tiff":
|
||||||
pdf_filepath = convert_from_tiff_to_pdf(filepath)
|
pdf_filepath = convert_from_tiff_to_pdf(filepath)
|
||||||
|
|
||||||
# Chose the scanner
|
# Always try pikepdf first, it's usually fine, faster and
|
||||||
if settings.CONSUMER_USE_LEGACY_DETECTION:
|
# uses less memory
|
||||||
logger.debug("Using pdf2image for barcodes")
|
|
||||||
scanner_function = _pdf2image_barcode_scan
|
|
||||||
else:
|
|
||||||
logger.debug("Using pikepdf for barcodes")
|
|
||||||
scanner_function = _pikepdf_barcode_scan
|
|
||||||
|
|
||||||
# Run the scanner
|
|
||||||
try:
|
try:
|
||||||
scanner_function(pdf_filepath)
|
_pikepdf_barcode_scan(pdf_filepath)
|
||||||
# Neither method can handle password protected PDFs without it being
|
# Password protected files can't be checked
|
||||||
# provided. Log it and continue
|
except PasswordError as e:
|
||||||
except (PasswordError, PDFPageCountError) as e:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"File is likely password protected, not splitting: {e}",
|
f"File is likely password protected, not checking for barcodes: {e}",
|
||||||
)
|
)
|
||||||
# Handle pikepdf related image decoding issues with a fallback
|
# Handle pikepdf related image decoding issues with a fallback to page
|
||||||
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
|
# by page conversion to images in a temporary directory
|
||||||
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Falling back to pdf2image because: {e}",
|
f"Falling back to pdf2image because: {e}",
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
# Clear the list in case some processing worked
|
||||||
separator_page_numbers = []
|
separator_page_numbers = []
|
||||||
_pdf2image_barcode_scan(pdf_filepath)
|
_pdf2image_barcode_scan(pdf_filepath)
|
||||||
# This file is really borked, allow the consumption to continue
|
# This file is really borked, allow the consumption to continue
|
||||||
@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
|||||||
logger.warning(
|
logger.warning(
|
||||||
f"Exception during barcode scanning: {e}",
|
f"Exception during barcode scanning: {e}",
|
||||||
)
|
)
|
||||||
# We're not sure what happened, but allow the consumption to continue
|
|
||||||
except Exception as e: # pragma: no cover
|
|
||||||
logger.warning(
|
|
||||||
f"Exception during barcode scanning: {e}",
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
self.assertTrue(os.path.isfile(target_file1))
|
self.assertTrue(os.path.isfile(target_file1))
|
||||||
self.assertTrue(os.path.isfile(target_file2))
|
self.assertTrue(os.path.isfile(target_file2))
|
||||||
|
|
||||||
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
|
|
||||||
def test_barcode_splitter_legacy_fallback(self):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File containing barcode
|
|
||||||
- Legacy method of detection is enabled
|
|
||||||
WHEN:
|
|
||||||
- File is scanned for barcodes
|
|
||||||
THEN:
|
|
||||||
- Barcodes are properly detected
|
|
||||||
"""
|
|
||||||
test_file = os.path.join(
|
|
||||||
self.BARCODE_SAMPLE_DIR,
|
|
||||||
"patch-code-t-middle.pdf",
|
|
||||||
)
|
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
|
||||||
|
|
||||||
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
|
|
||||||
test_file,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(test_file, pdf_file)
|
|
||||||
self.assertTrue(len(separator_page_numbers) > 0)
|
|
||||||
|
|
||||||
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
|
|
||||||
self.assertTrue(document_list)
|
|
||||||
for document in document_list:
|
|
||||||
barcodes.save_to_dir(document, target_dir=tempdir)
|
|
||||||
|
|
||||||
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
|
|
||||||
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
|
|
||||||
|
|
||||||
self.assertTrue(os.path.isfile(target_file1))
|
|
||||||
self.assertTrue(os.path.isfile(target_file2))
|
|
||||||
|
|
||||||
@override_settings(CONSUMER_ENABLE_BARCODES=True)
|
@override_settings(CONSUMER_ENABLE_BARCODES=True)
|
||||||
def test_consume_barcode_file(self):
|
def test_consume_barcode_file(self):
|
||||||
test_file = os.path.join(
|
test_file = os.path.join(
|
||||||
@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
with mock.patch("documents.tasks.async_to_sync"):
|
with mock.patch("documents.tasks.async_to_sync"):
|
||||||
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_password_pikepdf(self):
|
def test_scan_file_for_separating_barcodes_password(self):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Password protected PDF
|
- Password protected PDF
|
||||||
@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
self.assertEqual(pdf_file, test_file)
|
self.assertEqual(pdf_file, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [])
|
self.assertListEqual(separator_page_numbers, [])
|
||||||
|
|
||||||
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
|
|
||||||
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Password protected PDF
|
|
||||||
- pdf2image based scanning
|
|
||||||
WHEN:
|
|
||||||
- File is scanned for barcode
|
|
||||||
THEN:
|
|
||||||
- Scanning handle the exception without exception
|
|
||||||
"""
|
|
||||||
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
|
|
||||||
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
|
|
||||||
test_file,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(pdf_file, test_file)
|
|
||||||
self.assertListEqual(separator_page_numbers, [])
|
|
||||||
|
@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
|
|||||||
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
|
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
|
||||||
)
|
)
|
||||||
|
|
||||||
CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
|
|
||||||
"PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
|
|
||||||
"NO",
|
|
||||||
)
|
|
||||||
|
|
||||||
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
|
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
|
||||||
"PAPERLESS_CONSUMER_BARCODE_STRING",
|
"PAPERLESS_CONSUMER_BARCODE_STRING",
|
||||||
"PATCHT",
|
"PATCHT",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user