mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
In case pikepdf fails to convert an image to a PIL image, fall back to converting pages to PIL images
This commit is contained in:
parent
fb2efe5ab8
commit
caf4b54bc7
1
Pipfile
1
Pipfile
@ -57,6 +57,7 @@ celery = {extras = ["redis"], version = "*"}
|
||||
django-celery-results = "*"
|
||||
setproctitle = "*"
|
||||
nltk = "*"
|
||||
pdf2image = "*"
|
||||
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
|
8
Pipfile.lock
generated
8
Pipfile.lock
generated
@ -939,6 +939,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.5.2"
|
||||
},
|
||||
"pdf2image": {
|
||||
"hashes": [
|
||||
"sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65",
|
||||
"sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"pdfminer.six": {
|
||||
"hashes": [
|
||||
"sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2",
|
||||
|
@ -9,6 +9,7 @@ from typing import Tuple
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pikepdf import Page
|
||||
from pikepdf import Pdf
|
||||
from pikepdf import PdfImage
|
||||
@ -108,6 +109,30 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
||||
which separate the file into new files
|
||||
"""
|
||||
|
||||
def _pikepdf_barcode_scan(pdf_filepath: str):
|
||||
with Pdf.open(pdf_filepath) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
for image_key in page.images:
|
||||
pdfimage = PdfImage(page.images[image_key])
|
||||
|
||||
# Not all images can be transcoded to a PIL image, which
|
||||
# is what pyzbar expects to receive
|
||||
pillow_img = pdfimage.as_pil_image()
|
||||
|
||||
detected_barcodes = barcode_reader(pillow_img)
|
||||
|
||||
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
|
||||
separator_page_numbers.append(page_num)
|
||||
|
||||
def _pdf2image_barcode_scan(pdf_filepath: str):
|
||||
# use a temporary directory in case the file os too big to handle in memory
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
|
||||
for current_page_number, page in enumerate(pages_from_path):
|
||||
current_barcodes = barcode_reader(page)
|
||||
if settings.CONSUMER_BARCODE_STRING in current_barcodes:
|
||||
separator_page_numbers.append(current_page_number)
|
||||
|
||||
separator_page_numbers = []
|
||||
pdf_filepath = None
|
||||
|
||||
@ -118,17 +143,17 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
|
||||
if mime_type == "image/tiff":
|
||||
pdf_filepath = convert_from_tiff_to_pdf(filepath)
|
||||
|
||||
pdf = Pdf.open(pdf_filepath)
|
||||
try:
|
||||
_pikepdf_barcode_scan(pdf_filepath)
|
||||
except Exception as e:
|
||||
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
for image_key in page.images:
|
||||
pdfimage = PdfImage(page.images[image_key])
|
||||
pillow_img = pdfimage.as_pil_image()
|
||||
logger.warning(
|
||||
f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
|
||||
)
|
||||
# Reset this incase pikepdf got part way through
|
||||
separator_page_numbers = []
|
||||
_pdf2image_barcode_scan(pdf_filepath)
|
||||
|
||||
detected_barcodes = barcode_reader(pillow_img)
|
||||
|
||||
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
|
||||
separator_page_numbers.append(page_num)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Unsupported file format for barcode reader: {str(mime_type)}",
|
||||
|
@ -3,6 +3,7 @@ import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
import pikepdf
|
||||
from django.conf import settings
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
@ -218,6 +219,66 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(pdf_file, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [1])
|
||||
|
||||
def test_scan_file_for_separating_barcodes_pillow_transcode_error(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A PDF containing an image which cannot be transcoded to a PIL image
|
||||
WHEN:
|
||||
- The image tries to be transcoded to a PIL image, but fails
|
||||
THEN:
|
||||
- The barcode reader is still called, as
|
||||
"""
|
||||
|
||||
def _build_device_n_pdf(self, save_path: str):
|
||||
# Based on the pikepdf tests
|
||||
# https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
|
||||
pdf = pikepdf.new()
|
||||
pdf.add_blank_page(page_size=(72, 72))
|
||||
imobj = pikepdf.Stream(
|
||||
pdf,
|
||||
bytes(range(0, 256)),
|
||||
BitsPerComponent=8,
|
||||
ColorSpace=pikepdf.Array(
|
||||
[
|
||||
pikepdf.Name.DeviceN,
|
||||
pikepdf.Array([pikepdf.Name.Black]),
|
||||
pikepdf.Name.DeviceCMYK,
|
||||
pikepdf.Stream(
|
||||
pdf,
|
||||
b"{0 0 0 4 -1 roll}", # Colorspace conversion function
|
||||
FunctionType=4,
|
||||
Domain=[0.0, 1.0],
|
||||
Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
|
||||
),
|
||||
],
|
||||
),
|
||||
Width=16,
|
||||
Height=16,
|
||||
Type=pikepdf.Name.XObject,
|
||||
Subtype=pikepdf.Name.Image,
|
||||
)
|
||||
pim = pikepdf.PdfImage(imobj)
|
||||
self.assertEqual(pim.mode, "DeviceN")
|
||||
self.assertTrue(pim.is_device_n)
|
||||
|
||||
pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
|
||||
pdf.pages[0].Resources = pikepdf.Dictionary(
|
||||
XObject=pikepdf.Dictionary(Im0=imobj),
|
||||
)
|
||||
pdf.save(save_path)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
|
||||
# Build an offending file
|
||||
_build_device_n_pdf(self, str(device_n_pdf.name))
|
||||
with mock.patch("documents.barcodes.barcode_reader") as reader:
|
||||
reader.return_value = list()
|
||||
|
||||
_, _ = barcodes.scan_file_for_separating_barcodes(
|
||||
str(device_n_pdf.name),
|
||||
)
|
||||
|
||||
reader.assert_called()
|
||||
|
||||
def test_scan_file_for_separating_qr_barcodes(self):
|
||||
test_file = os.path.join(
|
||||
self.BARCODE_SAMPLE_DIR,
|
||||
|
Loading…
x
Reference in New Issue
Block a user