mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	In case pikepdf fails to convert an image to a PIL image, fall back to converting pages to PIL images
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							@@ -57,6 +57,7 @@ celery = {extras = ["redis"], version = "*"}
 | 
			
		||||
django-celery-results = "*"
 | 
			
		||||
setproctitle = "*"
 | 
			
		||||
nltk = "*"
 | 
			
		||||
pdf2image = "*"
 | 
			
		||||
 | 
			
		||||
[dev-packages]
 | 
			
		||||
coveralls = "*"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										8
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										8
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							@@ -939,6 +939,14 @@
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==2.5.2"
 | 
			
		||||
        },
 | 
			
		||||
        "pdf2image": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65",
 | 
			
		||||
                "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb"
 | 
			
		||||
            ],
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==1.16.0"
 | 
			
		||||
        },
 | 
			
		||||
        "pdfminer.six": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2",
 | 
			
		||||
 
 | 
			
		||||
@@ -9,6 +9,7 @@ from typing import Tuple
 | 
			
		||||
 | 
			
		||||
import magic
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from pdf2image import convert_from_path
 | 
			
		||||
from pikepdf import Page
 | 
			
		||||
from pikepdf import Pdf
 | 
			
		||||
from pikepdf import PdfImage
 | 
			
		||||
@@ -108,6 +109,30 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
			
		||||
    which separate the file into new files
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def _pikepdf_barcode_scan(pdf_filepath: str):
 | 
			
		||||
        with Pdf.open(pdf_filepath) as pdf:
 | 
			
		||||
            for page_num, page in enumerate(pdf.pages):
 | 
			
		||||
                for image_key in page.images:
 | 
			
		||||
                    pdfimage = PdfImage(page.images[image_key])
 | 
			
		||||
 | 
			
		||||
                    # Not all images can be transcoded to a PIL image, which
 | 
			
		||||
                    # is what pyzbar expects to receive
 | 
			
		||||
                    pillow_img = pdfimage.as_pil_image()
 | 
			
		||||
 | 
			
		||||
                    detected_barcodes = barcode_reader(pillow_img)
 | 
			
		||||
 | 
			
		||||
                    if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
 | 
			
		||||
                        separator_page_numbers.append(page_num)
 | 
			
		||||
 | 
			
		||||
    def _pdf2image_barcode_scan(pdf_filepath: str):
 | 
			
		||||
        # use a temporary directory in case the file os too big to handle in memory
 | 
			
		||||
        with tempfile.TemporaryDirectory() as path:
 | 
			
		||||
            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
 | 
			
		||||
            for current_page_number, page in enumerate(pages_from_path):
 | 
			
		||||
                current_barcodes = barcode_reader(page)
 | 
			
		||||
                if settings.CONSUMER_BARCODE_STRING in current_barcodes:
 | 
			
		||||
                    separator_page_numbers.append(current_page_number)
 | 
			
		||||
 | 
			
		||||
    separator_page_numbers = []
 | 
			
		||||
    pdf_filepath = None
 | 
			
		||||
 | 
			
		||||
@@ -118,17 +143,17 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
			
		||||
        if mime_type == "image/tiff":
 | 
			
		||||
            pdf_filepath = convert_from_tiff_to_pdf(filepath)
 | 
			
		||||
 | 
			
		||||
        pdf = Pdf.open(pdf_filepath)
 | 
			
		||||
        try:
 | 
			
		||||
            _pikepdf_barcode_scan(pdf_filepath)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
 | 
			
		||||
        for page_num, page in enumerate(pdf.pages):
 | 
			
		||||
            for image_key in page.images:
 | 
			
		||||
                pdfimage = PdfImage(page.images[image_key])
 | 
			
		||||
                pillow_img = pdfimage.as_pil_image()
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
 | 
			
		||||
            )
 | 
			
		||||
            # Reset this incase pikepdf got part way through
 | 
			
		||||
            separator_page_numbers = []
 | 
			
		||||
            _pdf2image_barcode_scan(pdf_filepath)
 | 
			
		||||
 | 
			
		||||
                detected_barcodes = barcode_reader(pillow_img)
 | 
			
		||||
 | 
			
		||||
                if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
 | 
			
		||||
                    separator_page_numbers.append(page_num)
 | 
			
		||||
    else:
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            f"Unsupported file format for barcode reader: {str(mime_type)}",
 | 
			
		||||
 
 | 
			
		||||
@@ -3,6 +3,7 @@ import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
import pikepdf
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
@@ -218,6 +219,66 @@ class TestBarcode(DirectoriesMixin, TestCase):
 | 
			
		||||
        self.assertEqual(pdf_file, test_file)
 | 
			
		||||
        self.assertListEqual(separator_page_numbers, [1])
 | 
			
		||||
 | 
			
		||||
    def test_scan_file_for_separating_barcodes_pillow_transcode_error(self):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - A PDF containing an image which cannot be transcoded to a PIL image
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - The image tries to be transcoded to a PIL image, but fails
 | 
			
		||||
        THEN:
 | 
			
		||||
            - The barcode reader is still called, as
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        def _build_device_n_pdf(self, save_path: str):
 | 
			
		||||
            # Based on the pikepdf tests
 | 
			
		||||
            # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
 | 
			
		||||
            pdf = pikepdf.new()
 | 
			
		||||
            pdf.add_blank_page(page_size=(72, 72))
 | 
			
		||||
            imobj = pikepdf.Stream(
 | 
			
		||||
                pdf,
 | 
			
		||||
                bytes(range(0, 256)),
 | 
			
		||||
                BitsPerComponent=8,
 | 
			
		||||
                ColorSpace=pikepdf.Array(
 | 
			
		||||
                    [
 | 
			
		||||
                        pikepdf.Name.DeviceN,
 | 
			
		||||
                        pikepdf.Array([pikepdf.Name.Black]),
 | 
			
		||||
                        pikepdf.Name.DeviceCMYK,
 | 
			
		||||
                        pikepdf.Stream(
 | 
			
		||||
                            pdf,
 | 
			
		||||
                            b"{0 0 0 4 -1 roll}",  # Colorspace conversion function
 | 
			
		||||
                            FunctionType=4,
 | 
			
		||||
                            Domain=[0.0, 1.0],
 | 
			
		||||
                            Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
 | 
			
		||||
                        ),
 | 
			
		||||
                    ],
 | 
			
		||||
                ),
 | 
			
		||||
                Width=16,
 | 
			
		||||
                Height=16,
 | 
			
		||||
                Type=pikepdf.Name.XObject,
 | 
			
		||||
                Subtype=pikepdf.Name.Image,
 | 
			
		||||
            )
 | 
			
		||||
            pim = pikepdf.PdfImage(imobj)
 | 
			
		||||
            self.assertEqual(pim.mode, "DeviceN")
 | 
			
		||||
            self.assertTrue(pim.is_device_n)
 | 
			
		||||
 | 
			
		||||
            pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
 | 
			
		||||
            pdf.pages[0].Resources = pikepdf.Dictionary(
 | 
			
		||||
                XObject=pikepdf.Dictionary(Im0=imobj),
 | 
			
		||||
            )
 | 
			
		||||
            pdf.save(save_path)
 | 
			
		||||
 | 
			
		||||
        with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
 | 
			
		||||
            # Build an offending file
 | 
			
		||||
            _build_device_n_pdf(self, str(device_n_pdf.name))
 | 
			
		||||
            with mock.patch("documents.barcodes.barcode_reader") as reader:
 | 
			
		||||
                reader.return_value = list()
 | 
			
		||||
 | 
			
		||||
                _, _ = barcodes.scan_file_for_separating_barcodes(
 | 
			
		||||
                    str(device_n_pdf.name),
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
                reader.assert_called()
 | 
			
		||||
 | 
			
		||||
    def test_scan_file_for_separating_qr_barcodes(self):
 | 
			
		||||
        test_file = os.path.join(
 | 
			
		||||
            self.BARCODE_SAMPLE_DIR,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user