mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration
This commit is contained in:
		| @@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool> | ||||
|  | ||||
|     Defaults to false. | ||||
|  | ||||
| PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool> | ||||
|     Enables the legacy method of detecting barcodes.  By default, images are | ||||
|     extracted directly from the PDF structure for barcode detection.  If this | ||||
|     configuration value is set, images of the whole PDF page will be used instead. | ||||
|  | ||||
|     This is a slower and more memory intensive process, but may be required for | ||||
|     certain files, depending on how it is produced and how images are encoded. | ||||
|  | ||||
|     Defaults to false. | ||||
|  | ||||
|  | ||||
| PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool> | ||||
|     Whether TIFF image files should be scanned for barcodes. | ||||
|   | ||||
| @@ -10,12 +10,10 @@ from typing import Tuple | ||||
| import magic | ||||
| from django.conf import settings | ||||
| from pdf2image import convert_from_path | ||||
| from pdf2image.exceptions import PDFPageCountError | ||||
| from pikepdf import Page | ||||
| from pikepdf import PasswordError | ||||
| from pikepdf import Pdf | ||||
| from pikepdf import PdfImage | ||||
| from pikepdf.models.image import HifiPrintImageNotTranscodableError | ||||
| from PIL import Image | ||||
| from PIL import ImageSequence | ||||
| from pyzbar import pyzbar | ||||
| @@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str: | ||||
|                 images[0].save(newpath) | ||||
|             else: | ||||
|                 images[0].save(newpath, save_all=True, append_images=images[1:]) | ||||
|         except OSError as e: | ||||
|         except OSError as e:  # pragma: no cover | ||||
|             logger.warning( | ||||
|                 f"Could not save the file as pdf. Error: {str(e)}", | ||||
|             ) | ||||
| @@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|                 for image_key in page.images: | ||||
|                     pdfimage = PdfImage(page.images[image_key]) | ||||
|  | ||||
|                     # This type is known to have issues: | ||||
|                     # https://github.com/pikepdf/pikepdf/issues/401 | ||||
|                     if "/CCITTFaxDecode" in pdfimage.filters: | ||||
|                         raise BarcodeImageFormatError( | ||||
|                             "Unable to decode CCITTFaxDecode images", | ||||
|                         ) | ||||
|  | ||||
|                     # Not all images can be transcoded to a PIL image, which | ||||
|                     # is what pyzbar expects to receive | ||||
|                     # is what pyzbar expects to receive, so this may | ||||
|                     # raise an exception, triggering fallback | ||||
|                     pillow_img = pdfimage.as_pil_image() | ||||
|  | ||||
|                     detected_barcodes = barcode_reader(pillow_img) | ||||
| @@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|         if mime_type == "image/tiff": | ||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||
|  | ||||
|         # Chose the scanner | ||||
|         if settings.CONSUMER_USE_LEGACY_DETECTION: | ||||
|             logger.debug("Using pdf2image for barcodes") | ||||
|             scanner_function = _pdf2image_barcode_scan | ||||
|         else: | ||||
|             logger.debug("Using pikepdf for barcodes") | ||||
|             scanner_function = _pikepdf_barcode_scan | ||||
|  | ||||
|         # Run the scanner | ||||
|         # Always try pikepdf first, it's usually fine, faster and | ||||
|         # uses less memory | ||||
|         try: | ||||
|             scanner_function(pdf_filepath) | ||||
|         # Neither method can handle password protected PDFs without it being | ||||
|         # provided.  Log it and continue | ||||
|         except (PasswordError, PDFPageCountError) as e: | ||||
|             _pikepdf_barcode_scan(pdf_filepath) | ||||
|         # Password protected files can't be checked | ||||
|         except PasswordError as e: | ||||
|             logger.warning( | ||||
|                 f"File is likely password protected, not splitting: {e}", | ||||
|                 f"File is likely password protected, not checking for barcodes: {e}", | ||||
|             ) | ||||
|         # Handle pikepdf related image decoding issues with a fallback | ||||
|         except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: | ||||
|         # Handle pikepdf related image decoding issues with a fallback to page | ||||
|         # by page conversion to images in a temporary directory | ||||
|         except Exception as e: | ||||
|             logger.warning( | ||||
|                 f"Falling back to pdf2image because: {e}", | ||||
|             ) | ||||
|             try: | ||||
|                 # Clear the list in case some processing worked | ||||
|                 separator_page_numbers = [] | ||||
|                 _pdf2image_barcode_scan(pdf_filepath) | ||||
|             # This file is really borked, allow the consumption to continue | ||||
| @@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|                 logger.warning( | ||||
|                     f"Exception during barcode scanning: {e}", | ||||
|                 ) | ||||
|         # We're not sure what happened, but allow the consumption to continue | ||||
|         except Exception as e:  # pragma: no cover | ||||
|             logger.warning( | ||||
|                 f"Exception during barcode scanning: {e}", | ||||
|             ) | ||||
|  | ||||
|     else: | ||||
|         logger.warning( | ||||
|   | ||||
| @@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase): | ||||
|         self.assertTrue(os.path.isfile(target_file1)) | ||||
|         self.assertTrue(os.path.isfile(target_file2)) | ||||
|  | ||||
|     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) | ||||
|     def test_barcode_splitter_legacy_fallback(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File containing barcode | ||||
|             - Legacy method of detection is enabled | ||||
|         WHEN: | ||||
|             - File is scanned for barcodes | ||||
|         THEN: | ||||
|             - Barcodes are properly detected | ||||
|         """ | ||||
|         test_file = os.path.join( | ||||
|             self.BARCODE_SAMPLE_DIR, | ||||
|             "patch-code-t-middle.pdf", | ||||
|         ) | ||||
|         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||
|             test_file, | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(test_file, pdf_file) | ||||
|         self.assertTrue(len(separator_page_numbers) > 0) | ||||
|  | ||||
|         document_list = barcodes.separate_pages(test_file, separator_page_numbers) | ||||
|         self.assertTrue(document_list) | ||||
|         for document in document_list: | ||||
|             barcodes.save_to_dir(document, target_dir=tempdir) | ||||
|  | ||||
|         target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") | ||||
|         target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(target_file1)) | ||||
|         self.assertTrue(os.path.isfile(target_file2)) | ||||
|  | ||||
|     @override_settings(CONSUMER_ENABLE_BARCODES=True) | ||||
|     def test_consume_barcode_file(self): | ||||
|         test_file = os.path.join( | ||||
| @@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase): | ||||
|         with mock.patch("documents.tasks.async_to_sync"): | ||||
|             self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||
|  | ||||
|     def test_scan_file_for_separating_barcodes_password_pikepdf(self): | ||||
|     def test_scan_file_for_separating_barcodes_password(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Password protected PDF | ||||
| @@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertEqual(pdf_file, test_file) | ||||
|         self.assertListEqual(separator_page_numbers, []) | ||||
|  | ||||
|     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) | ||||
|     def test_scan_file_for_separating_barcodes_password_pdf2image(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Password protected PDF | ||||
|             - pdf2image based scanning | ||||
|         WHEN: | ||||
|             - File is scanned for barcode | ||||
|         THEN: | ||||
|             - Scanning handle the exception without exception | ||||
|         """ | ||||
|         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") | ||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||
|             test_file, | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(pdf_file, test_file) | ||||
|         self.assertListEqual(separator_page_numbers, []) | ||||
|   | ||||
| @@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean( | ||||
|     "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", | ||||
| ) | ||||
|  | ||||
| CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean( | ||||
|     "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION", | ||||
|     "NO", | ||||
| ) | ||||
|  | ||||
| CONSUMER_BARCODE_STRING: Final[str] = os.getenv( | ||||
|     "PAPERLESS_CONSUMER_BARCODE_STRING", | ||||
|     "PATCHT", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H