mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-28 03:46:06 -05:00 
			
		
		
		
	Always use pikepdf, then pdf2image if needed to check for barcodes instead of requiring/allowing configuration
This commit is contained in:
		| @@ -701,16 +701,6 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool> | |||||||
|  |  | ||||||
|     Defaults to false. |     Defaults to false. | ||||||
|  |  | ||||||
| PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool> |  | ||||||
|     Enables the legacy method of detecting barcodes.  By default, images are |  | ||||||
|     extracted directly from the PDF structure for barcode detection.  If this |  | ||||||
|     configuration value is set, images of the whole PDF page will be used instead. |  | ||||||
|  |  | ||||||
|     This is a slower and more memory intensive process, but may be required for |  | ||||||
|     certain files, depending on how it is produced and how images are encoded. |  | ||||||
|  |  | ||||||
|     Defaults to false. |  | ||||||
|  |  | ||||||
|  |  | ||||||
| PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool> | PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool> | ||||||
|     Whether TIFF image files should be scanned for barcodes. |     Whether TIFF image files should be scanned for barcodes. | ||||||
|   | |||||||
| @@ -10,12 +10,10 @@ from typing import Tuple | |||||||
| import magic | import magic | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from pdf2image import convert_from_path | from pdf2image import convert_from_path | ||||||
| from pdf2image.exceptions import PDFPageCountError |  | ||||||
| from pikepdf import Page | from pikepdf import Page | ||||||
| from pikepdf import PasswordError | from pikepdf import PasswordError | ||||||
| from pikepdf import Pdf | from pikepdf import Pdf | ||||||
| from pikepdf import PdfImage | from pikepdf import PdfImage | ||||||
| from pikepdf.models.image import HifiPrintImageNotTranscodableError |  | ||||||
| from PIL import Image | from PIL import Image | ||||||
| from PIL import ImageSequence | from PIL import ImageSequence | ||||||
| from pyzbar import pyzbar | from pyzbar import pyzbar | ||||||
| @@ -101,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str: | |||||||
|                 images[0].save(newpath) |                 images[0].save(newpath) | ||||||
|             else: |             else: | ||||||
|                 images[0].save(newpath, save_all=True, append_images=images[1:]) |                 images[0].save(newpath, save_all=True, append_images=images[1:]) | ||||||
|         except OSError as e: |         except OSError as e:  # pragma: no cover | ||||||
|             logger.warning( |             logger.warning( | ||||||
|                 f"Could not save the file as pdf. Error: {str(e)}", |                 f"Could not save the file as pdf. Error: {str(e)}", | ||||||
|             ) |             ) | ||||||
| @@ -122,13 +120,16 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|                 for image_key in page.images: |                 for image_key in page.images: | ||||||
|                     pdfimage = PdfImage(page.images[image_key]) |                     pdfimage = PdfImage(page.images[image_key]) | ||||||
|  |  | ||||||
|  |                     # This type is known to have issues: | ||||||
|  |                     # https://github.com/pikepdf/pikepdf/issues/401 | ||||||
|                     if "/CCITTFaxDecode" in pdfimage.filters: |                     if "/CCITTFaxDecode" in pdfimage.filters: | ||||||
|                         raise BarcodeImageFormatError( |                         raise BarcodeImageFormatError( | ||||||
|                             "Unable to decode CCITTFaxDecode images", |                             "Unable to decode CCITTFaxDecode images", | ||||||
|                         ) |                         ) | ||||||
|  |  | ||||||
|                     # Not all images can be transcoded to a PIL image, which |                     # Not all images can be transcoded to a PIL image, which | ||||||
|                     # is what pyzbar expects to receive |                     # is what pyzbar expects to receive, so this may | ||||||
|  |                     # raise an exception, triggering fallback | ||||||
|                     pillow_img = pdfimage.as_pil_image() |                     pillow_img = pdfimage.as_pil_image() | ||||||
|  |  | ||||||
|                     detected_barcodes = barcode_reader(pillow_img) |                     detected_barcodes = barcode_reader(pillow_img) | ||||||
| @@ -155,29 +156,23 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|         if mime_type == "image/tiff": |         if mime_type == "image/tiff": | ||||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) |             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||||
|  |  | ||||||
|         # Chose the scanner |         # Always try pikepdf first, it's usually fine, faster and | ||||||
|         if settings.CONSUMER_USE_LEGACY_DETECTION: |         # uses less memory | ||||||
|             logger.debug("Using pdf2image for barcodes") |  | ||||||
|             scanner_function = _pdf2image_barcode_scan |  | ||||||
|         else: |  | ||||||
|             logger.debug("Using pikepdf for barcodes") |  | ||||||
|             scanner_function = _pikepdf_barcode_scan |  | ||||||
|  |  | ||||||
|         # Run the scanner |  | ||||||
|         try: |         try: | ||||||
|             scanner_function(pdf_filepath) |             _pikepdf_barcode_scan(pdf_filepath) | ||||||
|         # Neither method can handle password protected PDFs without it being |         # Password protected files can't be checked | ||||||
|         # provided.  Log it and continue |         except PasswordError as e: | ||||||
|         except (PasswordError, PDFPageCountError) as e: |  | ||||||
|             logger.warning( |             logger.warning( | ||||||
|                 f"File is likely password protected, not splitting: {e}", |                 f"File is likely password protected, not checking for barcodes: {e}", | ||||||
|             ) |             ) | ||||||
|         # Handle pikepdf related image decoding issues with a fallback |         # Handle pikepdf related image decoding issues with a fallback to page | ||||||
|         except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: |         # by page conversion to images in a temporary directory | ||||||
|  |         except Exception as e: | ||||||
|             logger.warning( |             logger.warning( | ||||||
|                 f"Falling back to pdf2image because: {e}", |                 f"Falling back to pdf2image because: {e}", | ||||||
|             ) |             ) | ||||||
|             try: |             try: | ||||||
|  |                 # Clear the list in case some processing worked | ||||||
|                 separator_page_numbers = [] |                 separator_page_numbers = [] | ||||||
|                 _pdf2image_barcode_scan(pdf_filepath) |                 _pdf2image_barcode_scan(pdf_filepath) | ||||||
|             # This file is really borked, allow the consumption to continue |             # This file is really borked, allow the consumption to continue | ||||||
| @@ -186,11 +181,6 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|                 logger.warning( |                 logger.warning( | ||||||
|                     f"Exception during barcode scanning: {e}", |                     f"Exception during barcode scanning: {e}", | ||||||
|                 ) |                 ) | ||||||
|         # We're not sure what happened, but allow the consumption to continue |  | ||||||
|         except Exception as e:  # pragma: no cover |  | ||||||
|             logger.warning( |  | ||||||
|                 f"Exception during barcode scanning: {e}", |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     else: |     else: | ||||||
|         logger.warning( |         logger.warning( | ||||||
|   | |||||||
| @@ -468,41 +468,6 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|         self.assertTrue(os.path.isfile(target_file1)) |         self.assertTrue(os.path.isfile(target_file1)) | ||||||
|         self.assertTrue(os.path.isfile(target_file2)) |         self.assertTrue(os.path.isfile(target_file2)) | ||||||
|  |  | ||||||
|     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) |  | ||||||
|     def test_barcode_splitter_legacy_fallback(self): |  | ||||||
|         """ |  | ||||||
|         GIVEN: |  | ||||||
|             - File containing barcode |  | ||||||
|             - Legacy method of detection is enabled |  | ||||||
|         WHEN: |  | ||||||
|             - File is scanned for barcodes |  | ||||||
|         THEN: |  | ||||||
|             - Barcodes are properly detected |  | ||||||
|         """ |  | ||||||
|         test_file = os.path.join( |  | ||||||
|             self.BARCODE_SAMPLE_DIR, |  | ||||||
|             "patch-code-t-middle.pdf", |  | ||||||
|         ) |  | ||||||
|         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) |  | ||||||
|  |  | ||||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( |  | ||||||
|             test_file, |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         self.assertEqual(test_file, pdf_file) |  | ||||||
|         self.assertTrue(len(separator_page_numbers) > 0) |  | ||||||
|  |  | ||||||
|         document_list = barcodes.separate_pages(test_file, separator_page_numbers) |  | ||||||
|         self.assertTrue(document_list) |  | ||||||
|         for document in document_list: |  | ||||||
|             barcodes.save_to_dir(document, target_dir=tempdir) |  | ||||||
|  |  | ||||||
|         target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") |  | ||||||
|         target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") |  | ||||||
|  |  | ||||||
|         self.assertTrue(os.path.isfile(target_file1)) |  | ||||||
|         self.assertTrue(os.path.isfile(target_file2)) |  | ||||||
|  |  | ||||||
|     @override_settings(CONSUMER_ENABLE_BARCODES=True) |     @override_settings(CONSUMER_ENABLE_BARCODES=True) | ||||||
|     def test_consume_barcode_file(self): |     def test_consume_barcode_file(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
| @@ -586,7 +551,7 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|         with mock.patch("documents.tasks.async_to_sync"): |         with mock.patch("documents.tasks.async_to_sync"): | ||||||
|             self.assertEqual(tasks.consume_file(dst), "File successfully split") |             self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
|  |  | ||||||
|     def test_scan_file_for_separating_barcodes_password_pikepdf(self): |     def test_scan_file_for_separating_barcodes_password(self): | ||||||
|         """ |         """ | ||||||
|         GIVEN: |         GIVEN: | ||||||
|             - Password protected PDF |             - Password protected PDF | ||||||
| @@ -603,22 +568,3 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         self.assertEqual(pdf_file, test_file) |         self.assertEqual(pdf_file, test_file) | ||||||
|         self.assertListEqual(separator_page_numbers, []) |         self.assertListEqual(separator_page_numbers, []) | ||||||
|  |  | ||||||
|     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) |  | ||||||
|     def test_scan_file_for_separating_barcodes_password_pdf2image(self): |  | ||||||
|         """ |  | ||||||
|         GIVEN: |  | ||||||
|             - Password protected PDF |  | ||||||
|             - pdf2image based scanning |  | ||||||
|         WHEN: |  | ||||||
|             - File is scanned for barcode |  | ||||||
|         THEN: |  | ||||||
|             - Scanning handle the exception without exception |  | ||||||
|         """ |  | ||||||
|         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") |  | ||||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( |  | ||||||
|             test_file, |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         self.assertEqual(pdf_file, test_file) |  | ||||||
|         self.assertListEqual(separator_page_numbers, []) |  | ||||||
|   | |||||||
| @@ -573,11 +573,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean( | |||||||
|     "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", |     "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", | ||||||
| ) | ) | ||||||
|  |  | ||||||
| CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean( |  | ||||||
|     "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION", |  | ||||||
|     "NO", |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| CONSUMER_BARCODE_STRING: Final[str] = os.getenv( | CONSUMER_BARCODE_STRING: Final[str] = os.getenv( | ||||||
|     "PAPERLESS_CONSUMER_BARCODE_STRING", |     "PAPERLESS_CONSUMER_BARCODE_STRING", | ||||||
|     "PATCHT", |     "PATCHT", | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H