mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
		
							
								
								
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @@ -226,7 +226,7 @@ | |||||||
|                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", |                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", | ||||||
|                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" |                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" | ||||||
|             ], |             ], | ||||||
|             "markers": "python_full_version >= '3.6.0'", |             "markers": "python_version >= '3.6'", | ||||||
|             "version": "==2.1.1" |             "version": "==2.1.1" | ||||||
|         }, |         }, | ||||||
|         "click": { |         "click": { | ||||||
| @@ -242,7 +242,7 @@ | |||||||
|                 "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", |                 "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", | ||||||
|                 "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" |                 "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" | ||||||
|             ], |             ], | ||||||
|             "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'", |             "markers": "python_version < '4' and python_full_version >= '3.6.2'", | ||||||
|             "version": "==0.3.0" |             "version": "==0.3.0" | ||||||
|         }, |         }, | ||||||
|         "click-plugins": { |         "click-plugins": { | ||||||
| @@ -2191,7 +2191,7 @@ | |||||||
|                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", |                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", | ||||||
|                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" |                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" | ||||||
|             ], |             ], | ||||||
|             "markers": "python_full_version >= '3.6.0'", |             "markers": "python_version >= '3.6'", | ||||||
|             "version": "==2.1.1" |             "version": "==2.1.1" | ||||||
|         }, |         }, | ||||||
|         "click": { |         "click": { | ||||||
| @@ -2211,6 +2211,9 @@ | |||||||
|             "version": "==0.4.5" |             "version": "==0.4.5" | ||||||
|         }, |         }, | ||||||
|         "coverage": { |         "coverage": { | ||||||
|  |             "extras": [ | ||||||
|  |                 "toml" | ||||||
|  |             ], | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", |                 "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", | ||||||
|                 "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", |                 "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", | ||||||
| @@ -2785,7 +2788,7 @@ | |||||||
|                 "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", |                 "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", | ||||||
|                 "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" |                 "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" | ||||||
|             ], |             ], | ||||||
|             "markers": "python_full_version < '3.11.0a7'", |             "markers": "python_version >= '3.7'", | ||||||
|             "version": "==2.0.1" |             "version": "==2.0.1" | ||||||
|         }, |         }, | ||||||
|         "tornado": { |         "tornado": { | ||||||
|   | |||||||
| @@ -10,9 +10,12 @@ from typing import Tuple | |||||||
| import magic | import magic | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from pdf2image import convert_from_path | from pdf2image import convert_from_path | ||||||
|  | from pdf2image.exceptions import PDFPageCountError | ||||||
| from pikepdf import Page | from pikepdf import Page | ||||||
|  | from pikepdf import PasswordError | ||||||
| from pikepdf import Pdf | from pikepdf import Pdf | ||||||
| from pikepdf import PdfImage | from pikepdf import PdfImage | ||||||
|  | from pikepdf.models.image import HifiPrintImageNotTranscodableError | ||||||
| from PIL import Image | from PIL import Image | ||||||
| from PIL import ImageSequence | from PIL import ImageSequence | ||||||
| from pyzbar import pyzbar | from pyzbar import pyzbar | ||||||
| @@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|                     pdfimage = PdfImage(page.images[image_key]) |                     pdfimage = PdfImage(page.images[image_key]) | ||||||
|  |  | ||||||
|                     if "/CCITTFaxDecode" in pdfimage.filters: |                     if "/CCITTFaxDecode" in pdfimage.filters: | ||||||
|                         raise BarcodeImageFormatError() |                         raise BarcodeImageFormatError( | ||||||
|  |                             "Unable to decode CCITTFaxDecode images", | ||||||
|  |                         ) | ||||||
|  |  | ||||||
|                     # Not all images can be transcoded to a PIL image, which |                     # Not all images can be transcoded to a PIL image, which | ||||||
|                     # is what pyzbar expects to receive |                     # is what pyzbar expects to receive | ||||||
| @@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|                         separator_page_numbers.append(page_num) |                         separator_page_numbers.append(page_num) | ||||||
|  |  | ||||||
|     def _pdf2image_barcode_scan(pdf_filepath: str): |     def _pdf2image_barcode_scan(pdf_filepath: str): | ||||||
|         # use a temporary directory in case the file os too big to handle in memory |         # use a temporary directory in case the file is too big to handle in memory | ||||||
|         with tempfile.TemporaryDirectory() as path: |         with tempfile.TemporaryDirectory() as path: | ||||||
|             pages_from_path = convert_from_path(pdf_filepath, output_folder=path) |             pages_from_path = convert_from_path(pdf_filepath, output_folder=path) | ||||||
|             for current_page_number, page in enumerate(pages_from_path): |             for current_page_number, page in enumerate(pages_from_path): | ||||||
| @@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|         if mime_type == "image/tiff": |         if mime_type == "image/tiff": | ||||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) |             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||||
|  |  | ||||||
|  |         # Chose the scanner | ||||||
|         if settings.CONSUMER_USE_LEGACY_DETECTION: |         if settings.CONSUMER_USE_LEGACY_DETECTION: | ||||||
|             _pdf2image_barcode_scan(pdf_filepath) |             logger.debug("Using pdf2image for barcodes") | ||||||
|  |             scanner_function = _pdf2image_barcode_scan | ||||||
|         else: |         else: | ||||||
|             try: |             logger.debug("Using pikepdf for barcodes") | ||||||
|                 _pikepdf_barcode_scan(pdf_filepath) |             scanner_function = _pikepdf_barcode_scan | ||||||
|             except Exception as e: |  | ||||||
|  |  | ||||||
|                 logger.warning( |         # Run the scanner | ||||||
|                     f"Exception using pikepdf for barcodes," |         try: | ||||||
|                     f" falling back to pdf2image: {e}", |             scanner_function(pdf_filepath) | ||||||
|                 ) |         # Neither method can handle password protected PDFs without it being | ||||||
|                 # Reset this incase pikepdf got part way through |         # provided.  Log it and continue | ||||||
|  |         except (PasswordError, PDFPageCountError) as e: | ||||||
|  |             logger.warning( | ||||||
|  |                 f"File is likely password protected, not splitting: {e}", | ||||||
|  |             ) | ||||||
|  |         # Handle pikepdf related image decoding issues with a fallback | ||||||
|  |         except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: | ||||||
|  |             logger.warning( | ||||||
|  |                 f"Falling back to pdf2image because: {e}", | ||||||
|  |             ) | ||||||
|  |             try: | ||||||
|                 separator_page_numbers = [] |                 separator_page_numbers = [] | ||||||
|                 _pdf2image_barcode_scan(pdf_filepath) |                 _pdf2image_barcode_scan(pdf_filepath) | ||||||
|  |             # This file is really borked, allow the consumption to continue | ||||||
|  |             # but it may fail further on | ||||||
|  |             except Exception as e:  # pragma: no cover | ||||||
|  |                 logger.warning( | ||||||
|  |                     f"Exception during barcode scanning: {e}", | ||||||
|  |                 ) | ||||||
|  |         # We're not sure what happened, but allow the consumption to continue | ||||||
|  |         except Exception as e:  # pragma: no cover | ||||||
|  |             logger.warning( | ||||||
|  |                 f"Exception during barcode scanning: {e}", | ||||||
|  |             ) | ||||||
|  |  | ||||||
|     else: |     else: | ||||||
|         logger.warning( |         logger.warning( | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|         self.assertEqual(pdf_file, test_file) |         self.assertEqual(pdf_file, test_file) | ||||||
|         self.assertListEqual(separator_page_numbers, [0]) |         self.assertListEqual(separator_page_numbers, [0]) | ||||||
|  |  | ||||||
|     def test_scan_file_for_separating_barcodes2(self): |     def test_scan_file_for_separating_barcodes_none_present(self): | ||||||
|         test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") |         test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") | ||||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( |         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||||
|             test_file, |             test_file, | ||||||
| @@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         with mock.patch("documents.tasks.async_to_sync"): |         with mock.patch("documents.tasks.async_to_sync"): | ||||||
|             self.assertEqual(tasks.consume_file(dst), "File successfully split") |             self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes_password_pikepdf(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Password protected PDF | ||||||
|  |             - pikepdf based scanning | ||||||
|  |         WHEN: | ||||||
|  |             - File is scanned for barcode | ||||||
|  |         THEN: | ||||||
|  |             - Scanning handle the exception without exception | ||||||
|  |         """ | ||||||
|  |         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") | ||||||
|  |         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||||
|  |             test_file, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         self.assertEqual(pdf_file, test_file) | ||||||
|  |         self.assertListEqual(separator_page_numbers, []) | ||||||
|  |  | ||||||
|  |     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) | ||||||
|  |     def test_scan_file_for_separating_barcodes_password_pdf2image(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Password protected PDF | ||||||
|  |             - pdf2image based scanning | ||||||
|  |         WHEN: | ||||||
|  |             - File is scanned for barcode | ||||||
|  |         THEN: | ||||||
|  |             - Scanning handle the exception without exception | ||||||
|  |         """ | ||||||
|  |         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") | ||||||
|  |         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||||
|  |             test_file, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         self.assertEqual(pdf_file, test_file) | ||||||
|  |         self.assertListEqual(separator_page_numbers, []) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 phail
					phail