mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-24 03:26:11 -05:00 
			
		
		
		
	Merge pull request #962 from paperless-ngx/add-MIME-pdf-check
fixes #949: change to MIME detection for files
This commit is contained in:
		| @@ -4,6 +4,7 @@ import shutil | |||||||
| import tempfile | import tempfile | ||||||
| from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used | from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used | ||||||
|  |  | ||||||
|  | import magic | ||||||
| import tqdm | import tqdm | ||||||
| from asgiref.sync import async_to_sync | from asgiref.sync import async_to_sync | ||||||
| from channels.layers import get_channel_layer | from channels.layers import get_channel_layer | ||||||
| @@ -95,19 +96,33 @@ def barcode_reader(image) -> List[str]: | |||||||
|     return barcodes |     return barcodes | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_file_type(path: str) -> str: | ||||||
|  |     """ | ||||||
|  |     Determines the file type, based on MIME type. | ||||||
|  |  | ||||||
|  |     Returns the MIME type. | ||||||
|  |     """ | ||||||
|  |     mime_type = magic.from_file(path, mime=True) | ||||||
|  |     logger.debug(f"Detected mime type: {mime_type}") | ||||||
|  |     return mime_type | ||||||
|  |  | ||||||
|  |  | ||||||
| def convert_from_tiff_to_pdf(filepath: str) -> str: | def convert_from_tiff_to_pdf(filepath: str) -> str: | ||||||
|     """ |     """ | ||||||
|     converts a given TIFF image file to pdf into a temp. directory. |     converts a given TIFF image file to pdf into a temporary directory. | ||||||
|  |  | ||||||
|     Returns the new pdf file. |     Returns the new pdf file. | ||||||
|     """ |     """ | ||||||
|     file_name = os.path.splitext(os.path.basename(filepath))[0] |     file_name = os.path.splitext(os.path.basename(filepath))[0] | ||||||
|     file_extension = os.path.splitext(os.path.basename(filepath))[1].lower() |     mime_type = get_file_type(filepath) | ||||||
|     tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) |     tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|     # use old file name with pdf extension |     # use old file name with pdf extension | ||||||
|     if file_extension == ".tif" or file_extension == ".tiff": |     if mime_type == "image/tiff": | ||||||
|         newpath = os.path.join(tempdir, file_name + ".pdf") |         newpath = os.path.join(tempdir, file_name + ".pdf") | ||||||
|     else: |     else: | ||||||
|         logger.warning(f"Cannot convert from {str(file_extension)} to pdf.") |         logger.warning( | ||||||
|  |             f"Cannot convert mime type {str(mime_type)} from {str(filepath)} to pdf.", | ||||||
|  |         ) | ||||||
|         return None |         return None | ||||||
|     with Image.open(filepath) as image: |     with Image.open(filepath) as image: | ||||||
|         images = [] |         images = [] | ||||||
| @@ -231,17 +246,17 @@ def consume_file( | |||||||
|         document_list = [] |         document_list = [] | ||||||
|         converted_tiff = None |         converted_tiff = None | ||||||
|         if settings.CONSUMER_BARCODE_TIFF_SUPPORT: |         if settings.CONSUMER_BARCODE_TIFF_SUPPORT: | ||||||
|             supported_extensions = [".pdf", ".tiff", ".tif"] |             supported_mime = ["image/tiff", "application/pdf"] | ||||||
|         else: |         else: | ||||||
|             supported_extensions = [".pdf"] |             supported_mime = ["application/pdf"] | ||||||
|         file_extension = os.path.splitext(os.path.basename(path))[1].lower() |         mime_type = get_file_type(path) | ||||||
|         if file_extension not in supported_extensions: |         if mime_type not in supported_mime: | ||||||
|             # if not supported, skip this routine |             # if not supported, skip this routine | ||||||
|             logger.warning( |             logger.warning( | ||||||
|                 f"Unsupported file format for barcode reader: {str(file_extension)}", |                 f"Unsupported file format for barcode reader: {str(mime_type)}", | ||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             if file_extension in {".tif", ".tiff"}: |             if mime_type == "image/tiff": | ||||||
|                 file_to_process = convert_from_tiff_to_pdf(path) |                 file_to_process = convert_from_tiff_to_pdf(path) | ||||||
|             else: |             else: | ||||||
|                 file_to_process = path |                 file_to_process = path | ||||||
|   | |||||||
| @@ -204,6 +204,34 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         img = Image.open(test_file) |         img = Image.open(test_file) | ||||||
|         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) |         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) | ||||||
|  |  | ||||||
|  |     def test_get_mime_type(self): | ||||||
|  |         tiff_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "simple.tiff", | ||||||
|  |         ) | ||||||
|  |         pdf_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "simple.pdf", | ||||||
|  |         ) | ||||||
|  |         png_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-128-custom.png", | ||||||
|  |         ) | ||||||
|  |         tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1") | ||||||
|  |         pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2") | ||||||
|  |         shutil.copy(tiff_file, tiff_file_no_extension) | ||||||
|  |         shutil.copy(pdf_file, pdf_file_no_extension) | ||||||
|  |  | ||||||
|  |         self.assertEqual(tasks.get_file_type(tiff_file), "image/tiff") | ||||||
|  |         self.assertEqual(tasks.get_file_type(pdf_file), "application/pdf") | ||||||
|  |         self.assertEqual(tasks.get_file_type(tiff_file_no_extension), "image/tiff") | ||||||
|  |         self.assertEqual(tasks.get_file_type(pdf_file_no_extension), "application/pdf") | ||||||
|  |         self.assertEqual(tasks.get_file_type(png_file), "image/png") | ||||||
|  |  | ||||||
|     def test_convert_from_tiff_to_pdf(self): |     def test_convert_from_tiff_to_pdf(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             os.path.dirname(__file__), |             os.path.dirname(__file__), | ||||||
| @@ -469,7 +497,7 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
|             cm.output, |             cm.output, | ||||||
|             [ |             [ | ||||||
|                 "WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg", |                 "WARNING:paperless.tasks:Unsupported file format for barcode reader: image/jpeg", | ||||||
|             ], |             ], | ||||||
|         ) |         ) | ||||||
|         m.assert_called_once() |         m.assert_called_once() | ||||||
| @@ -481,6 +509,26 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         self.assertIsNone(kwargs["override_document_type_id"]) |         self.assertIsNone(kwargs["override_document_type_id"]) | ||||||
|         self.assertIsNone(kwargs["override_tag_ids"]) |         self.assertIsNone(kwargs["override_tag_ids"]) | ||||||
|  |  | ||||||
|  |     @override_settings( | ||||||
|  |         CONSUMER_ENABLE_BARCODES=True, | ||||||
|  |         CONSUMER_BARCODE_TIFF_SUPPORT=True, | ||||||
|  |     ) | ||||||
|  |     def test_consume_barcode_supported_no_extension_file(self): | ||||||
|  |         """ | ||||||
|  |         This test assumes barcode and TIFF support are enabled and | ||||||
|  |         the user uploads a supported image file, but without extension | ||||||
|  |         """ | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.tiff", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
|  |  | ||||||
|     @mock.patch("documents.tasks.sanity_checker.check_sanity") |     @mock.patch("documents.tasks.sanity_checker.check_sanity") | ||||||
|     def test_sanity_check_success(self, m): |     def test_sanity_check_success(self, m): | ||||||
|         m.return_value = SanityCheckMessages() |         m.return_value = SanityCheckMessages() | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 shamoon
					shamoon