mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Fix: Allows pre-consume scripts to modify the working path again (#5260)
* Allows pre-consume scripts to modify the working path again and generally cleans up some confusion about working copy vs original
This commit is contained in:
		| @@ -135,24 +135,24 @@ class Consumer(LoggingMixin): | ||||
|         """ | ||||
|         Confirm the input file still exists where it should | ||||
|         """ | ||||
|         if not os.path.isfile(self.path): | ||||
|         if not os.path.isfile(self.original_path): | ||||
|             self._fail( | ||||
|                 ConsumerStatusShortMessage.FILE_NOT_FOUND, | ||||
|                 f"Cannot consume {self.path}: File not found.", | ||||
|                 f"Cannot consume {self.original_path}: File not found.", | ||||
|             ) | ||||
|  | ||||
|     def pre_check_duplicate(self): | ||||
|         """ | ||||
|         Using the MD5 of the file, check this exact file doesn't already exist | ||||
|         """ | ||||
|         with open(self.path, "rb") as f: | ||||
|         with open(self.original_path, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         existing_doc = Document.objects.filter( | ||||
|             Q(checksum=checksum) | Q(archive_checksum=checksum), | ||||
|         ) | ||||
|         if existing_doc.exists(): | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 os.unlink(self.path) | ||||
|                 os.unlink(self.original_path) | ||||
|             self._fail( | ||||
|                 ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS, | ||||
|                 f"Not consuming {self.filename}: It is a duplicate of" | ||||
| @@ -211,7 +211,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") | ||||
|  | ||||
|         working_file_path = str(self.path) | ||||
|         working_file_path = str(self.working_copy) | ||||
|         original_file_path = str(self.original_path) | ||||
|  | ||||
|         script_env = os.environ.copy() | ||||
| @@ -343,8 +343,8 @@ class Consumer(LoggingMixin): | ||||
|         Return the document object if it was successfully created. | ||||
|         """ | ||||
|  | ||||
|         self.path = Path(path).resolve() | ||||
|         self.filename = override_filename or self.path.name | ||||
|         self.original_path = Path(path).resolve() | ||||
|         self.filename = override_filename or self.original_path.name | ||||
|         self.override_title = override_title | ||||
|         self.override_correspondent_id = override_correspondent_id | ||||
|         self.override_document_type_id = override_document_type_id | ||||
| @@ -377,17 +377,16 @@ class Consumer(LoggingMixin): | ||||
|         self.log.info(f"Consuming {self.filename}") | ||||
|  | ||||
|         # For the actual work, copy the file into a tempdir | ||||
|         self.original_path = self.path | ||||
|         tempdir = tempfile.TemporaryDirectory( | ||||
|             prefix="paperless-ngx", | ||||
|             dir=settings.SCRATCH_DIR, | ||||
|         ) | ||||
|         self.path = Path(tempdir.name) / Path(self.filename) | ||||
|         copy_file_with_basic_stats(self.original_path, self.path) | ||||
|         self.working_copy = Path(tempdir.name) / Path(self.filename) | ||||
|         copy_file_with_basic_stats(self.original_path, self.working_copy) | ||||
|  | ||||
|         # Determine the parser class. | ||||
|  | ||||
|         mime_type = magic.from_file(self.path, mime=True) | ||||
|         mime_type = magic.from_file(self.working_copy, mime=True) | ||||
|  | ||||
|         self.log.debug(f"Detected mime type: {mime_type}") | ||||
|  | ||||
| @@ -406,7 +405,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         document_consumption_started.send( | ||||
|             sender=self.__class__, | ||||
|             filename=self.path, | ||||
|             filename=self.working_copy, | ||||
|             logging_group=self.logging_group, | ||||
|         ) | ||||
|  | ||||
| @@ -444,7 +443,7 @@ class Consumer(LoggingMixin): | ||||
|                 ConsumerStatusShortMessage.PARSING_DOCUMENT, | ||||
|             ) | ||||
|             self.log.debug(f"Parsing {self.filename}...") | ||||
|             document_parser.parse(self.path, mime_type, self.filename) | ||||
|             document_parser.parse(self.working_copy, mime_type, self.filename) | ||||
|  | ||||
|             self.log.debug(f"Generating thumbnail for {self.filename}...") | ||||
|             self._send_progress( | ||||
| @@ -454,7 +453,7 @@ class Consumer(LoggingMixin): | ||||
|                 ConsumerStatusShortMessage.GENERATING_THUMBNAIL, | ||||
|             ) | ||||
|             thumbnail = document_parser.get_thumbnail( | ||||
|                 self.path, | ||||
|                 self.working_copy, | ||||
|                 mime_type, | ||||
|                 self.filename, | ||||
|             ) | ||||
| @@ -527,7 +526,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|                     self._write( | ||||
|                         document.storage_type, | ||||
|                         self.original_path, | ||||
|                         self.working_copy, | ||||
|                         document.source_path, | ||||
|                     ) | ||||
|  | ||||
| @@ -560,9 +559,9 @@ class Consumer(LoggingMixin): | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
|                 self.log.debug(f"Deleting file {self.path}") | ||||
|                 os.unlink(self.path) | ||||
|                 self.log.debug(f"Deleting file {self.working_copy}") | ||||
|                 self.original_path.unlink() | ||||
|                 self.working_copy.unlink() | ||||
|  | ||||
|                 # https://github.com/jonaswinkler/paperless-ng/discussions/1037 | ||||
|                 shadow_file = os.path.join( | ||||
| @@ -735,7 +734,7 @@ class Consumer(LoggingMixin): | ||||
|             )[:127], | ||||
|             content=text, | ||||
|             mime_type=mime_type, | ||||
|             checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(), | ||||
|             checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(), | ||||
|             created=create_date, | ||||
|             modified=create_date, | ||||
|             storage_type=storage_type, | ||||
|   | ||||
| @@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin): | ||||
|         self.logging_group = logging_group | ||||
|         self.settings = self.get_settings() | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|         self.tempdir = Path( | ||||
|             tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), | ||||
|         ) | ||||
|  | ||||
|         self.archive_path = None | ||||
|         self.text = None | ||||
|   | ||||
| @@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase): | ||||
|     @override_settings(PRE_CONSUME_SCRIPT=None) | ||||
|     def test_no_pre_consume_script(self, m): | ||||
|         c = Consumer() | ||||
|         c.path = "path-to-file" | ||||
|         c.working_copy = "path-to-file" | ||||
|         c.run_pre_consume_script() | ||||
|         m.assert_not_called() | ||||
|  | ||||
| @@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase): | ||||
|     def test_pre_consume_script_not_found(self, m, m2): | ||||
|         c = Consumer() | ||||
|         c.filename = "somefile.pdf" | ||||
|         c.path = "path-to-file" | ||||
|         c.working_copy = "path-to-file" | ||||
|         self.assertRaises(ConsumerError, c.run_pre_consume_script) | ||||
|  | ||||
|     @mock.patch("documents.consumer.run") | ||||
| @@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase): | ||||
|             with override_settings(PRE_CONSUME_SCRIPT=script.name): | ||||
|                 c = Consumer() | ||||
|                 c.original_path = "path-to-file" | ||||
|                 c.path = "/tmp/somewhere/path-to-file" | ||||
|                 c.working_copy = "/tmp/somewhere/path-to-file" | ||||
|                 c.task_id = str(uuid.uuid4()) | ||||
|                 c.run_pre_consume_script() | ||||
|  | ||||
| @@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase): | ||||
|  | ||||
|                 subset = { | ||||
|                     "DOCUMENT_SOURCE_PATH": c.original_path, | ||||
|                     "DOCUMENT_WORKING_PATH": c.path, | ||||
|                     "DOCUMENT_WORKING_PATH": c.working_copy, | ||||
|                     "TASK_ID": c.task_id, | ||||
|                 } | ||||
|                 self.assertDictEqual(environment, {**environment, **subset}) | ||||
| @@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase): | ||||
|             with override_settings(PRE_CONSUME_SCRIPT=script.name): | ||||
|                 with self.assertLogs("paperless.consumer", level="INFO") as cm: | ||||
|                     c = Consumer() | ||||
|                     c.path = "path-to-file" | ||||
|                     c.working_copy = "path-to-file" | ||||
|  | ||||
|                     c.run_pre_consume_script() | ||||
|                     self.assertIn( | ||||
| @@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase): | ||||
|  | ||||
|             with override_settings(PRE_CONSUME_SCRIPT=script.name): | ||||
|                 c = Consumer() | ||||
|                 c.path = "path-to-file" | ||||
|                 c.working_copy = "path-to-file" | ||||
|                 self.assertRaises( | ||||
|                     ConsumerError, | ||||
|                     c.run_pre_consume_script, | ||||
|   | ||||
| @@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         with Image.open(image) as im: | ||||
|             return im.mode in ("RGBA", "LA") | ||||
|  | ||||
|     def remove_alpha(self, image_path: str): | ||||
|     def remove_alpha(self, image_path: str) -> Path: | ||||
|         no_alpha_image = Path(self.tempdir) / "image-no-alpha" | ||||
|         subprocess.run( | ||||
|             [ | ||||
|                 settings.CONVERT_BINARY, | ||||
|                 "-alpha", | ||||
|                 "off", | ||||
|                 image_path, | ||||
|                 image_path, | ||||
|                 no_alpha_image, | ||||
|             ], | ||||
|         ) | ||||
|         return no_alpha_image | ||||
|  | ||||
|     def get_dpi(self, image) -> Optional[int]: | ||||
|         try: | ||||
| @@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                     f"Removing alpha layer from {input_file} " | ||||
|                     "for compatibility with img2pdf", | ||||
|                 ) | ||||
|                 self.remove_alpha(input_file) | ||||
|                 # Replace the input file with the non-alpha | ||||
|                 ocrmypdf_args["input_file"] = self.remove_alpha(input_file) | ||||
|  | ||||
|             if dpi: | ||||
|                 self.log.debug(f"Detected DPI for image {input_file}: {dpi}") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H