diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index ca5ed4321..4bbd7753d 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -136,6 +136,11 @@ script can access the following relevant environment variables set: be triggered, leading to failures as two tasks work on the same document path +!!! warning + + If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic + way, this may allow duplicate documents to be stored + A simple but common example for this would be creating a simple script like this: diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 11faeea43..b7a559575 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -135,24 +135,24 @@ class Consumer(LoggingMixin): """ Confirm the input file still exists where it should """ - if not os.path.isfile(self.path): + if not os.path.isfile(self.original_path): self._fail( ConsumerStatusShortMessage.FILE_NOT_FOUND, - f"Cannot consume {self.path}: File not found.", + f"Cannot consume {self.original_path}: File not found.", ) def pre_check_duplicate(self): """ Using the MD5 of the file, check this exact file doesn't already exist """ - with open(self.path, "rb") as f: + with open(self.original_path, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() existing_doc = Document.objects.filter( Q(checksum=checksum) | Q(archive_checksum=checksum), ) if existing_doc.exists(): if settings.CONSUMER_DELETE_DUPLICATES: - os.unlink(self.path) + os.unlink(self.original_path) self._fail( ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS, f"Not consuming {self.filename}: It is a duplicate of" @@ -211,7 +211,7 @@ class Consumer(LoggingMixin): self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") - working_file_path = str(self.path) + working_file_path = str(self.working_copy) original_file_path = str(self.original_path) script_env = os.environ.copy() @@ -343,8 +343,8 @@ class Consumer(LoggingMixin): Return the document object if it was successfully created. """ - self.path = Path(path).resolve() - self.filename = override_filename or self.path.name + self.original_path = Path(path).resolve() + self.filename = override_filename or self.original_path.name self.override_title = override_title self.override_correspondent_id = override_correspondent_id self.override_document_type_id = override_document_type_id @@ -377,17 +377,16 @@ class Consumer(LoggingMixin): self.log.info(f"Consuming {self.filename}") # For the actual work, copy the file into a tempdir - self.original_path = self.path tempdir = tempfile.TemporaryDirectory( prefix="paperless-ngx", dir=settings.SCRATCH_DIR, ) - self.path = Path(tempdir.name) / Path(self.filename) - copy_file_with_basic_stats(self.original_path, self.path) + self.working_copy = Path(tempdir.name) / Path(self.filename) + copy_file_with_basic_stats(self.original_path, self.working_copy) # Determine the parser class. - mime_type = magic.from_file(self.path, mime=True) + mime_type = magic.from_file(self.working_copy, mime=True) self.log.debug(f"Detected mime type: {mime_type}") @@ -406,7 +405,7 @@ class Consumer(LoggingMixin): document_consumption_started.send( sender=self.__class__, - filename=self.path, + filename=self.working_copy, logging_group=self.logging_group, ) @@ -444,7 +443,7 @@ class Consumer(LoggingMixin): ConsumerStatusShortMessage.PARSING_DOCUMENT, ) self.log.debug(f"Parsing {self.filename}...") - document_parser.parse(self.path, mime_type, self.filename) + document_parser.parse(self.working_copy, mime_type, self.filename) self.log.debug(f"Generating thumbnail for {self.filename}...") self._send_progress( @@ -454,7 +453,7 @@ class Consumer(LoggingMixin): ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) thumbnail = document_parser.get_thumbnail( - self.path, + self.working_copy, mime_type, self.filename, ) @@ -527,7 +526,7 @@ class Consumer(LoggingMixin): self._write( document.storage_type, - self.original_path, + self.working_copy, document.source_path, ) @@ -560,9 +559,9 @@ class Consumer(LoggingMixin): document.save() # Delete the file only if it was successfully consumed - self.log.debug(f"Deleting file {self.path}") - os.unlink(self.path) + self.log.debug(f"Deleting file {self.working_copy}") self.original_path.unlink() + self.working_copy.unlink() # https://github.com/jonaswinkler/paperless-ng/discussions/1037 shadow_file = os.path.join( @@ -735,7 +734,7 @@ class Consumer(LoggingMixin): )[:127], content=text, mime_type=mime_type, - checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(), + checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(), created=create_date, modified=create_date, storage_type=storage_type, diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 3215d49a6..b823ae9ce 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin): self.logging_group = logging_group self.settings = self.get_settings() os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + self.tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) self.archive_path = None self.text = None diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 1db90ee54..24f7ff338 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase): @override_settings(PRE_CONSUME_SCRIPT=None) def test_no_pre_consume_script(self, m): c = Consumer() - c.path = "path-to-file" + c.working_copy = "path-to-file" c.run_pre_consume_script() m.assert_not_called() @@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase): def test_pre_consume_script_not_found(self, m, m2): c = Consumer() c.filename = "somefile.pdf" - c.path = "path-to-file" + c.working_copy = "path-to-file" self.assertRaises(ConsumerError, c.run_pre_consume_script) @mock.patch("documents.consumer.run") @@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase): with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() c.original_path = "path-to-file" - c.path = "/tmp/somewhere/path-to-file" + c.working_copy = "/tmp/somewhere/path-to-file" c.task_id = str(uuid.uuid4()) c.run_pre_consume_script() @@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase): subset = { "DOCUMENT_SOURCE_PATH": c.original_path, - "DOCUMENT_WORKING_PATH": c.path, + "DOCUMENT_WORKING_PATH": c.working_copy, "TASK_ID": c.task_id, } self.assertDictEqual(environment, {**environment, **subset}) @@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase): with override_settings(PRE_CONSUME_SCRIPT=script.name): with self.assertLogs("paperless.consumer", level="INFO") as cm: c = Consumer() - c.path = "path-to-file" + c.working_copy = "path-to-file" c.run_pre_consume_script() self.assertIn( @@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase): with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() - c.path = "path-to-file" + c.working_copy = "path-to-file" self.assertRaises( ConsumerError, c.run_pre_consume_script, diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 047a171b2..c699d8ea5 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser): with Image.open(image) as im: return im.mode in ("RGBA", "LA") - def remove_alpha(self, image_path: str): + def remove_alpha(self, image_path: str) -> Path: + no_alpha_image = Path(self.tempdir) / "image-no-alpha" subprocess.run( [ settings.CONVERT_BINARY, "-alpha", "off", image_path, - image_path, + no_alpha_image, ], ) + return no_alpha_image def get_dpi(self, image) -> Optional[int]: try: @@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser): f"Removing alpha layer from {input_file} " "for compatibility with img2pdf", ) - self.remove_alpha(input_file) + # Replace the input file with the non-alpha + ocrmypdf_args["input_file"] = self.remove_alpha(input_file) if dpi: self.log.debug(f"Detected DPI for image {input_file}: {dpi}")