Fix: Allows pre-consume scripts to modify the working path again (#5260)

* Allows pre-consume scripts to modify the working path again and generally cleans up some confusion about working copy vs original
2026-01-08 21:24:26 -06:00 · 2024-01-05 21:01:57 -08:00
parent 3115106dc1
commit a82e3771ae
5 changed files with 37 additions and 28 deletions
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -136,6 +136,11 @@ script can access the following relevant environment variables set:
    be triggered, leading to failures as two tasks work on the
    same document path
 !!! warning
    If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic
    way, this may allow duplicate documents to be stored
 A simple but common example for this would be creating a simple script
 like this:
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -135,24 +135,24 @@ class Consumer(LoggingMixin):
        """
        Confirm the input file still exists where it should
        """
-        if not os.path.isfile(self.path):
+        if not os.path.isfile(self.original_path):
            self._fail(
                ConsumerStatusShortMessage.FILE_NOT_FOUND,
-                f"Cannot consume {self.path}: File not found.",
+                f"Cannot consume {self.original_path}: File not found.",
            )
    def pre_check_duplicate(self):
        """
        Using the MD5 of the file, check this exact file doesn't already exist
        """
-        with open(self.path, "rb") as f:
+        with open(self.original_path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        existing_doc = Document.objects.filter(
            Q(checksum=checksum) | Q(archive_checksum=checksum),
        )
        if existing_doc.exists():
            if settings.CONSUMER_DELETE_DUPLICATES:
-                os.unlink(self.path)
+                os.unlink(self.original_path)
            self._fail(
                ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
                f"Not consuming {self.filename}: It is a duplicate of"
@@ -211,7 +211,7 @@ class Consumer(LoggingMixin):
        self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
-        working_file_path = str(self.path)
+        working_file_path = str(self.working_copy)
        original_file_path = str(self.original_path)
        script_env = os.environ.copy()
@@ -343,8 +343,8 @@ class Consumer(LoggingMixin):
        Return the document object if it was successfully created.
        """
-        self.path = Path(path).resolve()
+        self.original_path = Path(path).resolve()
-        self.filename = override_filename or self.path.name
+        self.filename = override_filename or self.original_path.name
        self.override_title = override_title
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
@@ -377,17 +377,16 @@ class Consumer(LoggingMixin):
        self.log.info(f"Consuming {self.filename}")
        # For the actual work, copy the file into a tempdir
        self.original_path = self.path
        tempdir = tempfile.TemporaryDirectory(
            prefix="paperless-ngx",
            dir=settings.SCRATCH_DIR,
        )
-        self.path = Path(tempdir.name) / Path(self.filename)
+        self.working_copy = Path(tempdir.name) / Path(self.filename)
-        copy_file_with_basic_stats(self.original_path, self.path)
+        copy_file_with_basic_stats(self.original_path, self.working_copy)
        # Determine the parser class.
-        mime_type = magic.from_file(self.path, mime=True)
+        mime_type = magic.from_file(self.working_copy, mime=True)
        self.log.debug(f"Detected mime type: {mime_type}")
@@ -406,7 +405,7 @@ class Consumer(LoggingMixin):
        document_consumption_started.send(
            sender=self.__class__,
-            filename=self.path,
+            filename=self.working_copy,
            logging_group=self.logging_group,
        )
@@ -444,7 +443,7 @@ class Consumer(LoggingMixin):
                ConsumerStatusShortMessage.PARSING_DOCUMENT,
            )
            self.log.debug(f"Parsing {self.filename}...")
-            document_parser.parse(self.path, mime_type, self.filename)
+            document_parser.parse(self.working_copy, mime_type, self.filename)
            self.log.debug(f"Generating thumbnail for {self.filename}...")
            self._send_progress(
@@ -454,7 +453,7 @@ class Consumer(LoggingMixin):
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
            thumbnail = document_parser.get_thumbnail(
-                self.path,
+                self.working_copy,
                mime_type,
                self.filename,
            )
@@ -527,7 +526,7 @@ class Consumer(LoggingMixin):
                    self._write(
                        document.storage_type,
-                        self.original_path,
+                        self.working_copy,
                        document.source_path,
                    )
@@ -560,9 +559,9 @@ class Consumer(LoggingMixin):
                document.save()
                # Delete the file only if it was successfully consumed
-                self.log.debug(f"Deleting file {self.path}")
+                self.log.debug(f"Deleting file {self.working_copy}")
                os.unlink(self.path)
                self.original_path.unlink()
                self.working_copy.unlink()
                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(
@@ -735,7 +734,7 @@ class Consumer(LoggingMixin):
            )[:127],
            content=text,
            mime_type=mime_type,
-            checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(),
+            checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(),
            created=create_date,
            modified=create_date,
            storage_type=storage_type,
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin):
        self.logging_group = logging_group
        self.settings = self.get_settings()
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+        self.tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
        )
        self.archive_path = None
        self.text = None
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase):
    @override_settings(PRE_CONSUME_SCRIPT=None)
    def test_no_pre_consume_script(self, m):
        c = Consumer()
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
        c.run_pre_consume_script()
        m.assert_not_called()
@@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase):
    def test_pre_consume_script_not_found(self, m, m2):
        c = Consumer()
        c.filename = "somefile.pdf"
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
        self.assertRaises(ConsumerError, c.run_pre_consume_script)
    @mock.patch("documents.consumer.run")
@@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase):
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                c = Consumer()
                c.original_path = "path-to-file"
-                c.path = "/tmp/somewhere/path-to-file"
+                c.working_copy = "/tmp/somewhere/path-to-file"
                c.task_id = str(uuid.uuid4())
                c.run_pre_consume_script()
@@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase):
                subset = {
                    "DOCUMENT_SOURCE_PATH": c.original_path,
-                    "DOCUMENT_WORKING_PATH": c.path,
+                    "DOCUMENT_WORKING_PATH": c.working_copy,
                    "TASK_ID": c.task_id,
                }
                self.assertDictEqual(environment, {**environment, **subset})
@@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase):
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                with self.assertLogs("paperless.consumer", level="INFO") as cm:
                    c = Consumer()
-                    c.path = "path-to-file"
+                    c.working_copy = "path-to-file"
                    c.run_pre_consume_script()
                    self.assertIn(
@@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase):
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                c = Consumer()
-                c.path = "path-to-file"
+                c.working_copy = "path-to-file"
                self.assertRaises(
                    ConsumerError,
                    c.run_pre_consume_script,
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser):
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")
-    def remove_alpha(self, image_path: str):
+    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        subprocess.run(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
                image_path,
-                image_path,
+                no_alpha_image,
            ],
        )
        return no_alpha_image
    def get_dpi(self, image) -> Optional[int]:
        try:
@@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
                    f"Removing alpha layer from {input_file} "
                    "for compatibility with img2pdf",
                )
-                self.remove_alpha(input_file)
+                # Replace the input file with the non-alpha
                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
            if dpi:
                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")