Feature: auto-clean some invalid pdfs (#7651)

2025-11-23 23:49:08 -06:00 · 2024-09-25 08:57:20 -07:00
parent c92c3e224a
commit 5e687d9a93
7 changed files with 100 additions and 4 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -532,6 +532,7 @@ class ConsumerPlugin(
            )
            self.working_copy = Path(tempdir.name) / Path(self.filename)
            copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+            self.unmodified_original = None

            # Determine the parser class.

@@ -539,6 +540,37 @@ class ConsumerPlugin(

            self.log.debug(f"Detected mime type: {mime_type}")

+            if (
+                Path(self.filename).suffix.lower() == ".pdf"
+                and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+            ):
+                try:
+                    # The file might be a pdf, but the mime type is wrong.
+                    # Try to clean with qpdf
+                    self.log.debug(
+                        "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+                    )
+                    run_subprocess(
+                        [
+                            "qpdf",
+                            "--replace-input",
+                            self.working_copy,
+                        ],
+                        logger=self.log,
+                    )
+                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+                    # Save the original file for later
+                    self.unmodified_original = (
+                        Path(tempdir.name) / Path("uo") / Path(self.filename)
+                    )
+                    copy_file_with_basic_stats(
+                        self.input_doc.original_file,
+                        self.unmodified_original,
+                    )
+                except Exception as e:
+                    self.log.error(f"Error attempting to clean PDF: {e}")
+
            # Based on the mime type, get the parser for that type
            parser_class: Optional[type[DocumentParser]] = (
                get_parser_class_for_mime_type(
@@ -689,7 +721,9 @@ class ConsumerPlugin(

                    self._write(
                        document.storage_type,
-                        self.working_copy,
+                        self.unmodified_original
+                        if self.unmodified_original is not None
+                        else self.working_copy,
                        document.source_path,
                    )

@@ -725,6 +759,8 @@ class ConsumerPlugin(
                self.log.debug(f"Deleting file {self.working_copy}")
                self.input_doc.original_file.unlink()
                self.working_copy.unlink()
+                if self.unmodified_original is not None:  # pragma: no cover
+                    self.unmodified_original.unlink()

                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(