Feature: auto-clean some invalid pdfs (#7651)

2026-01-26 22:49:01 -06:00 · 2024-09-25 08:57:20 -07:00
parent c92c3e224a
commit 5e687d9a93
7 changed files with 100 additions and 4 deletions
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -418,6 +418,11 @@ Insurances/                             # Insurances
    Defining a storage path is optional. If no storage path is defined for a
    document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.

+## Automatic recovery of invalid PDFs {#pdf-recovery}
+
+Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
+detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
+
 ## Celery Monitoring {#celery-monitoring}

 The monitoring tool
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -532,6 +532,7 @@ class ConsumerPlugin(
            )
            self.working_copy = Path(tempdir.name) / Path(self.filename)
            copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+            self.unmodified_original = None

            # Determine the parser class.

@@ -539,6 +540,37 @@ class ConsumerPlugin(

            self.log.debug(f"Detected mime type: {mime_type}")

+            if (
+                Path(self.filename).suffix.lower() == ".pdf"
+                and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+            ):
+                try:
+                    # The file might be a pdf, but the mime type is wrong.
+                    # Try to clean with qpdf
+                    self.log.debug(
+                        "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+                    )
+                    run_subprocess(
+                        [
+                            "qpdf",
+                            "--replace-input",
+                            self.working_copy,
+                        ],
+                        logger=self.log,
+                    )
+                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+                    # Save the original file for later
+                    self.unmodified_original = (
+                        Path(tempdir.name) / Path("uo") / Path(self.filename)
+                    )
+                    copy_file_with_basic_stats(
+                        self.input_doc.original_file,
+                        self.unmodified_original,
+                    )
+                except Exception as e:
+                    self.log.error(f"Error attempting to clean PDF: {e}")
+
            # Based on the mime type, get the parser for that type
            parser_class: Optional[type[DocumentParser]] = (
                get_parser_class_for_mime_type(
@@ -689,7 +721,9 @@ class ConsumerPlugin(

                    self._write(
                        document.storage_type,
-                        self.working_copy,
+                        self.unmodified_original
+                        if self.unmodified_original is not None
+                        else self.working_copy,
                        document.source_path,
                    )

@@ -725,6 +759,8 @@ class ConsumerPlugin(
                self.log.debug(f"Deleting file {self.working_copy}")
                self.input_doc.original_file.unlink()
                self.working_copy.unlink()
+                if self.unmodified_original is not None:  # pragma: no cover
+                    self.unmodified_original.unlink()

                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer):
        mime_type = magic.from_buffer(document_data, mime=True)

        if not is_mime_type_supported(mime_type):
-            raise serializers.ValidationError(
-                _("File type %(type)s not supported") % {"type": mime_type},
-            )
+            if (
+                mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+                and document.name.endswith(
+                    ".pdf",
+                )
+            ):
+                # If the file is an invalid PDF, we can try to recover it later in the consumer
+                mime_type = "application/pdf"
+            else:
+                raise serializers.ValidationError(
+                    _("File type %(type)s not supported") % {"type": mime_type},
+                )

        return document.name, document_data

--- a/src/documents/tests/samples/invalid_pdf.pdf
+++ b/src/documents/tests/samples/invalid_pdf.pdf
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
        self.assertEqual(overrides.filename, "simple.pdf")
        self.assertEqual(overrides.custom_field_ids, [custom_field.id])

+    def test_upload_invalid_pdf(self):
+        """
+        GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+        WHEN: Upload the file
+        THEN: The file is not rejected
+        """
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )
+
+        with open(
+            os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
+            "rb",
+        ) as f:
+            response = self.client.post(
+                "/api/documents/post_document/",
+                {"document": f},
+            )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
    def test_get_metadata(self):
        doc = Document.objects.create(
            title="test",
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser):

 def fake_magic_from_file(file, mime=False):
    if mime:
+        if file.name.startswith("invalid_pdf"):
+            return "application/octet-stream"
        if os.path.splitext(file)[1] == ".pdf":
            return "application/pdf"
        elif os.path.splitext(file)[1] == ".png":
@@ -952,6 +954,27 @@ class TestConsumer(

        sanity_check()

+    @mock.patch("documents.consumer.run_subprocess")
+    def test_try_to_clean_invalid_pdf(self, m):
+        shutil.copy(
+            Path(__file__).parent / "samples" / "invalid_pdf.pdf",
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        )
+        with self.get_consumer(
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        ) as consumer:
+            # fails because no qpdf
+            self.assertRaises(ConsumerError, consumer.run)
+
+            m.assert_called_once()
+
+            args, _ = m.call_args
+
+            command = args[0]
+
+            self.assertEqual(command[0], "qpdf")
+            self.assertEqual(command[1], "--replace-input")
+

@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
 )

+CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
+
 OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")

 # The default language that tesseract will attempt to use when parsing