diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 30687680c..fe8d2e305 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -418,6 +418,11 @@ Insurances/ # Insurances Defining a storage path is optional. If no storage path is defined for a document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied. +## Automatic recovery of invalid PDFs {#pdf-recovery} + +Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type +detection is incorrect. This can happen if the PDF is not properly formatted or contains errors. + ## Celery Monitoring {#celery-monitoring} The monitoring tool diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 57277e4a6..97910e24b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -532,6 +532,7 @@ class ConsumerPlugin( ) self.working_copy = Path(tempdir.name) / Path(self.filename) copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy) + self.unmodified_original = None # Determine the parser class. @@ -539,6 +540,37 @@ class ConsumerPlugin( self.log.debug(f"Detected mime type: {mime_type}") + if ( + Path(self.filename).suffix.lower() == ".pdf" + and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + ): + try: + # The file might be a pdf, but the mime type is wrong. + # Try to clean with qpdf + self.log.debug( + "Detected possible PDF with wrong mime type, trying to clean with qpdf", + ) + run_subprocess( + [ + "qpdf", + "--replace-input", + self.working_copy, + ], + logger=self.log, + ) + mime_type = magic.from_file(self.working_copy, mime=True) + self.log.debug(f"Detected mime type after qpdf: {mime_type}") + # Save the original file for later + self.unmodified_original = ( + Path(tempdir.name) / Path("uo") / Path(self.filename) + ) + copy_file_with_basic_stats( + self.input_doc.original_file, + self.unmodified_original, + ) + except Exception as e: + self.log.error(f"Error attempting to clean PDF: {e}") + # Based on the mime type, get the parser for that type parser_class: Optional[type[DocumentParser]] = ( get_parser_class_for_mime_type( @@ -689,7 +721,9 @@ class ConsumerPlugin( self._write( document.storage_type, - self.working_copy, + self.unmodified_original + if self.unmodified_original is not None + else self.working_copy, document.source_path, ) @@ -725,6 +759,8 @@ class ConsumerPlugin( self.log.debug(f"Deleting file {self.working_copy}") self.input_doc.original_file.unlink() self.working_copy.unlink() + if self.unmodified_original is not None: # pragma: no cover + self.unmodified_original.unlink() # https://github.com/jonaswinkler/paperless-ng/discussions/1037 shadow_file = os.path.join( diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 737d1256f..30f3dd26d 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer): mime_type = magic.from_buffer(document_data, mime=True) if not is_mime_type_supported(mime_type): - raise serializers.ValidationError( - _("File type %(type)s not supported") % {"type": mime_type}, - ) + if ( + mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + and document.name.endswith( + ".pdf", + ) + ): + # If the file is an invalid PDF, we can try to recover it later in the consumer + mime_type = "application/pdf" + else: + raise serializers.ValidationError( + _("File type %(type)s not supported") % {"type": mime_type}, + ) return document.name, document_data diff --git a/src/documents/tests/samples/invalid_pdf.pdf b/src/documents/tests/samples/invalid_pdf.pdf new file mode 100644 index 000000000..f226c2d84 Binary files /dev/null and b/src/documents/tests/samples/invalid_pdf.pdf differ diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index ee2e8ee1e..b1cd43932 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): self.assertEqual(overrides.filename, "simple.pdf") self.assertEqual(overrides.custom_field_ids, [custom_field.id]) + def test_upload_invalid_pdf(self): + """ + GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + WHEN: Upload the file + THEN: The file is not rejected + """ + self.consume_file_mock.return_value = celery.result.AsyncResult( + id=str(uuid.uuid4()), + ) + + with open( + os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"), + "rb", + ) as f: + response = self.client.post( + "/api/documents/post_document/", + {"document": f}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + def test_get_metadata(self): doc = Document.objects.create( title="test", diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 5b56e2cca..aa452e15b 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser): def fake_magic_from_file(file, mime=False): if mime: + if file.name.startswith("invalid_pdf"): + return "application/octet-stream" if os.path.splitext(file)[1] == ".pdf": return "application/pdf" elif os.path.splitext(file)[1] == ".png": @@ -952,6 +954,27 @@ class TestConsumer( sanity_check() + @mock.patch("documents.consumer.run_subprocess") + def test_try_to_clean_invalid_pdf(self, m): + shutil.copy( + Path(__file__).parent / "samples" / "invalid_pdf.pdf", + settings.CONSUMPTION_DIR / "invalid_pdf.pdf", + ) + with self.get_consumer( + settings.CONSUMPTION_DIR / "invalid_pdf.pdf", + ) as consumer: + # fails because no qpdf + self.assertRaises(ConsumerError, consumer.run) + + m.assert_called_once() + + args, _ = m.call_args + + command = args[0] + + self.assertEqual(command[0], "qpdf") + self.assertEqual(command[1], "--replace-input") + @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase): diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 851fe6217..2da0b49f1 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT", ) +CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",) + OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES") # The default language that tesseract will attempt to use when parsing