mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Feature: auto-clean some invalid pdfs (#7651)
This commit is contained in:
parent
c92c3e224a
commit
5e687d9a93
@ -418,6 +418,11 @@ Insurances/ # Insurances
|
|||||||
Defining a storage path is optional. If no storage path is defined for a
|
Defining a storage path is optional. If no storage path is defined for a
|
||||||
document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.
|
document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.
|
||||||
|
|
||||||
|
## Automatic recovery of invalid PDFs {#pdf-recovery}
|
||||||
|
|
||||||
|
Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
|
||||||
|
detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
|
||||||
|
|
||||||
## Celery Monitoring {#celery-monitoring}
|
## Celery Monitoring {#celery-monitoring}
|
||||||
|
|
||||||
The monitoring tool
|
The monitoring tool
|
||||||
|
@ -532,6 +532,7 @@ class ConsumerPlugin(
|
|||||||
)
|
)
|
||||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||||
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
||||||
|
self.unmodified_original = None
|
||||||
|
|
||||||
# Determine the parser class.
|
# Determine the parser class.
|
||||||
|
|
||||||
@ -539,6 +540,37 @@ class ConsumerPlugin(
|
|||||||
|
|
||||||
self.log.debug(f"Detected mime type: {mime_type}")
|
self.log.debug(f"Detected mime type: {mime_type}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
Path(self.filename).suffix.lower() == ".pdf"
|
||||||
|
and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
# The file might be a pdf, but the mime type is wrong.
|
||||||
|
# Try to clean with qpdf
|
||||||
|
self.log.debug(
|
||||||
|
"Detected possible PDF with wrong mime type, trying to clean with qpdf",
|
||||||
|
)
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
"qpdf",
|
||||||
|
"--replace-input",
|
||||||
|
self.working_copy,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||||
|
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||||
|
# Save the original file for later
|
||||||
|
self.unmodified_original = (
|
||||||
|
Path(tempdir.name) / Path("uo") / Path(self.filename)
|
||||||
|
)
|
||||||
|
copy_file_with_basic_stats(
|
||||||
|
self.input_doc.original_file,
|
||||||
|
self.unmodified_original,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||||
|
|
||||||
# Based on the mime type, get the parser for that type
|
# Based on the mime type, get the parser for that type
|
||||||
parser_class: Optional[type[DocumentParser]] = (
|
parser_class: Optional[type[DocumentParser]] = (
|
||||||
get_parser_class_for_mime_type(
|
get_parser_class_for_mime_type(
|
||||||
@ -689,7 +721,9 @@ class ConsumerPlugin(
|
|||||||
|
|
||||||
self._write(
|
self._write(
|
||||||
document.storage_type,
|
document.storage_type,
|
||||||
self.working_copy,
|
self.unmodified_original
|
||||||
|
if self.unmodified_original is not None
|
||||||
|
else self.working_copy,
|
||||||
document.source_path,
|
document.source_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -725,6 +759,8 @@ class ConsumerPlugin(
|
|||||||
self.log.debug(f"Deleting file {self.working_copy}")
|
self.log.debug(f"Deleting file {self.working_copy}")
|
||||||
self.input_doc.original_file.unlink()
|
self.input_doc.original_file.unlink()
|
||||||
self.working_copy.unlink()
|
self.working_copy.unlink()
|
||||||
|
if self.unmodified_original is not None: # pragma: no cover
|
||||||
|
self.unmodified_original.unlink()
|
||||||
|
|
||||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||||
shadow_file = os.path.join(
|
shadow_file = os.path.join(
|
||||||
|
@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer):
|
|||||||
mime_type = magic.from_buffer(document_data, mime=True)
|
mime_type = magic.from_buffer(document_data, mime=True)
|
||||||
|
|
||||||
if not is_mime_type_supported(mime_type):
|
if not is_mime_type_supported(mime_type):
|
||||||
raise serializers.ValidationError(
|
if (
|
||||||
_("File type %(type)s not supported") % {"type": mime_type},
|
mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
|
||||||
)
|
and document.name.endswith(
|
||||||
|
".pdf",
|
||||||
|
)
|
||||||
|
):
|
||||||
|
# If the file is an invalid PDF, we can try to recover it later in the consumer
|
||||||
|
mime_type = "application/pdf"
|
||||||
|
else:
|
||||||
|
raise serializers.ValidationError(
|
||||||
|
_("File type %(type)s not supported") % {"type": mime_type},
|
||||||
|
)
|
||||||
|
|
||||||
return document.name, document_data
|
return document.name, document_data
|
||||||
|
|
||||||
|
BIN
src/documents/tests/samples/invalid_pdf.pdf
Normal file
BIN
src/documents/tests/samples/invalid_pdf.pdf
Normal file
Binary file not shown.
@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
self.assertEqual(overrides.filename, "simple.pdf")
|
self.assertEqual(overrides.filename, "simple.pdf")
|
||||||
self.assertEqual(overrides.custom_field_ids, [custom_field.id])
|
self.assertEqual(overrides.custom_field_ids, [custom_field.id])
|
||||||
|
|
||||||
|
def test_upload_invalid_pdf(self):
|
||||||
|
"""
|
||||||
|
GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
|
||||||
|
WHEN: Upload the file
|
||||||
|
THEN: The file is not rejected
|
||||||
|
"""
|
||||||
|
self.consume_file_mock.return_value = celery.result.AsyncResult(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(
|
||||||
|
os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
|
||||||
|
"rb",
|
||||||
|
) as f:
|
||||||
|
response = self.client.post(
|
||||||
|
"/api/documents/post_document/",
|
||||||
|
{"document": f},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
|
||||||
def test_get_metadata(self):
|
def test_get_metadata(self):
|
||||||
doc = Document.objects.create(
|
doc = Document.objects.create(
|
||||||
title="test",
|
title="test",
|
||||||
|
@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser):
|
|||||||
|
|
||||||
def fake_magic_from_file(file, mime=False):
|
def fake_magic_from_file(file, mime=False):
|
||||||
if mime:
|
if mime:
|
||||||
|
if file.name.startswith("invalid_pdf"):
|
||||||
|
return "application/octet-stream"
|
||||||
if os.path.splitext(file)[1] == ".pdf":
|
if os.path.splitext(file)[1] == ".pdf":
|
||||||
return "application/pdf"
|
return "application/pdf"
|
||||||
elif os.path.splitext(file)[1] == ".png":
|
elif os.path.splitext(file)[1] == ".png":
|
||||||
@ -952,6 +954,27 @@ class TestConsumer(
|
|||||||
|
|
||||||
sanity_check()
|
sanity_check()
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.run_subprocess")
|
||||||
|
def test_try_to_clean_invalid_pdf(self, m):
|
||||||
|
shutil.copy(
|
||||||
|
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||||
|
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||||
|
)
|
||||||
|
with self.get_consumer(
|
||||||
|
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||||
|
) as consumer:
|
||||||
|
# fails because no qpdf
|
||||||
|
self.assertRaises(ConsumerError, consumer.run)
|
||||||
|
|
||||||
|
m.assert_called_once()
|
||||||
|
|
||||||
|
args, _ = m.call_args
|
||||||
|
|
||||||
|
command = args[0]
|
||||||
|
|
||||||
|
self.assertEqual(command[0], "qpdf")
|
||||||
|
self.assertEqual(command[1], "--replace-input")
|
||||||
|
|
||||||
|
|
||||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||||
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
|
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||||
|
@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
|
|||||||
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
|
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
|
||||||
|
|
||||||
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
|
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
|
||||||
|
|
||||||
# The default language that tesseract will attempt to use when parsing
|
# The default language that tesseract will attempt to use when parsing
|
||||||
|
Loading…
x
Reference in New Issue
Block a user