mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-01 01:46:16 +00:00
Feature: auto-clean some invalid pdfs (#7651)
This commit is contained in:
@@ -532,6 +532,7 @@ class ConsumerPlugin(
|
||||
)
|
||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
||||
self.unmodified_original = None
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
@@ -539,6 +540,37 @@ class ConsumerPlugin(
|
||||
|
||||
self.log.debug(f"Detected mime type: {mime_type}")
|
||||
|
||||
if (
|
||||
Path(self.filename).suffix.lower() == ".pdf"
|
||||
and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
|
||||
):
|
||||
try:
|
||||
# The file might be a pdf, but the mime type is wrong.
|
||||
# Try to clean with qpdf
|
||||
self.log.debug(
|
||||
"Detected possible PDF with wrong mime type, trying to clean with qpdf",
|
||||
)
|
||||
run_subprocess(
|
||||
[
|
||||
"qpdf",
|
||||
"--replace-input",
|
||||
self.working_copy,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||
# Save the original file for later
|
||||
self.unmodified_original = (
|
||||
Path(tempdir.name) / Path("uo") / Path(self.filename)
|
||||
)
|
||||
copy_file_with_basic_stats(
|
||||
self.input_doc.original_file,
|
||||
self.unmodified_original,
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: Optional[type[DocumentParser]] = (
|
||||
get_parser_class_for_mime_type(
|
||||
@@ -689,7 +721,9 @@ class ConsumerPlugin(
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
self.working_copy,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
@@ -725,6 +759,8 @@ class ConsumerPlugin(
|
||||
self.log.debug(f"Deleting file {self.working_copy}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = os.path.join(
|
||||
|
Reference in New Issue
Block a user