diff --git a/src/documents/consumer.py b/src/documents/consumer.py index ca8e2d378..a916afb2d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -808,50 +808,3 @@ class ConsumerPlugin( copy_basic_file_stats(source, target) except Exception: # pragma: no cover pass - - -class CleanPDFPlugin( - NoCleanupPluginMixin, - NoSetupPluginMixin, - AlwaysRunPluginMixin, - LoggingMixin, - ConsumeTaskPlugin, -): - NAME: str = "CleanPDFPlugin" - logging_name = "paperless.consumer" - - def __init__( - self, - input_doc: ConsumableDocument, - metadata: DocumentMetadataOverrides, - status_mgr: ProgressManager, - base_tmp_dir: Path, - task_id: str, - ) -> None: - super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id) - - self.renew_logging_group() - - def run(self) -> str | None: - """ - Tries to clean a PDF file with qpdf - """ - msg = None - try: - result = run_subprocess( - [ - "qpdf", - "--replace-input", - self.input_doc.original_file, - ], - logger=self.log, - ) - msg = ( - f"Error while cleaning PDF: {result.stderr}" - if result.returncode != 0 - else "PDF cleaned successfully" - ) - except Exception as e: - msg = "Error while cleaning PDF" - self.log.error(e) - return msg diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 7799abe2a..c2b7194e2 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -24,7 +24,6 @@ from documents.barcodes import BarcodePlugin from documents.caching import clear_document_caches from documents.classifier import DocumentClassifier from documents.classifier import load_classifier -from documents.consumer import CleanPDFPlugin from documents.consumer import ConsumerPlugin from documents.consumer import WorkflowTriggerPlugin from documents.data_models import ConsumableDocument @@ -49,6 +48,7 @@ from documents.sanity_checker import SanityCheckFailedException from documents.signals import document_updated from documents.signals.handlers import cleanup_document_deletion from documents.utils import copy_file_with_basic_stats +from documents.utils import run_subprocess if settings.AUDIT_LOG_ENABLED: from auditlog.models import LogEntry @@ -111,7 +111,6 @@ def consume_file( self: Task, input_doc: ConsumableDocument, overrides: DocumentMetadataOverrides | None = None, - clean: bool = False, ): # Default no overrides if overrides is None: @@ -124,9 +123,6 @@ def consume_file( ConsumerPlugin, ] - if clean: - plugins.insert(0, CleanPDFPlugin) - with ( ProgressManager( overrides.filename or input_doc.original_file.name, @@ -189,13 +185,32 @@ def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False) working_copy = settings.SCRATCH_DIR / failed_file.name copy_file_with_basic_stats(failed_file, working_copy) + if clean: + try: + result = run_subprocess( + [ + "qpdf", + "--replace-input", + "--warning-exit-0", + working_copy, + ], + logger=logger, + ) + if result.returncode != 0: + raise Exception( + f"qpdf failed with exit code {result.returncode}, error: {result.stderr}", + ) + else: + logger.debug("PDF cleaned successfully") + except Exception as e: + logger.error(f"Error while cleaning PDF: {e}") + return + consume_file( ConsumableDocument( source=DocumentSource.ConsumeFolder, original_file=working_copy, ), - clean=clean, - # skip_ocr=skip_ocr, ) diff --git a/src/documents/tests/samples/corrupted.pdf b/src/documents/tests/samples/corrupted.pdf index fcabb6a95..61a0274b2 100644 Binary files a/src/documents/tests/samples/corrupted.pdf and b/src/documents/tests/samples/corrupted.pdf differ diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 8dc1edc39..210cbd111 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -248,6 +248,6 @@ class TestRetryConsumeTask( self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name) with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): - with self.assertLogs("documents.tasks", level="INFO") as cm: + with self.assertLogs() as cm: tasks.retry_failed_file(task_id=task.task_id, clean=True) - self.assertIn("PDF cleaned successfully", cm.output[0]) + self.assertIn("New document id 1 created", cm.output[-1])