Move it out of consumer

This commit is contained in:
shamoon 2024-11-07 13:19:06 -08:00
parent af1c64e969
commit 0fcd69b739
No known key found for this signature in database
4 changed files with 24 additions and 56 deletions

View File

@ -808,50 +808,3 @@ class ConsumerPlugin(
copy_basic_file_stats(source, target) copy_basic_file_stats(source, target)
except Exception: # pragma: no cover except Exception: # pragma: no cover
pass pass
class CleanPDFPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
LoggingMixin,
ConsumeTaskPlugin,
):
NAME: str = "CleanPDFPlugin"
logging_name = "paperless.consumer"
def __init__(
self,
input_doc: ConsumableDocument,
metadata: DocumentMetadataOverrides,
status_mgr: ProgressManager,
base_tmp_dir: Path,
task_id: str,
) -> None:
super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id)
self.renew_logging_group()
def run(self) -> str | None:
"""
Tries to clean a PDF file with qpdf
"""
msg = None
try:
result = run_subprocess(
[
"qpdf",
"--replace-input",
self.input_doc.original_file,
],
logger=self.log,
)
msg = (
f"Error while cleaning PDF: {result.stderr}"
if result.returncode != 0
else "PDF cleaned successfully"
)
except Exception as e:
msg = "Error while cleaning PDF"
self.log.error(e)
return msg

View File

@ -24,7 +24,6 @@ from documents.barcodes import BarcodePlugin
from documents.caching import clear_document_caches from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.consumer import CleanPDFPlugin
from documents.consumer import ConsumerPlugin from documents.consumer import ConsumerPlugin
from documents.consumer import WorkflowTriggerPlugin from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
@ -49,6 +48,7 @@ from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import cleanup_document_deletion
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
if settings.AUDIT_LOG_ENABLED: if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry from auditlog.models import LogEntry
@ -111,7 +111,6 @@ def consume_file(
self: Task, self: Task,
input_doc: ConsumableDocument, input_doc: ConsumableDocument,
overrides: DocumentMetadataOverrides | None = None, overrides: DocumentMetadataOverrides | None = None,
clean: bool = False,
): ):
# Default no overrides # Default no overrides
if overrides is None: if overrides is None:
@ -124,9 +123,6 @@ def consume_file(
ConsumerPlugin, ConsumerPlugin,
] ]
if clean:
plugins.insert(0, CleanPDFPlugin)
with ( with (
ProgressManager( ProgressManager(
overrides.filename or input_doc.original_file.name, overrides.filename or input_doc.original_file.name,
@ -189,13 +185,32 @@ def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False)
working_copy = settings.SCRATCH_DIR / failed_file.name working_copy = settings.SCRATCH_DIR / failed_file.name
copy_file_with_basic_stats(failed_file, working_copy) copy_file_with_basic_stats(failed_file, working_copy)
if clean:
try:
result = run_subprocess(
[
"qpdf",
"--replace-input",
"--warning-exit-0",
working_copy,
],
logger=logger,
)
if result.returncode != 0:
raise Exception(
f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
)
else:
logger.debug("PDF cleaned successfully")
except Exception as e:
logger.error(f"Error while cleaning PDF: {e}")
return
consume_file( consume_file(
ConsumableDocument( ConsumableDocument(
source=DocumentSource.ConsumeFolder, source=DocumentSource.ConsumeFolder,
original_file=working_copy, original_file=working_copy,
), ),
clean=clean,
# skip_ocr=skip_ocr,
) )

View File

@ -248,6 +248,6 @@ class TestRetryConsumeTask(
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name) self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("documents.tasks", level="INFO") as cm: with self.assertLogs() as cm:
tasks.retry_failed_file(task_id=task.task_id, clean=True) tasks.retry_failed_file(task_id=task.task_id, clean=True)
self.assertIn("PDF cleaned successfully", cm.output[0]) self.assertIn("New document id 1 created", cm.output[-1])