Messing around

This commit is contained in:
shamoon 2024-10-30 01:04:14 -07:00
parent 159344f033
commit bc4d3925cc
No known key found for this signature in database
9 changed files with 54186 additions and 0 deletions

@ -18,6 +18,7 @@
# Paths and folders # Paths and folders
#PAPERLESS_CONSUMPTION_DIR=../consume #PAPERLESS_CONSUMPTION_DIR=../consume
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
#PAPERLESS_DATA_DIR=../data #PAPERLESS_DATA_DIR=../data
#PAPERLESS_EMPTY_TRASH_DIR= #PAPERLESS_EMPTY_TRASH_DIR=
#PAPERLESS_MEDIA_ROOT=../media #PAPERLESS_MEDIA_ROOT=../media

@ -148,6 +148,11 @@ class ConsumerPlugin(
): ):
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message) self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info) self.log.error(log_message or message, exc_info=exc_info)
# Move the file to the failed directory
if self.input_doc.original_file.exists():
self.input_doc.original_file.rename(
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
def pre_check_file_exists(self): def pre_check_file_exists(self):
@ -797,3 +802,32 @@ class ConsumerPlugin(
copy_basic_file_stats(source, target) copy_basic_file_stats(source, target)
except Exception: # pragma: no cover except Exception: # pragma: no cover
pass pass
class CleanPDFPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "CleanPDFPlugin"
def run(self) -> str | None:
"""
Tries to clean a PDF file with qpdf
"""
msg = None
try:
run_subprocess(
[
"qpdf",
"--replace-input",
self.working_copy,
],
logger=self.log,
)
msg = "PDF successfully cleaned"
except Exception as e:
msg = "Error while cleaning PDF"
self.log.error(e)
return msg

@ -679,24 +679,28 @@ class PaperlessTask(models.Model):
verbose_name=_("Task State"), verbose_name=_("Task State"),
help_text=_("Current state of the task being run"), help_text=_("Current state of the task being run"),
) )
date_created = models.DateTimeField( date_created = models.DateTimeField(
null=True, null=True,
default=timezone.now, default=timezone.now,
verbose_name=_("Created DateTime"), verbose_name=_("Created DateTime"),
help_text=_("Datetime field when the task result was created in UTC"), help_text=_("Datetime field when the task result was created in UTC"),
) )
date_started = models.DateTimeField( date_started = models.DateTimeField(
null=True, null=True,
default=None, default=None,
verbose_name=_("Started DateTime"), verbose_name=_("Started DateTime"),
help_text=_("Datetime field when the task was started in UTC"), help_text=_("Datetime field when the task was started in UTC"),
) )
date_done = models.DateTimeField( date_done = models.DateTimeField(
null=True, null=True,
default=None, default=None,
verbose_name=_("Completed DateTime"), verbose_name=_("Completed DateTime"),
help_text=_("Datetime field when the task was completed in UTC"), help_text=_("Datetime field when the task was completed in UTC"),
) )
result = models.TextField( result = models.TextField(
null=True, null=True,
default=None, default=None,

@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
import tqdm import tqdm
from celery import Task from celery import Task
from celery import shared_task from celery import shared_task
from celery import states
from django.conf import settings from django.conf import settings
from django.db import models from django.db import models
from django.db import transaction from django.db import transaction
@ -23,16 +24,19 @@ from documents.barcodes import BarcodePlugin
from documents.caching import clear_document_caches from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.consumer import CleanPDFPlugin
from documents.consumer import ConsumerPlugin from documents.consumer import ConsumerPlugin
from documents.consumer import WorkflowTriggerPlugin from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.double_sided import CollatePlugin from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename from documents.file_handling import generate_unique_filename
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
@ -106,6 +110,7 @@ def consume_file(
self: Task, self: Task,
input_doc: ConsumableDocument, input_doc: ConsumableDocument,
overrides: DocumentMetadataOverrides | None = None, overrides: DocumentMetadataOverrides | None = None,
clean: bool = False,
): ):
# Default no overrides # Default no overrides
if overrides is None: if overrides is None:
@ -118,6 +123,9 @@ def consume_file(
ConsumerPlugin, ConsumerPlugin,
] ]
if clean:
plugins.insert(0, CleanPDFPlugin)
with ( with (
ProgressManager( ProgressManager(
overrides.filename or input_doc.original_file.name, overrides.filename or input_doc.original_file.name,
@ -169,6 +177,24 @@ def consume_file(
return msg return msg
@shared_task
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
if task:
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
if not failed_file.exists():
logger.error(f"Failed file {failed_file} not found")
return
consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=failed_file,
),
clean=clean,
# skip_ocr=skip_ocr,
)
@shared_task @shared_task
def sanity_check(): def sanity_check():
messages = sanity_checker.check_sanity() messages = sanity_checker.check_sanity()

File diff suppressed because one or more lines are too long

@ -1,21 +1,32 @@
import os import os
import shutil
import uuid
from datetime import timedelta from datetime import timedelta
from pathlib import Path
from unittest import mock from unittest import mock
from django.conf import settings from django.conf import settings
from django.test import TestCase from django.test import TestCase
from django.test import override_settings
from django.utils import timezone from django.utils import timezone
from documents import tasks from documents import tasks
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import Tag from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages from documents.sanity_checker import SanityCheckMessages
from documents.signals.handlers import before_task_publish_handler
from documents.signals.handlers import task_failure_handler
from documents.tests.test_classifier import dummy_preprocess from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
class TestIndexReindex(DirectoriesMixin, TestCase): class TestIndexReindex(DirectoriesMixin, TestCase):
@ -184,3 +195,66 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
tasks.empty_trash() tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0) self.assertEqual(Document.global_objects.count(), 0)
class TestRetryConsumeTask(
DirectoriesMixin,
SampleDirMixin,
FileSystemAssertsMixin,
TestCase,
):
@override_settings(CONSUMPTION_FAILED_DIR=Path(__file__).parent / "samples")
@mock.patch("documents.consumer.run_subprocess")
def test_retry_consume(self, m):
test_file = self.SAMPLE_DIR / "corrupted.pdf"
temp_copy = self.dirs.scratch_dir / test_file.name
shutil.copy(test_file, temp_copy)
headers = {
"id": str(uuid.uuid4()),
"task": "documents.tasks.consume_file",
}
body = (
# args
(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=str(temp_copy),
),
None,
),
# kwargs
{},
# celery stuff
{"callbacks": None, "errbacks": None, "chain": None, "chord": None},
)
before_task_publish_handler(headers=headers, body=body)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertRaises(Exception):
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=temp_copy,
),
)
task_failure_handler(
task_id=headers["id"],
exception="Example failure",
)
task = PaperlessTask.objects.first()
# Ensure the file is moved to the failed dir
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
tasks.retry_failed_file(task_id=task.task_id)
m.assert_called_once()
args, _ = m.call_args
command = args[0]
self.assertEqual(command[0], "qpdf")
self.assertEqual(command[1], "--replace-input")

@ -35,6 +35,7 @@ def setup_directories():
dirs.scratch_dir = Path(tempfile.mkdtemp()) dirs.scratch_dir = Path(tempfile.mkdtemp())
dirs.media_dir = Path(tempfile.mkdtemp()) dirs.media_dir = Path(tempfile.mkdtemp())
dirs.consumption_dir = Path(tempfile.mkdtemp()) dirs.consumption_dir = Path(tempfile.mkdtemp())
dirs.consumption_failed_dir = dirs.consumption_dir / "failed"
dirs.static_dir = Path(tempfile.mkdtemp()) dirs.static_dir = Path(tempfile.mkdtemp())
dirs.index_dir = dirs.data_dir / "index" dirs.index_dir = dirs.data_dir / "index"
dirs.originals_dir = dirs.media_dir / "documents" / "originals" dirs.originals_dir = dirs.media_dir / "documents" / "originals"
@ -56,6 +57,7 @@ def setup_directories():
THUMBNAIL_DIR=dirs.thumbnail_dir, THUMBNAIL_DIR=dirs.thumbnail_dir,
ARCHIVE_DIR=dirs.archive_dir, ARCHIVE_DIR=dirs.archive_dir,
CONSUMPTION_DIR=dirs.consumption_dir, CONSUMPTION_DIR=dirs.consumption_dir,
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
LOGGING_DIR=dirs.logging_dir, LOGGING_DIR=dirs.logging_dir,
INDEX_DIR=dirs.index_dir, INDEX_DIR=dirs.index_dir,
STATIC_ROOT=dirs.static_dir, STATIC_ROOT=dirs.static_dir,
@ -72,6 +74,7 @@ def remove_dirs(dirs):
shutil.rmtree(dirs.data_dir, ignore_errors=True) shutil.rmtree(dirs.data_dir, ignore_errors=True)
shutil.rmtree(dirs.scratch_dir, ignore_errors=True) shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_dir, ignore_errors=True) shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
shutil.rmtree(dirs.static_dir, ignore_errors=True) shutil.rmtree(dirs.static_dir, ignore_errors=True)
dirs.settings_override.disable() dirs.settings_override.disable()

@ -65,6 +65,10 @@ def paths_check(app_configs, **kwargs):
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR) + path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT) + path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR) + path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
+ path_check(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
settings.CONSUMPTION_FAILED_DIR,
)
) )

@ -281,6 +281,11 @@ CONSUMPTION_DIR = __get_path(
BASE_DIR.parent / "consume", BASE_DIR.parent / "consume",
) )
CONSUMPTION_FAILED_DIR = __get_path(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
CONSUMPTION_DIR / "failed",
)
# This will be created if it doesn't exist # This will be created if it doesn't exist
SCRATCH_DIR = __get_path( SCRATCH_DIR = __get_path(
"PAPERLESS_SCRATCH_DIR", "PAPERLESS_SCRATCH_DIR",
@ -890,6 +895,8 @@ CONSUMER_IGNORE_PATTERNS = list(
), ),
), ),
) )
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")