Messing around

This commit is contained in:
shamoon 2024-10-30 01:04:14 -07:00
parent 159344f033
commit bc4d3925cc
No known key found for this signature in database
9 changed files with 54186 additions and 0 deletions

View File

@ -18,6 +18,7 @@
# Paths and folders
#PAPERLESS_CONSUMPTION_DIR=../consume
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
#PAPERLESS_DATA_DIR=../data
#PAPERLESS_EMPTY_TRASH_DIR=
#PAPERLESS_MEDIA_ROOT=../media

View File

@ -148,6 +148,11 @@ class ConsumerPlugin(
):
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info)
# Move the file to the failed directory
if self.input_doc.original_file.exists():
self.input_doc.original_file.rename(
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
def pre_check_file_exists(self):
@ -797,3 +802,32 @@ class ConsumerPlugin(
copy_basic_file_stats(source, target)
except Exception: # pragma: no cover
pass
class CleanPDFPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "CleanPDFPlugin"
def run(self) -> str | None:
"""
Tries to clean a PDF file with qpdf
"""
msg = None
try:
run_subprocess(
[
"qpdf",
"--replace-input",
self.working_copy,
],
logger=self.log,
)
msg = "PDF successfully cleaned"
except Exception as e:
msg = "Error while cleaning PDF"
self.log.error(e)
return msg

View File

@ -679,24 +679,28 @@ class PaperlessTask(models.Model):
verbose_name=_("Task State"),
help_text=_("Current state of the task being run"),
)
date_created = models.DateTimeField(
null=True,
default=timezone.now,
verbose_name=_("Created DateTime"),
help_text=_("Datetime field when the task result was created in UTC"),
)
date_started = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Started DateTime"),
help_text=_("Datetime field when the task was started in UTC"),
)
date_done = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Completed DateTime"),
help_text=_("Datetime field when the task was completed in UTC"),
)
result = models.TextField(
null=True,
default=None,

View File

@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
import tqdm
from celery import Task
from celery import shared_task
from celery import states
from django.conf import settings
from django.db import models
from django.db import transaction
@ -23,16 +24,19 @@ from documents.barcodes import BarcodePlugin
from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import CleanPDFPlugin
from documents.consumer import ConsumerPlugin
from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
@ -106,6 +110,7 @@ def consume_file(
self: Task,
input_doc: ConsumableDocument,
overrides: DocumentMetadataOverrides | None = None,
clean: bool = False,
):
# Default no overrides
if overrides is None:
@ -118,6 +123,9 @@ def consume_file(
ConsumerPlugin,
]
if clean:
plugins.insert(0, CleanPDFPlugin)
with (
ProgressManager(
overrides.filename or input_doc.original_file.name,
@ -169,6 +177,24 @@ def consume_file(
return msg
@shared_task
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
if task:
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
if not failed_file.exists():
logger.error(f"Failed file {failed_file} not found")
return
consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=failed_file,
),
clean=clean,
# skip_ocr=skip_ocr,
)
@shared_task
def sanity_check():
messages = sanity_checker.check_sanity()

File diff suppressed because one or more lines are too long

View File

@ -1,21 +1,32 @@
import os
import shutil
import uuid
from datetime import timedelta
from pathlib import Path
from unittest import mock
from django.conf import settings
from django.test import TestCase
from django.test import override_settings
from django.utils import timezone
from documents import tasks
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages
from documents.signals.handlers import before_task_publish_handler
from documents.signals.handlers import task_failure_handler
from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
class TestIndexReindex(DirectoriesMixin, TestCase):
@ -184,3 +195,66 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)
class TestRetryConsumeTask(
DirectoriesMixin,
SampleDirMixin,
FileSystemAssertsMixin,
TestCase,
):
@override_settings(CONSUMPTION_FAILED_DIR=Path(__file__).parent / "samples")
@mock.patch("documents.consumer.run_subprocess")
def test_retry_consume(self, m):
test_file = self.SAMPLE_DIR / "corrupted.pdf"
temp_copy = self.dirs.scratch_dir / test_file.name
shutil.copy(test_file, temp_copy)
headers = {
"id": str(uuid.uuid4()),
"task": "documents.tasks.consume_file",
}
body = (
# args
(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=str(temp_copy),
),
None,
),
# kwargs
{},
# celery stuff
{"callbacks": None, "errbacks": None, "chain": None, "chord": None},
)
before_task_publish_handler(headers=headers, body=body)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertRaises(Exception):
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=temp_copy,
),
)
task_failure_handler(
task_id=headers["id"],
exception="Example failure",
)
task = PaperlessTask.objects.first()
# Ensure the file is moved to the failed dir
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
tasks.retry_failed_file(task_id=task.task_id)
m.assert_called_once()
args, _ = m.call_args
command = args[0]
self.assertEqual(command[0], "qpdf")
self.assertEqual(command[1], "--replace-input")

View File

@ -35,6 +35,7 @@ def setup_directories():
dirs.scratch_dir = Path(tempfile.mkdtemp())
dirs.media_dir = Path(tempfile.mkdtemp())
dirs.consumption_dir = Path(tempfile.mkdtemp())
dirs.consumption_failed_dir = dirs.consumption_dir / "failed"
dirs.static_dir = Path(tempfile.mkdtemp())
dirs.index_dir = dirs.data_dir / "index"
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
@ -56,6 +57,7 @@ def setup_directories():
THUMBNAIL_DIR=dirs.thumbnail_dir,
ARCHIVE_DIR=dirs.archive_dir,
CONSUMPTION_DIR=dirs.consumption_dir,
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
LOGGING_DIR=dirs.logging_dir,
INDEX_DIR=dirs.index_dir,
STATIC_ROOT=dirs.static_dir,
@ -72,6 +74,7 @@ def remove_dirs(dirs):
shutil.rmtree(dirs.data_dir, ignore_errors=True)
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
shutil.rmtree(dirs.static_dir, ignore_errors=True)
dirs.settings_override.disable()

View File

@ -65,6 +65,10 @@ def paths_check(app_configs, **kwargs):
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
+ path_check(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
settings.CONSUMPTION_FAILED_DIR,
)
)

View File

@ -281,6 +281,11 @@ CONSUMPTION_DIR = __get_path(
BASE_DIR.parent / "consume",
)
CONSUMPTION_FAILED_DIR = __get_path(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
CONSUMPTION_DIR / "failed",
)
# This will be created if it doesn't exist
SCRATCH_DIR = __get_path(
"PAPERLESS_SCRATCH_DIR",
@ -890,6 +895,8 @@ CONSUMER_IGNORE_PATTERNS = list(
),
),
)
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")