mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Messing around
This commit is contained in:
parent
159344f033
commit
bc4d3925cc
@ -18,6 +18,7 @@
|
||||
# Paths and folders
|
||||
|
||||
#PAPERLESS_CONSUMPTION_DIR=../consume
|
||||
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
|
||||
#PAPERLESS_DATA_DIR=../data
|
||||
#PAPERLESS_EMPTY_TRASH_DIR=
|
||||
#PAPERLESS_MEDIA_ROOT=../media
|
||||
|
@ -148,6 +148,11 @@ class ConsumerPlugin(
|
||||
):
|
||||
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
# Move the file to the failed directory
|
||||
if self.input_doc.original_file.exists():
|
||||
self.input_doc.original_file.rename(
|
||||
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||
)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
@ -797,3 +802,32 @@ class ConsumerPlugin(
|
||||
copy_basic_file_stats(source, target)
|
||||
except Exception: # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class CleanPDFPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
AlwaysRunPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "CleanPDFPlugin"
|
||||
|
||||
def run(self) -> str | None:
|
||||
"""
|
||||
Tries to clean a PDF file with qpdf
|
||||
"""
|
||||
msg = None
|
||||
try:
|
||||
run_subprocess(
|
||||
[
|
||||
"qpdf",
|
||||
"--replace-input",
|
||||
self.working_copy,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
msg = "PDF successfully cleaned"
|
||||
except Exception as e:
|
||||
msg = "Error while cleaning PDF"
|
||||
self.log.error(e)
|
||||
return msg
|
||||
|
@ -679,24 +679,28 @@ class PaperlessTask(models.Model):
|
||||
verbose_name=_("Task State"),
|
||||
help_text=_("Current state of the task being run"),
|
||||
)
|
||||
|
||||
date_created = models.DateTimeField(
|
||||
null=True,
|
||||
default=timezone.now,
|
||||
verbose_name=_("Created DateTime"),
|
||||
help_text=_("Datetime field when the task result was created in UTC"),
|
||||
)
|
||||
|
||||
date_started = models.DateTimeField(
|
||||
null=True,
|
||||
default=None,
|
||||
verbose_name=_("Started DateTime"),
|
||||
help_text=_("Datetime field when the task was started in UTC"),
|
||||
)
|
||||
|
||||
date_done = models.DateTimeField(
|
||||
null=True,
|
||||
default=None,
|
||||
verbose_name=_("Completed DateTime"),
|
||||
help_text=_("Datetime field when the task was completed in UTC"),
|
||||
)
|
||||
|
||||
result = models.TextField(
|
||||
null=True,
|
||||
default=None,
|
||||
|
@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
|
||||
import tqdm
|
||||
from celery import Task
|
||||
from celery import shared_task
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.db import transaction
|
||||
@ -23,16 +24,19 @@ from documents.barcodes import BarcodePlugin
|
||||
from documents.caching import clear_document_caches
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.classifier import load_classifier
|
||||
from documents.consumer import CleanPDFPlugin
|
||||
from documents.consumer import ConsumerPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.double_sided import CollatePlugin
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import generate_unique_filename
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.parsers import DocumentParser
|
||||
@ -106,6 +110,7 @@ def consume_file(
|
||||
self: Task,
|
||||
input_doc: ConsumableDocument,
|
||||
overrides: DocumentMetadataOverrides | None = None,
|
||||
clean: bool = False,
|
||||
):
|
||||
# Default no overrides
|
||||
if overrides is None:
|
||||
@ -118,6 +123,9 @@ def consume_file(
|
||||
ConsumerPlugin,
|
||||
]
|
||||
|
||||
if clean:
|
||||
plugins.insert(0, CleanPDFPlugin)
|
||||
|
||||
with (
|
||||
ProgressManager(
|
||||
overrides.filename or input_doc.original_file.name,
|
||||
@ -169,6 +177,24 @@ def consume_file(
|
||||
return msg
|
||||
|
||||
|
||||
@shared_task
|
||||
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
|
||||
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
|
||||
if task:
|
||||
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
|
||||
if not failed_file.exists():
|
||||
logger.error(f"Failed file {failed_file} not found")
|
||||
return
|
||||
consume_file(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=failed_file,
|
||||
),
|
||||
clean=clean,
|
||||
# skip_ocr=skip_ocr,
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
def sanity_check():
|
||||
messages = sanity_checker.check_sanity()
|
||||
|
54033
src/documents/tests/samples/corrupted.pdf
Normal file
54033
src/documents/tests/samples/corrupted.pdf
Normal file
File diff suppressed because one or more lines are too long
@ -1,21 +1,32 @@
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import Tag
|
||||
from documents.sanity_checker import SanityCheckFailedException
|
||||
from documents.sanity_checker import SanityCheckMessages
|
||||
from documents.signals.handlers import before_task_publish_handler
|
||||
from documents.signals.handlers import task_failure_handler
|
||||
from documents.tests.test_classifier import dummy_preprocess
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import SampleDirMixin
|
||||
|
||||
|
||||
class TestIndexReindex(DirectoriesMixin, TestCase):
|
||||
@ -184,3 +195,66 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
tasks.empty_trash()
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
class TestRetryConsumeTask(
|
||||
DirectoriesMixin,
|
||||
SampleDirMixin,
|
||||
FileSystemAssertsMixin,
|
||||
TestCase,
|
||||
):
|
||||
@override_settings(CONSUMPTION_FAILED_DIR=Path(__file__).parent / "samples")
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
def test_retry_consume(self, m):
|
||||
test_file = self.SAMPLE_DIR / "corrupted.pdf"
|
||||
temp_copy = self.dirs.scratch_dir / test_file.name
|
||||
shutil.copy(test_file, temp_copy)
|
||||
|
||||
headers = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"task": "documents.tasks.consume_file",
|
||||
}
|
||||
body = (
|
||||
# args
|
||||
(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=str(temp_copy),
|
||||
),
|
||||
None,
|
||||
),
|
||||
# kwargs
|
||||
{},
|
||||
# celery stuff
|
||||
{"callbacks": None, "errbacks": None, "chain": None, "chord": None},
|
||||
)
|
||||
before_task_publish_handler(headers=headers, body=body)
|
||||
|
||||
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
|
||||
with self.assertRaises(Exception):
|
||||
tasks.consume_file(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=temp_copy,
|
||||
),
|
||||
)
|
||||
|
||||
task_failure_handler(
|
||||
task_id=headers["id"],
|
||||
exception="Example failure",
|
||||
)
|
||||
|
||||
task = PaperlessTask.objects.first()
|
||||
# Ensure the file is moved to the failed dir
|
||||
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||
|
||||
tasks.retry_failed_file(task_id=task.task_id)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, _ = m.call_args
|
||||
|
||||
command = args[0]
|
||||
|
||||
self.assertEqual(command[0], "qpdf")
|
||||
self.assertEqual(command[1], "--replace-input")
|
||||
|
@ -35,6 +35,7 @@ def setup_directories():
|
||||
dirs.scratch_dir = Path(tempfile.mkdtemp())
|
||||
dirs.media_dir = Path(tempfile.mkdtemp())
|
||||
dirs.consumption_dir = Path(tempfile.mkdtemp())
|
||||
dirs.consumption_failed_dir = dirs.consumption_dir / "failed"
|
||||
dirs.static_dir = Path(tempfile.mkdtemp())
|
||||
dirs.index_dir = dirs.data_dir / "index"
|
||||
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
|
||||
@ -56,6 +57,7 @@ def setup_directories():
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
|
||||
LOGGING_DIR=dirs.logging_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
STATIC_ROOT=dirs.static_dir,
|
||||
@ -72,6 +74,7 @@ def remove_dirs(dirs):
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.static_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
@ -65,6 +65,10 @@ def paths_check(app_configs, **kwargs):
|
||||
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
||||
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
||||
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
||||
+ path_check(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
settings.CONSUMPTION_FAILED_DIR,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
@ -281,6 +281,11 @@ CONSUMPTION_DIR = __get_path(
|
||||
BASE_DIR.parent / "consume",
|
||||
)
|
||||
|
||||
CONSUMPTION_FAILED_DIR = __get_path(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
CONSUMPTION_DIR / "failed",
|
||||
)
|
||||
|
||||
# This will be created if it doesn't exist
|
||||
SCRATCH_DIR = __get_path(
|
||||
"PAPERLESS_SCRATCH_DIR",
|
||||
@ -890,6 +895,8 @@ CONSUMER_IGNORE_PATTERNS = list(
|
||||
),
|
||||
),
|
||||
)
|
||||
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
|
||||
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
|
||||
|
||||
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user