mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Messing around
This commit is contained in:
parent
159344f033
commit
bc4d3925cc
@ -18,6 +18,7 @@
|
|||||||
# Paths and folders
|
# Paths and folders
|
||||||
|
|
||||||
#PAPERLESS_CONSUMPTION_DIR=../consume
|
#PAPERLESS_CONSUMPTION_DIR=../consume
|
||||||
|
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
|
||||||
#PAPERLESS_DATA_DIR=../data
|
#PAPERLESS_DATA_DIR=../data
|
||||||
#PAPERLESS_EMPTY_TRASH_DIR=
|
#PAPERLESS_EMPTY_TRASH_DIR=
|
||||||
#PAPERLESS_MEDIA_ROOT=../media
|
#PAPERLESS_MEDIA_ROOT=../media
|
||||||
|
@ -148,6 +148,11 @@ class ConsumerPlugin(
|
|||||||
):
|
):
|
||||||
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
||||||
self.log.error(log_message or message, exc_info=exc_info)
|
self.log.error(log_message or message, exc_info=exc_info)
|
||||||
|
# Move the file to the failed directory
|
||||||
|
if self.input_doc.original_file.exists():
|
||||||
|
self.input_doc.original_file.rename(
|
||||||
|
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||||
|
)
|
||||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||||
|
|
||||||
def pre_check_file_exists(self):
|
def pre_check_file_exists(self):
|
||||||
@ -797,3 +802,32 @@ class ConsumerPlugin(
|
|||||||
copy_basic_file_stats(source, target)
|
copy_basic_file_stats(source, target)
|
||||||
except Exception: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CleanPDFPlugin(
|
||||||
|
NoCleanupPluginMixin,
|
||||||
|
NoSetupPluginMixin,
|
||||||
|
AlwaysRunPluginMixin,
|
||||||
|
ConsumeTaskPlugin,
|
||||||
|
):
|
||||||
|
NAME: str = "CleanPDFPlugin"
|
||||||
|
|
||||||
|
def run(self) -> str | None:
|
||||||
|
"""
|
||||||
|
Tries to clean a PDF file with qpdf
|
||||||
|
"""
|
||||||
|
msg = None
|
||||||
|
try:
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
"qpdf",
|
||||||
|
"--replace-input",
|
||||||
|
self.working_copy,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
msg = "PDF successfully cleaned"
|
||||||
|
except Exception as e:
|
||||||
|
msg = "Error while cleaning PDF"
|
||||||
|
self.log.error(e)
|
||||||
|
return msg
|
||||||
|
@ -679,24 +679,28 @@ class PaperlessTask(models.Model):
|
|||||||
verbose_name=_("Task State"),
|
verbose_name=_("Task State"),
|
||||||
help_text=_("Current state of the task being run"),
|
help_text=_("Current state of the task being run"),
|
||||||
)
|
)
|
||||||
|
|
||||||
date_created = models.DateTimeField(
|
date_created = models.DateTimeField(
|
||||||
null=True,
|
null=True,
|
||||||
default=timezone.now,
|
default=timezone.now,
|
||||||
verbose_name=_("Created DateTime"),
|
verbose_name=_("Created DateTime"),
|
||||||
help_text=_("Datetime field when the task result was created in UTC"),
|
help_text=_("Datetime field when the task result was created in UTC"),
|
||||||
)
|
)
|
||||||
|
|
||||||
date_started = models.DateTimeField(
|
date_started = models.DateTimeField(
|
||||||
null=True,
|
null=True,
|
||||||
default=None,
|
default=None,
|
||||||
verbose_name=_("Started DateTime"),
|
verbose_name=_("Started DateTime"),
|
||||||
help_text=_("Datetime field when the task was started in UTC"),
|
help_text=_("Datetime field when the task was started in UTC"),
|
||||||
)
|
)
|
||||||
|
|
||||||
date_done = models.DateTimeField(
|
date_done = models.DateTimeField(
|
||||||
null=True,
|
null=True,
|
||||||
default=None,
|
default=None,
|
||||||
verbose_name=_("Completed DateTime"),
|
verbose_name=_("Completed DateTime"),
|
||||||
help_text=_("Datetime field when the task was completed in UTC"),
|
help_text=_("Datetime field when the task was completed in UTC"),
|
||||||
)
|
)
|
||||||
|
|
||||||
result = models.TextField(
|
result = models.TextField(
|
||||||
null=True,
|
null=True,
|
||||||
default=None,
|
default=None,
|
||||||
|
@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
|
|||||||
import tqdm
|
import tqdm
|
||||||
from celery import Task
|
from celery import Task
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
@ -23,16 +24,19 @@ from documents.barcodes import BarcodePlugin
|
|||||||
from documents.caching import clear_document_caches
|
from documents.caching import clear_document_caches
|
||||||
from documents.classifier import DocumentClassifier
|
from documents.classifier import DocumentClassifier
|
||||||
from documents.classifier import load_classifier
|
from documents.classifier import load_classifier
|
||||||
|
from documents.consumer import CleanPDFPlugin
|
||||||
from documents.consumer import ConsumerPlugin
|
from documents.consumer import ConsumerPlugin
|
||||||
from documents.consumer import WorkflowTriggerPlugin
|
from documents.consumer import WorkflowTriggerPlugin
|
||||||
from documents.data_models import ConsumableDocument
|
from documents.data_models import ConsumableDocument
|
||||||
from documents.data_models import DocumentMetadataOverrides
|
from documents.data_models import DocumentMetadataOverrides
|
||||||
|
from documents.data_models import DocumentSource
|
||||||
from documents.double_sided import CollatePlugin
|
from documents.double_sided import CollatePlugin
|
||||||
from documents.file_handling import create_source_path_directory
|
from documents.file_handling import create_source_path_directory
|
||||||
from documents.file_handling import generate_unique_filename
|
from documents.file_handling import generate_unique_filename
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
|
from documents.models import PaperlessTask
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
@ -106,6 +110,7 @@ def consume_file(
|
|||||||
self: Task,
|
self: Task,
|
||||||
input_doc: ConsumableDocument,
|
input_doc: ConsumableDocument,
|
||||||
overrides: DocumentMetadataOverrides | None = None,
|
overrides: DocumentMetadataOverrides | None = None,
|
||||||
|
clean: bool = False,
|
||||||
):
|
):
|
||||||
# Default no overrides
|
# Default no overrides
|
||||||
if overrides is None:
|
if overrides is None:
|
||||||
@ -118,6 +123,9 @@ def consume_file(
|
|||||||
ConsumerPlugin,
|
ConsumerPlugin,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if clean:
|
||||||
|
plugins.insert(0, CleanPDFPlugin)
|
||||||
|
|
||||||
with (
|
with (
|
||||||
ProgressManager(
|
ProgressManager(
|
||||||
overrides.filename or input_doc.original_file.name,
|
overrides.filename or input_doc.original_file.name,
|
||||||
@ -169,6 +177,24 @@ def consume_file(
|
|||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task
|
||||||
|
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
|
||||||
|
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
|
||||||
|
if task:
|
||||||
|
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
|
||||||
|
if not failed_file.exists():
|
||||||
|
logger.error(f"Failed file {failed_file} not found")
|
||||||
|
return
|
||||||
|
consume_file(
|
||||||
|
ConsumableDocument(
|
||||||
|
source=DocumentSource.ConsumeFolder,
|
||||||
|
original_file=failed_file,
|
||||||
|
),
|
||||||
|
clean=clean,
|
||||||
|
# skip_ocr=skip_ocr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def sanity_check():
|
def sanity_check():
|
||||||
messages = sanity_checker.check_sanity()
|
messages = sanity_checker.check_sanity()
|
||||||
|
54033
src/documents/tests/samples/corrupted.pdf
Normal file
54033
src/documents/tests/samples/corrupted.pdf
Normal file
File diff suppressed because one or more lines are too long
@ -1,21 +1,32 @@
|
|||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
from pathlib import Path
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
from django.test import override_settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from documents import tasks
|
from documents import tasks
|
||||||
|
from documents.data_models import ConsumableDocument
|
||||||
|
from documents.data_models import DocumentSource
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
|
from documents.models import PaperlessTask
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.sanity_checker import SanityCheckFailedException
|
from documents.sanity_checker import SanityCheckFailedException
|
||||||
from documents.sanity_checker import SanityCheckMessages
|
from documents.sanity_checker import SanityCheckMessages
|
||||||
|
from documents.signals.handlers import before_task_publish_handler
|
||||||
|
from documents.signals.handlers import task_failure_handler
|
||||||
from documents.tests.test_classifier import dummy_preprocess
|
from documents.tests.test_classifier import dummy_preprocess
|
||||||
from documents.tests.utils import DirectoriesMixin
|
from documents.tests.utils import DirectoriesMixin
|
||||||
|
from documents.tests.utils import DummyProgressManager
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
|
from documents.tests.utils import SampleDirMixin
|
||||||
|
|
||||||
|
|
||||||
class TestIndexReindex(DirectoriesMixin, TestCase):
|
class TestIndexReindex(DirectoriesMixin, TestCase):
|
||||||
@ -184,3 +195,66 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
tasks.empty_trash()
|
tasks.empty_trash()
|
||||||
self.assertEqual(Document.global_objects.count(), 0)
|
self.assertEqual(Document.global_objects.count(), 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRetryConsumeTask(
|
||||||
|
DirectoriesMixin,
|
||||||
|
SampleDirMixin,
|
||||||
|
FileSystemAssertsMixin,
|
||||||
|
TestCase,
|
||||||
|
):
|
||||||
|
@override_settings(CONSUMPTION_FAILED_DIR=Path(__file__).parent / "samples")
|
||||||
|
@mock.patch("documents.consumer.run_subprocess")
|
||||||
|
def test_retry_consume(self, m):
|
||||||
|
test_file = self.SAMPLE_DIR / "corrupted.pdf"
|
||||||
|
temp_copy = self.dirs.scratch_dir / test_file.name
|
||||||
|
shutil.copy(test_file, temp_copy)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"task": "documents.tasks.consume_file",
|
||||||
|
}
|
||||||
|
body = (
|
||||||
|
# args
|
||||||
|
(
|
||||||
|
ConsumableDocument(
|
||||||
|
source=DocumentSource.ConsumeFolder,
|
||||||
|
original_file=str(temp_copy),
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
# kwargs
|
||||||
|
{},
|
||||||
|
# celery stuff
|
||||||
|
{"callbacks": None, "errbacks": None, "chain": None, "chord": None},
|
||||||
|
)
|
||||||
|
before_task_publish_handler(headers=headers, body=body)
|
||||||
|
|
||||||
|
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
|
||||||
|
with self.assertRaises(Exception):
|
||||||
|
tasks.consume_file(
|
||||||
|
ConsumableDocument(
|
||||||
|
source=DocumentSource.ConsumeFolder,
|
||||||
|
original_file=temp_copy,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
task_failure_handler(
|
||||||
|
task_id=headers["id"],
|
||||||
|
exception="Example failure",
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PaperlessTask.objects.first()
|
||||||
|
# Ensure the file is moved to the failed dir
|
||||||
|
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||||
|
|
||||||
|
tasks.retry_failed_file(task_id=task.task_id)
|
||||||
|
|
||||||
|
m.assert_called_once()
|
||||||
|
|
||||||
|
args, _ = m.call_args
|
||||||
|
|
||||||
|
command = args[0]
|
||||||
|
|
||||||
|
self.assertEqual(command[0], "qpdf")
|
||||||
|
self.assertEqual(command[1], "--replace-input")
|
||||||
|
@ -35,6 +35,7 @@ def setup_directories():
|
|||||||
dirs.scratch_dir = Path(tempfile.mkdtemp())
|
dirs.scratch_dir = Path(tempfile.mkdtemp())
|
||||||
dirs.media_dir = Path(tempfile.mkdtemp())
|
dirs.media_dir = Path(tempfile.mkdtemp())
|
||||||
dirs.consumption_dir = Path(tempfile.mkdtemp())
|
dirs.consumption_dir = Path(tempfile.mkdtemp())
|
||||||
|
dirs.consumption_failed_dir = dirs.consumption_dir / "failed"
|
||||||
dirs.static_dir = Path(tempfile.mkdtemp())
|
dirs.static_dir = Path(tempfile.mkdtemp())
|
||||||
dirs.index_dir = dirs.data_dir / "index"
|
dirs.index_dir = dirs.data_dir / "index"
|
||||||
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
|
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
|
||||||
@ -56,6 +57,7 @@ def setup_directories():
|
|||||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||||
ARCHIVE_DIR=dirs.archive_dir,
|
ARCHIVE_DIR=dirs.archive_dir,
|
||||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||||
|
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
|
||||||
LOGGING_DIR=dirs.logging_dir,
|
LOGGING_DIR=dirs.logging_dir,
|
||||||
INDEX_DIR=dirs.index_dir,
|
INDEX_DIR=dirs.index_dir,
|
||||||
STATIC_ROOT=dirs.static_dir,
|
STATIC_ROOT=dirs.static_dir,
|
||||||
@ -72,6 +74,7 @@ def remove_dirs(dirs):
|
|||||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||||
|
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
|
||||||
shutil.rmtree(dirs.static_dir, ignore_errors=True)
|
shutil.rmtree(dirs.static_dir, ignore_errors=True)
|
||||||
dirs.settings_override.disable()
|
dirs.settings_override.disable()
|
||||||
|
|
||||||
|
@ -65,6 +65,10 @@ def paths_check(app_configs, **kwargs):
|
|||||||
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
||||||
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
||||||
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
||||||
|
+ path_check(
|
||||||
|
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||||
|
settings.CONSUMPTION_FAILED_DIR,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -281,6 +281,11 @@ CONSUMPTION_DIR = __get_path(
|
|||||||
BASE_DIR.parent / "consume",
|
BASE_DIR.parent / "consume",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CONSUMPTION_FAILED_DIR = __get_path(
|
||||||
|
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||||
|
CONSUMPTION_DIR / "failed",
|
||||||
|
)
|
||||||
|
|
||||||
# This will be created if it doesn't exist
|
# This will be created if it doesn't exist
|
||||||
SCRATCH_DIR = __get_path(
|
SCRATCH_DIR = __get_path(
|
||||||
"PAPERLESS_SCRATCH_DIR",
|
"PAPERLESS_SCRATCH_DIR",
|
||||||
@ -890,6 +895,8 @@ CONSUMER_IGNORE_PATTERNS = list(
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
|
||||||
|
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
|
||||||
|
|
||||||
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
|
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user