Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106)

This commit is contained in:
shamoon
2025-02-26 14:12:20 -08:00
committed by GitHub
parent ec34197b59
commit 2d52226732
30 changed files with 1117 additions and 479 deletions

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
import logging
import pickle
import re
import time
import warnings
from hashlib import sha256
from pathlib import Path
@@ -144,19 +143,6 @@ class DocumentClassifier:
):
raise IncompatibleClassifierVersionError("sklearn version update")
def set_last_checked(self) -> None:
# save a timestamp of the last time we checked for retraining to a file
with Path(settings.MODEL_FILE.with_suffix(".last_checked")).open("w") as f:
f.write(str(time.time()))
def get_last_checked(self) -> float | None:
# load the timestamp of the last time we checked for retraining
try:
with Path(settings.MODEL_FILE.with_suffix(".last_checked")).open("r") as f:
return float(f.read())
except FileNotFoundError: # pragma: no cover
return None
def save(self) -> None:
target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".pickle.part")
@@ -177,7 +163,6 @@ class DocumentClassifier:
pickle.dump(self.storage_path_classifier, f)
target_file_temp.rename(target_file)
self.set_last_checked()
def train(self) -> bool:
# Get non-inbox documents
@@ -246,7 +231,6 @@ class DocumentClassifier:
and self.last_doc_change_time >= latest_doc_change
) and self.last_auto_type_hash == hasher.digest():
logger.info("No updates since last training")
self.set_last_checked()
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(

View File

@@ -37,6 +37,7 @@ from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Log
from documents.models import PaperlessTask
from documents.models import ShareLink
from documents.models import StoragePath
from documents.models import Tag
@@ -775,6 +776,21 @@ class ShareLinkFilterSet(FilterSet):
}
class PaperlessTaskFilterSet(FilterSet):
acknowledged = BooleanFilter(
label="Acknowledged",
field_name="acknowledged",
)
class Meta:
model = PaperlessTask
fields = {
"type": ["exact"],
"task_name": ["exact"],
"status": ["exact"],
}
class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter):
"""
A filter backend that limits results to those where the requesting user

View File

@@ -10,4 +10,4 @@ class Command(BaseCommand):
)
def handle(self, *args, **options):
train_classifier()
train_classifier(scheduled=False)

View File

@@ -12,6 +12,6 @@ class Command(ProgressBarMixin, BaseCommand):
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar)
messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
messages.log_messages()

View File

@@ -1,4 +1,4 @@
# Generated by Django 5.1.6 on 2025-02-20 04:55
# Generated by Django 5.1.6 on 2025-02-21 16:34
import multiselectfield.db.fields
from django.db import migrations
@@ -16,12 +16,52 @@ def update_workflow_sources(apps, schema_editor):
trigger.save()
def make_existing_tasks_consume_auto(apps, schema_editor):
PaperlessTask = apps.get_model("documents", "PaperlessTask")
PaperlessTask.objects.all().update(type="auto_task", task_name="consume_file")
class Migration(migrations.Migration):
dependencies = [
("documents", "1062_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.AddField(
model_name="paperlesstask",
name="type",
field=models.CharField(
choices=[
("auto_task", "Auto Task"),
("scheduled_task", "Scheduled Task"),
("manual_task", "Manual Task"),
],
default="auto_task",
help_text="The type of task that was run",
max_length=30,
verbose_name="Task Type",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
choices=[
("consume_file", "Consume File"),
("train_classifier", "Train Classifier"),
("check_sanity", "Check Sanity"),
("index_optimize", "Index Optimize"),
],
help_text="Name of the task that was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.RunPython(
code=make_existing_tasks_consume_auto,
reverse_code=migrations.RunPython.noop,
),
migrations.AlterField(
model_name="workflowactionwebhook",
name="url",

View File

@@ -650,6 +650,17 @@ class PaperlessTask(ModelWithOwner):
ALL_STATES = sorted(states.ALL_STATES)
TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES))
class TaskType(models.TextChoices):
AUTO = ("auto_task", _("Auto Task"))
SCHEDULED_TASK = ("scheduled_task", _("Scheduled Task"))
MANUAL_TASK = ("manual_task", _("Manual Task"))
class TaskName(models.TextChoices):
CONSUME_FILE = ("consume_file", _("Consume File"))
TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
CHECK_SANITY = ("check_sanity", _("Check Sanity"))
INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
task_id = models.CharField(
max_length=255,
unique=True,
@@ -673,8 +684,9 @@ class PaperlessTask(ModelWithOwner):
task_name = models.CharField(
null=True,
max_length=255,
choices=TaskName.choices,
verbose_name=_("Task Name"),
help_text=_("Name of the Task which was run"),
help_text=_("Name of the task that was run"),
)
status = models.CharField(
@@ -684,24 +696,28 @@ class PaperlessTask(ModelWithOwner):
verbose_name=_("Task State"),
help_text=_("Current state of the task being run"),
)
date_created = models.DateTimeField(
null=True,
default=timezone.now,
verbose_name=_("Created DateTime"),
help_text=_("Datetime field when the task result was created in UTC"),
)
date_started = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Started DateTime"),
help_text=_("Datetime field when the task was started in UTC"),
)
date_done = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Completed DateTime"),
help_text=_("Datetime field when the task was completed in UTC"),
)
result = models.TextField(
null=True,
default=None,
@@ -711,6 +727,14 @@ class PaperlessTask(ModelWithOwner):
),
)
type = models.CharField(
max_length=30,
choices=TaskType.choices,
default=TaskType.AUTO,
verbose_name=_("Task Type"),
help_text=_("The type of task that was run"),
)
def __str__(self) -> str:
return f"Task {self.task_id}"

View File

@@ -1,13 +1,17 @@
import hashlib
import logging
import uuid
from collections import defaultdict
from pathlib import Path
from typing import Final
from celery import states
from django.conf import settings
from django.utils import timezone
from tqdm import tqdm
from documents.models import Document
from documents.models import PaperlessTask
class SanityCheckMessages:
@@ -57,7 +61,17 @@ class SanityCheckFailedException(Exception):
pass
def check_sanity(*, progress=False) -> SanityCheckMessages:
def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
paperless_task = PaperlessTask.objects.create(
task_id=uuid.uuid4(),
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_name=PaperlessTask.TaskName.CHECK_SANITY,
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
messages = SanityCheckMessages()
present_files = {
@@ -142,4 +156,11 @@ def check_sanity(*, progress=False) -> SanityCheckMessages:
for extra_file in present_files:
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
paperless_task.status = states.SUCCESS if not messages.has_error else states.FAILURE
# result is concatenated messages
paperless_task.result = f"{len(messages)} issues found."
if messages.has_error:
paperless_task.result += " Check logs for details."
paperless_task.date_done = timezone.now()
paperless_task.save(update_fields=["status", "result", "date_done"])
return messages

View File

@@ -1710,6 +1710,7 @@ class TasksViewSerializer(OwnedObjectSerializer):
fields = (
"id",
"task_id",
"task_name",
"task_file_name",
"date_created",
"date_done",
@@ -1721,12 +1722,6 @@ class TasksViewSerializer(OwnedObjectSerializer):
"owner",
)
type = serializers.SerializerMethodField()
def get_type(self, obj) -> str:
# just file tasks, for now
return "file"
related_document = serializers.SerializerMethodField()
created_doc_re = re.compile(r"New document id (\d+) created")
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")
@@ -1734,24 +1729,33 @@ class TasksViewSerializer(OwnedObjectSerializer):
def get_related_document(self, obj) -> str | None:
result = None
re = None
match obj.status:
case states.SUCCESS:
re = self.created_doc_re
case states.FAILURE:
re = (
self.duplicate_doc_re
if "existing document is in the trash" not in obj.result
else None
)
if re is not None:
try:
result = re.search(obj.result).group(1)
except Exception:
pass
if obj.result:
match obj.status:
case states.SUCCESS:
re = self.created_doc_re
case states.FAILURE:
re = (
self.duplicate_doc_re
if "existing document is in the trash" not in obj.result
else None
)
if re is not None:
try:
result = re.search(obj.result).group(1)
except Exception:
pass
return result
class RunTaskViewSerializer(serializers.Serializer):
task_name = serializers.ChoiceField(
choices=PaperlessTask.TaskName.choices,
label="Task Name",
write_only=True,
)
class AcknowledgeTasksViewSerializer(serializers.Serializer):
tasks = serializers.ListField(
required=True,

View File

@@ -1255,10 +1255,11 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):
user_id = overrides.owner_id if overrides else None
PaperlessTask.objects.create(
type=PaperlessTask.TaskType.AUTO,
task_id=headers["id"],
status=states.PENDING,
task_file_name=task_file_name,
task_name=headers["task"],
task_name=PaperlessTask.TaskName.CONSUME_FILE,
result=None,
date_created=timezone.now(),
date_started=None,

View File

@@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
import tqdm
from celery import Task
from celery import shared_task
from celery import states
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.db import models
@@ -35,6 +36,7 @@ from documents.models import Correspondent
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import StoragePath
from documents.models import Tag
from documents.models import Workflow
@@ -74,19 +76,34 @@ def index_reindex(*, progress_bar_disable=False):
@shared_task
def train_classifier():
def train_classifier(*, scheduled=True):
task = PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_id=uuid.uuid4(),
task_name=PaperlessTask.TaskName.TRAIN_CLASSIFIER,
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
):
logger.info("No automatic matching items, not training")
result = "No automatic matching items, not training"
logger.info(result)
# Special case, items were once auto and trained, so remove the model
# and prevent its use again
if settings.MODEL_FILE.exists():
logger.info(f"Removing {settings.MODEL_FILE} so it won't be used")
settings.MODEL_FILE.unlink()
task.status = states.SUCCESS
task.result = result
task.date_done = timezone.now()
task.save()
return
classifier = load_classifier()
@@ -100,11 +117,19 @@ def train_classifier():
f"Saving updated classifier model to {settings.MODEL_FILE}...",
)
classifier.save()
task.result = "Training completed successfully"
else:
logger.debug("Training data unchanged.")
task.result = "Training data unchanged"
task.status = states.SUCCESS
task.date_done = timezone.now()
task.save(update_fields=["status", "result", "date_done"])
except Exception as e:
logger.warning("Classifier error: " + str(e))
task.status = states.FAILURE
task.result = str(e)
@shared_task(bind=True)
@@ -176,13 +201,16 @@ def consume_file(
@shared_task
def sanity_check():
messages = sanity_checker.check_sanity()
def sanity_check(*, scheduled=True, raise_on_error=True):
messages = sanity_checker.check_sanity(scheduled=scheduled)
messages.log_messages()
if messages.has_error:
raise SanityCheckFailedException("Sanity check failed with errors. See log.")
message = "Sanity check exited with errors. See log."
if raise_on_error:
raise SanityCheckFailedException(message)
return message
elif messages.has_warning:
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:

View File

@@ -1,18 +1,14 @@
import os
import tempfile
from pathlib import Path
from unittest import mock
from celery import states
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework import status
from rest_framework.test import APITestCase
from documents.classifier import ClassifierModelCorruptError
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.models import Document
from documents.models import Tag
from documents.models import PaperlessTask
from paperless import version
@@ -193,7 +189,6 @@ class TestSystemStatus(APITestCase):
self.assertEqual(response.data["tasks"]["index_status"], "ERROR")
self.assertIsNotNone(response.data["tasks"]["index_error"])
@override_settings(DATA_DIR=Path("/tmp/does_not_exist/data/"))
def test_system_status_classifier_ok(self):
"""
GIVEN:
@@ -203,9 +198,11 @@ class TestSystemStatus(APITestCase):
THEN:
- The response contains an OK classifier status
"""
load_classifier()
test_classifier = DocumentClassifier()
test_classifier.save()
PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK,
status=states.SUCCESS,
task_name=PaperlessTask.TaskName.TRAIN_CLASSIFIER,
)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
@@ -215,73 +212,101 @@ class TestSystemStatus(APITestCase):
def test_system_status_classifier_warning(self):
"""
GIVEN:
- The classifier does not exist yet
- > 0 documents and tags with auto matching exist
- No classifier task is found
WHEN:
- The user requests the system status
THEN:
- The response contains an WARNING classifier status
- The response contains a WARNING classifier status
"""
with override_settings(MODEL_FILE=Path("does_not_exist")):
Document.objects.create(
title="Test Document",
)
Tag.objects.create(name="Test Tag", matching_algorithm=Tag.MATCH_AUTO)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["tasks"]["classifier_status"], "WARNING")
self.assertIsNotNone(response.data["tasks"]["classifier_error"])
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data["tasks"]["classifier_status"],
"WARNING",
)
@mock.patch(
"documents.classifier.load_classifier",
side_effect=ClassifierModelCorruptError(),
)
def test_system_status_classifier_error(self, mock_load_classifier):
def test_system_status_classifier_error(self):
"""
GIVEN:
- The classifier does exist but is corrupt
- > 0 documents and tags with auto matching exist
- An error occurred while loading the classifier
WHEN:
- The user requests the system status
THEN:
- The response contains an ERROR classifier status
"""
with (
tempfile.NamedTemporaryFile(
dir="/tmp",
delete=False,
) as does_exist,
override_settings(MODEL_FILE=Path(does_exist.name)),
):
Document.objects.create(
title="Test Document",
)
Tag.objects.create(
name="Test Tag",
matching_algorithm=Tag.MATCH_AUTO,
)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data["tasks"]["classifier_status"],
"ERROR",
)
self.assertIsNotNone(response.data["tasks"]["classifier_error"])
PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK,
status=states.FAILURE,
task_name=PaperlessTask.TaskName.TRAIN_CLASSIFIER,
result="Classifier training failed",
)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data["tasks"]["classifier_status"],
"ERROR",
)
self.assertIsNotNone(response.data["tasks"]["classifier_error"])
def test_system_status_classifier_ok_no_objects(self):
def test_system_status_sanity_check_ok(self):
"""
GIVEN:
- The classifier does not exist (and should not)
- No documents nor objects with auto matching exist
- The sanity check is successful
WHEN:
- The user requests the system status
THEN:
- The response contains an OK classifier status
- The response contains an OK sanity check status
"""
with override_settings(MODEL_FILE=Path("does_not_exist")):
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["tasks"]["classifier_status"], "OK")
PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK,
status=states.SUCCESS,
task_name=PaperlessTask.TaskName.CHECK_SANITY,
)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["tasks"]["sanity_check_status"], "OK")
self.assertIsNone(response.data["tasks"]["sanity_check_error"])
def test_system_status_sanity_check_warning(self):
"""
GIVEN:
- No sanity check task is found
WHEN:
- The user requests the system status
THEN:
- The response contains a WARNING sanity check status
"""
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data["tasks"]["sanity_check_status"],
"WARNING",
)
def test_system_status_sanity_check_error(self):
"""
GIVEN:
- The sanity check failed
WHEN:
- The user requests the system status
THEN:
- The response contains an ERROR sanity check status
"""
PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK,
status=states.FAILURE,
task_name=PaperlessTask.TaskName.CHECK_SANITY,
result="5 issues found.",
)
self.client.force_login(self.user)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data["tasks"]["sanity_check_status"],
"ERROR",
)
self.assertIsNotNone(response.data["tasks"]["sanity_check_error"])

View File

@@ -1,4 +1,5 @@
import uuid
from unittest import mock
import celery
from django.contrib.auth.models import Permission
@@ -8,6 +9,7 @@ from rest_framework.test import APITestCase
from documents.models import PaperlessTask
from documents.tests.utils import DirectoriesMixin
from documents.views import TasksViewSet
class TestTasks(DirectoriesMixin, APITestCase):
@@ -130,7 +132,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
response = self.client.get(self.ENDPOINT)
response = self.client.get(self.ENDPOINT + "?acknowledged=false")
self.assertEqual(len(response.data), 0)
def test_tasks_owner_aware(self):
@@ -246,7 +248,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="test.pdf",
task_name="documents.tasks.some_task",
task_name=PaperlessTask.TaskName.CONSUME_FILE,
status=celery.states.SUCCESS,
)
@@ -272,7 +274,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="anothertest.pdf",
task_name="documents.tasks.some_task",
task_name=PaperlessTask.TaskName.CONSUME_FILE,
status=celery.states.SUCCESS,
)
@@ -309,3 +311,62 @@ class TestTasks(DirectoriesMixin, APITestCase):
returned_data = response.data[0]
self.assertEqual(returned_data["related_document"], "1234")
def test_run_train_classifier_task(self):
"""
GIVEN:
- A superuser
WHEN:
- API call is made to run the train classifier task
THEN:
- The task is run
"""
mock_train_classifier = mock.Mock(return_value="Task started")
TasksViewSet.TASK_AND_ARGS_BY_NAME = {
PaperlessTask.TaskName.TRAIN_CLASSIFIER: (
mock_train_classifier,
{"scheduled": False},
),
}
response = self.client.post(
self.ENDPOINT + "run/",
{"task_name": PaperlessTask.TaskName.TRAIN_CLASSIFIER},
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, {"result": "Task started"})
mock_train_classifier.assert_called_once_with(scheduled=False)
# mock error
mock_train_classifier.reset_mock()
mock_train_classifier.side_effect = Exception("Error")
response = self.client.post(
self.ENDPOINT + "run/",
{"task_name": PaperlessTask.TaskName.TRAIN_CLASSIFIER},
)
self.assertEqual(response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR)
mock_train_classifier.assert_called_once_with(scheduled=False)
@mock.patch("documents.tasks.sanity_check")
def test_run_task_requires_superuser(self, mock_check_sanity):
"""
GIVEN:
- A regular user
WHEN:
- API call is made to run a task
THEN:
- The task is not run
"""
regular_user = User.objects.create_user(username="test")
regular_user.user_permissions.add(*Permission.objects.all())
self.client.logout()
self.client.force_authenticate(user=regular_user)
response = self.client.post(
self.ENDPOINT + "run/",
{"task_name": PaperlessTask.TaskName.CHECK_SANITY},
)
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
mock_check_sanity.assert_not_called()

View File

@@ -68,7 +68,7 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
self.assertIsNotNone(task)
self.assertEqual(headers["id"], task.task_id)
self.assertEqual("hello-999.pdf", task.task_file_name)
self.assertEqual("documents.tasks.consume_file", task.task_name)
self.assertEqual(PaperlessTask.TaskName.CONSUME_FILE, task.task_name)
self.assertEqual(1, task.owner_id)
self.assertEqual(celery.states.PENDING, task.status)

View File

@@ -118,6 +118,19 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error_no_raise(self, m):
messages = SanityCheckMessages()
messages.error(None, "Some error")
m.return_value = messages
# No exception should be raised
result = tasks.sanity_check(raise_on_error=False)
self.assertEqual(
result,
"Sanity check exited with errors. See log.",
)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_warning(self, m):
messages = SanityCheckMessages()

View File

@@ -14,6 +14,7 @@ from urllib.parse import urlparse
import httpx
import pathvalidate
from celery import states
from django.conf import settings
from django.contrib.auth.models import Group
from django.contrib.auth.models import User
@@ -103,6 +104,7 @@ from documents.filters import DocumentsOrderingFilter
from documents.filters import DocumentTypeFilterSet
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
from documents.filters import ObjectOwnedPermissionsFilter
from documents.filters import PaperlessTaskFilterSet
from documents.filters import ShareLinkFilterSet
from documents.filters import StoragePathFilterSet
from documents.filters import TagFilterSet
@@ -144,6 +146,7 @@ from documents.serialisers import DocumentListSerializer
from documents.serialisers import DocumentSerializer
from documents.serialisers import DocumentTypeSerializer
from documents.serialisers import PostDocumentSerializer
from documents.serialisers import RunTaskViewSerializer
from documents.serialisers import SavedViewSerializer
from documents.serialisers import SearchResultSerializer
from documents.serialisers import ShareLinkSerializer
@@ -160,6 +163,9 @@ from documents.serialisers import WorkflowTriggerSerializer
from documents.signals import document_updated
from documents.tasks import consume_file
from documents.tasks import empty_trash
from documents.tasks import index_optimize
from documents.tasks import sanity_check
from documents.tasks import train_classifier
from documents.templating.filepath import validate_filepath_template_and_render
from paperless import version
from paperless.celery import app as celery_app
@@ -2276,16 +2282,27 @@ class RemoteVersionView(GenericAPIView):
class TasksViewSet(ReadOnlyModelViewSet):
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
serializer_class = TasksViewSerializer
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
filter_backends = (
DjangoFilterBackend,
OrderingFilter,
ObjectOwnedOrGrantedPermissionsFilter,
)
filterset_class = PaperlessTaskFilterSet
TASK_AND_ARGS_BY_NAME = {
PaperlessTask.TaskName.INDEX_OPTIMIZE: (index_optimize, {}),
PaperlessTask.TaskName.TRAIN_CLASSIFIER: (
train_classifier,
{"scheduled": False},
),
PaperlessTask.TaskName.CHECK_SANITY: (
sanity_check,
{"scheduled": False, "raise_on_error": False},
),
}
def get_queryset(self):
queryset = (
PaperlessTask.objects.filter(
acknowledged=False,
)
.order_by("date_created")
.reverse()
)
queryset = PaperlessTask.objects.all().order_by("-date_created")
task_id = self.request.query_params.get("task_id")
if task_id is not None:
queryset = PaperlessTask.objects.filter(task_id=task_id)
@@ -2308,6 +2325,25 @@ class TasksViewSet(ReadOnlyModelViewSet):
except Exception:
return HttpResponseBadRequest()
@action(methods=["post"], detail=False)
def run(self, request):
serializer = RunTaskViewSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
task_name = serializer.validated_data.get("task_name")
if not request.user.is_superuser:
return HttpResponseForbidden("Insufficient permissions")
try:
task_func, task_args = self.TASK_AND_ARGS_BY_NAME[task_name]
result = task_func(**task_args)
return Response({"result": result})
except Exception as e:
logger.warning(f"An error occurred running task: {e!s}")
return HttpResponseServerError(
"Error running task, check logs for more detail.",
)
class ShareLinkViewSet(ModelViewSet, PassUserMixin):
model = ShareLink
@@ -2614,6 +2650,14 @@ class CustomFieldViewSet(ModelViewSet):
"last_trained": serializers.DateTimeField(),
},
),
"sanity_check": inline_serializer(
name="SanityCheck",
fields={
"status": serializers.CharField(),
"error": serializers.CharField(),
"last_run": serializers.DateTimeField(),
},
),
},
),
},
@@ -2674,13 +2718,20 @@ class SystemStatusView(PassUserMixin):
)
redis_error = "Error connecting to redis, check logs for more detail."
celery_error = None
celery_url = None
try:
celery_ping = celery_app.control.inspect().ping()
first_worker_ping = celery_ping[next(iter(celery_ping.keys()))]
celery_url = next(iter(celery_ping.keys()))
first_worker_ping = celery_ping[celery_url]
if first_worker_ping["ok"] == "pong":
celery_active = "OK"
except Exception:
except Exception as e:
celery_active = "ERROR"
logger.exception(
f"System status detected a possible problem while connecting to celery: {e}",
)
celery_error = "Error connecting to celery, check logs for more detail."
index_error = None
try:
@@ -2697,55 +2748,43 @@ class SystemStatusView(PassUserMixin):
)
index_last_modified = None
last_trained_task = (
PaperlessTask.objects.filter(
task_name=PaperlessTask.TaskName.TRAIN_CLASSIFIER,
)
.order_by("-date_done")
.first()
)
classifier_status = "OK"
classifier_error = None
classifier_status = None
try:
classifier = load_classifier(raise_exception=True)
if classifier is None:
# Make sure classifier should exist
docs_queryset = Document.objects.exclude(
tags__is_inbox_tag=True,
)
if (
docs_queryset.count() > 0
and (
Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
or DocumentType.objects.filter(
matching_algorithm=Tag.MATCH_AUTO,
).exists()
or Correspondent.objects.filter(
matching_algorithm=Tag.MATCH_AUTO,
).exists()
or StoragePath.objects.filter(
matching_algorithm=Tag.MATCH_AUTO,
).exists()
)
and not settings.MODEL_FILE.exists()
):
# if classifier file doesn't exist just classify as a warning
classifier_error = "Classifier file does not exist (yet). Re-training may be pending."
classifier_status = "WARNING"
raise FileNotFoundError(classifier_error)
classifier_status = "OK"
classifier_last_trained = (
make_aware(
datetime.fromtimestamp(classifier.get_last_checked()),
)
if settings.MODEL_FILE.exists()
and classifier.get_last_checked() is not None
else None
)
except Exception as e:
if classifier_status is None:
classifier_status = "ERROR"
classifier_last_trained = None
if classifier_error is None:
classifier_error = (
"Unable to load classifier, check logs for more detail."
)
logger.exception(
f"System status detected a possible problem while loading the classifier: {e}",
if last_trained_task is None:
classifier_status = "WARNING"
classifier_error = "No classifier training tasks found"
elif last_trained_task and last_trained_task.status == states.FAILURE:
classifier_status = "ERROR"
classifier_error = last_trained_task.result
classifier_last_trained = (
last_trained_task.date_done if last_trained_task else None
)
last_sanity_check = (
PaperlessTask.objects.filter(
task_name=PaperlessTask.TaskName.CHECK_SANITY,
)
.order_by("-date_done")
.first()
)
sanity_check_status = "OK"
sanity_check_error = None
if last_sanity_check is None:
sanity_check_status = "WARNING"
sanity_check_error = "No sanity check tasks found"
elif last_sanity_check and last_sanity_check.status == states.FAILURE:
sanity_check_status = "ERROR"
sanity_check_error = last_sanity_check.result
sanity_check_last_run = (
last_sanity_check.date_done if last_sanity_check else None
)
return Response(
{
@@ -2773,12 +2812,17 @@ class SystemStatusView(PassUserMixin):
"redis_status": redis_status,
"redis_error": redis_error,
"celery_status": celery_active,
"celery_url": celery_url,
"celery_error": celery_error,
"index_status": index_status,
"index_last_modified": index_last_modified,
"index_error": index_error,
"classifier_status": classifier_status,
"classifier_last_trained": classifier_last_trained,
"classifier_error": classifier_error,
"sanity_check_status": sanity_check_status,
"sanity_check_last_run": sanity_check_last_run,
"sanity_check_error": sanity_check_error,
},
},
)