Create paperlesstasks for sanity, classifier

[ci skip]
This commit is contained in:
shamoon 2025-02-13 17:46:05 -08:00
parent de5f66b3a0
commit f897447a65
12 changed files with 120 additions and 17 deletions

View File

@ -33,7 +33,7 @@ describe('TasksService', () => {
it('calls tasks api endpoint on reload', () => { it('calls tasks api endpoint on reload', () => {
tasksService.reload() tasksService.reload()
const req = httpTestingController.expectOne( const req = httpTestingController.expectOne(
`${environment.apiBaseUrl}tasks/` `${environment.apiBaseUrl}tasks/?type=file`
) )
expect(req.request.method).toEqual('GET') expect(req.request.method).toEqual('GET')
}) })
@ -41,7 +41,9 @@ describe('TasksService', () => {
it('does not call tasks api endpoint on reload if already loading', () => { it('does not call tasks api endpoint on reload if already loading', () => {
tasksService.loading = true tasksService.loading = true
tasksService.reload() tasksService.reload()
httpTestingController.expectNone(`${environment.apiBaseUrl}tasks/`) httpTestingController.expectNone(
`${environment.apiBaseUrl}tasks/?type=file`
)
}) })
it('calls acknowledge_tasks api endpoint on dismiss and reloads', () => { it('calls acknowledge_tasks api endpoint on dismiss and reloads', () => {
@ -55,7 +57,9 @@ describe('TasksService', () => {
}) })
req.flush([]) req.flush([])
// reload is then called // reload is then called
httpTestingController.expectOne(`${environment.apiBaseUrl}tasks/`).flush([]) httpTestingController
.expectOne(`${environment.apiBaseUrl}tasks/?type=file`)
.flush([])
}) })
it('sorts tasks returned from api', () => { it('sorts tasks returned from api', () => {
@ -106,7 +110,7 @@ describe('TasksService', () => {
tasksService.reload() tasksService.reload()
const req = httpTestingController.expectOne( const req = httpTestingController.expectOne(
`${environment.apiBaseUrl}tasks/` `${environment.apiBaseUrl}tasks/?type=file`
) )
req.flush(mockTasks) req.flush(mockTasks)

View File

@ -54,7 +54,7 @@ export class TasksService {
this.loading = true this.loading = true
this.http this.http
.get<PaperlessTask[]>(`${this.baseUrl}tasks/`) .get<PaperlessTask[]>(`${this.baseUrl}tasks/?type=file`)
.pipe(takeUntil(this.unsubscribeNotifer), first()) .pipe(takeUntil(this.unsubscribeNotifer), first())
.subscribe((r) => { .subscribe((r) => {
this.fileTasks = r.filter((t) => t.type == PaperlessTaskType.File) // they're all File tasks, for now this.fileTasks = r.filter((t) => t.type == PaperlessTaskType.File) // they're all File tasks, for now

View File

@ -35,6 +35,7 @@ from documents.models import CustomFieldInstance
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import Log from documents.models import Log
from documents.models import PaperlessTask
from documents.models import ShareLink from documents.models import ShareLink
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
@ -770,6 +771,15 @@ class ShareLinkFilterSet(FilterSet):
} }
class PaperlessTaskFilterSet(FilterSet):
class Meta:
model = PaperlessTask
fields = {
"type": ["exact"],
"status": ["exact"],
}
class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter): class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter):
""" """
A filter backend that limits results to those where the requesting user A filter backend that limits results to those where the requesting user

View File

@ -10,4 +10,4 @@ class Command(BaseCommand):
) )
def handle(self, *args, **options): def handle(self, *args, **options):
train_classifier() train_classifier(scheduled=False)

View File

@ -12,6 +12,6 @@ class Command(ProgressBarMixin, BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options) self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar) messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
messages.log_messages() messages.log_messages()

View File

@ -0,0 +1,28 @@
# Generated by Django 5.1.6 on 2025-02-14 01:11
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1062_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.AddField(
model_name="paperlesstask",
name="type",
field=models.CharField(
choices=[
("file", "File Task"),
("scheduled_task", "Scheduled Task"),
("manual_task", "Manual Task"),
],
default="file",
help_text="The type of task that was run",
max_length=30,
verbose_name="Task Type",
),
),
]

View File

@ -650,6 +650,11 @@ class PaperlessTask(ModelWithOwner):
ALL_STATES = sorted(states.ALL_STATES) ALL_STATES = sorted(states.ALL_STATES)
TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES)) TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES))
class TaskType(models.TextChoices):
FILE = ("file", _("File Task"))
SCHEDULED_TASK = ("scheduled_task", _("Scheduled Task"))
MANUAL_TASK = ("manual_task", _("Manual Task"))
task_id = models.CharField( task_id = models.CharField(
max_length=255, max_length=255,
unique=True, unique=True,
@ -684,24 +689,28 @@ class PaperlessTask(ModelWithOwner):
verbose_name=_("Task State"), verbose_name=_("Task State"),
help_text=_("Current state of the task being run"), help_text=_("Current state of the task being run"),
) )
date_created = models.DateTimeField( date_created = models.DateTimeField(
null=True, null=True,
default=timezone.now, default=timezone.now,
verbose_name=_("Created DateTime"), verbose_name=_("Created DateTime"),
help_text=_("Datetime field when the task result was created in UTC"), help_text=_("Datetime field when the task result was created in UTC"),
) )
date_started = models.DateTimeField( date_started = models.DateTimeField(
null=True, null=True,
default=None, default=None,
verbose_name=_("Started DateTime"), verbose_name=_("Started DateTime"),
help_text=_("Datetime field when the task was started in UTC"), help_text=_("Datetime field when the task was started in UTC"),
) )
date_done = models.DateTimeField( date_done = models.DateTimeField(
null=True, null=True,
default=None, default=None,
verbose_name=_("Completed DateTime"), verbose_name=_("Completed DateTime"),
help_text=_("Datetime field when the task was completed in UTC"), help_text=_("Datetime field when the task was completed in UTC"),
) )
result = models.TextField( result = models.TextField(
null=True, null=True,
default=None, default=None,
@ -711,6 +720,14 @@ class PaperlessTask(ModelWithOwner):
), ),
) )
type = models.CharField(
max_length=30,
choices=TaskType.choices,
default=TaskType.FILE,
verbose_name=_("Task Type"),
help_text=_("The type of task that was run"),
)
def __str__(self) -> str: def __str__(self) -> str:
return f"Task {self.task_id}" return f"Task {self.task_id}"

View File

@ -1,13 +1,17 @@
import hashlib import hashlib
import logging import logging
import uuid
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
from typing import Final from typing import Final
from celery import states
from django.conf import settings from django.conf import settings
from django.utils import timezone
from tqdm import tqdm from tqdm import tqdm
from documents.models import Document from documents.models import Document
from documents.models import PaperlessTask
class SanityCheckMessages: class SanityCheckMessages:
@ -57,7 +61,17 @@ class SanityCheckFailedException(Exception):
pass pass
def check_sanity(*, progress=False) -> SanityCheckMessages: def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
task = PaperlessTask.objects.create(
task_id=uuid.uuid4(),
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_name="check_sanity",
status=PaperlessTask.TASK_STATE_CHOICES.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
messages = SanityCheckMessages() messages = SanityCheckMessages()
present_files = { present_files = {
@ -142,4 +156,8 @@ def check_sanity(*, progress=False) -> SanityCheckMessages:
for extra_file in present_files: for extra_file in present_files:
messages.warning(None, f"Orphaned file in media dir: {extra_file}") messages.warning(None, f"Orphaned file in media dir: {extra_file}")
task.status = states.SUCCESS if not messages.has_error else states.FAILED
# result is concatenated messages
task.result = str(messages)
task.date_done = timezone.now()
return messages return messages

View File

@ -1700,12 +1700,6 @@ class TasksViewSerializer(OwnedObjectSerializer):
"owner", "owner",
) )
type = serializers.SerializerMethodField()
def get_type(self, obj) -> str:
# just file tasks, for now
return "file"
related_document = serializers.SerializerMethodField() related_document = serializers.SerializerMethodField()
created_doc_re = re.compile(r"New document id (\d+) created") created_doc_re = re.compile(r"New document id (\d+) created")
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)") duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")

View File

@ -1221,6 +1221,7 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):
user_id = overrides.owner_id if overrides else None user_id = overrides.owner_id if overrides else None
PaperlessTask.objects.create( PaperlessTask.objects.create(
type=PaperlessTask.TaskType.FILE,
task_id=headers["id"], task_id=headers["id"],
status=states.PENDING, status=states.PENDING,
task_file_name=task_file_name, task_file_name=task_file_name,

View File

@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
import tqdm import tqdm
from celery import Task from celery import Task
from celery import shared_task from celery import shared_task
from celery import states
from django.conf import settings from django.conf import settings
from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes.models import ContentType
from django.db import models from django.db import models
@ -35,6 +36,7 @@ from documents.models import Correspondent
from documents.models import CustomFieldInstance from documents.models import CustomFieldInstance
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.models import Workflow from documents.models import Workflow
@ -74,19 +76,34 @@ def index_reindex(*, progress_bar_disable=False):
@shared_task @shared_task
def train_classifier(): def train_classifier(*, scheduled=True):
task = PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_id=uuid.uuid4(),
task_name="train_classifier",
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
if ( if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
): ):
logger.info("No automatic matching items, not training") result = "No automatic matching items, not training"
logger.info(result)
# Special case, items were once auto and trained, so remove the model # Special case, items were once auto and trained, so remove the model
# and prevent its use again # and prevent its use again
if settings.MODEL_FILE.exists(): if settings.MODEL_FILE.exists():
logger.info(f"Removing {settings.MODEL_FILE} so it won't be used") logger.info(f"Removing {settings.MODEL_FILE} so it won't be used")
settings.MODEL_FILE.unlink() settings.MODEL_FILE.unlink()
task.status = states.SUCCESS
task.result = result
task.date_done = timezone.now()
task.save()
return return
classifier = load_classifier() classifier = load_classifier()
@ -100,11 +117,19 @@ def train_classifier():
f"Saving updated classifier model to {settings.MODEL_FILE}...", f"Saving updated classifier model to {settings.MODEL_FILE}...",
) )
classifier.save() classifier.save()
task.status = states.SUCCESS
task.result = "Training completed successfully"
else: else:
logger.debug("Training data unchanged.") logger.debug("Training data unchanged.")
task.status = states.SUCCESS
task.result = "Training data unchanged"
task.save(update_fields=["status", "result"])
except Exception as e: except Exception as e:
logger.warning("Classifier error: " + str(e)) logger.warning("Classifier error: " + str(e))
task.status = states.FAILED
task.result = str(e)
@shared_task(bind=True) @shared_task(bind=True)

View File

@ -103,6 +103,7 @@ from documents.filters import DocumentsOrderingFilter
from documents.filters import DocumentTypeFilterSet from documents.filters import DocumentTypeFilterSet
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
from documents.filters import ObjectOwnedPermissionsFilter from documents.filters import ObjectOwnedPermissionsFilter
from documents.filters import PaperlessTaskFilterSet
from documents.filters import ShareLinkFilterSet from documents.filters import ShareLinkFilterSet
from documents.filters import StoragePathFilterSet from documents.filters import StoragePathFilterSet
from documents.filters import TagFilterSet from documents.filters import TagFilterSet
@ -2223,7 +2224,12 @@ class RemoteVersionView(GenericAPIView):
class TasksViewSet(ReadOnlyModelViewSet): class TasksViewSet(ReadOnlyModelViewSet):
permission_classes = (IsAuthenticated, PaperlessObjectPermissions) permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
serializer_class = TasksViewSerializer serializer_class = TasksViewSerializer
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,) filter_backends = (
DjangoFilterBackend,
OrderingFilter,
ObjectOwnedOrGrantedPermissionsFilter,
)
filterset_class = PaperlessTaskFilterSet
def get_queryset(self): def get_queryset(self):
queryset = ( queryset = (