added a task scheduler for recurring tasks

This commit is contained in:
Jonas Winkler
2020-11-09 20:29:02 +01:00
parent 04f5438ce3
commit d3e7c8ff4e
15 changed files with 243 additions and 192 deletions

View File

@@ -1,10 +1,6 @@
import logging
from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from paperless import settings
from ...mixins import Renderable
from ...tasks import train_classifier
class Command(Renderable, BaseCommand):
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
classifier = DocumentClassifier()
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass
try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
train_classifier()

View File

@@ -1,9 +1,7 @@
from django.core.management import BaseCommand
from whoosh.writing import AsyncWriter
import documents.index as index
from documents.mixins import Renderable
from documents.models import Document
from documents.tasks import index_reindex, index_optimize
class Command(Renderable, BaseCommand):
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
self.verbosity = options["verbosity"]
if options['command'] == 'reindex':
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
index_reindex()
elif options['command'] == 'optimize':
index.open_index().optimize()
index_optimize()

View File

@@ -1,60 +0,0 @@
import argparse
import threading
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from django.core.management.base import BaseCommand
from documents.consumer import Consumer
from documents.models import Log, Document
from documents.parsers import get_parser_class
def process_document(doc):
parser_class = get_parser_class(doc.file_name)
if not parser_class:
print("no parser available")
else:
print("Parser: {}".format(parser_class.__name__))
parser = parser_class(doc.source_path, None)
try:
text = parser.get_text()
doc.content = text
doc.save()
finally:
parser.cleanup()
def document_index(value):
ivalue = int(value)
if not (1 <= ivalue <= Document.objects.count()):
raise argparse.ArgumentTypeError(
"{} is not a valid document index (out of range)".format(value))
return ivalue
class Command(BaseCommand):
help = "Performs OCR on all documents again!"
def add_arguments(self, parser):
parser.add_argument(
"-s", "--start_index",
default=None,
type=document_index
)
def handle(self, *args, **options):
docs = Document.objects.all().order_by("added")
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
for i in indices:
doc = docs[i]
print("==================================")
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
print("==================================")
process_document(doc)

View File

@@ -0,0 +1,28 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.all().delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1000_update_paperless_all'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

57
src/documents/tasks.py Normal file
View File

@@ -0,0 +1,57 @@
import logging
from django.conf import settings
from django_q.tasks import async_task, result
from whoosh.writing import AsyncWriter
from documents import index
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.mail import MailFetcher
from documents.models import Document
def consume_mail():
MailFetcher().pull()
def index_optimize():
index.open_index().optimize()
def index_reindex():
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
def train_classifier():
classifier = DocumentClassifier()
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass
try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)

View File

@@ -71,6 +71,8 @@ INSTALLED_APPS = [
"rest_framework",
"django_filters",
"django_q",
]
REST_FRAMEWORK = {
@@ -242,6 +244,16 @@ LOGGING = {
},
}
###############################################################################
# Task queue #
###############################################################################
Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}
###############################################################################
# Paperless Specific Settings #
###############################################################################