added a task scheduler for recurring tasks

This commit is contained in:
Jonas Winkler
2020-11-09 20:29:02 +01:00
parent 04f5438ce3
commit d3e7c8ff4e
15 changed files with 243 additions and 192 deletions

View File

@@ -1,10 +1,6 @@
import logging
from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from paperless import settings
from ...mixins import Renderable
from ...tasks import train_classifier
class Command(Renderable, BaseCommand):
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
classifier = DocumentClassifier()
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass
try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
train_classifier()

View File

@@ -1,9 +1,7 @@
from django.core.management import BaseCommand
from whoosh.writing import AsyncWriter
import documents.index as index
from documents.mixins import Renderable
from documents.models import Document
from documents.tasks import index_reindex, index_optimize
class Command(Renderable, BaseCommand):
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
self.verbosity = options["verbosity"]
if options['command'] == 'reindex':
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
index_reindex()
elif options['command'] == 'optimize':
index.open_index().optimize()
index_optimize()

View File

@@ -1,60 +0,0 @@
import argparse
import threading
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from django.core.management.base import BaseCommand
from documents.consumer import Consumer
from documents.models import Log, Document
from documents.parsers import get_parser_class
def process_document(doc):
parser_class = get_parser_class(doc.file_name)
if not parser_class:
print("no parser available")
else:
print("Parser: {}".format(parser_class.__name__))
parser = parser_class(doc.source_path, None)
try:
text = parser.get_text()
doc.content = text
doc.save()
finally:
parser.cleanup()
def document_index(value):
ivalue = int(value)
if not (1 <= ivalue <= Document.objects.count()):
raise argparse.ArgumentTypeError(
"{} is not a valid document index (out of range)".format(value))
return ivalue
class Command(BaseCommand):
help = "Performs OCR on all documents again!"
def add_arguments(self, parser):
parser.add_argument(
"-s", "--start_index",
default=None,
type=document_index
)
def handle(self, *args, **options):
docs = Document.objects.all().order_by("added")
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
for i in indices:
doc = docs[i]
print("==================================")
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
print("==================================")
process_document(doc)