added a task scheduler for recurring tasks

2025-12-22 01:55:49 -06:00 · 2020-11-09 20:29:02 +01:00
parent 04f5438ce3
commit d3e7c8ff4e
15 changed files with 243 additions and 192 deletions
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -1,10 +1,6 @@
-import logging
-
 from django.core.management.base import BaseCommand
-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
-from paperless import settings
 from ...mixins import Renderable
+from ...tasks import train_classifier


 class Command(Renderable, BaseCommand):
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
        BaseCommand.__init__(self, *args, **kwargs)

    def handle(self, *args, **options):
-        classifier = DocumentClassifier()
-
-        try:
-            # load the classifier, since we might not have to train it again.
-            classifier.reload()
-        except (FileNotFoundError, IncompatibleClassifierVersionError):
-            # This is what we're going to fix here.
-            pass
-
-        try:
-            if classifier.train():
-                logging.getLogger(__name__).info(
-                    "Saving updated classifier model to {}...".format(settings.MODEL_FILE)
-                )
-                classifier.save_classifier()
-            else:
-                logging.getLogger(__name__).debug(
-                    "Training data unchanged."
-                )
-
-        except Exception as e:
-            logging.getLogger(__name__).error(
-                "Classifier error: " + str(e)
-            )
+        train_classifier()
--- a/src/documents/management/commands/document_index.py
+++ b/src/documents/management/commands/document_index.py
@@ -1,9 +1,7 @@
 from django.core.management import BaseCommand
-from whoosh.writing import AsyncWriter

-import documents.index as index
 from documents.mixins import Renderable
-from documents.models import Document
+from documents.tasks import index_reindex, index_optimize


 class Command(Renderable, BaseCommand):
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
        self.verbosity = options["verbosity"]

        if options['command'] == 'reindex':
-            documents = Document.objects.all()
-
-            ix = index.open_index(recreate=True)
-
-            with AsyncWriter(ix) as writer:
-                for document in documents:
-                    index.update_document(writer, document)
-
+            index_reindex()
        elif options['command'] == 'optimize':
-            index.open_index().optimize()
+            index_optimize()
--- a/src/documents/management/commands/document_rerun_ocr.py
+++ b/src/documents/management/commands/document_rerun_ocr.py
@@ -1,60 +0,0 @@
-import argparse
-import threading
-from multiprocessing import Pool
-from multiprocessing.pool import ThreadPool
-
-from django.core.management.base import BaseCommand
-
-from documents.consumer import Consumer
-from documents.models import Log, Document
-from documents.parsers import get_parser_class
-
-
-def process_document(doc):
-    parser_class = get_parser_class(doc.file_name)
-    if not parser_class:
-        print("no parser available")
-    else:
-        print("Parser: {}".format(parser_class.__name__))
-        parser = parser_class(doc.source_path, None)
-        try:
-            text = parser.get_text()
-            doc.content = text
-            doc.save()
-        finally:
-            parser.cleanup()
-
-
-def document_index(value):
-    ivalue = int(value)
-    if not (1 <= ivalue <= Document.objects.count()):
-        raise argparse.ArgumentTypeError(
-            "{} is not a valid document index (out of range)".format(value))
-
-    return ivalue
-
-
-class Command(BaseCommand):
-
-    help = "Performs OCR on all documents again!"
-
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "-s", "--start_index",
-            default=None,
-            type=document_index
-        )
-
-    def handle(self, *args, **options):
-
-        docs = Document.objects.all().order_by("added")
-
-        indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
-
-        for i in indices:
-            doc = docs[i]
-            print("==================================")
-            print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
-            print("==================================")
-            process_document(doc)