Merge branch 'dev' into feature-websockets-status

2025-12-16 01:31:09 -06:00 · 2021-01-30 16:08:50 +01:00
parent dcd350a30c 9d148c08ce
commit 46ea86a6d2
45 changed files with 1017 additions and 515 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -26,6 +26,34 @@ def preprocess_content(content):
    return content


+def load_classifier():
+    if not os.path.isfile(settings.MODEL_FILE):
+        logger.debug(
+            f"Document classification model does not exist (yet), not "
+            f"performing automatic matching."
+        )
+        return None
+
+    try:
+        classifier = DocumentClassifier()
+        classifier.reload()
+    except (EOFError, IncompatibleClassifierVersionError) as e:
+        # there's something wrong with the model file.
+        logger.error(
+            f"Unrecoverable error while loading document "
+            f"classification model: {str(e)}, deleting model file."
+        )
+        os.unlink(settings.MODEL_FILE)
+        classifier = None
+    except OSError as e:
+        logger.error(
+            f"Error while loading document classification model: {str(e)}"
+        )
+        classifier = None
+
+    return classifier
+
+
 class DocumentClassifier(object):

    FORMAT_VERSION = 6
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -14,7 +14,7 @@ from django.utils import timezone
 from filelock import FileLock
 from rest_framework.reverse import reverse

-from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
+from .classifier import load_classifier
 from .file_handling import create_source_path_directory, \
    generate_unique_filename
 from .loggers import LoggingMixin
@@ -262,14 +262,8 @@ class Consumer(LoggingMixin):
        #   reloading the classifier multiple times, since there are multiple
        #   post-consume hooks that all require the classifier.

-        try:
-            classifier = DocumentClassifier()
-            classifier.reload()
-        except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
-            self.log(
-                "warning",
-                f"Cannot classify documents: {e}.")
-            classifier = None
+        classifier = load_classifier()
+
        self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
        # now that everything is done, we can start to store the document
        # in the system. This will be a transaction and reasonably fast.
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -2,8 +2,7 @@ import logging

 from django.core.management.base import BaseCommand

-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
+from documents.classifier import load_classifier
 from documents.models import Document
 from ...mixins import Renderable
 from ...signals.handlers import set_correspondent, set_document_type, set_tags
@@ -70,13 +69,7 @@ class Command(Renderable, BaseCommand):
            queryset = Document.objects.all()
        documents = queryset.distinct()

-        classifier = DocumentClassifier()
-        try:
-            classifier.reload()
-        except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
-            logging.getLogger(__name__).warning(
-                f"Cannot classify documents: {e}.")
-            classifier = None
+        classifier = load_classifier()

        for document in documents:
            logging.getLogger(__name__).info(
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -6,10 +6,9 @@ from django.db.models.signals import post_save
 from whoosh.writing import AsyncWriter

 from documents import index, sanity_checker
-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
+from documents.classifier import DocumentClassifier, load_classifier
 from documents.consumer import Consumer, ConsumerError
-from documents.models import Document
+from documents.models import Document, Tag, DocumentType, Correspondent
 from documents.sanity_checker import SanityFailedError


@@ -30,13 +29,18 @@ def index_reindex():


 def train_classifier():
-    classifier = DocumentClassifier()
+    if (not Tag.objects.filter(
+                matching_algorithm=Tag.MATCH_AUTO).exists() and
+        not DocumentType.objects.filter(
+            matching_algorithm=Tag.MATCH_AUTO).exists() and
+        not Correspondent.objects.filter(
+            matching_algorithm=Tag.MATCH_AUTO).exists()):

-    try:
-        # load the classifier, since we might not have to train it again.
-        classifier.reload()
-    except (OSError, EOFError, IncompatibleClassifierVersionError):
-        # This is what we're going to fix here.
+        return
+
+    classifier = load_classifier()
+
+    if not classifier:
        classifier = DocumentClassifier()

    try:
@@ -52,7 +56,7 @@ def train_classifier():
            )

    except Exception as e:
-        logging.getLogger(__name__).error(
+        logging.getLogger(__name__).warning(
            "Classifier error: " + str(e)
        )

--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -590,6 +590,10 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(meta['original_metadata']), 0)
        self.assertGreater(len(meta['archive_metadata']), 0)

+    def test_get_metadata_invalid_doc(self):
+        response = self.client.get(f"/api/documents/34576/metadata/")
+        self.assertEqual(response.status_code, 404)
+
    def test_get_metadata_no_archive(self):
        doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")

@@ -605,6 +609,30 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertGreater(len(meta['original_metadata']), 0)
        self.assertIsNone(meta['archive_metadata'])

+    def test_get_empty_suggestions(self):
+        doc = Document.objects.create(title="test", mime_type="application/pdf")
+
+        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.data, {'correspondents': [], 'tags': [], 'document_types': []})
+
+    def test_get_suggestions_invalid_doc(self):
+        response = self.client.get(f"/api/documents/34676/suggestions/")
+        self.assertEqual(response.status_code, 404)
+
+    @mock.patch("documents.views.match_correspondents")
+    @mock.patch("documents.views.match_tags")
+    @mock.patch("documents.views.match_document_types")
+    def test_get_suggestions(self, match_document_types, match_tags, match_correspondents):
+        doc = Document.objects.create(title="test", mime_type="application/pdf", content="this is an invoice!")
+        match_tags.return_value = [Tag(id=56), Tag(id=123)]
+        match_document_types.return_value = [DocumentType(id=23)]
+        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
+
+        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+        self.assertEqual(response.data, {'correspondents': [88,2], 'tags': [56,123], 'document_types': [23]})
+
    def test_saved_views(self):
        u1 = User.objects.create_user("user1")
        u2 = User.objects.create_user("user2")
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -1,10 +1,13 @@
+import os
 import tempfile
+from pathlib import Path
 from time import sleep
 from unittest import mock

+from django.conf import settings
 from django.test import TestCase, override_settings

-from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
+from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
 from documents.models import Correspondent, Document, Tag, DocumentType
 from documents.tests.utils import DirectoriesMixin

@@ -235,3 +238,30 @@ class TestClassifier(DirectoriesMixin, TestCase):
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
+
+    def test_load_classifier_not_exists(self):
+        self.assertFalse(os.path.exists(settings.MODEL_FILE))
+        self.assertIsNone(load_classifier())
+
+    @mock.patch("documents.classifier.DocumentClassifier.reload")
+    def test_load_classifier(self, reload):
+        Path(settings.MODEL_FILE).touch()
+        self.assertIsNotNone(load_classifier())
+
+    @mock.patch("documents.classifier.DocumentClassifier.reload")
+    def test_load_classifier_incompatible_version(self, reload):
+        Path(settings.MODEL_FILE).touch()
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
+
+        reload.side_effect = IncompatibleClassifierVersionError()
+        self.assertIsNone(load_classifier())
+        self.assertFalse(os.path.exists(settings.MODEL_FILE))
+
+    @mock.patch("documents.classifier.DocumentClassifier.reload")
+    def test_load_classifier_os_error(self, reload):
+        Path(settings.MODEL_FILE).touch()
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
+
+        reload.side_effect = OSError()
+        self.assertIsNone(load_classifier())
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -460,7 +460,7 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self._assert_first_last_send_progress()

-    @mock.patch("documents.consumer.DocumentClassifier")
+    @mock.patch("documents.consumer.load_classifier")
    def testClassifyDocument(self, m):
        correspondent = Correspondent.objects.create(name="test")
        dtype = DocumentType.objects.create(name="test")
--- a/src/documents/tests/test_settings.py
+++ b/src/documents/tests/test_settings.py
@@ -20,7 +20,7 @@ class TestSettings(TestCase):
        self.assertEqual(default_threads, 1)

    def test_workers_threads(self):
-        for i in range(2, 64):
+        for i in range(1, 64):
            with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count:
                cpu_count.return_value = i

@@ -31,4 +31,4 @@ class TestSettings(TestCase):
                self.assertTrue(default_workers >= 1)
                self.assertTrue(default_threads >= 1)

-                self.assertTrue(default_workers * default_threads < i, f"{i}")
+                self.assertTrue(default_workers * default_threads <= i, f"{i}")
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -1,11 +1,12 @@
-from datetime import datetime
+import os
 from unittest import mock

+from django.conf import settings
 from django.test import TestCase
 from django.utils import timezone

 from documents import tasks
-from documents.models import Document
+from documents.models import Document, Tag, Correspondent, DocumentType
 from documents.sanity_checker import SanityError, SanityFailedError
 from documents.tests.utils import DirectoriesMixin

@@ -22,8 +23,55 @@ class TestTasks(DirectoriesMixin, TestCase):

        tasks.index_optimize()

-    def test_train_classifier(self):
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_no_auto_matching(self, load_classifier):
        tasks.train_classifier()
+        load_classifier.assert_not_called()
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_tag(self, load_classifier):
+        load_classifier.return_value = None
+        Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_type(self, load_classifier):
+        load_classifier.return_value = None
+        DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_correspondent(self, load_classifier):
+        load_classifier.return_value = None
+        Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    def test_train_classifier(self):
+        c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        doc = Document.objects.create(correspondent=c, content="test", title="test")
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime = os.stat(settings.MODEL_FILE).st_mtime
+
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime2 = os.stat(settings.MODEL_FILE).st_mtime
+        self.assertEqual(mtime, mtime2)
+
+        doc.content = "test2"
+        doc.save()
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime3 = os.stat(settings.MODEL_FILE).st_mtime
+        self.assertNotEqual(mtime2, mtime3)

    @mock.patch("documents.tasks.sanity_checker.check_sanity")
    def test_sanity_check(self, m):
@@ -35,7 +83,7 @@ class TestTasks(DirectoriesMixin, TestCase):
        self.assertRaises(SanityFailedError, tasks.sanity_check)
        m.assert_called_once()

-    def test_culk_update_documents(self):
+    def test_bulk_update_documents(self):
        doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(),
                                created=timezone.now(), modified=timezone.now())

--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -35,6 +35,7 @@ from rest_framework.viewsets import (
 import documents.index as index
 from paperless.db import GnuPG
 from paperless.views import StandardPagination
+from .classifier import load_classifier
 from .filters import (
    CorrespondentFilterSet,
    DocumentFilterSet,
@@ -42,6 +43,7 @@ from .filters import (
    DocumentTypeFilterSet,
    LogFilterSet
 )
+from .matching import match_correspondents, match_tags, match_document_types
 from .models import Correspondent, Document, Log, Tag, DocumentType, SavedView
 from .parsers import get_parser_class_for_mime_type
 from .serialisers import (
@@ -133,10 +135,6 @@ class DocumentTypeViewSet(ModelViewSet):
    ordering_fields = ("name", "matching_algorithm", "match", "document_count")


-class BulkEditForm(object):
-    pass
-
-
 class DocumentViewSet(RetrieveModelMixin,
                      UpdateModelMixin,
                      DestroyModelMixin,
@@ -230,31 +228,50 @@ class DocumentViewSet(RetrieveModelMixin,
    def metadata(self, request, pk=None):
        try:
            doc = Document.objects.get(pk=pk)
-
-            meta = {
-                "original_checksum": doc.checksum,
-                "original_size": os.stat(doc.source_path).st_size,
-                "original_mime_type": doc.mime_type,
-                "media_filename": doc.filename,
-                "has_archive_version": os.path.isfile(doc.archive_path),
-                "original_metadata": self.get_metadata(
-                    doc.source_path, doc.mime_type)
-            }
-
-            if doc.archive_checksum and os.path.isfile(doc.archive_path):
-                meta['archive_checksum'] = doc.archive_checksum
-                meta['archive_size'] = os.stat(doc.archive_path).st_size,
-                meta['archive_metadata'] = self.get_metadata(
-                    doc.archive_path, "application/pdf")
-            else:
-                meta['archive_checksum'] = None
-                meta['archive_size'] = None
-                meta['archive_metadata'] = None
-
-            return Response(meta)
        except Document.DoesNotExist:
            raise Http404()

+        meta = {
+            "original_checksum": doc.checksum,
+            "original_size": os.stat(doc.source_path).st_size,
+            "original_mime_type": doc.mime_type,
+            "media_filename": doc.filename,
+            "has_archive_version": os.path.isfile(doc.archive_path),
+            "original_metadata": self.get_metadata(
+                doc.source_path, doc.mime_type)
+        }
+
+        if doc.archive_checksum and os.path.isfile(doc.archive_path):
+            meta['archive_checksum'] = doc.archive_checksum
+            meta['archive_size'] = os.stat(doc.archive_path).st_size,
+            meta['archive_metadata'] = self.get_metadata(
+                doc.archive_path, "application/pdf")
+        else:
+            meta['archive_checksum'] = None
+            meta['archive_size'] = None
+            meta['archive_metadata'] = None
+
+        return Response(meta)
+
+    @action(methods=['get'], detail=True)
+    def suggestions(self, request, pk=None):
+        try:
+            doc = Document.objects.get(pk=pk)
+        except Document.DoesNotExist:
+            raise Http404()
+
+        classifier = load_classifier()
+
+        return Response({
+            "correspondents": [
+                c.id for c in match_correspondents(doc, classifier)
+            ],
+            "tags": [t.id for t in match_tags(doc, classifier)],
+            "document_types": [
+                dt.id for dt in match_document_types(doc, classifier)
+            ]
+        })
+
    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
@@ -382,6 +399,7 @@ class PostDocumentView(APIView):

        with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
                                         dir=settings.SCRATCH_DIR,
+                                         buffering=0,
                                         delete=False) as f:
            f.write(doc_data)
            os.utime(f.name, times=(t, t))
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -22,7 +22,7 @@ def path_check(var, directory):
                exists_hint.format(directory)
            ))
        elif not os.access(directory, os.W_OK | os.X_OK):
-            messages.append(Error(
+            messages.append(Warning(
                writeable_message.format(var),
                writeable_hint.format(directory)
            ))
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -366,8 +366,10 @@ LOGGING = {

 def default_task_workers():
    # always leave one core open
-    available_cores = max(multiprocessing.cpu_count() - 1, 1)
+    available_cores = max(multiprocessing.cpu_count(), 1)
    try:
+        if available_cores < 4:
+            return available_cores
        return max(
            math.floor(math.sqrt(available_cores)),
            1
@@ -388,7 +390,7 @@ Q_CLUSTER = {

 def default_threads_per_worker(task_workers):
    # always leave one core open
-    available_cores = max(multiprocessing.cpu_count() - 1, 1)
+    available_cores = max(multiprocessing.cpu_count(), 1)
    try:
        return max(
            math.floor(available_cores / task_workers),