mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-websockets-status
This commit is contained in:
@@ -26,6 +26,34 @@ def preprocess_content(content):
|
||||
return content
|
||||
|
||||
|
||||
def load_classifier():
|
||||
if not os.path.isfile(settings.MODEL_FILE):
|
||||
logger.debug(
|
||||
f"Document classification model does not exist (yet), not "
|
||||
f"performing automatic matching."
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (EOFError, IncompatibleClassifierVersionError) as e:
|
||||
# there's something wrong with the model file.
|
||||
logger.error(
|
||||
f"Unrecoverable error while loading document "
|
||||
f"classification model: {str(e)}, deleting model file."
|
||||
)
|
||||
os.unlink(settings.MODEL_FILE)
|
||||
classifier = None
|
||||
except OSError as e:
|
||||
logger.error(
|
||||
f"Error while loading document classification model: {str(e)}"
|
||||
)
|
||||
classifier = None
|
||||
|
||||
return classifier
|
||||
|
||||
|
||||
class DocumentClassifier(object):
|
||||
|
||||
FORMAT_VERSION = 6
|
||||
|
@@ -14,7 +14,7 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .classifier import load_classifier
|
||||
from .file_handling import create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from .loggers import LoggingMixin
|
||||
@@ -262,14 +262,8 @@ class Consumer(LoggingMixin):
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
|
@@ -2,8 +2,7 @@ import logging
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.classifier import load_classifier
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
from ...signals.handlers import set_correspondent, set_document_type, set_tags
|
||||
@@ -70,13 +69,7 @@ class Command(Renderable, BaseCommand):
|
||||
queryset = Document.objects.all()
|
||||
documents = queryset.distinct()
|
||||
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.reload()
|
||||
except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in documents:
|
||||
logging.getLogger(__name__).info(
|
||||
|
@@ -6,10 +6,9 @@ from django.db.models.signals import post_save
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index, sanity_checker
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.classifier import DocumentClassifier, load_classifier
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
from documents.sanity_checker import SanityFailedError
|
||||
|
||||
|
||||
@@ -30,13 +29,18 @@ def index_reindex():
|
||||
|
||||
|
||||
def train_classifier():
|
||||
classifier = DocumentClassifier()
|
||||
if (not Tag.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists() and
|
||||
not DocumentType.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists() and
|
||||
not Correspondent.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists()):
|
||||
|
||||
try:
|
||||
# load the classifier, since we might not have to train it again.
|
||||
classifier.reload()
|
||||
except (OSError, EOFError, IncompatibleClassifierVersionError):
|
||||
# This is what we're going to fix here.
|
||||
return
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
if not classifier:
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
try:
|
||||
@@ -52,7 +56,7 @@ def train_classifier():
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).error(
|
||||
logging.getLogger(__name__).warning(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
|
||||
|
@@ -590,6 +590,10 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(len(meta['original_metadata']), 0)
|
||||
self.assertGreater(len(meta['archive_metadata']), 0)
|
||||
|
||||
def test_get_metadata_invalid_doc(self):
|
||||
response = self.client.get(f"/api/documents/34576/metadata/")
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
def test_get_metadata_no_archive(self):
|
||||
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
|
||||
|
||||
@@ -605,6 +609,30 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertGreater(len(meta['original_metadata']), 0)
|
||||
self.assertIsNone(meta['archive_metadata'])
|
||||
|
||||
def test_get_empty_suggestions(self):
|
||||
doc = Document.objects.create(title="test", mime_type="application/pdf")
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data, {'correspondents': [], 'tags': [], 'document_types': []})
|
||||
|
||||
def test_get_suggestions_invalid_doc(self):
|
||||
response = self.client.get(f"/api/documents/34676/suggestions/")
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
@mock.patch("documents.views.match_correspondents")
|
||||
@mock.patch("documents.views.match_tags")
|
||||
@mock.patch("documents.views.match_document_types")
|
||||
def test_get_suggestions(self, match_document_types, match_tags, match_correspondents):
|
||||
doc = Document.objects.create(title="test", mime_type="application/pdf", content="this is an invoice!")
|
||||
match_tags.return_value = [Tag(id=56), Tag(id=123)]
|
||||
match_document_types.return_value = [DocumentType(id=23)]
|
||||
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
self.assertEqual(response.data, {'correspondents': [88,2], 'tags': [56,123], 'document_types': [23]})
|
||||
|
||||
def test_saved_views(self):
|
||||
u1 = User.objects.create_user("user1")
|
||||
u2 = User.objects.create_user("user2")
|
||||
|
@@ -1,10 +1,13 @@
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
@@ -235,3 +238,30 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
|
||||
|
||||
def test_load_classifier_not_exists(self):
|
||||
self.assertFalse(os.path.exists(settings.MODEL_FILE))
|
||||
self.assertIsNone(load_classifier())
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.reload")
|
||||
def test_load_classifier(self, reload):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertIsNotNone(load_classifier())
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.reload")
|
||||
def test_load_classifier_incompatible_version(self, reload):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
reload.side_effect = IncompatibleClassifierVersionError()
|
||||
self.assertIsNone(load_classifier())
|
||||
self.assertFalse(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.reload")
|
||||
def test_load_classifier_os_error(self, reload):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
reload.side_effect = OSError()
|
||||
self.assertIsNone(load_classifier())
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
@@ -460,7 +460,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@mock.patch("documents.consumer.DocumentClassifier")
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def testClassifyDocument(self, m):
|
||||
correspondent = Correspondent.objects.create(name="test")
|
||||
dtype = DocumentType.objects.create(name="test")
|
||||
|
@@ -20,7 +20,7 @@ class TestSettings(TestCase):
|
||||
self.assertEqual(default_threads, 1)
|
||||
|
||||
def test_workers_threads(self):
|
||||
for i in range(2, 64):
|
||||
for i in range(1, 64):
|
||||
with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count:
|
||||
cpu_count.return_value = i
|
||||
|
||||
@@ -31,4 +31,4 @@ class TestSettings(TestCase):
|
||||
self.assertTrue(default_workers >= 1)
|
||||
self.assertTrue(default_threads >= 1)
|
||||
|
||||
self.assertTrue(default_workers * default_threads < i, f"{i}")
|
||||
self.assertTrue(default_workers * default_threads <= i, f"{i}")
|
||||
|
@@ -1,11 +1,12 @@
|
||||
from datetime import datetime
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.models import Document
|
||||
from documents.models import Document, Tag, Correspondent, DocumentType
|
||||
from documents.sanity_checker import SanityError, SanityFailedError
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
@@ -22,8 +23,55 @@ class TestTasks(DirectoriesMixin, TestCase):
|
||||
|
||||
tasks.index_optimize()
|
||||
|
||||
def test_train_classifier(self):
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_no_auto_matching(self, load_classifier):
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_not_called()
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_tag(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_type(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_correspondent(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
def test_train_classifier(self):
|
||||
c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
doc = Document.objects.create(correspondent=c, content="test", title="test")
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime = os.stat(settings.MODEL_FILE).st_mtime
|
||||
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
|
||||
self.assertEqual(mtime, mtime2)
|
||||
|
||||
doc.content = "test2"
|
||||
doc.save()
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
|
||||
self.assertNotEqual(mtime2, mtime3)
|
||||
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check(self, m):
|
||||
@@ -35,7 +83,7 @@ class TestTasks(DirectoriesMixin, TestCase):
|
||||
self.assertRaises(SanityFailedError, tasks.sanity_check)
|
||||
m.assert_called_once()
|
||||
|
||||
def test_culk_update_documents(self):
|
||||
def test_bulk_update_documents(self):
|
||||
doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(),
|
||||
created=timezone.now(), modified=timezone.now())
|
||||
|
||||
|
@@ -35,6 +35,7 @@ from rest_framework.viewsets import (
|
||||
import documents.index as index
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
from .classifier import load_classifier
|
||||
from .filters import (
|
||||
CorrespondentFilterSet,
|
||||
DocumentFilterSet,
|
||||
@@ -42,6 +43,7 @@ from .filters import (
|
||||
DocumentTypeFilterSet,
|
||||
LogFilterSet
|
||||
)
|
||||
from .matching import match_correspondents, match_tags, match_document_types
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType, SavedView
|
||||
from .parsers import get_parser_class_for_mime_type
|
||||
from .serialisers import (
|
||||
@@ -133,10 +135,6 @@ class DocumentTypeViewSet(ModelViewSet):
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||
|
||||
|
||||
class BulkEditForm(object):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentViewSet(RetrieveModelMixin,
|
||||
UpdateModelMixin,
|
||||
DestroyModelMixin,
|
||||
@@ -230,31 +228,50 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
|
||||
meta = {
|
||||
"original_checksum": doc.checksum,
|
||||
"original_size": os.stat(doc.source_path).st_size,
|
||||
"original_mime_type": doc.mime_type,
|
||||
"media_filename": doc.filename,
|
||||
"has_archive_version": os.path.isfile(doc.archive_path),
|
||||
"original_metadata": self.get_metadata(
|
||||
doc.source_path, doc.mime_type)
|
||||
}
|
||||
|
||||
if doc.archive_checksum and os.path.isfile(doc.archive_path):
|
||||
meta['archive_checksum'] = doc.archive_checksum
|
||||
meta['archive_size'] = os.stat(doc.archive_path).st_size,
|
||||
meta['archive_metadata'] = self.get_metadata(
|
||||
doc.archive_path, "application/pdf")
|
||||
else:
|
||||
meta['archive_checksum'] = None
|
||||
meta['archive_size'] = None
|
||||
meta['archive_metadata'] = None
|
||||
|
||||
return Response(meta)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
meta = {
|
||||
"original_checksum": doc.checksum,
|
||||
"original_size": os.stat(doc.source_path).st_size,
|
||||
"original_mime_type": doc.mime_type,
|
||||
"media_filename": doc.filename,
|
||||
"has_archive_version": os.path.isfile(doc.archive_path),
|
||||
"original_metadata": self.get_metadata(
|
||||
doc.source_path, doc.mime_type)
|
||||
}
|
||||
|
||||
if doc.archive_checksum and os.path.isfile(doc.archive_path):
|
||||
meta['archive_checksum'] = doc.archive_checksum
|
||||
meta['archive_size'] = os.stat(doc.archive_path).st_size,
|
||||
meta['archive_metadata'] = self.get_metadata(
|
||||
doc.archive_path, "application/pdf")
|
||||
else:
|
||||
meta['archive_checksum'] = None
|
||||
meta['archive_size'] = None
|
||||
meta['archive_metadata'] = None
|
||||
|
||||
return Response(meta)
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def suggestions(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
return Response({
|
||||
"correspondents": [
|
||||
c.id for c in match_correspondents(doc, classifier)
|
||||
],
|
||||
"tags": [t.id for t in match_tags(doc, classifier)],
|
||||
"document_types": [
|
||||
dt.id for dt in match_document_types(doc, classifier)
|
||||
]
|
||||
})
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
@@ -382,6 +399,7 @@ class PostDocumentView(APIView):
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
buffering=0,
|
||||
delete=False) as f:
|
||||
f.write(doc_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
@@ -22,7 +22,7 @@ def path_check(var, directory):
|
||||
exists_hint.format(directory)
|
||||
))
|
||||
elif not os.access(directory, os.W_OK | os.X_OK):
|
||||
messages.append(Error(
|
||||
messages.append(Warning(
|
||||
writeable_message.format(var),
|
||||
writeable_hint.format(directory)
|
||||
))
|
||||
|
@@ -366,8 +366,10 @@ LOGGING = {
|
||||
|
||||
def default_task_workers():
|
||||
# always leave one core open
|
||||
available_cores = max(multiprocessing.cpu_count() - 1, 1)
|
||||
available_cores = max(multiprocessing.cpu_count(), 1)
|
||||
try:
|
||||
if available_cores < 4:
|
||||
return available_cores
|
||||
return max(
|
||||
math.floor(math.sqrt(available_cores)),
|
||||
1
|
||||
@@ -388,7 +390,7 @@ Q_CLUSTER = {
|
||||
|
||||
def default_threads_per_worker(task_workers):
|
||||
# always leave one core open
|
||||
available_cores = max(multiprocessing.cpu_count() - 1, 1)
|
||||
available_cores = max(multiprocessing.cpu_count(), 1)
|
||||
try:
|
||||
return max(
|
||||
math.floor(available_cores / task_workers),
|
||||
|
Reference in New Issue
Block a user