mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'dev' into feature-websockets-status
This commit is contained in:
		| @@ -26,6 +26,34 @@ def preprocess_content(content): | ||||
|     return content | ||||
|  | ||||
|  | ||||
| def load_classifier(): | ||||
|     if not os.path.isfile(settings.MODEL_FILE): | ||||
|         logger.debug( | ||||
|             f"Document classification model does not exist (yet), not " | ||||
|             f"performing automatic matching." | ||||
|         ) | ||||
|         return None | ||||
|  | ||||
|     try: | ||||
|         classifier = DocumentClassifier() | ||||
|         classifier.reload() | ||||
|     except (EOFError, IncompatibleClassifierVersionError) as e: | ||||
|         # there's something wrong with the model file. | ||||
|         logger.error( | ||||
|             f"Unrecoverable error while loading document " | ||||
|             f"classification model: {str(e)}, deleting model file." | ||||
|         ) | ||||
|         os.unlink(settings.MODEL_FILE) | ||||
|         classifier = None | ||||
|     except OSError as e: | ||||
|         logger.error( | ||||
|             f"Error while loading document classification model: {str(e)}" | ||||
|         ) | ||||
|         classifier = None | ||||
|  | ||||
|     return classifier | ||||
|  | ||||
|  | ||||
| class DocumentClassifier(object): | ||||
|  | ||||
|     FORMAT_VERSION = 6 | ||||
|   | ||||
| @@ -14,7 +14,7 @@ from django.utils import timezone | ||||
| from filelock import FileLock | ||||
| from rest_framework.reverse import reverse | ||||
|  | ||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from .classifier import load_classifier | ||||
| from .file_handling import create_source_path_directory, \ | ||||
|     generate_unique_filename | ||||
| from .loggers import LoggingMixin | ||||
| @@ -262,14 +262,8 @@ class Consumer(LoggingMixin): | ||||
|         #   reloading the classifier multiple times, since there are multiple | ||||
|         #   post-consume hooks that all require the classifier. | ||||
|  | ||||
|         try: | ||||
|             classifier = DocumentClassifier() | ||||
|             classifier.reload() | ||||
|         except (OSError, EOFError, IncompatibleClassifierVersionError) as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 f"Cannot classify documents: {e}.") | ||||
|             classifier = None | ||||
|         classifier = load_classifier() | ||||
|  | ||||
|         self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT) | ||||
|         # now that everything is done, we can start to store the document | ||||
|         # in the system. This will be a transaction and reasonably fast. | ||||
|   | ||||
| @@ -2,8 +2,7 @@ import logging | ||||
|  | ||||
| from django.core.management.base import BaseCommand | ||||
|  | ||||
| from documents.classifier import DocumentClassifier, \ | ||||
|     IncompatibleClassifierVersionError | ||||
| from documents.classifier import load_classifier | ||||
| from documents.models import Document | ||||
| from ...mixins import Renderable | ||||
| from ...signals.handlers import set_correspondent, set_document_type, set_tags | ||||
| @@ -70,13 +69,7 @@ class Command(Renderable, BaseCommand): | ||||
|             queryset = Document.objects.all() | ||||
|         documents = queryset.distinct() | ||||
|  | ||||
|         classifier = DocumentClassifier() | ||||
|         try: | ||||
|             classifier.reload() | ||||
|         except (OSError, EOFError, IncompatibleClassifierVersionError) as e: | ||||
|             logging.getLogger(__name__).warning( | ||||
|                 f"Cannot classify documents: {e}.") | ||||
|             classifier = None | ||||
|         classifier = load_classifier() | ||||
|  | ||||
|         for document in documents: | ||||
|             logging.getLogger(__name__).info( | ||||
|   | ||||
| @@ -6,10 +6,9 @@ from django.db.models.signals import post_save | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents import index, sanity_checker | ||||
| from documents.classifier import DocumentClassifier, \ | ||||
|     IncompatibleClassifierVersionError | ||||
| from documents.classifier import DocumentClassifier, load_classifier | ||||
| from documents.consumer import Consumer, ConsumerError | ||||
| from documents.models import Document | ||||
| from documents.models import Document, Tag, DocumentType, Correspondent | ||||
| from documents.sanity_checker import SanityFailedError | ||||
|  | ||||
|  | ||||
| @@ -30,13 +29,18 @@ def index_reindex(): | ||||
|  | ||||
|  | ||||
| def train_classifier(): | ||||
|     classifier = DocumentClassifier() | ||||
|     if (not Tag.objects.filter( | ||||
|                 matching_algorithm=Tag.MATCH_AUTO).exists() and | ||||
|         not DocumentType.objects.filter( | ||||
|             matching_algorithm=Tag.MATCH_AUTO).exists() and | ||||
|         not Correspondent.objects.filter( | ||||
|             matching_algorithm=Tag.MATCH_AUTO).exists()): | ||||
|  | ||||
|     try: | ||||
|         # load the classifier, since we might not have to train it again. | ||||
|         classifier.reload() | ||||
|     except (OSError, EOFError, IncompatibleClassifierVersionError): | ||||
|         # This is what we're going to fix here. | ||||
|         return | ||||
|  | ||||
|     classifier = load_classifier() | ||||
|  | ||||
|     if not classifier: | ||||
|         classifier = DocumentClassifier() | ||||
|  | ||||
|     try: | ||||
| @@ -52,7 +56,7 @@ def train_classifier(): | ||||
|             ) | ||||
|  | ||||
|     except Exception as e: | ||||
|         logging.getLogger(__name__).error( | ||||
|         logging.getLogger(__name__).warning( | ||||
|             "Classifier error: " + str(e) | ||||
|         ) | ||||
|  | ||||
|   | ||||
| @@ -590,6 +590,10 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|         self.assertEqual(len(meta['original_metadata']), 0) | ||||
|         self.assertGreater(len(meta['archive_metadata']), 0) | ||||
|  | ||||
|     def test_get_metadata_invalid_doc(self): | ||||
|         response = self.client.get(f"/api/documents/34576/metadata/") | ||||
|         self.assertEqual(response.status_code, 404) | ||||
|  | ||||
|     def test_get_metadata_no_archive(self): | ||||
|         doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf") | ||||
|  | ||||
| @@ -605,6 +609,30 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|         self.assertGreater(len(meta['original_metadata']), 0) | ||||
|         self.assertIsNone(meta['archive_metadata']) | ||||
|  | ||||
|     def test_get_empty_suggestions(self): | ||||
|         doc = Document.objects.create(title="test", mime_type="application/pdf") | ||||
|  | ||||
|         response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.data, {'correspondents': [], 'tags': [], 'document_types': []}) | ||||
|  | ||||
|     def test_get_suggestions_invalid_doc(self): | ||||
|         response = self.client.get(f"/api/documents/34676/suggestions/") | ||||
|         self.assertEqual(response.status_code, 404) | ||||
|  | ||||
|     @mock.patch("documents.views.match_correspondents") | ||||
|     @mock.patch("documents.views.match_tags") | ||||
|     @mock.patch("documents.views.match_document_types") | ||||
|     def test_get_suggestions(self, match_document_types, match_tags, match_correspondents): | ||||
|         doc = Document.objects.create(title="test", mime_type="application/pdf", content="this is an invoice!") | ||||
|         match_tags.return_value = [Tag(id=56), Tag(id=123)] | ||||
|         match_document_types.return_value = [DocumentType(id=23)] | ||||
|         match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)] | ||||
|  | ||||
|         response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") | ||||
|         self.assertEqual(response.data, {'correspondents': [88,2], 'tags': [56,123], 'document_types': [23]}) | ||||
|  | ||||
|     def test_saved_views(self): | ||||
|         u1 = User.objects.create_user("user1") | ||||
|         u2 = User.objects.create_user("user2") | ||||
|   | ||||
| @@ -1,10 +1,13 @@ | ||||
| import os | ||||
| import tempfile | ||||
| from pathlib import Path | ||||
| from time import sleep | ||||
| from unittest import mock | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier | ||||
| from documents.models import Correspondent, Document, Tag, DocumentType | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
| @@ -235,3 +238,30 @@ class TestClassifier(DirectoriesMixin, TestCase): | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc2.content), []) | ||||
|  | ||||
|     def test_load_classifier_not_exists(self): | ||||
|         self.assertFalse(os.path.exists(settings.MODEL_FILE)) | ||||
|         self.assertIsNone(load_classifier()) | ||||
|  | ||||
|     @mock.patch("documents.classifier.DocumentClassifier.reload") | ||||
|     def test_load_classifier(self, reload): | ||||
|         Path(settings.MODEL_FILE).touch() | ||||
|         self.assertIsNotNone(load_classifier()) | ||||
|  | ||||
|     @mock.patch("documents.classifier.DocumentClassifier.reload") | ||||
|     def test_load_classifier_incompatible_version(self, reload): | ||||
|         Path(settings.MODEL_FILE).touch() | ||||
|         self.assertTrue(os.path.exists(settings.MODEL_FILE)) | ||||
|  | ||||
|         reload.side_effect = IncompatibleClassifierVersionError() | ||||
|         self.assertIsNone(load_classifier()) | ||||
|         self.assertFalse(os.path.exists(settings.MODEL_FILE)) | ||||
|  | ||||
|     @mock.patch("documents.classifier.DocumentClassifier.reload") | ||||
|     def test_load_classifier_os_error(self, reload): | ||||
|         Path(settings.MODEL_FILE).touch() | ||||
|         self.assertTrue(os.path.exists(settings.MODEL_FILE)) | ||||
|  | ||||
|         reload.side_effect = OSError() | ||||
|         self.assertIsNone(load_classifier()) | ||||
|         self.assertTrue(os.path.exists(settings.MODEL_FILE)) | ||||
|   | ||||
| @@ -460,7 +460,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self._assert_first_last_send_progress() | ||||
|  | ||||
|     @mock.patch("documents.consumer.DocumentClassifier") | ||||
|     @mock.patch("documents.consumer.load_classifier") | ||||
|     def testClassifyDocument(self, m): | ||||
|         correspondent = Correspondent.objects.create(name="test") | ||||
|         dtype = DocumentType.objects.create(name="test") | ||||
|   | ||||
| @@ -20,7 +20,7 @@ class TestSettings(TestCase): | ||||
|         self.assertEqual(default_threads, 1) | ||||
|  | ||||
|     def test_workers_threads(self): | ||||
|         for i in range(2, 64): | ||||
|         for i in range(1, 64): | ||||
|             with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count: | ||||
|                 cpu_count.return_value = i | ||||
|  | ||||
| @@ -31,4 +31,4 @@ class TestSettings(TestCase): | ||||
|                 self.assertTrue(default_workers >= 1) | ||||
|                 self.assertTrue(default_threads >= 1) | ||||
|  | ||||
|                 self.assertTrue(default_workers * default_threads < i, f"{i}") | ||||
|                 self.assertTrue(default_workers * default_threads <= i, f"{i}") | ||||
|   | ||||
| @@ -1,11 +1,12 @@ | ||||
| from datetime import datetime | ||||
| import os | ||||
| from unittest import mock | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.test import TestCase | ||||
| from django.utils import timezone | ||||
|  | ||||
| from documents import tasks | ||||
| from documents.models import Document | ||||
| from documents.models import Document, Tag, Correspondent, DocumentType | ||||
| from documents.sanity_checker import SanityError, SanityFailedError | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
| @@ -22,8 +23,55 @@ class TestTasks(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         tasks.index_optimize() | ||||
|  | ||||
|     def test_train_classifier(self): | ||||
|     @mock.patch("documents.tasks.load_classifier") | ||||
|     def test_train_classifier_no_auto_matching(self, load_classifier): | ||||
|         tasks.train_classifier() | ||||
|         load_classifier.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.tasks.load_classifier") | ||||
|     def test_train_classifier_with_auto_tag(self, load_classifier): | ||||
|         load_classifier.return_value = None | ||||
|         Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test") | ||||
|         tasks.train_classifier() | ||||
|         load_classifier.assert_called_once() | ||||
|         self.assertFalse(os.path.isfile(settings.MODEL_FILE)) | ||||
|  | ||||
|     @mock.patch("documents.tasks.load_classifier") | ||||
|     def test_train_classifier_with_auto_type(self, load_classifier): | ||||
|         load_classifier.return_value = None | ||||
|         DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test") | ||||
|         tasks.train_classifier() | ||||
|         load_classifier.assert_called_once() | ||||
|         self.assertFalse(os.path.isfile(settings.MODEL_FILE)) | ||||
|  | ||||
|     @mock.patch("documents.tasks.load_classifier") | ||||
|     def test_train_classifier_with_auto_correspondent(self, load_classifier): | ||||
|         load_classifier.return_value = None | ||||
|         Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test") | ||||
|         tasks.train_classifier() | ||||
|         load_classifier.assert_called_once() | ||||
|         self.assertFalse(os.path.isfile(settings.MODEL_FILE)) | ||||
|  | ||||
|     def test_train_classifier(self): | ||||
|         c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test") | ||||
|         doc = Document.objects.create(correspondent=c, content="test", title="test") | ||||
|         self.assertFalse(os.path.isfile(settings.MODEL_FILE)) | ||||
|  | ||||
|         tasks.train_classifier() | ||||
|         self.assertTrue(os.path.isfile(settings.MODEL_FILE)) | ||||
|         mtime = os.stat(settings.MODEL_FILE).st_mtime | ||||
|  | ||||
|         tasks.train_classifier() | ||||
|         self.assertTrue(os.path.isfile(settings.MODEL_FILE)) | ||||
|         mtime2 = os.stat(settings.MODEL_FILE).st_mtime | ||||
|         self.assertEqual(mtime, mtime2) | ||||
|  | ||||
|         doc.content = "test2" | ||||
|         doc.save() | ||||
|         tasks.train_classifier() | ||||
|         self.assertTrue(os.path.isfile(settings.MODEL_FILE)) | ||||
|         mtime3 = os.stat(settings.MODEL_FILE).st_mtime | ||||
|         self.assertNotEqual(mtime2, mtime3) | ||||
|  | ||||
|     @mock.patch("documents.tasks.sanity_checker.check_sanity") | ||||
|     def test_sanity_check(self, m): | ||||
| @@ -35,7 +83,7 @@ class TestTasks(DirectoriesMixin, TestCase): | ||||
|         self.assertRaises(SanityFailedError, tasks.sanity_check) | ||||
|         m.assert_called_once() | ||||
|  | ||||
|     def test_culk_update_documents(self): | ||||
|     def test_bulk_update_documents(self): | ||||
|         doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), | ||||
|                                 created=timezone.now(), modified=timezone.now()) | ||||
|  | ||||
|   | ||||
| @@ -35,6 +35,7 @@ from rest_framework.viewsets import ( | ||||
| import documents.index as index | ||||
| from paperless.db import GnuPG | ||||
| from paperless.views import StandardPagination | ||||
| from .classifier import load_classifier | ||||
| from .filters import ( | ||||
|     CorrespondentFilterSet, | ||||
|     DocumentFilterSet, | ||||
| @@ -42,6 +43,7 @@ from .filters import ( | ||||
|     DocumentTypeFilterSet, | ||||
|     LogFilterSet | ||||
| ) | ||||
| from .matching import match_correspondents, match_tags, match_document_types | ||||
| from .models import Correspondent, Document, Log, Tag, DocumentType, SavedView | ||||
| from .parsers import get_parser_class_for_mime_type | ||||
| from .serialisers import ( | ||||
| @@ -133,10 +135,6 @@ class DocumentTypeViewSet(ModelViewSet): | ||||
|     ordering_fields = ("name", "matching_algorithm", "match", "document_count") | ||||
|  | ||||
|  | ||||
| class BulkEditForm(object): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class DocumentViewSet(RetrieveModelMixin, | ||||
|                       UpdateModelMixin, | ||||
|                       DestroyModelMixin, | ||||
| @@ -230,31 +228,50 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|     def metadata(self, request, pk=None): | ||||
|         try: | ||||
|             doc = Document.objects.get(pk=pk) | ||||
|  | ||||
|             meta = { | ||||
|                 "original_checksum": doc.checksum, | ||||
|                 "original_size": os.stat(doc.source_path).st_size, | ||||
|                 "original_mime_type": doc.mime_type, | ||||
|                 "media_filename": doc.filename, | ||||
|                 "has_archive_version": os.path.isfile(doc.archive_path), | ||||
|                 "original_metadata": self.get_metadata( | ||||
|                     doc.source_path, doc.mime_type) | ||||
|             } | ||||
|  | ||||
|             if doc.archive_checksum and os.path.isfile(doc.archive_path): | ||||
|                 meta['archive_checksum'] = doc.archive_checksum | ||||
|                 meta['archive_size'] = os.stat(doc.archive_path).st_size, | ||||
|                 meta['archive_metadata'] = self.get_metadata( | ||||
|                     doc.archive_path, "application/pdf") | ||||
|             else: | ||||
|                 meta['archive_checksum'] = None | ||||
|                 meta['archive_size'] = None | ||||
|                 meta['archive_metadata'] = None | ||||
|  | ||||
|             return Response(meta) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
|  | ||||
|         meta = { | ||||
|             "original_checksum": doc.checksum, | ||||
|             "original_size": os.stat(doc.source_path).st_size, | ||||
|             "original_mime_type": doc.mime_type, | ||||
|             "media_filename": doc.filename, | ||||
|             "has_archive_version": os.path.isfile(doc.archive_path), | ||||
|             "original_metadata": self.get_metadata( | ||||
|                 doc.source_path, doc.mime_type) | ||||
|         } | ||||
|  | ||||
|         if doc.archive_checksum and os.path.isfile(doc.archive_path): | ||||
|             meta['archive_checksum'] = doc.archive_checksum | ||||
|             meta['archive_size'] = os.stat(doc.archive_path).st_size, | ||||
|             meta['archive_metadata'] = self.get_metadata( | ||||
|                 doc.archive_path, "application/pdf") | ||||
|         else: | ||||
|             meta['archive_checksum'] = None | ||||
|             meta['archive_size'] = None | ||||
|             meta['archive_metadata'] = None | ||||
|  | ||||
|         return Response(meta) | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def suggestions(self, request, pk=None): | ||||
|         try: | ||||
|             doc = Document.objects.get(pk=pk) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
|  | ||||
|         classifier = load_classifier() | ||||
|  | ||||
|         return Response({ | ||||
|             "correspondents": [ | ||||
|                 c.id for c in match_correspondents(doc, classifier) | ||||
|             ], | ||||
|             "tags": [t.id for t in match_tags(doc, classifier)], | ||||
|             "document_types": [ | ||||
|                 dt.id for dt in match_document_types(doc, classifier) | ||||
|             ] | ||||
|         }) | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def preview(self, request, pk=None): | ||||
|         try: | ||||
| @@ -382,6 +399,7 @@ class PostDocumentView(APIView): | ||||
|  | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", | ||||
|                                          dir=settings.SCRATCH_DIR, | ||||
|                                          buffering=0, | ||||
|                                          delete=False) as f: | ||||
|             f.write(doc_data) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|   | ||||
| @@ -22,7 +22,7 @@ def path_check(var, directory): | ||||
|                 exists_hint.format(directory) | ||||
|             )) | ||||
|         elif not os.access(directory, os.W_OK | os.X_OK): | ||||
|             messages.append(Error( | ||||
|             messages.append(Warning( | ||||
|                 writeable_message.format(var), | ||||
|                 writeable_hint.format(directory) | ||||
|             )) | ||||
|   | ||||
| @@ -366,8 +366,10 @@ LOGGING = { | ||||
|  | ||||
| def default_task_workers(): | ||||
|     # always leave one core open | ||||
|     available_cores = max(multiprocessing.cpu_count() - 1, 1) | ||||
|     available_cores = max(multiprocessing.cpu_count(), 1) | ||||
|     try: | ||||
|         if available_cores < 4: | ||||
|             return available_cores | ||||
|         return max( | ||||
|             math.floor(math.sqrt(available_cores)), | ||||
|             1 | ||||
| @@ -388,7 +390,7 @@ Q_CLUSTER = { | ||||
|  | ||||
| def default_threads_per_worker(task_workers): | ||||
|     # always leave one core open | ||||
|     available_cores = max(multiprocessing.cpu_count() - 1, 1) | ||||
|     available_cores = max(multiprocessing.cpu_count(), 1) | ||||
|     try: | ||||
|         return max( | ||||
|             math.floor(available_cores / task_workers), | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler