From 6b3ec52ed49e8e50b9b809c8724c2c6d8a7f42c4 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 12:03:24 +0100 Subject: [PATCH 01/10] todo note --- src/documents/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/documents/models.py b/src/documents/models.py index 8e0435647..cd4517a3d 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -230,6 +230,7 @@ class Document(models.Model): @property def file_type(self): + # TODO: this is not stable across python versions return mimetypes.guess_extension(str(self.mime_type)) @property From d04b54140cc33d1b048172062c2b8edb4155eed7 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:12:13 +0100 Subject: [PATCH 02/10] moved consumption dir check into the correct spot --- src/documents/consumer.py | 13 ------------ .../management/commands/document_consumer.py | 11 +++++++++- src/documents/tests/test_consumer.py | 20 ------------------- .../tests/test_management_consumer.py | 11 ++++++++++ 4 files changed, 21 insertions(+), 34 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 8fed01c30..a7afca89d 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -8,7 +8,6 @@ from django.conf import settings from django.db import transaction from django.utils import timezone -from paperless.db import GnuPG from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .file_handling import generate_filename, create_source_path_directory from .loggers import LoggingMixin @@ -40,17 +39,6 @@ class Consumer(LoggingMixin): raise ConsumerError("Cannot consume {}: It is not a file".format( self.path)) - def pre_check_consumption_dir(self): - if not settings.CONSUMPTION_DIR: - raise ConsumerError( - "The CONSUMPTION_DIR settings variable does not appear to be " - "set.") - - if not os.path.isdir(settings.CONSUMPTION_DIR): - raise ConsumerError( - "Consumption directory {} does not exist".format( - settings.CONSUMPTION_DIR)) - def pre_check_duplicate(self): with open(self.path, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() @@ -92,7 +80,6 @@ class Consumer(LoggingMixin): # Make sure that preconditions for consuming the file are met. self.pre_check_file_exists() - self.pre_check_consumption_dir() self.pre_check_directories() self.pre_check_duplicate() diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index c25d0cfa9..b738f001b 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -3,7 +3,7 @@ import os from time import sleep from django.conf import settings -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from django_q.tasks import async_task from watchdog.events import FileSystemEventHandler from watchdog.observers.polling import PollingObserver @@ -95,6 +95,15 @@ class Command(BaseCommand): def handle(self, *args, **options): directory = options["directory"] + if not directory: + raise CommandError( + "CONSUMPTION_DIR does not appear to be set." + ) + + if not os.path.isdir(directory): + raise CommandError( + f"Consumption directory {directory} does not exist") + for entry in os.scandir(directory): _consume(entry.path) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 323f5051f..ed835a467 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -502,26 +502,6 @@ class TestConsumer(TestCase): self.fail("Should throw exception") - @override_settings(CONSUMPTION_DIR=None) - def testConsumptionDirUnset(self): - try: - self.consumer.try_consume_file(self.get_test_file()) - except ConsumerError as e: - self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.") - return - - self.fail("Should throw exception") - - @override_settings(CONSUMPTION_DIR="asd") - def testNoConsumptionDir(self): - try: - self.consumer.try_consume_file(self.get_test_file()) - except ConsumerError as e: - self.assertEqual(str(e), "Consumption directory asd does not exist") - return - - self.fail("Should throw exception") - def testDuplicates(self): self.consumer.try_consume_file(self.get_test_file()) diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 33938d450..25b71f563 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -6,6 +6,7 @@ from time import sleep from unittest import mock from django.conf import settings +from django.core.management import call_command, CommandError from django.test import TestCase, override_settings from documents.consumer import ConsumerError @@ -193,3 +194,13 @@ class TestConsumer(TestCase): @override_settings(CONSUMER_POLLING=1) def test_slow_write_incomplete_polling(self): self.test_slow_write_incomplete() + + @override_settings(CONSUMPTION_DIR="does_not_exist") + def test_consumption_directory_invalid(self): + + self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot') + + @override_settings(CONSUMPTION_DIR="") + def test_consumption_directory_unset(self): + + self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot') From 20c11396324755e96f02c4b60a91dd7820b9741a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:12:34 +0100 Subject: [PATCH 03/10] inotify: cleanup descriptor when done --- src/documents/management/commands/document_consumer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index b738f001b..295da4f5c 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -137,12 +137,13 @@ class Command(BaseCommand): f"Using inotify to watch directory for changes: {directory}") inotify = INotify() - inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + descriptor = inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) try: while not self.stop_flag: for event in inotify.read(timeout=1000, read_delay=1000): file = os.path.join(directory, event.name) - if os.path.isfile(file): - _consume(file) + _consume(file) except KeyboardInterrupt: pass + + inotify.rm_watch(descriptor) From 35b2033949c1a3facbb2a2e499b7a62cbff46f13 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:13:11 +0100 Subject: [PATCH 04/10] tests: disable db logger in all tests except logger tests --- src/documents/loggers.py | 5 +++++ src/documents/tests/test_logger.py | 4 +++- src/paperless/settings.py | 2 ++ src/setup.cfg | 3 +-- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/documents/loggers.py b/src/documents/loggers.py index fd20e1288..76dbe0163 100644 --- a/src/documents/loggers.py +++ b/src/documents/loggers.py @@ -1,9 +1,14 @@ import logging import uuid +from django.conf import settings + class PaperlessHandler(logging.Handler): def emit(self, record): + if settings.DISABLE_DBHANDLER: + return + # We have to do the import here or Django will barf when it tries to # load this because the apps aren't loaded at that point from .models import Log diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py index 6e240ffc9..bbc9c2b5d 100644 --- a/src/documents/tests/test_logger.py +++ b/src/documents/tests/test_logger.py @@ -2,7 +2,7 @@ import logging import uuid from unittest import mock -from django.test import TestCase +from django.test import TestCase, override_settings from ..models import Log @@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase): self.logger = logging.getLogger( "documents.management.commands.document_consumer") + @override_settings(DISABLE_DBHANDLER=False) def test_that_it_saves_at_all(self): kw = {"group": uuid.uuid4()} @@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase): self.logger.critical("This is a critical message", extra=kw) self.assertEqual(Log.objects.all().count(), 5) + @override_settings(DISABLE_DBHANDLER=False) def test_groups(self): kw1 = {"group": uuid.uuid4()} diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1432dc5ec..4847d7bce 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -250,6 +250,8 @@ USE_TZ = True # Logging # ############################################################################### +DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER") + LOGGING = { "version": 1, "disable_existing_loggers": False, diff --git a/src/setup.cfg b/src/setup.cfg index b540f9efe..f43c9adf6 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -5,8 +5,7 @@ exclude = migrations, paperless/settings.py, .tox, */tests/* DJANGO_SETTINGS_MODULE=paperless.settings addopts = --pythonwarnings=all --cov --cov-report=html env = - PAPERLESS_SECRET=paperless - PAPERLESS_EMAIL_SECRET=paperless + PAPERLESS_DISABLE_DBHANDLER=true [coverage:run] From a4277706f2a87c917e3fbfc02dd11b2c500a498b Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:14:02 +0100 Subject: [PATCH 05/10] tests: wait for the consumer to exit before removing directories. --- src/documents/tests/test_management_consumer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 25b71f563..2d45acd01 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -38,6 +38,7 @@ class TestConsumer(TestCase): sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") def setUp(self) -> None: + self.t = None patcher = mock.patch("documents.management.commands.document_consumer.async_task") self.task_mock = patcher.start() self.addCleanup(patcher.stop) @@ -53,7 +54,12 @@ class TestConsumer(TestCase): def tearDown(self) -> None: if self.t: + # set the stop flag self.t.stop() + # wait for the consumer to exit. + self.t.join() + + remove_dirs(self.dirs) def wait_for_task_mock_call(self): n = 0 From 60ac1ddbb93b6acda61589509293b4f8009d00b6 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:19:58 +0100 Subject: [PATCH 06/10] fix warnings about unclosed files. --- src/documents/management/commands/document_consumer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 295da4f5c..7baeccce0 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -137,7 +137,8 @@ class Command(BaseCommand): f"Using inotify to watch directory for changes: {directory}") inotify = INotify() - descriptor = inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + descriptor = inotify.add_watch( + directory, flags.CLOSE_WRITE | flags.MOVED_TO) try: while not self.stop_flag: for event in inotify.read(timeout=1000, read_delay=1000): @@ -147,3 +148,4 @@ class Command(BaseCommand): pass inotify.rm_watch(descriptor) + inotify.close() From 42c9186e91f0a60c66b37c2a3c159137b15e13ce Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:56:07 +0100 Subject: [PATCH 07/10] refrain from creating the index as part of the migrations, messes with the test cases. --- docs/setup.rst | 18 +++++++++------- .../migrations/1000_update_paperless_all.py | 21 ------------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index 88785364b..4e2826dd6 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -265,15 +265,17 @@ Migration to paperless-ng is then performed in a few simple steps: ``docker-compose.env`` to your needs. See `docker route`_ for details on which edits are advised. -6. Start paperless-ng. +6. In order to find your existing documents with the new search feature, you need + to invoke a one-time operation that will create the search index: - .. code:: bash + .. code:: shell-session - $ docker-compose up + $ docker-compose run --rm webserver document_index reindex + + This will migrate your database and create the search index. After that, + paperless will take care of maintaining the index by itself. - If you see everything working (you should see some migrations getting - applied, for instance), you can gracefully stop paperless-ng with Ctrl-C - and then start paperless-ng as usual with +7. Start paperless-ng. .. code:: bash @@ -281,11 +283,11 @@ Migration to paperless-ng is then performed in a few simple steps: This will run paperless in the background and automatically start it on system boot. -7. Paperless installed a permanent redirect to ``admin/`` in your browser. This +8. Paperless installed a permanent redirect to ``admin/`` in your browser. This redirect is still in place and prevents access to the new UI. Clear browsing cache in order to fix this. -8. Optionally, follow the instructions below to migrate your existing data to PostgreSQL. +9. Optionally, follow the instructions below to migrate your existing data to PostgreSQL. .. _setup-sqlite_to_psql: diff --git a/src/documents/migrations/1000_update_paperless_all.py b/src/documents/migrations/1000_update_paperless_all.py index dc6313dd8..f3fbbb6c1 100644 --- a/src/documents/migrations/1000_update_paperless_all.py +++ b/src/documents/migrations/1000_update_paperless_all.py @@ -5,23 +5,6 @@ from django.db import migrations, models import django.db.models.deletion -def make_index(apps, schema_editor): - Document = apps.get_model("documents", "Document") - documents = Document.objects.all() - print() - try: - print(" --> Creating document index...") - from whoosh.writing import AsyncWriter - from documents import index - ix = index.open_index(recreate=True) - with AsyncWriter(ix) as writer: - for document in documents: - index.update_document(writer, document) - except ImportError: - # index may not be relevant anymore - print(" --> Cannot create document index.") - - def logs_set_default_group(apps, schema_editor): Log = apps.get_model('documents', 'Log') for log in Log.objects.all(): @@ -99,8 +82,4 @@ class Migration(migrations.Migration): code=django.db.migrations.operations.special.RunPython.noop, reverse_code=logs_set_default_group ), - migrations.RunPython( - code=make_index, - reverse_code=django.db.migrations.operations.special.RunPython.noop, - ), ] From 938499706ce8a1ff01234cd833dd1c044cc92e1d Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 13:59:24 +0100 Subject: [PATCH 08/10] fixed an issue with the search api opening the index on import (that's way too early.) --- src/documents/views.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 287fb114c..96b413d67 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -202,7 +202,9 @@ class SearchView(APIView): permission_classes = (IsAuthenticated,) - ix = index.open_index() + def __init__(self, *args, **kwargs): + super(SearchView, self).__init__(*args, **kwargs) + self.ix = index.open_index() def add_infos_to_hit(self, r): doc = Document.objects.get(id=r['id']) @@ -241,7 +243,9 @@ class SearchAutoCompleteView(APIView): permission_classes = (IsAuthenticated,) - ix = index.open_index() + def __init__(self, *args, **kwargs): + super(SearchAutoCompleteView, self).__init__(*args, **kwargs) + self.ix = index.open_index() def get(self, request, format=None): if 'term' in request.query_params: From 6834e563a880c003d3daccda11b13a6486650395 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 14:00:41 +0100 Subject: [PATCH 09/10] refactored the test cases to use a mixin for setting up temporary directories. --- src/documents/tests/test_api.py | 7 +++---- src/documents/tests/test_consumer.py | 7 +++---- src/documents/tests/test_management_consumer.py | 12 +++++------- src/documents/tests/utils.py | 15 +++++++++++++++ 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index d9a2aac26..37f774891 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -7,14 +7,13 @@ from pathvalidate import ValidationError from rest_framework.test import APITestCase from documents.models import Document, Correspondent, DocumentType, Tag -from documents.tests.utils import setup_directories, remove_dirs +from documents.tests.utils import DirectoriesMixin -class DocumentApiTest(APITestCase): +class DocumentApiTest(DirectoriesMixin, APITestCase): def setUp(self): - self.dirs = setup_directories() - self.addCleanup(remove_dirs, self.dirs) + super(DocumentApiTest, self).setUp() user = User.objects.create_superuser(username="temp_admin") self.client.force_login(user=user) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index ed835a467..d1cd0adf1 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock from django.test import TestCase, override_settings -from .utils import setup_directories, remove_dirs +from .utils import DirectoriesMixin from ..consumer import Consumer, ConsumerError from ..models import FileInfo, Tag, Correspondent, DocumentType, Document from ..parsers import DocumentParser, ParseError @@ -408,7 +408,7 @@ def fake_magic_from_file(file, mime=False): @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) -class TestConsumer(TestCase): +class TestConsumer(DirectoriesMixin, TestCase): def make_dummy_parser(self, path, logging_group): return DummyParser(path, logging_group, self.dirs.scratch_dir) @@ -417,8 +417,7 @@ class TestConsumer(TestCase): return FaultyParser(path, logging_group, self.dirs.scratch_dir) def setUp(self): - self.dirs = setup_directories() - self.addCleanup(remove_dirs, self.dirs) + super(TestConsumer, self).setUp() patcher = mock.patch("documents.parsers.document_consumer_declaration.send") m = patcher.start() diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 2d45acd01..aed824926 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -7,11 +7,11 @@ from unittest import mock from django.conf import settings from django.core.management import call_command, CommandError -from django.test import TestCase, override_settings +from django.test import override_settings, TestCase from documents.consumer import ConsumerError from documents.management.commands import document_consumer -from documents.tests.utils import setup_directories, remove_dirs +from documents.tests.utils import DirectoriesMixin class ConsumerThread(Thread): @@ -33,19 +33,17 @@ def chunked(size, source): yield source[i:i+size] -class TestConsumer(TestCase): +class TestConsumer(DirectoriesMixin, TestCase): sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") def setUp(self) -> None: + super(TestConsumer, self).setUp() self.t = None patcher = mock.patch("documents.management.commands.document_consumer.async_task") self.task_mock = patcher.start() self.addCleanup(patcher.stop) - self.dirs = setup_directories() - self.addCleanup(remove_dirs, self.dirs) - def t_start(self): self.t = ConsumerThread() self.t.start() @@ -59,7 +57,7 @@ class TestConsumer(TestCase): # wait for the consumer to exit. self.t.join() - remove_dirs(self.dirs) + super(TestConsumer, self).tearDown() def wait_for_task_mock_call(self): n = 0 diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index 7b0938ee3..83148e9c7 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -39,3 +39,18 @@ def remove_dirs(dirs): shutil.rmtree(dirs.data_dir, ignore_errors=True) shutil.rmtree(dirs.scratch_dir, ignore_errors=True) shutil.rmtree(dirs.consumption_dir, ignore_errors=True) + + +class DirectoriesMixin: + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dirs = None + + def setUp(self) -> None: + self.dirs = setup_directories() + super(DirectoriesMixin, self).setUp() + + def tearDown(self) -> None: + super(DirectoriesMixin, self).tearDown() + remove_dirs(self.dirs) From 6c308116d68904374536f5e47907a3d22c1688d1 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 27 Nov 2020 14:00:52 +0100 Subject: [PATCH 10/10] parallel tests. --- src/documents/index.py | 3 --- src/setup.cfg | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index a6c3abba8..ffa3e688f 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -64,9 +64,6 @@ def get_schema(): def open_index(recreate=False): - # TODO: this is not thread safe. If 2 instances try to create the index - # at the same time, this fails. This currently prevents parallel - # tests. try: if exists_in(settings.INDEX_DIR) and not recreate: return open_dir(settings.INDEX_DIR) diff --git a/src/setup.cfg b/src/setup.cfg index f43c9adf6..2a1a348bd 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -3,7 +3,7 @@ exclude = migrations, paperless/settings.py, .tox, */tests/* [tool:pytest] DJANGO_SETTINGS_MODULE=paperless.settings -addopts = --pythonwarnings=all --cov --cov-report=html +addopts = --pythonwarnings=all --cov --cov-report=html -n auto env = PAPERLESS_DISABLE_DBHANDLER=true