From fa9a5cc247afec928af375506824ef7c404a6189 Mon Sep 17 00:00:00 2001 From: jayme-github Date: Sun, 29 Nov 2020 15:39:43 +0100 Subject: [PATCH 1/7] Create tags from sub directories The names of sub directories in the consumer directory will be added as tags for the document to be consumed. To enable this, set: PAPERLESS_CONSUMER_RECURSIVE=1 PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=1 Fixes #50 --- Pipfile | 2 +- .../management/commands/document_consumer.py | 86 ++++++++++++++----- .../tests/test_management_consumer.py | 42 ++++++++- src/paperless/settings.py | 9 ++ 4 files changed, 115 insertions(+), 24 deletions(-) diff --git a/Pipfile b/Pipfile index 105efd0ad..ee25c5682 100644 --- a/Pipfile +++ b/Pipfile @@ -38,7 +38,7 @@ scikit-learn="~=0.23.2" whitenoise = "~=5.2.0" watchdog = "*" whoosh="~=2.7.4" -inotify-simple = "*" +inotifyrecursive = ">=0.3.4" [dev-packages] coveralls = "*" diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 7baeccce0..ea63cd19d 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,31 +1,60 @@ import logging import os +from pathlib import Path from time import sleep from django.conf import settings from django.core.management.base import BaseCommand, CommandError +from django.utils.text import slugify from django_q.tasks import async_task from watchdog.events import FileSystemEventHandler from watchdog.observers.polling import PollingObserver +from documents.models import Tag + try: - from inotify_simple import INotify, flags + from inotifyrecursive import INotify, flags except ImportError: INotify = flags = None logger = logging.getLogger(__name__) -def _consume(file): - try: - if os.path.isfile(file): - async_task("documents.tasks.consume_file", - file, - task_name=os.path.basename(file)[:100]) - else: - logger.debug( - f"Not consuming file {file}: File has moved.") +def _tags_from_path(filepath): + """Walk up the directory tree from filepath to CONSUMPTION_DIr + and get or create Tag IDs for every directory. + """ + tag_ids = set() + path_parts = Path(filepath).relative_to( + settings.CONSUMPTION_DIR).parent.parts + for part in path_parts: + tag_ids.add(Tag.objects.get_or_create( + slug=slugify(part), + defaults={"name": part}, + )[0].pk) + return tag_ids + + +def _consume(filepath): + if not os.path.isfile(filepath): + logger.debug( + f"Not consuming file {filepath}: File has moved.") + return + + tag_ids = None + try: + if settings.CONSUMER_SUBDIRS_AS_TAGS: + tag_ids = _tags_from_path(filepath) + except Exception as e: + logger.error( + "Error creating tags from path: {}".format(e)) + + try: + async_task("documents.tasks.consume_file", + filepath, + override_tag_ids=tag_ids if tag_ids else None, + task_name=os.path.basename(filepath)[:100]) except Exception as e: # Catch all so that the consumer won't crash. # This is also what the test case is listening for to check for @@ -94,6 +123,7 @@ class Command(BaseCommand): def handle(self, *args, **options): directory = options["directory"] + recursive = settings.CONSUMER_RECURSIVE if not directory: raise CommandError( @@ -104,24 +134,30 @@ class Command(BaseCommand): raise CommandError( f"Consumption directory {directory} does not exist") - for entry in os.scandir(directory): - _consume(entry.path) + if recursive: + for dirpath, _, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + _consume(filepath) + else: + for entry in os.scandir(directory): + _consume(entry.path) if options["oneshot"]: return if settings.CONSUMER_POLLING == 0 and INotify: - self.handle_inotify(directory) + self.handle_inotify(directory, recursive) else: - self.handle_polling(directory) + self.handle_polling(directory, recursive) logger.debug("Consumer exiting.") - def handle_polling(self, directory): + def handle_polling(self, directory, recursive): logging.getLogger(__name__).info( f"Polling directory for changes: {directory}") self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING) - self.observer.schedule(Handler(), directory, recursive=False) + self.observer.schedule(Handler(), directory, recursive=recursive) self.observer.start() try: while self.observer.is_alive(): @@ -132,18 +168,26 @@ class Command(BaseCommand): self.observer.stop() self.observer.join() - def handle_inotify(self, directory): + def handle_inotify(self, directory, recursive): logging.getLogger(__name__).info( f"Using inotify to watch directory for changes: {directory}") inotify = INotify() - descriptor = inotify.add_watch( - directory, flags.CLOSE_WRITE | flags.MOVED_TO) + inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO + if recursive: + descriptor = inotify.add_watch_recursive(directory, inotify_flags) + else: + descriptor = inotify.add_watch(directory, inotify_flags) + try: while not self.stop_flag: for event in inotify.read(timeout=1000, read_delay=1000): - file = os.path.join(directory, event.name) - _consume(file) + if recursive: + path = inotify.get_path(event.wd) + else: + path = directory + filepath = os.path.join(path, event.name) + _consume(filepath) except KeyboardInterrupt: pass diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index aed824926..c56b49a41 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -7,8 +7,9 @@ from unittest import mock from django.conf import settings from django.core.management import call_command, CommandError -from django.test import override_settings, TestCase +from django.test import override_settings, TransactionTestCase +from documents.models import Tag from documents.consumer import ConsumerError from documents.management.commands import document_consumer from documents.tests.utils import DirectoriesMixin @@ -33,7 +34,7 @@ def chunked(size, source): yield source[i:i+size] -class TestConsumer(DirectoriesMixin, TestCase): +class TestConsumer(DirectoriesMixin, TransactionTestCase): sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") @@ -126,6 +127,43 @@ class TestConsumer(DirectoriesMixin, TestCase): def test_consume_existing_file_polling(self): self.test_consume_existing_file() + @override_settings(CONSUMER_RECURSIVE=1) + @override_settings(CONSUMER_SUBDIRS_AS_TAGS=1) + def test_consume_file_with_path_tags(self): + + tag_names = ("existingTag", "Space Tag") + # Create a Tag prior to consuming a file using it in path + tag_ids = [Tag.objects.create(name=tag_names[0]).pk,] + + self.t_start() + + path = os.path.join(self.dirs.consumption_dir, *tag_names) + os.makedirs(path, exist_ok=True) + f = os.path.join(path, "my_file.pdf") + # Wait at least inotify read_delay for recursive watchers + # to be created for the new directories + sleep(1) + shutil.copy(self.sample_file, f) + + self.wait_for_task_mock_call() + + self.task_mock.assert_called_once() + + # Add the pk of the Tag created by _consume() + tag_ids.append(Tag.objects.get(name=tag_names[1]).pk) + + args, kwargs = self.task_mock.call_args + self.assertEqual(args[1], f) + + # assertCountEqual has a bad name, but test that the first + # sequence contains the same elements as second, regardless of + # their order. + self.assertCountEqual(kwargs["override_tag_ids"], tag_ids) + + @override_settings(CONSUMER_POLLING=1) + def test_consume_file_with_path_tags_polling(self): + self.test_consume_file_with_path_tags() + @mock.patch("documents.management.commands.document_consumer.logger.error") def test_slow_write_pdf(self, error_logger): diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 4847d7bce..648df23f5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -331,6 +331,15 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0)) CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") +# Consume from subdirectories of CONSUMPTION_DIR as well +CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE") + +# Set the names of subdirectories as tags for consumed files. +# E.g. $CONSUMPTION_DIR/foo/bar/file.pdf will add the tags "foo" and "bar" to +# the consumed file. +# PAPERLESS_CONSUMER_RECURSIVE must be enabled for this to work. +CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") + OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) From daf54a334fbae4d0dd264f20726ada229df7c35f Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 16:30:24 +0100 Subject: [PATCH 2/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef088ef9e..5c1953469 100644 --- a/README.md +++ b/README.md @@ -45,16 +45,16 @@ For a complete list of changes, check out the [changelog](https://paperless-ng.r # Roadmap for 1.0 +- Make the front end nice (except mobile). - Test coverage at 90%. - Store archived documents with an embedded OCR text layer, while keeping originals available. Making good progress in the `feature-ocrmypdf` branch. -- Fix whatever bugs I and you find +- Fix whatever bugs I and you find. ## Roadmap for versions beyond 1.0 - **More search.** The search backend is incredibly versatile and customizable. Searching is the most important feature of this project and thus, I want to implement things like: - Group and limit search results by correspondent, show “more from this” links in the results. - Ability to search for “Similar documents” in the search results - - Provide corrections for mispelled queries - **An interactive consumer** that shows its progress for documents it processes on the web page. - With live updates ans websockets. This already works on a dev branch, but requires a lot of new dependencies, which I'm not particular happy about. - Notifications when a document was added with buttons to open the new document right away. From 1b0ddd6df6d7f0942c96b301b0ac0317bcbc3d8c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 21:41:29 +0100 Subject: [PATCH 3/7] changelog --- docs/changelog.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 68198ec49..580dd7830 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,13 +10,12 @@ paperless-ng 0.9.4 * Searching: - * Paperless now supports searching by tags, types and dates. In order to have this applied to your + * Paperless now supports searching by tags, types and dates and correspondents. In order to have this applied to your existing documents, you need to perform a ``document_index reindex`` management command (see :ref:`administration-index`) - that adds the new data to the search index. You only need to do this once, so that paperless can find - your documents by tags,types and dates. Paperless keeps the index updated after that whenever - something changes. - * Paperless now has spelling corrections ("Did you mean") for misstyped queries. + that adds the data to the search index. You only need to do this once, since the schema of the search index changed. + Paperless keeps the index updated after that whenever something changes. + * Paperless now has spelling corrections ("Did you mean") for miss-typed queries. * The documentation contains :ref:`information about the query syntax `. * Front end: From b97fa9e3b9a0a8d1082c646c249ed558a3e18ad3 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 21:53:39 +0100 Subject: [PATCH 4/7] this change caused index optimization to fail. --- src/documents/tasks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index cd47892be..3c9baad08 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -12,9 +12,7 @@ from documents.sanity_checker import SanityFailedError def index_optimize(): - ix = index.open_index() - with AsyncWriter(ix) as writer: - writer.commit(optimize=True) + index.open_index().optimize() def index_reindex(): From e431a658cc3409ee4a347dc7020a038b9e9028d9 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 22:04:25 +0100 Subject: [PATCH 5/7] more testing. --- src/documents/tests/test_api.py | 4 ++-- src/documents/tests/test_consumer.py | 1 + .../tests/test_management_decrypt.py | 3 ++- src/documents/tests/test_tasks.py | 23 +++++++++++++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 src/documents/tests/test_tasks.py diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index b9f3dcfba..78a3a8ad9 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -12,10 +12,10 @@ from documents.models import Document, Correspondent, DocumentType, Tag from documents.tests.utils import DirectoriesMixin -class DocumentApiTest(DirectoriesMixin, APITestCase): +class TestDocumentApi(DirectoriesMixin, APITestCase): def setUp(self): - super(DocumentApiTest, self).setUp() + super(TestDocumentApi, self).setUp() user = User.objects.create_superuser(username="temp_admin") self.client.force_login(user=user) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 1b2e3e649..8217bf0cf 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -435,6 +435,7 @@ class TestConsumer(DirectoriesMixin, TestCase): fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir) return f + @override_settings(PAPERLESS_FILENAME_FORMAT=None) def testNormalOperation(self): filename = self.get_test_file() diff --git a/src/documents/tests/test_management_decrypt.py b/src/documents/tests/test_management_decrypt.py index 8f41e076f..f68ea7cc1 100644 --- a/src/documents/tests/test_management_decrypt.py +++ b/src/documents/tests/test_management_decrypt.py @@ -17,7 +17,8 @@ class TestDecryptDocuments(TestCase): @override_settings( ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"), THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"), - PASSPHRASE="test" + PASSPHRASE="test", + PAPERLESS_FILENAME_FORMAT=None ) @mock.patch("documents.management.commands.decrypt_documents.input") def test_decrypt(self, m): diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py new file mode 100644 index 000000000..d408c1e3d --- /dev/null +++ b/src/documents/tests/test_tasks.py @@ -0,0 +1,23 @@ +from datetime import datetime + +from django.test import TestCase + +from documents import tasks +from documents.models import Document +from documents.tests.utils import DirectoriesMixin + + +class TestTasks(DirectoriesMixin, TestCase): + + def test_index_reindex(self): + Document.objects.create(title="test", content="my document", checksum="wow", added=datetime.now(), created=datetime.now(), modified=datetime.now()) + + tasks.index_reindex() + + def test_index_optimize(self): + Document.objects.create(title="test", content="my document", checksum="wow", added=datetime.now(), created=datetime.now(), modified=datetime.now()) + + tasks.index_optimize() + + def test_train_classifier(self): + tasks.train_classifier() From 756c80690d613ff204035f424574c7f67761677c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 23:02:59 +0100 Subject: [PATCH 6/7] fix for the docs. --- docs/usage_overview.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/usage_overview.rst b/docs/usage_overview.rst index 4ce7f9b7a..35ca505a3 100644 --- a/docs/usage_overview.rst +++ b/docs/usage_overview.rst @@ -176,20 +176,20 @@ further. Matching documents with logical expressions: -.. code:: none +.. code:: shopname AND (product1 OR product2) Matching specific tags, correspondents or types: -.. code:: none +.. code:: type:invoice tag:unpaid correspondent:university certificate Matching dates: -.. code:: none +.. code:: created:[2005 to 2009] added:yesterday @@ -197,7 +197,7 @@ Matching dates: Matching inexact words: -.. code:: none +.. code:: produ*name From d58706a34b8f112a4109e0db77877a2e0cef302e Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 30 Nov 2020 23:45:21 +0100 Subject: [PATCH 7/7] pipfile update. --- Pipfile.lock | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 918609845..c6f4a2773 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d6432a18280c092c108e998f00bcd377c0c55ef18f26cb0b8eb64f9618b9f383" + "sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62" }, "pipfile-spec": 6, "requires": { @@ -39,10 +39,10 @@ }, "blessed": { "hashes": [ - "sha256:7d4914079a6e8e14fbe080dcaf14dee596a088057cdc598561080e3266123b48", - "sha256:81125aa5b84cb9dfc09ff451886f64b4b923b75c5eaf51fde9d1c48a135eb797" + "sha256:0a74a8d3f0366db600d061273df77d44f0db07daade7bb7a4d49c8bc22ed9f74", + "sha256:580429e7e0c6f6a42ea81b0ae5a4993b6205c6ccbb635d034b4277af8175753e" ], - "version": "==1.17.11" + "version": "==1.17.12" }, "dateparser": { "hashes": [ @@ -70,11 +70,11 @@ }, "django-extensions": { "hashes": [ - "sha256:6809c89ca952f0e08d4e0766bc0101dfaf508d7649aced1180c091d737046ea7", - "sha256:dc663652ac9460fd06580a973576820430c6d428720e874ae46b041fa63e0efa" + "sha256:7cd002495ff0a0e5eb6cdd6be759600905b4e4079232ea27618fc46bdd853651", + "sha256:c7f88625a53f631745d4f2bef9ec4dcb999ed59476393bdbbe99db8596778846" ], "index": "pypi", - "version": "==3.0.9" + "version": "==3.1.0" }, "django-filter": { "hashes": [ @@ -136,9 +136,17 @@ "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128", "sha256:854f9ac752cc1fcff6ca34e9d3d875c9a94c9b7d6eb377f63be2d481a566c6ee" ], - "index": "pypi", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.3.5" }, + "inotifyrecursive": { + "hashes": [ + "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", + "sha256:a2c450b317693e4538416f90eb1d7858506dafe6b8b885037bd2dd9ae2dafa1e" + ], + "index": "pypi", + "version": "==0.3.5" + }, "joblib": { "hashes": [ "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72", @@ -499,11 +507,11 @@ }, "watchdog": { "hashes": [ - "sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8", - "sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04" + "sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3", + "sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.10.4" }, "wcwidth": { "hashes": [ @@ -673,11 +681,11 @@ }, "faker": { "hashes": [ - "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997", - "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e" + "sha256:2ba20a4438429cb08d729175d7bb0435ef3c2c4cedc7b1ceb703ee6da8dad906", + "sha256:6279746aed175a693108238e6d1ab8d7e26d0ec7ff8474f61025b9fdaae15d65" ], "markers": "python_version >= '3.5'", - "version": "==4.17.1" + "version": "==4.18.0" }, "filelock": { "hashes": [ @@ -780,11 +788,11 @@ }, "packaging": { "hashes": [ - "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8", - "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181" + "sha256:05af3bb85d320377db281cf254ab050e1a7ebcbf5410685a9a407e18a1f81236", + "sha256:eb41423378682dadb7166144a4926e443093863024de508ca5c9737d6bc08376" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.4" + "version": "==20.7" }, "pluggy": { "hashes": [