From d1a57b5d68f1ceb17a04560a4e65b3ba61acf045 Mon Sep 17 00:00:00 2001 From: Ovv Date: Sat, 24 Feb 2018 20:32:19 +0100 Subject: [PATCH 1/8] Configuration cli argument for document_consumer --- src/documents/consumer.py | 27 +++++------ src/documents/forms.py | 2 +- src/documents/mail.py | 5 +- .../management/commands/document_consumer.py | 47 +++++++++++-------- 4 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 74e85b9f0..1d5036318 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -32,31 +32,30 @@ class Consumer: 5. Delete the document and image(s) """ - SCRATCH = settings.SCRATCH_DIR - CONSUME = settings.CONSUMPTION_DIR - - def __init__(self): + def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None + self.stats = {} + self._ignore = [] + self.consume = consume + self.scratch = scratch + try: - os.makedirs(self.SCRATCH) + os.makedirs(self.scratch) except FileExistsError: pass - self.stats = {} - self._ignore = [] - - if not self.CONSUME: + if not self.consume: raise ConsumerError( "The CONSUMPTION_DIR settings variable does not appear to be " "set." ) - if not os.path.exists(self.CONSUME): + if not os.path.exists(self.consume): raise ConsumerError( - "Consumption directory {} does not exist".format(self.CONSUME)) + "Consumption directory {} does not exist".format(self.consume)) self.parsers = [] for response in document_consumer_declaration.send(self): @@ -73,11 +72,11 @@ class Consumer: "group": self.logging_group }) - def consume(self): + def run(self): - for doc in os.listdir(self.CONSUME): + for doc in os.listdir(self.consume): - doc = os.path.join(self.CONSUME, doc) + doc = os.path.join(self.consume, doc) if not os.path.isfile(doc): continue diff --git a/src/documents/forms.py b/src/documents/forms.py index 28b30df3e..5f965a1c6 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -92,7 +92,7 @@ class UploadForm(forms.Form): t = int(mktime(datetime.now().timetuple())) file_name = os.path.join( - Consumer.CONSUME, + settings.CONSUMPTION_DIR, "{} - {}.{}".format(correspondent, title, self._file_type) ) diff --git a/src/documents/mail.py b/src/documents/mail.py index d41b952ac..d63e04beb 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -151,7 +151,7 @@ class Attachment(object): class MailFetcher(Loggable): - def __init__(self): + def __init__(self, consume=settings.CONSUMPTION_DIR): Loggable.__init__(self) @@ -165,6 +165,7 @@ class MailFetcher(Loggable): self._enabled = bool(self._host) self.last_checked = datetime.datetime.now() + self.consume = consume def pull(self): """ @@ -185,7 +186,7 @@ class MailFetcher(Loggable): self.log("info", 'Storing email: "{}"'.format(message.subject)) t = int(time.mktime(message.time.timetuple())) - file_name = os.path.join(Consumer.CONSUME, message.file_name) + file_name = os.path.join(self.consume, message.file_name) with open(file_name, "wb") as f: f.write(message.attachment.data) os.utime(file_name, times=(t, t)) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index f50b489ee..eb3e38a2e 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -16,9 +16,6 @@ class Command(BaseCommand): consumption directory, and fetch any mail available. """ - LOOP_TIME = settings.CONSUMER_LOOP_TIME - MAIL_DELTA = datetime.timedelta(minutes=10) - ORIGINAL_DOCS = os.path.join(settings.MEDIA_ROOT, "documents", "originals") THUMB_DOCS = os.path.join(settings.MEDIA_ROOT, "documents", "thumbnails") @@ -32,13 +29,22 @@ class Command(BaseCommand): BaseCommand.__init__(self, *args, **kwargs) + def add_arguments(self, parser): + parser.add_argument("directory", default=settings.CONSUMPTION_DIR, nargs='?') + parser.add_argument("--loop-time", default=settings.CONSUMER_LOOP_TIME, type=int) + parser.add_argument("--mail-delta", default=10, type=int) + parser.add_argument("--oneshot", action='store_true') + def handle(self, *args, **options): self.verbosity = options["verbosity"] + directory = options['directory'] + loop_time = options['loop_time'] + mail_delta = datetime.timedelta(minutes=options['mail_delta']) try: - self.file_consumer = Consumer() - self.mail_fetcher = MailFetcher() + self.file_consumer = Consumer(consume=directory) + self.mail_fetcher = MailFetcher(consume=directory) except (ConsumerError, MailFetcherError) as e: raise CommandError(e) @@ -49,27 +55,30 @@ class Command(BaseCommand): pass logging.getLogger(__name__).info( - "Starting document consumer at {}".format(settings.CONSUMPTION_DIR) + "Starting document consumer at {}".format(directory) ) - try: - while True: - self.loop() - time.sleep(self.LOOP_TIME) - if self.verbosity > 1: - print(".") - except KeyboardInterrupt: - print("Exiting") + if options['oneshot']: + self.loop(mail_delta=mail_delta) + else: + try: + while True: + self.loop(mail_delta=mail_delta) + time.sleep(loop_time) + if self.verbosity > 1: + print(".", int(time.time())) + except KeyboardInterrupt: + print("Exiting") - def loop(self): - - # Consume whatever files we can - self.file_consumer.consume() + def loop(self, mail_delta): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. - delta = self.mail_fetcher.last_checked + self.MAIL_DELTA + delta = self.mail_fetcher.last_checked + mail_delta if self.first_iteration or delta < datetime.datetime.now(): self.first_iteration = False self.mail_fetcher.pull() + + # Consume whatever files we can + self.file_consumer.run() From 8fefafb8442fa7e676c3265b2fe61c14c0e4127a Mon Sep 17 00:00:00 2001 From: Ovv Date: Sun, 25 Feb 2018 19:20:51 +0100 Subject: [PATCH 2/8] style & test --- src/documents/consumer.py | 3 ++- .../management/commands/document_consumer.py | 22 +++++++++++++------ src/documents/tests/test_consumer.py | 10 ++++----- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 1d5036318..7d4f2facf 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -32,7 +32,8 @@ class Consumer: 5. Delete the document and image(s) """ - def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): + def __init__(self, consume=settings.CONSUMPTION_DIR, + scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index eb3e38a2e..8efdbe54e 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -30,17 +30,25 @@ class Command(BaseCommand): BaseCommand.__init__(self, *args, **kwargs) def add_arguments(self, parser): - parser.add_argument("directory", default=settings.CONSUMPTION_DIR, nargs='?') - parser.add_argument("--loop-time", default=settings.CONSUMER_LOOP_TIME, type=int) + parser.add_argument( + "directory", + default=settings.CONSUMPTION_DIR, + nargs="?" + ) + parser.add_argument( + "--loop-time", + default=settings.CONSUMER_LOOP_TIME, + type=int + ) parser.add_argument("--mail-delta", default=10, type=int) - parser.add_argument("--oneshot", action='store_true') + parser.add_argument("--oneshot", action="store_true") def handle(self, *args, **options): self.verbosity = options["verbosity"] - directory = options['directory'] - loop_time = options['loop_time'] - mail_delta = datetime.timedelta(minutes=options['mail_delta']) + directory = options["directory"] + loop_time = options["loop_time"] + mail_delta = datetime.timedelta(minutes=options["mail_delta"]) try: self.file_consumer = Consumer(consume=directory) @@ -58,7 +66,7 @@ class Command(BaseCommand): "Starting document consumer at {}".format(directory) ) - if options['oneshot']: + if options["oneshot"]: self.loop(mail_delta=mail_delta) else: try: diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index b7fca45e7..edb2e8754 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -16,7 +16,6 @@ class TestConsumer(TestCase): self.DummyParser ) - @mock.patch("documents.consumer.Consumer.CONSUME") @mock.patch("documents.consumer.os.makedirs") @mock.patch("documents.consumer.os.path.exists", return_value=True) @mock.patch("documents.consumer.document_consumer_declaration.send") @@ -33,17 +32,16 @@ class TestConsumer(TestCase): (None, lambda _: {"weight": 1, "parser": DummyParser2}), ) - self.assertEqual(Consumer()._get_parser_class("doc.pdf"), DummyParser2) + self.assertEqual(Consumer(consume=".")._get_parser_class("doc.pdf"), + DummyParser2) - @mock.patch("documents.consumer.Consumer.CONSUME") @mock.patch("documents.consumer.os.makedirs") @mock.patch("documents.consumer.os.path.exists", return_value=True) @mock.patch("documents.consumer.document_consumer_declaration.send") def test__get_parser_class_0_parsers(self, m, *args): m.return_value = ((None, lambda _: None),) - self.assertIsNone(Consumer()._get_parser_class("doc.pdf")) + self.assertIsNone(Consumer(consume=".")._get_parser_class("doc.pdf")) - @mock.patch("documents.consumer.Consumer.CONSUME") @mock.patch("documents.consumer.os.makedirs") @mock.patch("documents.consumer.os.path.exists", return_value=True) @mock.patch("documents.consumer.document_consumer_declaration.send") @@ -51,7 +49,7 @@ class TestConsumer(TestCase): m.return_value = ( (None, lambda _: {"weight": 0, "parser": self.DummyParser}), ) - return Consumer() + return Consumer(consume=".") class TestAttributes(TestCase): From f8c6c07bb75d9cb7f1b71559c3b525e999a97b25 Mon Sep 17 00:00:00 2001 From: Ovv Date: Sun, 25 Feb 2018 19:42:03 +0100 Subject: [PATCH 3/8] use tmp dir --- src/documents/tests/test_consumer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index edb2e8754..9f2445340 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,5 +1,6 @@ from django.test import TestCase from unittest import mock +from tempfile import TemporaryDirectory from ..consumer import Consumer from ..models import FileInfo @@ -31,16 +32,21 @@ class TestConsumer(TestCase): (None, lambda _: {"weight": 0, "parser": DummyParser1}), (None, lambda _: {"weight": 1, "parser": DummyParser2}), ) - - self.assertEqual(Consumer(consume=".")._get_parser_class("doc.pdf"), - DummyParser2) + with TemporaryDirectory() as tmpdir: + self.assertEqual( + Consumer(consume=tmpdir)._get_parser_class("doc.pdf"), + DummyParser2 + ) @mock.patch("documents.consumer.os.makedirs") @mock.patch("documents.consumer.os.path.exists", return_value=True) @mock.patch("documents.consumer.document_consumer_declaration.send") def test__get_parser_class_0_parsers(self, m, *args): m.return_value = ((None, lambda _: None),) - self.assertIsNone(Consumer(consume=".")._get_parser_class("doc.pdf")) + with TemporaryDirectory() as tmpdir: + self.assertIsNone( + Consumer(consume=tmpdir)._get_parser_class("doc.pdf") + ) @mock.patch("documents.consumer.os.makedirs") @mock.patch("documents.consumer.os.path.exists", return_value=True) @@ -49,7 +55,8 @@ class TestConsumer(TestCase): m.return_value = ( (None, lambda _: {"weight": 0, "parser": self.DummyParser}), ) - return Consumer(consume=".") + with TemporaryDirectory() as tmpdir: + return Consumer(consume=tmpdir) class TestAttributes(TestCase): From 7a1754fffd5974bf48b7e5518a917a43f267b571 Mon Sep 17 00:00:00 2001 From: Ovv Date: Sun, 25 Feb 2018 19:47:04 +0100 Subject: [PATCH 4/8] remove consume env var from pytest.ini --- src/setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/src/setup.cfg b/src/setup.cfg index 1112934c0..f9572519b 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -6,7 +6,6 @@ exclude = migrations, paperless/settings.py, .tox DJANGO_SETTINGS_MODULE=paperless.settings addopts = --pythonwarnings=all -n auto env = - PAPERLESS_CONSUME=/tmp PAPERLESS_PASSPHRASE=THISISNOTASECRET PAPERLESS_SECRET=paperless PAPERLESS_EMAIL_SECRET=paperless From f56dafe7d946eb8731c7ba7043310e88459b5a7a Mon Sep 17 00:00:00 2001 From: Ovv Date: Mon, 26 Feb 2018 18:52:46 +0100 Subject: [PATCH 5/8] Help & documentation --- docs/utilities.rst | 7 +++++++ .../management/commands/document_consumer.py | 19 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 2de6e2a01..265e2e78d 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -80,6 +80,13 @@ you'll need to have it start in the background -- something you'll need to figure out for your own system. To get you started though, there are Systemd service files in the ``scripts`` directory. +Some command line argument are available to customize the behavior of the +consumer. By default it will use ``/etc/paperless.conf`` values. Display the +help with: + +.. code-block:: shell-session + + $ /path/to/paperless/src/manage.py document_consumer --help .. _utilities-exporter: diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 8efdbe54e..dbd7e3264 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -33,15 +33,26 @@ class Command(BaseCommand): parser.add_argument( "directory", default=settings.CONSUMPTION_DIR, - nargs="?" + nargs="?", + help="The consumption directory." ) parser.add_argument( "--loop-time", default=settings.CONSUMER_LOOP_TIME, - type=int + type=int, + help="Wait time between each loop (in seconds)." + ) + parser.add_argument( + "--mail-delta", + default=10, + type=int, + help="Wait time between each mail fetch (in minutes)." + ) + parser.add_argument( + "--oneshot", + action="store_true", + help="Run only once." ) - parser.add_argument("--mail-delta", default=10, type=int) - parser.add_argument("--oneshot", action="store_true") def handle(self, *args, **options): From 5c430416106fb223a8e9b53f00e2946343261ae0 Mon Sep 17 00:00:00 2001 From: Ovv Date: Tue, 27 Feb 2018 12:22:18 +0100 Subject: [PATCH 6/8] fix typo --- docs/utilities.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 265e2e78d..ca5af6a3f 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -80,7 +80,7 @@ you'll need to have it start in the background -- something you'll need to figure out for your own system. To get you started though, there are Systemd service files in the ``scripts`` directory. -Some command line argument are available to customize the behavior of the +Some command line arguments are available to customize the behavior of the consumer. By default it will use ``/etc/paperless.conf`` values. Display the help with: From 73e62600c2a5e85ac2d037289551fb6081c7895c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 3 Mar 2018 18:42:27 +0000 Subject: [PATCH 7/8] Clean up docstring to be properly rst --- src/documents/consumer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 7d4f2facf..886b0dd69 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -226,8 +226,8 @@ class Consumer: def _is_ready(self, doc): """ - Detect whether `doc` is ready to consume or if it's still being written - to by the uploader. + Detect whether ``doc`` is ready to consume or if it's still being + written to by the uploader. """ t = os.stat(doc).st_mtime From d0252e8e44a930b9c9ef9ba3b7f7d3a25b2160d3 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 3 Mar 2018 18:42:41 +0000 Subject: [PATCH 8/8] Run a --oneshot loop twice This was necessary since the first loop only ever collects file statistics so that the second run can be sure about "readiness". --- src/documents/management/commands/document_consumer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index dbd7e3264..f94265b65 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -99,5 +99,7 @@ class Command(BaseCommand): self.first_iteration = False self.mail_fetcher.pull() - # Consume whatever files we can - self.file_consumer.run() + # Consume whatever files we can. + # We have to run twice as the first run checks for file readiness + for i in range(2): + self.file_consumer.run()