From aa2fc84d7f8cc39d24ad0efafabc203408943523 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:09 +0200 Subject: [PATCH 01/18] Mail fetching: Only catch internal errors Previously, all errors raised during mail fetching were silently caught and printed without backtrace. To increase robustness and ease debugging, we now fail with a backtrace on unexpected errors. --- src/documents/mail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index d2828a57c..7cc417bfd 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -20,7 +20,7 @@ class MailFetcherError(Exception): pass -class InvalidMessageError(Exception): +class InvalidMessageError(MailFetcherError): pass @@ -205,7 +205,7 @@ class MailFetcher(Loggable): self._connection.close() self._connection.logout() - except Exception as e: + except MailFetcherError as e: self.log("error", str(e)) return r From 4babfa1a5b286aeb144d852f198dbb6771d6dd3d Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:10 +0200 Subject: [PATCH 02/18] Set default empty PAPERLESS_EMAIL_SECRET Previously, if the user didn't set PAPERLESS_EMAIL_SECRET, Paperless failed with an error in check_body() because self.SECRET was None. --- src/documents/mail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 7cc417bfd..f1a84d8e0 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -42,7 +42,7 @@ class Message(Loggable): and n attachments, and that we don't care about the message body. """ - SECRET = os.getenv("PAPERLESS_EMAIL_SECRET") + SECRET = os.getenv("PAPERLESS_EMAIL_SECRET", "") def __init__(self, data, group=None): """ From ea287e0db27494db73fe266a85bed3bbdcba0627 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:11 +0200 Subject: [PATCH 03/18] Fix list out of bounds error in mail message parsing Check list length before accessing the first two elements of 'dispositions'. The list may have only a single element ('inline') or may be empty in mailformed emails. --- src/documents/mail.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index f1a84d8e0..1be62527d 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -75,8 +75,9 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") - if not dispositions[0].lower() == "attachment" and \ - "filename" not in dispositions[1].lower(): + if len(dispositions) < 2 or \ + (not dispositions[0].lower() == "attachment" and + "filename" not in dispositions[1].lower()): continue file_data = part.get_payload() From 873c98dddb6c64b9f80a50269ff5fa3ce8f26d60 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:12 +0200 Subject: [PATCH 04/18] Refactor: extract fn 'make_dirs' --- src/documents/consumer.py | 13 +++++++++---- .../management/commands/document_consumer.py | 8 ++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 886b0dd69..fc8635c95 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -43,10 +43,7 @@ class Consumer: self.consume = consume self.scratch = scratch - try: - os.makedirs(self.scratch) - except FileExistsError: - pass + make_dirs(self.scratch) if not self.consume: raise ConsumerError( @@ -245,3 +242,11 @@ class Consumer: with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() + + +def make_dirs(*dirs): + for dir in dirs: + try: + os.makedirs(dir) + except FileExistsError: + pass diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index f94265b65..ae8ff7e35 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -6,7 +6,7 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...consumer import Consumer, ConsumerError +from ...consumer import Consumer, ConsumerError, make_dirs from ...mail import MailFetcher, MailFetcherError @@ -67,11 +67,7 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS): - try: - os.makedirs(path) - except FileExistsError: - pass + make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) logging.getLogger(__name__).info( "Starting document consumer at {}".format(directory) From 2fe7df8ca06180dbb6e99efedde300abe34ed995 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:13 +0200 Subject: [PATCH 05/18] Consume documents in order of increasing mtime This increases overall usability, especially for multi-page scans. Previously, the consumption order was undefined (see os.listdir()) --- src/documents/consumer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fc8635c95..3d7cc7bd1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -71,8 +71,11 @@ class Consumer: }) def run(self): + docs = [os.path.join(self.consume, entry) + for entry in os.listdir(self.consume)] + docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) - for doc in os.listdir(self.consume): + for doc in docs_old_to_new: doc = os.path.join(self.consume, doc) From a56a3eb86d7c2b665be421ba5df4828b8b3e9ec3 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:14 +0200 Subject: [PATCH 06/18] Use os.scandir instead of os.listdir It's simpler and better suited for use cases introduced in later commits. --- src/documents/consumer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 3d7cc7bd1..6f9273b03 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -71,8 +71,7 @@ class Consumer: }) def run(self): - docs = [os.path.join(self.consume, entry) - for entry in os.listdir(self.consume)] + docs = [entry.path for entry in os.scandir(self.consume)] docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) for doc in docs_old_to_new: From f018e8e54f03c3955158147b4561685acf60ab74 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:15 +0200 Subject: [PATCH 07/18] Refactor: extract fn try_consume_file The main purpose of this change is to make the following commits more readable. --- src/documents/consumer.py | 126 +++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 6f9273b03..d1d839e4d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -75,80 +75,82 @@ class Consumer: docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) for doc in docs_old_to_new: + self.try_consume_file(doc) - doc = os.path.join(self.consume, doc) + def try_consume_file(self, doc): + doc = os.path.join(self.consume, doc) - if not os.path.isfile(doc): - continue + if not os.path.isfile(doc): + return - if not re.match(FileInfo.REGEXES["title"], doc): - continue + if not re.match(FileInfo.REGEXES["title"], doc): + return - if doc in self._ignore: - continue + if doc in self._ignore: + return - if not self._is_ready(doc): - continue + if not self._is_ready(doc): + return - if self._is_duplicate(doc): - self.log( - "info", - "Skipping {} as it appears to be a duplicate".format(doc) - ) - self._ignore.append(doc) - continue + if self._is_duplicate(doc): + self.log( + "info", + "Skipping {} as it appears to be a duplicate".format(doc) + ) + self._ignore.append(doc) + return - parser_class = self._get_parser_class(doc) - if not parser_class: - self.log( - "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - continue + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "error", "No parsers could be found for {}".format(doc)) + self._ignore.append(doc) + return - self.logging_group = uuid.uuid4() + self.logging_group = uuid.uuid4() - self.log("info", "Consuming {}".format(doc)) + self.log("info", "Consuming {}".format(doc)) - document_consumption_started.send( - sender=self.__class__, - filename=doc, - logging_group=self.logging_group + document_consumption_started.send( + sender=self.__class__, + filename=doc, + logging_group=self.logging_group + ) + + parsed_document = parser_class(doc) + + try: + thumbnail = parsed_document.get_thumbnail() + date = parsed_document.get_date() + document = self._store( + parsed_document.get_text(), + doc, + thumbnail, + date + ) + except ParseError as e: + + self._ignore.append(doc) + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() + + return + + else: + + parsed_document.cleanup() + self._cleanup_doc(doc) + + self.log( + "info", + "Document {} consumption finished".format(document) ) - parsed_document = parser_class(doc) - - try: - thumbnail = parsed_document.get_thumbnail() - date = parsed_document.get_date() - document = self._store( - parsed_document.get_text(), - doc, - thumbnail, - date - ) - except ParseError as e: - - self._ignore.append(doc) - self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) - parsed_document.cleanup() - - continue - - else: - - parsed_document.cleanup() - self._cleanup_doc(doc) - - self.log( - "info", - "Document {} consumption finished".format(document) - ) - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group - ) + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group + ) def _get_parser_class(self, doc): """ From 61cd050e241984cdffe1f4698461f9802f120d0e Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:16 +0200 Subject: [PATCH 08/18] Ensure docs have been unmodified for some time before consuming Previously, the second mtime check for new files usually happened right after the first one, which could have caused consumption of docs that were still being modified. We're now waiting for at least FILES_MIN_UNMODIFIED_DURATION (0.5s). This also cleans up the logic by eliminating the consumer.stats attribute and the weird double call to consumer.run(). Additionally, this a fixes memory leak in consumer.stats where paths could be added but never removed if the corresponding files disappeared from the consumer dir before being considered ready. --- src/documents/consumer.py | 62 +++++++++---------- .../management/commands/document_consumer.py | 5 +- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d1d839e4d..514406646 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -3,8 +3,10 @@ import hashlib import logging import os import re +import time import uuid +from operator import itemgetter from django.conf import settings from django.utils import timezone from paperless.db import GnuPG @@ -32,13 +34,16 @@ class Consumer: 5. Delete the document and image(s) """ + # Files are considered ready for consumption if they have been unmodified + # for this duration + FILES_MIN_UNMODIFIED_DURATION = 0.5 + def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None - self.stats = {} self._ignore = [] self.consume = consume self.scratch = scratch @@ -70,27 +75,34 @@ class Consumer: "group": self.logging_group }) - def run(self): - docs = [entry.path for entry in os.scandir(self.consume)] - docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) + def consume_new_files(self): + """ + Find non-ignored files in consumption dir and consume them if they have + been unmodified for FILES_MIN_UNMODIFIED_DURATION. + """ + files = [] + for entry in os.scandir(self.consume): + if entry.is_file() and entry.path not in self._ignore: + files.append((entry.path, entry.stat().st_mtime)) - for doc in docs_old_to_new: - self.try_consume_file(doc) - - def try_consume_file(self, doc): - doc = os.path.join(self.consume, doc) - - if not os.path.isfile(doc): + if not files: return - if not re.match(FileInfo.REGEXES["title"], doc): - return + files_old_to_new = sorted(files, key=itemgetter(1)) - if doc in self._ignore: - return + time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) - if not self._is_ready(doc): - return + for file, mtime in files_old_to_new: + if mtime == os.path.getmtime(file): + # File has not been modified and can be consumed + self.try_consume_file(file) + + def try_consume_file(self, file): + + if not re.match(FileInfo.REGEXES["title"], file): + return False + + doc = file if self._is_duplicate(doc): self.log( @@ -225,22 +237,6 @@ class Consumer: self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) - def _is_ready(self, doc): - """ - Detect whether ``doc`` is ready to consume or if it's still being - written to by the uploader. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False - @staticmethod def _is_duplicate(doc): with open(doc, "rb") as f: diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ae8ff7e35..4aec489b6 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -95,7 +95,4 @@ class Command(BaseCommand): self.first_iteration = False self.mail_fetcher.pull() - # Consume whatever files we can. - # We have to run twice as the first run checks for file readiness - for i in range(2): - self.file_consumer.run() + self.file_consumer.consume_new_files() From 12488c963489a9a7b5b1f8aeca377bff477c227f Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:17 +0200 Subject: [PATCH 09/18] Simplify ignoring docs --- src/documents/consumer.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 514406646..e895593a5 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -95,9 +95,11 @@ class Consumer: for file, mtime in files_old_to_new: if mtime == os.path.getmtime(file): # File has not been modified and can be consumed - self.try_consume_file(file) + if not self.try_consume_file(file): + self._ignore.append(file) def try_consume_file(self, file): + "Return True if file was consumed" if not re.match(FileInfo.REGEXES["title"], file): return False @@ -109,15 +111,13 @@ class Consumer: "info", "Skipping {} as it appears to be a duplicate".format(doc) ) - self._ignore.append(doc) - return + return False parser_class = self._get_parser_class(doc) if not parser_class: self.log( "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - return + return False self.logging_group = uuid.uuid4() @@ -141,15 +141,10 @@ class Consumer: date ) except ParseError as e: - - self._ignore.append(doc) self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) parsed_document.cleanup() - - return - + return False else: - parsed_document.cleanup() self._cleanup_doc(doc) @@ -163,6 +158,7 @@ class Consumer: document=document, logging_group=self.logging_group ) + return True def _get_parser_class(self, doc): """ From e65e27d11f8bac0631eacd7d85cfdbdd58079c74 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:18 +0200 Subject: [PATCH 10/18] Consider mtime of ignored files, garbage-collect ignore list 1. Store the mtime of ignored files so that we can reconsider them if they have changed. 2. Regularly reset the ignore list to files that still exist in the consumption dir. Previously, the list could grow indefinitely. --- src/documents/consumer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index e895593a5..cca7c1c13 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -80,14 +80,23 @@ class Consumer: Find non-ignored files in consumption dir and consume them if they have been unmodified for FILES_MIN_UNMODIFIED_DURATION. """ + ignored_files = [] files = [] for entry in os.scandir(self.consume): - if entry.is_file() and entry.path not in self._ignore: - files.append((entry.path, entry.stat().st_mtime)) + if entry.is_file(): + file = (entry.path, entry.stat().st_mtime) + if file in self._ignore: + ignored_files.append(file) + else: + files.append(file) if not files: return + # Set _ignore to only include files that still exist. + # This keeps it from growing indefinitely. + self._ignore[:] = ignored_files + files_old_to_new = sorted(files, key=itemgetter(1)) time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) @@ -96,7 +105,7 @@ class Consumer: if mtime == os.path.getmtime(file): # File has not been modified and can be consumed if not self.try_consume_file(file): - self._ignore.append(file) + self._ignore.append((file, mtime)) def try_consume_file(self, file): "Return True if file was consumed" From bd75a65866756eb97ecb785ea91cdf1c1be5d9ee Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:19 +0200 Subject: [PATCH 11/18] Refactor: renamings, extract fn 'loop' Renamings: loop -> loop_step delta -> next_mail_time (this variable names a point in time, not a duration) Extracting the 'loop' fn is a preparation for later commits where a second type of loop is added. --- .../management/commands/document_consumer.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 4aec489b6..c5fe9baa2 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -74,24 +74,27 @@ class Command(BaseCommand): ) if options["oneshot"]: - self.loop(mail_delta=mail_delta) + self.loop_step(mail_delta) else: try: - while True: - self.loop(mail_delta=mail_delta) - time.sleep(loop_time) - if self.verbosity > 1: - print(".", int(time.time())) + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") - def loop(self, mail_delta): + def loop(self, loop_time, mail_delta): + while True: + self.loop_step(mail_delta) + time.sleep(loop_time) + if self.verbosity > 1: + print(".", int(time.time())) + + def loop_step(self, mail_delta): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. - delta = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or delta < datetime.datetime.now(): + next_mail_time = self.mail_fetcher.last_checked + mail_delta + if self.first_iteration or datetime.datetime.now() > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() From 7357471b9e454e2823c5c4f87c8ad5410f5560d5 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:20 +0200 Subject: [PATCH 12/18] Consumer loop: make sleep duration dynamic Make the sleep duration dynamic to account for the time spent in loop_step. This improves responsiveness when repeatedly consuming newly arriving docs. Use float epoch seconds (time.time()) as the time type for MailFetcher.last_checked to allow for natural time arithmetic. --- src/documents/mail.py | 4 ++-- .../management/commands/document_consumer.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 1be62527d..d974d57c5 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -161,7 +161,7 @@ class MailFetcher(Loggable): self._enabled = bool(self._host) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() self.consume = consume def pull(self): @@ -188,7 +188,7 @@ class MailFetcher(Loggable): f.write(message.attachment.data) os.utime(file_name, times=(t, t)) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() def _get_messages(self): diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index c5fe9baa2..9234617e1 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -59,7 +59,7 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] directory = options["directory"] loop_time = options["loop_time"] - mail_delta = datetime.timedelta(minutes=options["mail_delta"]) + mail_delta = options["mail_delta"] * 60 try: self.file_consumer = Consumer(consume=directory) @@ -83,18 +83,20 @@ class Command(BaseCommand): def loop(self, loop_time, mail_delta): while True: - self.loop_step(mail_delta) - time.sleep(loop_time) + start_time = time.time() if self.verbosity > 1: - print(".", int(time.time())) + print(".", int(start_time)) + self.loop_step(mail_delta, start_time) + # Sleep until the start of the next loop step + time.sleep(max(0, start_time + loop_time - time.time())) - def loop_step(self, mail_delta): + def loop_step(self, mail_delta, time_now=None): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. next_mail_time = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or datetime.datetime.now() > next_mail_time: + if self.first_iteration or time_now > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() From 7e1d59377aa46a1158da830119a4489fb2e97e5d Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:21 +0200 Subject: [PATCH 13/18] Add inotify support --- paperless.conf.example | 2 + requirements.txt | 1 + .../management/commands/document_consumer.py | 47 ++++++++++++++++++- src/paperless/settings.py | 2 + 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index 45c532fe1..0727ac29d 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret" #PAPERLESS_CONVERT_DENSITY=300 +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory # rarely, you may want to use a higher value than the default (10). diff --git a/requirements.txt b/requirements.txt index c90a8eaa9..d51a39039 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ flake8==3.5.0 fuzzywuzzy==0.15.0 gunicorn==19.7.1 idna==2.6 +inotify_simple==1.1.7; sys_platform == 'linux' langdetect==1.0.7 mccabe==0.6.1 more-itertools==4.1.0 diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 9234617e1..a0e2f00fe 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,6 +1,7 @@ import datetime import logging import os +import sys import time from django.conf import settings @@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError from ...consumer import Consumer, ConsumerError, make_dirs from ...mail import MailFetcher, MailFetcherError +try: + from inotify_simple import INotify, flags +except ImportError: + pass + class Command(BaseCommand): """ @@ -53,6 +59,11 @@ class Command(BaseCommand): action="store_true", help="Run only once." ) + parser.add_argument( + "--no-inotify", + action="store_true", + help="Don't use inotify, even if it's available." + ) def handle(self, *args, **options): @@ -60,6 +71,8 @@ class Command(BaseCommand): directory = options["directory"] loop_time = options["loop_time"] mail_delta = options["mail_delta"] * 60 + use_inotify = (not options["no_inotify"] + and "inotify_simple" in sys.modules) try: self.file_consumer = Consumer(consume=directory) @@ -70,14 +83,20 @@ class Command(BaseCommand): make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) logging.getLogger(__name__).info( - "Starting document consumer at {}".format(directory) + "Starting document consumer at {}{}".format( + directory, + " with inotify" if use_inotify else "" + ) ) if options["oneshot"]: self.loop_step(mail_delta) else: try: - self.loop(loop_time, mail_delta) + if use_inotify: + self.loop_inotify(mail_delta) + else: + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") @@ -101,3 +120,27 @@ class Command(BaseCommand): self.mail_fetcher.pull() self.file_consumer.consume_new_files() + + def loop_inotify(self, mail_delta): + directory = self.file_consumer.consume + inotify = INotify() + inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + + # Run initial mail fetch and consume all currently existing documents + self.loop_step(mail_delta) + next_mail_time = self.mail_fetcher.last_checked + mail_delta + + while True: + # Consume documents until next_mail_time + while True: + delta = next_mail_time - time.time() + if delta > 0: + for event in inotify.read(timeout=delta): + file = os.path.join(directory, event.name) + if os.path.isfile(file): + self.file_consumer.try_consume_file(file) + else: + break + + self.mail_fetcher.pull() + next_mail_time = self.mail_fetcher.last_checked + mail_delta diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 314edeb03..15e0674bc 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # CONSUMPTION_DIR. If you tend to write documents to this directory very # slowly, you may want to use a higher value than the default. From 3e8038577d86845a62833bf6ddd53a6743d1b36d Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 00:35:33 +0200 Subject: [PATCH 14/18] fixup: break up complex if condition --- src/documents/mail.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index d974d57c5..9a1c792aa 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -75,9 +75,11 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") - if len(dispositions) < 2 or \ - (not dispositions[0].lower() == "attachment" and - "filename" not in dispositions[1].lower()): + if len(dispositions) < 2: + continue + + if not dispositions[0].lower() == "attachment" and \ + "filename" not in dispositions[1].lower(): continue file_data = part.get_payload() From bccac5017c291f7b821231442f6e62fc4d6abfaf Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 00:35:34 +0200 Subject: [PATCH 15/18] fixup: remove helper fn 'make_dirs' --- src/documents/consumer.py | 10 +--------- src/documents/management/commands/document_consumer.py | 5 +++-- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index cca7c1c13..37151d1b4 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -48,7 +48,7 @@ class Consumer: self.consume = consume self.scratch = scratch - make_dirs(self.scratch) + os.makedirs(self.scratch, exists_ok=True) if not self.consume: raise ConsumerError( @@ -247,11 +247,3 @@ class Consumer: with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() - - -def make_dirs(*dirs): - for dir in dirs: - try: - os.makedirs(dir) - except FileExistsError: - pass diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index a0e2f00fe..4bf403318 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -7,7 +7,7 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...consumer import Consumer, ConsumerError, make_dirs +from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError try: @@ -80,7 +80,8 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) + for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): + os.makedirs(d, exists_ok=True) logging.getLogger(__name__).info( "Starting document consumer at {}{}".format( From 0559204be450b180b7ca5997869c4f30df2cc3a3 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 12:11:56 +0200 Subject: [PATCH 16/18] fixup: require usage of PAPERLESS_EMAIL_SECRET --- src/documents/mail.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 9a1c792aa..afa1b4362 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -42,7 +42,7 @@ class Message(Loggable): and n attachments, and that we don't care about the message body. """ - SECRET = os.getenv("PAPERLESS_EMAIL_SECRET", "") + SECRET = os.getenv("PAPERLESS_EMAIL_SECRET") def __init__(self, data, group=None): """ @@ -162,6 +162,8 @@ class MailFetcher(Loggable): self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX") self._enabled = bool(self._host) + if self._enabled and Message.SECRET is None: + raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined") self.last_checked = time.time() self.consume = consume From 8218b1aa51fb1fe6904979aaaaef4af31983f29f Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 21:17:03 +0200 Subject: [PATCH 17/18] Documentation: Replace 'PDF' with 'document' There are more supported file formats than just PDF. --- docs/utilities.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 25dbd9e49..24565ed6b 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -49,17 +49,17 @@ The Consumer ------------ The consumer script runs in an infinite loop, constantly looking at a directory -for PDF files to parse and index. The process is pretty straightforward: +for documents to parse and index. The process is pretty straightforward: -1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not, - wait 10 seconds and try again. -2. Parse the PDF with Tesseract +1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. + If not, wait 10 seconds and try again. +2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. Read up on the :ref:`guesswork documentation` for more information about this process. -5. Encrypt the PDF and store it in the ``media`` directory under - ``documents/pdf``. +5. Encrypt the document and store it in the ``media`` directory under + ``documents/originals``. 6. Go to #1. @@ -74,7 +74,7 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming PDF files as they +This starts the service that will run in a loop, consuming documents as they appear in ``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver @@ -97,8 +97,8 @@ The Exporter ------------ Tired of fiddling with Paperless, or just want to do something stupid and are -afraid of accidentally damaging your files? You can export all of your PDFs -into neatly named, dated, and unencrypted. +afraid of accidentally damaging your files? You can export all of your +documents into neatly named, dated, and unencrypted files. .. _utilities-exporter-howto: @@ -112,10 +112,10 @@ This too is done via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ -This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you -to do with as you please. The files are accompanied with a special file, -``manifest.json`` which can be used to -:ref:`import the files ` at a later date if you wish. +This will dump all of your unencrypted documents into ``/path/to/somewhere`` +for you to do with as you please. The files are accompanied with a special +file, ``manifest.json`` which can be used to :ref:`import the files +` at a later date if you wish. .. _utilities-exporter-howto-docker: From f96e7f789540cdfb1bcce48e59d140e74518d6dd Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 21:23:57 +0200 Subject: [PATCH 18/18] fixup: mention inotify in 'utilities.rst' --- docs/utilities.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 24565ed6b..b9ded25fc 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -52,7 +52,8 @@ The consumer script runs in an infinite loop, constantly looking at a directory for documents to parse and index. The process is pretty straightforward: 1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. - If not, wait 10 seconds and try again. + If not, wait 10 seconds and try again. On Linux, new documents are detected + instantly via inotify, so there's no waiting involved. 2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. @@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming documents as they -appear in ``CONSUMPTION_DIR``. +This starts the service that will consume documents as they appear in +``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver disappears. If you want to run this full-time (which is kind of the point)