From d5e56095ac4b48eab8df3e2785d82271bc2158b0 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:09 +0200 Subject: [PATCH 01/25] Mail fetching: Only catch internal errors Previously, all errors raised during mail fetching were silently caught and printed without backtrace. To increase robustness and ease debugging, we now fail with a backtrace on unexpected errors. --- src/documents/mail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index d2828a57c..7cc417bfd 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -20,7 +20,7 @@ class MailFetcherError(Exception): pass -class InvalidMessageError(Exception): +class InvalidMessageError(MailFetcherError): pass @@ -205,7 +205,7 @@ class MailFetcher(Loggable): self._connection.close() self._connection.logout() - except Exception as e: + except MailFetcherError as e: self.log("error", str(e)) return r From 260ce7d75c9667449f2a17ee1fb9a08852fce9f9 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:10 +0200 Subject: [PATCH 02/25] Set default empty PAPERLESS_EMAIL_SECRET Previously, if the user didn't set PAPERLESS_EMAIL_SECRET, Paperless failed with an error in check_body() because self.SECRET was None. --- src/documents/mail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 7cc417bfd..f1a84d8e0 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -42,7 +42,7 @@ class Message(Loggable): and n attachments, and that we don't care about the message body. """ - SECRET = os.getenv("PAPERLESS_EMAIL_SECRET") + SECRET = os.getenv("PAPERLESS_EMAIL_SECRET", "") def __init__(self, data, group=None): """ From 8ebe52a7db54dbc8e094f3d045cf306a1830b511 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:11 +0200 Subject: [PATCH 03/25] Fix list out of bounds error in mail message parsing Check list length before accessing the first two elements of 'dispositions'. The list may have only a single element ('inline') or may be empty in mailformed emails. --- src/documents/mail.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index f1a84d8e0..1be62527d 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -75,8 +75,9 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") - if not dispositions[0].lower() == "attachment" and \ - "filename" not in dispositions[1].lower(): + if len(dispositions) < 2 or \ + (not dispositions[0].lower() == "attachment" and + "filename" not in dispositions[1].lower()): continue file_data = part.get_payload() From 9320230100b17c34efa2621706601e8652c5ed30 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:12 +0200 Subject: [PATCH 04/25] Refactor: extract fn 'make_dirs' --- src/documents/consumer.py | 13 +++++++++---- .../management/commands/document_consumer.py | 8 ++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 886b0dd69..fc8635c95 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -43,10 +43,7 @@ class Consumer: self.consume = consume self.scratch = scratch - try: - os.makedirs(self.scratch) - except FileExistsError: - pass + make_dirs(self.scratch) if not self.consume: raise ConsumerError( @@ -245,3 +242,11 @@ class Consumer: with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() + + +def make_dirs(*dirs): + for dir in dirs: + try: + os.makedirs(dir) + except FileExistsError: + pass diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index f94265b65..ae8ff7e35 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -6,7 +6,7 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...consumer import Consumer, ConsumerError +from ...consumer import Consumer, ConsumerError, make_dirs from ...mail import MailFetcher, MailFetcherError @@ -67,11 +67,7 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS): - try: - os.makedirs(path) - except FileExistsError: - pass + make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) logging.getLogger(__name__).info( "Starting document consumer at {}".format(directory) From 2c64e707545b586d2168742bf465bef05f95f5a2 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:13 +0200 Subject: [PATCH 05/25] Consume documents in order of increasing mtime This increases overall usability, especially for multi-page scans. Previously, the consumption order was undefined (see os.listdir()) --- src/documents/consumer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fc8635c95..3d7cc7bd1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -71,8 +71,11 @@ class Consumer: }) def run(self): + docs = [os.path.join(self.consume, entry) + for entry in os.listdir(self.consume)] + docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) - for doc in os.listdir(self.consume): + for doc in docs_old_to_new: doc = os.path.join(self.consume, doc) From 312a6a91b519fa4bca295c6617927a1fd2871ec1 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:14 +0200 Subject: [PATCH 06/25] Use os.scandir instead of os.listdir It's simpler and better suited for use cases introduced in later commits. --- src/documents/consumer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 3d7cc7bd1..6f9273b03 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -71,8 +71,7 @@ class Consumer: }) def run(self): - docs = [os.path.join(self.consume, entry) - for entry in os.listdir(self.consume)] + docs = [entry.path for entry in os.scandir(self.consume)] docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) for doc in docs_old_to_new: From 0db6ed225bb0fabc8963ba2f52a6aa0907d335cd Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:15 +0200 Subject: [PATCH 07/25] Refactor: extract fn try_consume_file The main purpose of this change is to make the following commits more readable. --- src/documents/consumer.py | 126 +++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 6f9273b03..d1d839e4d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -75,80 +75,82 @@ class Consumer: docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) for doc in docs_old_to_new: + self.try_consume_file(doc) - doc = os.path.join(self.consume, doc) + def try_consume_file(self, doc): + doc = os.path.join(self.consume, doc) - if not os.path.isfile(doc): - continue + if not os.path.isfile(doc): + return - if not re.match(FileInfo.REGEXES["title"], doc): - continue + if not re.match(FileInfo.REGEXES["title"], doc): + return - if doc in self._ignore: - continue + if doc in self._ignore: + return - if not self._is_ready(doc): - continue + if not self._is_ready(doc): + return - if self._is_duplicate(doc): - self.log( - "info", - "Skipping {} as it appears to be a duplicate".format(doc) - ) - self._ignore.append(doc) - continue + if self._is_duplicate(doc): + self.log( + "info", + "Skipping {} as it appears to be a duplicate".format(doc) + ) + self._ignore.append(doc) + return - parser_class = self._get_parser_class(doc) - if not parser_class: - self.log( - "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - continue + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "error", "No parsers could be found for {}".format(doc)) + self._ignore.append(doc) + return - self.logging_group = uuid.uuid4() + self.logging_group = uuid.uuid4() - self.log("info", "Consuming {}".format(doc)) + self.log("info", "Consuming {}".format(doc)) - document_consumption_started.send( - sender=self.__class__, - filename=doc, - logging_group=self.logging_group + document_consumption_started.send( + sender=self.__class__, + filename=doc, + logging_group=self.logging_group + ) + + parsed_document = parser_class(doc) + + try: + thumbnail = parsed_document.get_thumbnail() + date = parsed_document.get_date() + document = self._store( + parsed_document.get_text(), + doc, + thumbnail, + date + ) + except ParseError as e: + + self._ignore.append(doc) + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() + + return + + else: + + parsed_document.cleanup() + self._cleanup_doc(doc) + + self.log( + "info", + "Document {} consumption finished".format(document) ) - parsed_document = parser_class(doc) - - try: - thumbnail = parsed_document.get_thumbnail() - date = parsed_document.get_date() - document = self._store( - parsed_document.get_text(), - doc, - thumbnail, - date - ) - except ParseError as e: - - self._ignore.append(doc) - self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) - parsed_document.cleanup() - - continue - - else: - - parsed_document.cleanup() - self._cleanup_doc(doc) - - self.log( - "info", - "Document {} consumption finished".format(document) - ) - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group - ) + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group + ) def _get_parser_class(self, doc): """ From f56ec70aada710be72104f42d1ec74805e7bd963 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:16 +0200 Subject: [PATCH 08/25] Ensure docs have been unmodified for some time before consuming Previously, the second mtime check for new files usually happened right after the first one, which could have caused consumption of docs that were still being modified. We're now waiting for at least FILES_MIN_UNMODIFIED_DURATION (0.5s). This also cleans up the logic by eliminating the consumer.stats attribute and the weird double call to consumer.run(). Additionally, this a fixes memory leak in consumer.stats where paths could be added but never removed if the corresponding files disappeared from the consumer dir before being considered ready. --- src/documents/consumer.py | 62 +++++++++---------- .../management/commands/document_consumer.py | 5 +- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d1d839e4d..514406646 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -3,8 +3,10 @@ import hashlib import logging import os import re +import time import uuid +from operator import itemgetter from django.conf import settings from django.utils import timezone from paperless.db import GnuPG @@ -32,13 +34,16 @@ class Consumer: 5. Delete the document and image(s) """ + # Files are considered ready for consumption if they have been unmodified + # for this duration + FILES_MIN_UNMODIFIED_DURATION = 0.5 + def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None - self.stats = {} self._ignore = [] self.consume = consume self.scratch = scratch @@ -70,27 +75,34 @@ class Consumer: "group": self.logging_group }) - def run(self): - docs = [entry.path for entry in os.scandir(self.consume)] - docs_old_to_new = sorted(docs, key=lambda doc: os.path.getmtime(doc)) + def consume_new_files(self): + """ + Find non-ignored files in consumption dir and consume them if they have + been unmodified for FILES_MIN_UNMODIFIED_DURATION. + """ + files = [] + for entry in os.scandir(self.consume): + if entry.is_file() and entry.path not in self._ignore: + files.append((entry.path, entry.stat().st_mtime)) - for doc in docs_old_to_new: - self.try_consume_file(doc) - - def try_consume_file(self, doc): - doc = os.path.join(self.consume, doc) - - if not os.path.isfile(doc): + if not files: return - if not re.match(FileInfo.REGEXES["title"], doc): - return + files_old_to_new = sorted(files, key=itemgetter(1)) - if doc in self._ignore: - return + time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) - if not self._is_ready(doc): - return + for file, mtime in files_old_to_new: + if mtime == os.path.getmtime(file): + # File has not been modified and can be consumed + self.try_consume_file(file) + + def try_consume_file(self, file): + + if not re.match(FileInfo.REGEXES["title"], file): + return False + + doc = file if self._is_duplicate(doc): self.log( @@ -225,22 +237,6 @@ class Consumer: self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) - def _is_ready(self, doc): - """ - Detect whether ``doc`` is ready to consume or if it's still being - written to by the uploader. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False - @staticmethod def _is_duplicate(doc): with open(doc, "rb") as f: diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ae8ff7e35..4aec489b6 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -95,7 +95,4 @@ class Command(BaseCommand): self.first_iteration = False self.mail_fetcher.pull() - # Consume whatever files we can. - # We have to run twice as the first run checks for file readiness - for i in range(2): - self.file_consumer.run() + self.file_consumer.consume_new_files() From cc22204e5ab12dcb413d04461190aad78b1f4012 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:17 +0200 Subject: [PATCH 09/25] Simplify ignoring docs --- src/documents/consumer.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 514406646..e895593a5 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -95,9 +95,11 @@ class Consumer: for file, mtime in files_old_to_new: if mtime == os.path.getmtime(file): # File has not been modified and can be consumed - self.try_consume_file(file) + if not self.try_consume_file(file): + self._ignore.append(file) def try_consume_file(self, file): + "Return True if file was consumed" if not re.match(FileInfo.REGEXES["title"], file): return False @@ -109,15 +111,13 @@ class Consumer: "info", "Skipping {} as it appears to be a duplicate".format(doc) ) - self._ignore.append(doc) - return + return False parser_class = self._get_parser_class(doc) if not parser_class: self.log( "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - return + return False self.logging_group = uuid.uuid4() @@ -141,15 +141,10 @@ class Consumer: date ) except ParseError as e: - - self._ignore.append(doc) self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) parsed_document.cleanup() - - return - + return False else: - parsed_document.cleanup() self._cleanup_doc(doc) @@ -163,6 +158,7 @@ class Consumer: document=document, logging_group=self.logging_group ) + return True def _get_parser_class(self, doc): """ From 8b37af994a8adc942303b7d9e9274e10a5aa1914 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:18 +0200 Subject: [PATCH 10/25] Consider mtime of ignored files, garbage-collect ignore list 1. Store the mtime of ignored files so that we can reconsider them if they have changed. 2. Regularly reset the ignore list to files that still exist in the consumption dir. Previously, the list could grow indefinitely. --- src/documents/consumer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index e895593a5..cca7c1c13 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -80,14 +80,23 @@ class Consumer: Find non-ignored files in consumption dir and consume them if they have been unmodified for FILES_MIN_UNMODIFIED_DURATION. """ + ignored_files = [] files = [] for entry in os.scandir(self.consume): - if entry.is_file() and entry.path not in self._ignore: - files.append((entry.path, entry.stat().st_mtime)) + if entry.is_file(): + file = (entry.path, entry.stat().st_mtime) + if file in self._ignore: + ignored_files.append(file) + else: + files.append(file) if not files: return + # Set _ignore to only include files that still exist. + # This keeps it from growing indefinitely. + self._ignore[:] = ignored_files + files_old_to_new = sorted(files, key=itemgetter(1)) time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) @@ -96,7 +105,7 @@ class Consumer: if mtime == os.path.getmtime(file): # File has not been modified and can be consumed if not self.try_consume_file(file): - self._ignore.append(file) + self._ignore.append((file, mtime)) def try_consume_file(self, file): "Return True if file was consumed" From aac17670de8fab51424515a742d2c6f37ab6229d Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:19 +0200 Subject: [PATCH 11/25] Refactor: renamings, extract fn 'loop' Renamings: loop -> loop_step delta -> next_mail_time (this variable names a point in time, not a duration) Extracting the 'loop' fn is a preparation for later commits where a second type of loop is added. --- .../management/commands/document_consumer.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 4aec489b6..c5fe9baa2 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -74,24 +74,27 @@ class Command(BaseCommand): ) if options["oneshot"]: - self.loop(mail_delta=mail_delta) + self.loop_step(mail_delta) else: try: - while True: - self.loop(mail_delta=mail_delta) - time.sleep(loop_time) - if self.verbosity > 1: - print(".", int(time.time())) + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") - def loop(self, mail_delta): + def loop(self, loop_time, mail_delta): + while True: + self.loop_step(mail_delta) + time.sleep(loop_time) + if self.verbosity > 1: + print(".", int(time.time())) + + def loop_step(self, mail_delta): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. - delta = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or delta < datetime.datetime.now(): + next_mail_time = self.mail_fetcher.last_checked + mail_delta + if self.first_iteration or datetime.datetime.now() > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() From b74b47423d804114fb41847b456a1aee3fa80579 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:20 +0200 Subject: [PATCH 12/25] Consumer loop: make sleep duration dynamic Make the sleep duration dynamic to account for the time spent in loop_step. This improves responsiveness when repeatedly consuming newly arriving docs. Use float epoch seconds (time.time()) as the time type for MailFetcher.last_checked to allow for natural time arithmetic. --- src/documents/mail.py | 4 ++-- .../management/commands/document_consumer.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 1be62527d..d974d57c5 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -161,7 +161,7 @@ class MailFetcher(Loggable): self._enabled = bool(self._host) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() self.consume = consume def pull(self): @@ -188,7 +188,7 @@ class MailFetcher(Loggable): f.write(message.attachment.data) os.utime(file_name, times=(t, t)) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() def _get_messages(self): diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index c5fe9baa2..9234617e1 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -59,7 +59,7 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] directory = options["directory"] loop_time = options["loop_time"] - mail_delta = datetime.timedelta(minutes=options["mail_delta"]) + mail_delta = options["mail_delta"] * 60 try: self.file_consumer = Consumer(consume=directory) @@ -83,18 +83,20 @@ class Command(BaseCommand): def loop(self, loop_time, mail_delta): while True: - self.loop_step(mail_delta) - time.sleep(loop_time) + start_time = time.time() if self.verbosity > 1: - print(".", int(time.time())) + print(".", int(start_time)) + self.loop_step(mail_delta, start_time) + # Sleep until the start of the next loop step + time.sleep(max(0, start_time + loop_time - time.time())) - def loop_step(self, mail_delta): + def loop_step(self, mail_delta, time_now=None): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. next_mail_time = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or datetime.datetime.now() > next_mail_time: + if self.first_iteration or time_now > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() From 3db175dfe2d39b0ea34b8e2a5541e833d36777b7 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 11 May 2018 14:01:21 +0200 Subject: [PATCH 13/25] Add inotify support --- paperless.conf.example | 2 + requirements.txt | 1 + .../management/commands/document_consumer.py | 47 ++++++++++++++++++- src/paperless/settings.py | 2 + 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index 45c532fe1..0727ac29d 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret" #PAPERLESS_CONVERT_DENSITY=300 +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory # rarely, you may want to use a higher value than the default (10). diff --git a/requirements.txt b/requirements.txt index c90a8eaa9..d51a39039 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ flake8==3.5.0 fuzzywuzzy==0.15.0 gunicorn==19.7.1 idna==2.6 +inotify_simple==1.1.7; sys_platform == 'linux' langdetect==1.0.7 mccabe==0.6.1 more-itertools==4.1.0 diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 9234617e1..a0e2f00fe 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,6 +1,7 @@ import datetime import logging import os +import sys import time from django.conf import settings @@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError from ...consumer import Consumer, ConsumerError, make_dirs from ...mail import MailFetcher, MailFetcherError +try: + from inotify_simple import INotify, flags +except ImportError: + pass + class Command(BaseCommand): """ @@ -53,6 +59,11 @@ class Command(BaseCommand): action="store_true", help="Run only once." ) + parser.add_argument( + "--no-inotify", + action="store_true", + help="Don't use inotify, even if it's available." + ) def handle(self, *args, **options): @@ -60,6 +71,8 @@ class Command(BaseCommand): directory = options["directory"] loop_time = options["loop_time"] mail_delta = options["mail_delta"] * 60 + use_inotify = (not options["no_inotify"] + and "inotify_simple" in sys.modules) try: self.file_consumer = Consumer(consume=directory) @@ -70,14 +83,20 @@ class Command(BaseCommand): make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) logging.getLogger(__name__).info( - "Starting document consumer at {}".format(directory) + "Starting document consumer at {}{}".format( + directory, + " with inotify" if use_inotify else "" + ) ) if options["oneshot"]: self.loop_step(mail_delta) else: try: - self.loop(loop_time, mail_delta) + if use_inotify: + self.loop_inotify(mail_delta) + else: + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") @@ -101,3 +120,27 @@ class Command(BaseCommand): self.mail_fetcher.pull() self.file_consumer.consume_new_files() + + def loop_inotify(self, mail_delta): + directory = self.file_consumer.consume + inotify = INotify() + inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + + # Run initial mail fetch and consume all currently existing documents + self.loop_step(mail_delta) + next_mail_time = self.mail_fetcher.last_checked + mail_delta + + while True: + # Consume documents until next_mail_time + while True: + delta = next_mail_time - time.time() + if delta > 0: + for event in inotify.read(timeout=delta): + file = os.path.join(directory, event.name) + if os.path.isfile(file): + self.file_consumer.try_consume_file(file) + else: + break + + self.mail_fetcher.pull() + next_mail_time = self.mail_fetcher.last_checked + mail_delta diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 314edeb03..15e0674bc 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # CONSUMPTION_DIR. If you tend to write documents to this directory very # slowly, you may want to use a higher value than the default. From 1eeae1df00876f491ecbc28857a0b47056ac511b Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 00:35:33 +0200 Subject: [PATCH 14/25] fixup: break up complex if condition --- src/documents/mail.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index d974d57c5..9a1c792aa 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -75,9 +75,11 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") - if len(dispositions) < 2 or \ - (not dispositions[0].lower() == "attachment" and - "filename" not in dispositions[1].lower()): + if len(dispositions) < 2: + continue + + if not dispositions[0].lower() == "attachment" and \ + "filename" not in dispositions[1].lower(): continue file_data = part.get_payload() From d132e2b9f58e9dd9c96d0e9d4eca70ef270f47b2 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 00:35:34 +0200 Subject: [PATCH 15/25] fixup: remove helper fn 'make_dirs' --- src/documents/consumer.py | 10 +--------- src/documents/management/commands/document_consumer.py | 5 +++-- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index cca7c1c13..37151d1b4 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -48,7 +48,7 @@ class Consumer: self.consume = consume self.scratch = scratch - make_dirs(self.scratch) + os.makedirs(self.scratch, exists_ok=True) if not self.consume: raise ConsumerError( @@ -247,11 +247,3 @@ class Consumer: with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() - - -def make_dirs(*dirs): - for dir in dirs: - try: - os.makedirs(dir) - except FileExistsError: - pass diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index a0e2f00fe..4bf403318 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -7,7 +7,7 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...consumer import Consumer, ConsumerError, make_dirs +from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError try: @@ -80,7 +80,8 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - make_dirs(self.ORIGINAL_DOCS, self.THUMB_DOCS) + for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): + os.makedirs(d, exists_ok=True) logging.getLogger(__name__).info( "Starting document consumer at {}{}".format( From 901a810e974ac883a76366be2785d0a8c061c38b Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 12:11:56 +0200 Subject: [PATCH 16/25] fixup: require usage of PAPERLESS_EMAIL_SECRET --- src/documents/mail.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/mail.py b/src/documents/mail.py index 9a1c792aa..afa1b4362 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -42,7 +42,7 @@ class Message(Loggable): and n attachments, and that we don't care about the message body. """ - SECRET = os.getenv("PAPERLESS_EMAIL_SECRET", "") + SECRET = os.getenv("PAPERLESS_EMAIL_SECRET") def __init__(self, data, group=None): """ @@ -162,6 +162,8 @@ class MailFetcher(Loggable): self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX") self._enabled = bool(self._host) + if self._enabled and Message.SECRET is None: + raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined") self.last_checked = time.time() self.consume = consume From 2a8dcd14032256b798eca1254aea6de55c61a3ab Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 21:17:03 +0200 Subject: [PATCH 17/25] Documentation: Replace 'PDF' with 'document' There are more supported file formats than just PDF. --- docs/utilities.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 25dbd9e49..24565ed6b 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -49,17 +49,17 @@ The Consumer ------------ The consumer script runs in an infinite loop, constantly looking at a directory -for PDF files to parse and index. The process is pretty straightforward: +for documents to parse and index. The process is pretty straightforward: -1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not, - wait 10 seconds and try again. -2. Parse the PDF with Tesseract +1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. + If not, wait 10 seconds and try again. +2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. Read up on the :ref:`guesswork documentation` for more information about this process. -5. Encrypt the PDF and store it in the ``media`` directory under - ``documents/pdf``. +5. Encrypt the document and store it in the ``media`` directory under + ``documents/originals``. 6. Go to #1. @@ -74,7 +74,7 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming PDF files as they +This starts the service that will run in a loop, consuming documents as they appear in ``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver @@ -97,8 +97,8 @@ The Exporter ------------ Tired of fiddling with Paperless, or just want to do something stupid and are -afraid of accidentally damaging your files? You can export all of your PDFs -into neatly named, dated, and unencrypted. +afraid of accidentally damaging your files? You can export all of your +documents into neatly named, dated, and unencrypted files. .. _utilities-exporter-howto: @@ -112,10 +112,10 @@ This too is done via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ -This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you -to do with as you please. The files are accompanied with a special file, -``manifest.json`` which can be used to -:ref:`import the files ` at a later date if you wish. +This will dump all of your unencrypted documents into ``/path/to/somewhere`` +for you to do with as you please. The files are accompanied with a special +file, ``manifest.json`` which can be used to :ref:`import the files +` at a later date if you wish. .. _utilities-exporter-howto-docker: From b57cbe2e3ba37463ad7328ebcbf25bfa09030f16 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Mon, 21 May 2018 21:23:57 +0200 Subject: [PATCH 18/25] fixup: mention inotify in 'utilities.rst' --- docs/utilities.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/utilities.rst b/docs/utilities.rst index 24565ed6b..b9ded25fc 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -52,7 +52,8 @@ The consumer script runs in an infinite loop, constantly looking at a directory for documents to parse and index. The process is pretty straightforward: 1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. - If not, wait 10 seconds and try again. + If not, wait 10 seconds and try again. On Linux, new documents are detected + instantly via inotify, so there's no waiting involved. 2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. @@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming documents as they -appear in ``CONSUMPTION_DIR``. +This starts the service that will consume documents as they appear in +``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver disappears. If you want to run this full-time (which is kind of the point) From d1b6e9329f61b9ed737f8d506ba11432e5561d1c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Mon, 28 May 2018 13:08:00 +0100 Subject: [PATCH 19/25] It's exist_ok=, not exists_ok= -- my bad. --- src/documents/consumer.py | 2 +- src/documents/management/commands/document_consumer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 37151d1b4..6732c458a 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -48,7 +48,7 @@ class Consumer: self.consume = consume self.scratch = scratch - os.makedirs(self.scratch, exists_ok=True) + os.makedirs(self.scratch, exist_ok=True) if not self.consume: raise ConsumerError( diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 4bf403318..41e5382d5 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -81,7 +81,7 @@ class Command(BaseCommand): raise CommandError(e) for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): - os.makedirs(d, exists_ok=True) + os.makedirs(d, exist_ok=True) logging.getLogger(__name__).info( "Starting document consumer at {}{}".format( From a8f27f54407dd7edec4e0cccd8a24fef54b45f26 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Mon, 28 May 2018 13:11:19 +0100 Subject: [PATCH 20/25] Add note about inotify --- docs/changelog.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index 6945b90f6..bbe221362 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -33,6 +33,8 @@ Changelog entrypoint and fixed it with some very creating Bash skills: `#352`_. * You can now use the search field to find documents by tag thanks to `thinkjk`_'s *first ever issue*: `#354`_. +* Inotify is now being used to detect additions to the consume directory thanks + to some excellent work from `erikarvstedt`_ on `#351`_ 1.3.0 ===== @@ -462,6 +464,7 @@ Changelog .. _#253: https://github.com/danielquinn/paperless/issues/253 .. _#323: https://github.com/danielquinn/paperless/issues/323 .. _#344: https://github.com/danielquinn/paperless/pull/344 +.. _#351: https://github.com/danielquinn/paperless/pull/351 .. _#352: https://github.com/danielquinn/paperless/pull/352 .. _#354: https://github.com/danielquinn/paperless/issues/354 From 33e3277d2a7321f452cf3a72fe63cb429890fb5e Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Thu, 26 Apr 2018 11:58:05 +0200 Subject: [PATCH 21/25] Add field 'added' to documents This field indicates when the document was added to the database --- src/documents/admin.py | 4 ++- .../migrations/0020_document_added.py | 27 +++++++++++++++++++ src/documents/models.py | 2 ++ .../document/change_list_results.html | 17 ++++++------ 4 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 src/documents/migrations/0020_document_added.py diff --git a/src/documents/admin.py b/src/documents/admin.py index 3ce2785b5..39524ae21 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -125,7 +125,9 @@ class DocumentAdmin(CommonAdmin): } search_fields = ("correspondent__name", "title", "content", "tags__name") - list_display = ("title", "created", "thumbnail", "correspondent", "tags_") + readonly_fields = ("added",) + list_display = ("title", "created", "added", "thumbnail", "correspondent", + "tags_") list_filter = ("tags", "correspondent", FinancialYearFilter, MonthListFilter) diff --git a/src/documents/migrations/0020_document_added.py b/src/documents/migrations/0020_document_added.py new file mode 100644 index 000000000..dbddf80ae --- /dev/null +++ b/src/documents/migrations/0020_document_added.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models +import django.utils.timezone + + +def set_added_time_to_created_time(apps, schema_editor): + Document = apps.get_model("documents", "Document") + for doc in Document.objects.all(): + doc.added = doc.created + doc.save() + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0019_add_consumer_user'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='added', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False), + ), + migrations.RunPython(set_added_time_to_created_time) + ] diff --git a/src/documents/models.py b/src/documents/models.py index 420afa426..c5239a387 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -229,6 +229,8 @@ class Document(models.Model): default=timezone.now, db_index=True) modified = models.DateTimeField( auto_now=True, editable=False, db_index=True) + added = models.DateTimeField( + default=timezone.now, editable=False, db_index=True) class Meta(object): ordering = ("correspondent", "title") diff --git a/src/documents/templates/admin/documents/document/change_list_results.html b/src/documents/templates/admin/documents/document/change_list_results.html index cd5f88f0a..5bd5b1aff 100644 --- a/src/documents/templates/admin/documents/document/change_list_results.html +++ b/src/documents/templates/admin/documents/document/change_list_results.html @@ -129,24 +129,25 @@ {# 0: Checkbox #} {# 1: Title #} {# 2: Date #} - {# 3: Image #} - {# 4: Correspondent #} - {# 5: Tags #} - {# 6: Document edit url #} + {# 3: Added #} + {# 4: Image #} + {# 5: Correspondent #} + {# 6: Tags #} + {# 7: Document edit url #}
-
+
{{ result.0 }}
- {{ result.4 }}
+ {{ result.5 }}
{{ result.1 }}
-
{{ result.5 }}
+
{{ result.6 }}
{{ result.2 }}
-
{{ result.3 }}
+
{{ result.4 }}
{% endfor %} From 67b4cfaa830f923804d1727f99374142e57c77bb Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Fri, 1 Jun 2018 00:45:59 +0200 Subject: [PATCH 22/25] Fix incompatibility with Python versions < 3.6 Direct index access to a match was only added in 3.6. Fixes #359 --- src/documents/templatetags/hacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/templatetags/hacks.py b/src/documents/templatetags/hacks.py index 4faf1783f..0c0a0e099 100644 --- a/src/documents/templatetags/hacks.py +++ b/src/documents/templatetags/hacks.py @@ -38,6 +38,6 @@ def add_doc_edit_url(result): """ title = result[1] match = re.search(EXTRACT_URL, title) - edit_doc_url = match[1] + edit_doc_url = match.group(1) result.append(edit_doc_url) return result From d0a5d750dbc4925fea9e029c38a3afca8cee8fdb Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Wed, 30 May 2018 16:10:11 +0200 Subject: [PATCH 23/25] Fix unclickable checkbox in documents view 1. Clicks to the document selection checkbox were captured by the onclick handler of the document item header. This is now fixed. 2. Reexpose the doc title link to mouse events by putting it on top of the header link layer. --- .../document/change_list_results.html | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/documents/templates/admin/documents/document/change_list_results.html b/src/documents/templates/admin/documents/document/change_list_results.html index 5bd5b1aff..b33cd2927 100644 --- a/src/documents/templates/admin/documents/document/change_list_results.html +++ b/src/documents/templates/admin/documents/document/change_list_results.html @@ -29,13 +29,32 @@ .result .header { padding: 5px; background-color: #79AEC8; + position: relative; } - .result .header .checkbox{ + .result .header .checkbox { width: 5%; float: left; + position: absolute; + z-index: 2; } .result .header .info { margin-left: 10%; + position: relative; + } + .headerLink { + cursor: pointer; + opacity: 0; + z-index: 1; + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + } + .header > a { + z-index: 2; + margin-left: 10%; + position: relative; } .result .header a, .result a.tag { @@ -136,12 +155,23 @@ {# 7: Document edit url #}
-
+
{{ result.6 }}
From ca97caf68e2a7e61a0e7b23183964fa1eecb62fe Mon Sep 17 00:00:00 2001 From: ahyear Date: Fri, 15 Jun 2018 15:31:29 +0200 Subject: [PATCH 24/25] update docker-compose.env for mail Consumptionu$ --- docker-compose.env.example | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docker-compose.env.example b/docker-compose.env.example index 13c74b6ab..fbe1e5f4d 100644 --- a/docker-compose.env.example +++ b/docker-compose.env.example @@ -13,3 +13,24 @@ PAPERLESS_PASSPHRASE=CHANGE_ME # You can change the default user and group id to a custom one # USERMAP_UID=1000 # USERMAP_GID=1000 + +############################################################################### +#### Mail Consumption #### +############################################################################### + +# These values are required if you want paperless to check a particular email +# box every 10 minutes and attempt to consume documents from there. If you +# don't define a HOST, mail checking will just be disabled. +# don't use quotes after = or it will crash your docker +# PAPERLESS_CONSUME_MAIL_HOST= +# PAPERLESS_CONSUME_MAIL_PORT= +# PAPERLESS_CONSUME_MAIL_USER= +# PAPERLESS_CONSUME_MAIL_PASS= + +# Override the default IMAP inbox here. If not set Paperless defaults to +# INBOX. +# PAPERLESS_CONSUME_MAIL_INBOX=INBOX + +# Any email sent to the target account that does not contain this text will be +# ignored. +# PAPERLESS_EMAIL_SECRET= From 3cfdb10f8fb14678e5605d6559ba085a8726ef54 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 15 Jun 2018 14:44:19 +0100 Subject: [PATCH 25/25] Clean up the text a bit --- docker-compose.env.example | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docker-compose.env.example b/docker-compose.env.example index fbe1e5f4d..fa4097c5e 100644 --- a/docker-compose.env.example +++ b/docker-compose.env.example @@ -21,16 +21,17 @@ PAPERLESS_PASSPHRASE=CHANGE_ME # These values are required if you want paperless to check a particular email # box every 10 minutes and attempt to consume documents from there. If you # don't define a HOST, mail checking will just be disabled. -# don't use quotes after = or it will crash your docker +# Don't use quotes after = or it will crash your docker # PAPERLESS_CONSUME_MAIL_HOST= # PAPERLESS_CONSUME_MAIL_PORT= # PAPERLESS_CONSUME_MAIL_USER= # PAPERLESS_CONSUME_MAIL_PASS= -# Override the default IMAP inbox here. If not set Paperless defaults to +# Override the default IMAP inbox here. If it's not set, Paperless defaults to # INBOX. # PAPERLESS_CONSUME_MAIL_INBOX=INBOX # Any email sent to the target account that does not contain this text will be -# ignored. +# ignored. Mail checking won't work without this. # PAPERLESS_EMAIL_SECRET= +