diff --git a/docs/utilities.rst b/docs/utilities.rst index 25dbd9e49..b9ded25fc 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -49,17 +49,18 @@ The Consumer ------------ The consumer script runs in an infinite loop, constantly looking at a directory -for PDF files to parse and index. The process is pretty straightforward: +for documents to parse and index. The process is pretty straightforward: -1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not, - wait 10 seconds and try again. -2. Parse the PDF with Tesseract +1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. + If not, wait 10 seconds and try again. On Linux, new documents are detected + instantly via inotify, so there's no waiting involved. +2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. Read up on the :ref:`guesswork documentation` for more information about this process. -5. Encrypt the PDF and store it in the ``media`` directory under - ``documents/pdf``. +5. Encrypt the document and store it in the ``media`` directory under + ``documents/originals``. 6. Go to #1. @@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming PDF files as they -appear in ``CONSUMPTION_DIR``. +This starts the service that will consume documents as they appear in +``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver disappears. If you want to run this full-time (which is kind of the point) @@ -97,8 +98,8 @@ The Exporter ------------ Tired of fiddling with Paperless, or just want to do something stupid and are -afraid of accidentally damaging your files? You can export all of your PDFs -into neatly named, dated, and unencrypted. +afraid of accidentally damaging your files? You can export all of your +documents into neatly named, dated, and unencrypted files. .. _utilities-exporter-howto: @@ -112,10 +113,10 @@ This too is done via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ -This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you -to do with as you please. The files are accompanied with a special file, -``manifest.json`` which can be used to -:ref:`import the files ` at a later date if you wish. +This will dump all of your unencrypted documents into ``/path/to/somewhere`` +for you to do with as you please. The files are accompanied with a special +file, ``manifest.json`` which can be used to :ref:`import the files +` at a later date if you wish. .. _utilities-exporter-howto-docker: diff --git a/paperless.conf.example b/paperless.conf.example index 45c532fe1..0727ac29d 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret" #PAPERLESS_CONVERT_DENSITY=300 +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory # rarely, you may want to use a higher value than the default (10). diff --git a/requirements.txt b/requirements.txt index e8fa9d81b..0c46e4f8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ flake8==3.5.0 fuzzywuzzy==0.15.0 gunicorn==19.8.1 idna==2.6 +inotify_simple==1.1.7; sys_platform == 'linux' langdetect==1.0.7 mccabe==0.6.1 more-itertools==4.1.0 diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 886b0dd69..37151d1b4 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -3,8 +3,10 @@ import hashlib import logging import os import re +import time import uuid +from operator import itemgetter from django.conf import settings from django.utils import timezone from paperless.db import GnuPG @@ -32,21 +34,21 @@ class Consumer: 5. Delete the document and image(s) """ + # Files are considered ready for consumption if they have been unmodified + # for this duration + FILES_MIN_UNMODIFIED_DURATION = 0.5 + def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None - self.stats = {} self._ignore = [] self.consume = consume self.scratch = scratch - try: - os.makedirs(self.scratch) - except FileExistsError: - pass + os.makedirs(self.scratch, exists_ok=True) if not self.consume: raise ConsumerError( @@ -73,83 +75,99 @@ class Consumer: "group": self.logging_group }) - def run(self): + def consume_new_files(self): + """ + Find non-ignored files in consumption dir and consume them if they have + been unmodified for FILES_MIN_UNMODIFIED_DURATION. + """ + ignored_files = [] + files = [] + for entry in os.scandir(self.consume): + if entry.is_file(): + file = (entry.path, entry.stat().st_mtime) + if file in self._ignore: + ignored_files.append(file) + else: + files.append(file) - for doc in os.listdir(self.consume): + if not files: + return - doc = os.path.join(self.consume, doc) + # Set _ignore to only include files that still exist. + # This keeps it from growing indefinitely. + self._ignore[:] = ignored_files - if not os.path.isfile(doc): - continue + files_old_to_new = sorted(files, key=itemgetter(1)) - if not re.match(FileInfo.REGEXES["title"], doc): - continue + time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) - if doc in self._ignore: - continue + for file, mtime in files_old_to_new: + if mtime == os.path.getmtime(file): + # File has not been modified and can be consumed + if not self.try_consume_file(file): + self._ignore.append((file, mtime)) - if not self._is_ready(doc): - continue + def try_consume_file(self, file): + "Return True if file was consumed" - if self._is_duplicate(doc): - self.log( - "info", - "Skipping {} as it appears to be a duplicate".format(doc) - ) - self._ignore.append(doc) - continue + if not re.match(FileInfo.REGEXES["title"], file): + return False - parser_class = self._get_parser_class(doc) - if not parser_class: - self.log( - "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - continue + doc = file - self.logging_group = uuid.uuid4() + if self._is_duplicate(doc): + self.log( + "info", + "Skipping {} as it appears to be a duplicate".format(doc) + ) + return False - self.log("info", "Consuming {}".format(doc)) + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "error", "No parsers could be found for {}".format(doc)) + return False - document_consumption_started.send( - sender=self.__class__, - filename=doc, - logging_group=self.logging_group + self.logging_group = uuid.uuid4() + + self.log("info", "Consuming {}".format(doc)) + + document_consumption_started.send( + sender=self.__class__, + filename=doc, + logging_group=self.logging_group + ) + + parsed_document = parser_class(doc) + + try: + thumbnail = parsed_document.get_thumbnail() + date = parsed_document.get_date() + document = self._store( + parsed_document.get_text(), + doc, + thumbnail, + date + ) + except ParseError as e: + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() + return False + else: + parsed_document.cleanup() + self._cleanup_doc(doc) + + self.log( + "info", + "Document {} consumption finished".format(document) ) - parsed_document = parser_class(doc) - - try: - thumbnail = parsed_document.get_thumbnail() - date = parsed_document.get_date() - document = self._store( - parsed_document.get_text(), - doc, - thumbnail, - date - ) - except ParseError as e: - - self._ignore.append(doc) - self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) - parsed_document.cleanup() - - continue - - else: - - parsed_document.cleanup() - self._cleanup_doc(doc) - - self.log( - "info", - "Document {} consumption finished".format(document) - ) - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group - ) + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group + ) + return True def _get_parser_class(self, doc): """ @@ -224,22 +242,6 @@ class Consumer: self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) - def _is_ready(self, doc): - """ - Detect whether ``doc`` is ready to consume or if it's still being - written to by the uploader. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False - @staticmethod def _is_duplicate(doc): with open(doc, "rb") as f: diff --git a/src/documents/mail.py b/src/documents/mail.py index d2828a57c..afa1b4362 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -20,7 +20,7 @@ class MailFetcherError(Exception): pass -class InvalidMessageError(Exception): +class InvalidMessageError(MailFetcherError): pass @@ -75,6 +75,9 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") + if len(dispositions) < 2: + continue + if not dispositions[0].lower() == "attachment" and \ "filename" not in dispositions[1].lower(): continue @@ -159,8 +162,10 @@ class MailFetcher(Loggable): self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX") self._enabled = bool(self._host) + if self._enabled and Message.SECRET is None: + raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined") - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() self.consume = consume def pull(self): @@ -187,7 +192,7 @@ class MailFetcher(Loggable): f.write(message.attachment.data) os.utime(file_name, times=(t, t)) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() def _get_messages(self): @@ -205,7 +210,7 @@ class MailFetcher(Loggable): self._connection.close() self._connection.logout() - except Exception as e: + except MailFetcherError as e: self.log("error", str(e)) return r diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index f94265b65..4bf403318 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,6 +1,7 @@ import datetime import logging import os +import sys import time from django.conf import settings @@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError +try: + from inotify_simple import INotify, flags +except ImportError: + pass + class Command(BaseCommand): """ @@ -53,13 +59,20 @@ class Command(BaseCommand): action="store_true", help="Run only once." ) + parser.add_argument( + "--no-inotify", + action="store_true", + help="Don't use inotify, even if it's available." + ) def handle(self, *args, **options): self.verbosity = options["verbosity"] directory = options["directory"] loop_time = options["loop_time"] - mail_delta = datetime.timedelta(minutes=options["mail_delta"]) + mail_delta = options["mail_delta"] * 60 + use_inotify = (not options["no_inotify"] + and "inotify_simple" in sys.modules) try: self.file_consumer = Consumer(consume=directory) @@ -67,39 +80,68 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS): - try: - os.makedirs(path) - except FileExistsError: - pass + for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): + os.makedirs(d, exists_ok=True) logging.getLogger(__name__).info( - "Starting document consumer at {}".format(directory) + "Starting document consumer at {}{}".format( + directory, + " with inotify" if use_inotify else "" + ) ) if options["oneshot"]: - self.loop(mail_delta=mail_delta) + self.loop_step(mail_delta) else: try: - while True: - self.loop(mail_delta=mail_delta) - time.sleep(loop_time) - if self.verbosity > 1: - print(".", int(time.time())) + if use_inotify: + self.loop_inotify(mail_delta) + else: + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") - def loop(self, mail_delta): + def loop(self, loop_time, mail_delta): + while True: + start_time = time.time() + if self.verbosity > 1: + print(".", int(start_time)) + self.loop_step(mail_delta, start_time) + # Sleep until the start of the next loop step + time.sleep(max(0, start_time + loop_time - time.time())) + + def loop_step(self, mail_delta, time_now=None): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. - delta = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or delta < datetime.datetime.now(): + next_mail_time = self.mail_fetcher.last_checked + mail_delta + if self.first_iteration or time_now > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() - # Consume whatever files we can. - # We have to run twice as the first run checks for file readiness - for i in range(2): - self.file_consumer.run() + self.file_consumer.consume_new_files() + + def loop_inotify(self, mail_delta): + directory = self.file_consumer.consume + inotify = INotify() + inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + + # Run initial mail fetch and consume all currently existing documents + self.loop_step(mail_delta) + next_mail_time = self.mail_fetcher.last_checked + mail_delta + + while True: + # Consume documents until next_mail_time + while True: + delta = next_mail_time - time.time() + if delta > 0: + for event in inotify.read(timeout=delta): + file = os.path.join(directory, event.name) + if os.path.isfile(file): + self.file_consumer.try_consume_file(file) + else: + break + + self.mail_fetcher.pull() + next_mail_time = self.mail_fetcher.last_checked + mail_delta diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1c88c6bb8..ef1d5ef21 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # CONSUMPTION_DIR. If you tend to write documents to this directory very # slowly, you may want to use a higher value than the default.