diff --git a/docker-compose.env.example b/docker-compose.env.example index d1c4a2887..3c1664573 100644 --- a/docker-compose.env.example +++ b/docker-compose.env.example @@ -14,3 +14,25 @@ # You can change the default user and group id to a custom one # USERMAP_UID=1000 # USERMAP_GID=1000 + +############################################################################### +#### Mail Consumption #### +############################################################################### + +# These values are required if you want paperless to check a particular email +# box every 10 minutes and attempt to consume documents from there. If you +# don't define a HOST, mail checking will just be disabled. +# Don't use quotes after = or it will crash your docker +# PAPERLESS_CONSUME_MAIL_HOST= +# PAPERLESS_CONSUME_MAIL_PORT= +# PAPERLESS_CONSUME_MAIL_USER= +# PAPERLESS_CONSUME_MAIL_PASS= + +# Override the default IMAP inbox here. If it's not set, Paperless defaults to +# INBOX. +# PAPERLESS_CONSUME_MAIL_INBOX=INBOX + +# Any email sent to the target account that does not contain this text will be +# ignored. Mail checking won't work without this. +# PAPERLESS_EMAIL_SECRET= + diff --git a/docs/changelog.rst b/docs/changelog.rst index 64e78d6c9..d5c812c9a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -62,6 +62,8 @@ encryption too, you only need to do two things: entrypoint and fixed it with some very creating Bash skills: `#352`_. * You can now use the search field to find documents by tag thanks to `thinkjk`_'s *first ever issue*: `#354`_. +* Inotify is now being used to detect additions to the consume directory thanks + to some excellent work from `erikarvstedt`_ on `#351`_ 1.3.0 ===== @@ -491,6 +493,7 @@ encryption too, you only need to do two things: .. _#253: https://github.com/danielquinn/paperless/issues/253 .. _#323: https://github.com/danielquinn/paperless/issues/323 .. _#344: https://github.com/danielquinn/paperless/pull/344 +.. _#351: https://github.com/danielquinn/paperless/pull/351 .. _#352: https://github.com/danielquinn/paperless/pull/352 .. _#354: https://github.com/danielquinn/paperless/issues/354 diff --git a/docs/utilities.rst b/docs/utilities.rst index 25dbd9e49..b9ded25fc 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -49,17 +49,18 @@ The Consumer ------------ The consumer script runs in an infinite loop, constantly looking at a directory -for PDF files to parse and index. The process is pretty straightforward: +for documents to parse and index. The process is pretty straightforward: -1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not, - wait 10 seconds and try again. -2. Parse the PDF with Tesseract +1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2. + If not, wait 10 seconds and try again. On Linux, new documents are detected + instantly via inotify, so there's no waiting involved. +2. Parse the document with Tesseract 3. Create a new record in the database with the OCR'd text 4. Attempt to automatically assign document attributes by doing some guesswork. Read up on the :ref:`guesswork documentation` for more information about this process. -5. Encrypt the PDF and store it in the ``media`` directory under - ``documents/pdf``. +5. Encrypt the document and store it in the ``media`` directory under + ``documents/originals``. 6. Go to #1. @@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_consumer -This starts the service that will run in a loop, consuming PDF files as they -appear in ``CONSUMPTION_DIR``. +This starts the service that will consume documents as they appear in +``CONSUMPTION_DIR``. Note that this command runs continuously, so exiting it will mean your webserver disappears. If you want to run this full-time (which is kind of the point) @@ -97,8 +98,8 @@ The Exporter ------------ Tired of fiddling with Paperless, or just want to do something stupid and are -afraid of accidentally damaging your files? You can export all of your PDFs -into neatly named, dated, and unencrypted. +afraid of accidentally damaging your files? You can export all of your +documents into neatly named, dated, and unencrypted files. .. _utilities-exporter-howto: @@ -112,10 +113,10 @@ This too is done via the ``manage.py`` script: $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ -This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you -to do with as you please. The files are accompanied with a special file, -``manifest.json`` which can be used to -:ref:`import the files ` at a later date if you wish. +This will dump all of your unencrypted documents into ``/path/to/somewhere`` +for you to do with as you please. The files are accompanied with a special +file, ``manifest.json`` which can be used to :ref:`import the files +` at a later date if you wish. .. _utilities-exporter-howto-docker: diff --git a/paperless.conf.example b/paperless.conf.example index 6acba5f25..3d90b2915 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -165,6 +165,8 @@ PAPERLESS_EMAIL_SECRET="" #PAPERLESS_CONVERT_DENSITY=300 +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory # rarely, you may want to use a higher value than the default (10). diff --git a/requirements.txt b/requirements.txt index e8fa9d81b..0c46e4f8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ flake8==3.5.0 fuzzywuzzy==0.15.0 gunicorn==19.8.1 idna==2.6 +inotify_simple==1.1.7; sys_platform == 'linux' langdetect==1.0.7 mccabe==0.6.1 more-itertools==4.1.0 diff --git a/src/documents/admin.py b/src/documents/admin.py index 3ce2785b5..39524ae21 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -125,7 +125,9 @@ class DocumentAdmin(CommonAdmin): } search_fields = ("correspondent__name", "title", "content", "tags__name") - list_display = ("title", "created", "thumbnail", "correspondent", "tags_") + readonly_fields = ("added",) + list_display = ("title", "created", "added", "thumbnail", "correspondent", + "tags_") list_filter = ("tags", "correspondent", FinancialYearFilter, MonthListFilter) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 84a1ff3ca..b390f6800 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -3,8 +3,10 @@ import hashlib import logging import os import re +import time import uuid +from operator import itemgetter from django.conf import settings from django.utils import timezone from paperless.db import GnuPG @@ -32,21 +34,21 @@ class Consumer: 5. Delete the document and image(s) """ + # Files are considered ready for consumption if they have been unmodified + # for this duration + FILES_MIN_UNMODIFIED_DURATION = 0.5 + def __init__(self, consume=settings.CONSUMPTION_DIR, scratch=settings.SCRATCH_DIR): self.logger = logging.getLogger(__name__) self.logging_group = None - self.stats = {} self._ignore = [] self.consume = consume self.scratch = scratch - try: - os.makedirs(self.scratch) - except FileExistsError: - pass + os.makedirs(self.scratch, exist_ok=True) self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED if settings.PASSPHRASE: @@ -80,83 +82,99 @@ class Consumer: "group": self.logging_group }) - def run(self): + def consume_new_files(self): + """ + Find non-ignored files in consumption dir and consume them if they have + been unmodified for FILES_MIN_UNMODIFIED_DURATION. + """ + ignored_files = [] + files = [] + for entry in os.scandir(self.consume): + if entry.is_file(): + file = (entry.path, entry.stat().st_mtime) + if file in self._ignore: + ignored_files.append(file) + else: + files.append(file) - for doc in os.listdir(self.consume): + if not files: + return - doc = os.path.join(self.consume, doc) + # Set _ignore to only include files that still exist. + # This keeps it from growing indefinitely. + self._ignore[:] = ignored_files - if not os.path.isfile(doc): - continue + files_old_to_new = sorted(files, key=itemgetter(1)) - if not re.match(FileInfo.REGEXES["title"], doc): - continue + time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) - if doc in self._ignore: - continue + for file, mtime in files_old_to_new: + if mtime == os.path.getmtime(file): + # File has not been modified and can be consumed + if not self.try_consume_file(file): + self._ignore.append((file, mtime)) - if not self._is_ready(doc): - continue + def try_consume_file(self, file): + "Return True if file was consumed" - if self._is_duplicate(doc): - self.log( - "info", - "Skipping {} as it appears to be a duplicate".format(doc) - ) - self._ignore.append(doc) - continue + if not re.match(FileInfo.REGEXES["title"], file): + return False - parser_class = self._get_parser_class(doc) - if not parser_class: - self.log( - "error", "No parsers could be found for {}".format(doc)) - self._ignore.append(doc) - continue + doc = file - self.logging_group = uuid.uuid4() + if self._is_duplicate(doc): + self.log( + "info", + "Skipping {} as it appears to be a duplicate".format(doc) + ) + return False - self.log("info", "Consuming {}".format(doc)) + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "error", "No parsers could be found for {}".format(doc)) + return False - document_consumption_started.send( - sender=self.__class__, - filename=doc, - logging_group=self.logging_group + self.logging_group = uuid.uuid4() + + self.log("info", "Consuming {}".format(doc)) + + document_consumption_started.send( + sender=self.__class__, + filename=doc, + logging_group=self.logging_group + ) + + parsed_document = parser_class(doc) + + try: + thumbnail = parsed_document.get_thumbnail() + date = parsed_document.get_date() + document = self._store( + parsed_document.get_text(), + doc, + thumbnail, + date + ) + except ParseError as e: + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() + return False + else: + parsed_document.cleanup() + self._cleanup_doc(doc) + + self.log( + "info", + "Document {} consumption finished".format(document) ) - parsed_document = parser_class(doc) - - try: - thumbnail = parsed_document.get_thumbnail() - date = parsed_document.get_date() - document = self._store( - parsed_document.get_text(), - doc, - thumbnail, - date - ) - except ParseError as e: - - self._ignore.append(doc) - self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) - parsed_document.cleanup() - - continue - - else: - - parsed_document.cleanup() - self._cleanup_doc(doc) - - self.log( - "info", - "Document {} consumption finished".format(document) - ) - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group - ) + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group + ) + return True def _get_parser_class(self, doc): """ @@ -232,22 +250,6 @@ class Consumer: self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) - def _is_ready(self, doc): - """ - Detect whether ``doc`` is ready to consume or if it's still being - written to by the uploader. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False - @staticmethod def _is_duplicate(doc): with open(doc, "rb") as f: diff --git a/src/documents/mail.py b/src/documents/mail.py index d2828a57c..afa1b4362 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -20,7 +20,7 @@ class MailFetcherError(Exception): pass -class InvalidMessageError(Exception): +class InvalidMessageError(MailFetcherError): pass @@ -75,6 +75,9 @@ class Message(Loggable): continue dispositions = content_disposition.strip().split(";") + if len(dispositions) < 2: + continue + if not dispositions[0].lower() == "attachment" and \ "filename" not in dispositions[1].lower(): continue @@ -159,8 +162,10 @@ class MailFetcher(Loggable): self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX") self._enabled = bool(self._host) + if self._enabled and Message.SECRET is None: + raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined") - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() self.consume = consume def pull(self): @@ -187,7 +192,7 @@ class MailFetcher(Loggable): f.write(message.attachment.data) os.utime(file_name, times=(t, t)) - self.last_checked = datetime.datetime.now() + self.last_checked = time.time() def _get_messages(self): @@ -205,7 +210,7 @@ class MailFetcher(Loggable): self._connection.close() self._connection.logout() - except Exception as e: + except MailFetcherError as e: self.log("error", str(e)) return r diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index f94265b65..41e5382d5 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,6 +1,7 @@ import datetime import logging import os +import sys import time from django.conf import settings @@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError +try: + from inotify_simple import INotify, flags +except ImportError: + pass + class Command(BaseCommand): """ @@ -53,13 +59,20 @@ class Command(BaseCommand): action="store_true", help="Run only once." ) + parser.add_argument( + "--no-inotify", + action="store_true", + help="Don't use inotify, even if it's available." + ) def handle(self, *args, **options): self.verbosity = options["verbosity"] directory = options["directory"] loop_time = options["loop_time"] - mail_delta = datetime.timedelta(minutes=options["mail_delta"]) + mail_delta = options["mail_delta"] * 60 + use_inotify = (not options["no_inotify"] + and "inotify_simple" in sys.modules) try: self.file_consumer = Consumer(consume=directory) @@ -67,39 +80,68 @@ class Command(BaseCommand): except (ConsumerError, MailFetcherError) as e: raise CommandError(e) - for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS): - try: - os.makedirs(path) - except FileExistsError: - pass + for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS): + os.makedirs(d, exist_ok=True) logging.getLogger(__name__).info( - "Starting document consumer at {}".format(directory) + "Starting document consumer at {}{}".format( + directory, + " with inotify" if use_inotify else "" + ) ) if options["oneshot"]: - self.loop(mail_delta=mail_delta) + self.loop_step(mail_delta) else: try: - while True: - self.loop(mail_delta=mail_delta) - time.sleep(loop_time) - if self.verbosity > 1: - print(".", int(time.time())) + if use_inotify: + self.loop_inotify(mail_delta) + else: + self.loop(loop_time, mail_delta) except KeyboardInterrupt: print("Exiting") - def loop(self, mail_delta): + def loop(self, loop_time, mail_delta): + while True: + start_time = time.time() + if self.verbosity > 1: + print(".", int(start_time)) + self.loop_step(mail_delta, start_time) + # Sleep until the start of the next loop step + time.sleep(max(0, start_time + loop_time - time.time())) + + def loop_step(self, mail_delta, time_now=None): # Occasionally fetch mail and store it to be consumed on the next loop # We fetch email when we first start up so that it is not necessary to # wait for 10 minutes after making changes to the config file. - delta = self.mail_fetcher.last_checked + mail_delta - if self.first_iteration or delta < datetime.datetime.now(): + next_mail_time = self.mail_fetcher.last_checked + mail_delta + if self.first_iteration or time_now > next_mail_time: self.first_iteration = False self.mail_fetcher.pull() - # Consume whatever files we can. - # We have to run twice as the first run checks for file readiness - for i in range(2): - self.file_consumer.run() + self.file_consumer.consume_new_files() + + def loop_inotify(self, mail_delta): + directory = self.file_consumer.consume + inotify = INotify() + inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) + + # Run initial mail fetch and consume all currently existing documents + self.loop_step(mail_delta) + next_mail_time = self.mail_fetcher.last_checked + mail_delta + + while True: + # Consume documents until next_mail_time + while True: + delta = next_mail_time - time.time() + if delta > 0: + for event in inotify.read(timeout=delta): + file = os.path.join(directory, event.name) + if os.path.isfile(file): + self.file_consumer.try_consume_file(file) + else: + break + + self.mail_fetcher.pull() + next_mail_time = self.mail_fetcher.last_checked + mail_delta diff --git a/src/documents/migrations/0020_document_added.py b/src/documents/migrations/0020_document_added.py new file mode 100644 index 000000000..dbddf80ae --- /dev/null +++ b/src/documents/migrations/0020_document_added.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models +import django.utils.timezone + + +def set_added_time_to_created_time(apps, schema_editor): + Document = apps.get_model("documents", "Document") + for doc in Document.objects.all(): + doc.added = doc.created + doc.save() + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0019_add_consumer_user'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='added', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False), + ), + migrations.RunPython(set_added_time_to_created_time) + ] diff --git a/src/documents/models.py b/src/documents/models.py index 245655117..7390c1d3c 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -236,6 +236,7 @@ class Document(models.Model): default=timezone.now, db_index=True) modified = models.DateTimeField( auto_now=True, editable=False, db_index=True) + storage_type = models.CharField( max_length=11, choices=STORAGE_TYPES, @@ -243,6 +244,9 @@ class Document(models.Model): editable=False ) + added = models.DateTimeField( + default=timezone.now, editable=False, db_index=True) + class Meta: ordering = ("correspondent", "title") diff --git a/src/documents/templates/admin/documents/document/change_list_results.html b/src/documents/templates/admin/documents/document/change_list_results.html index cd5f88f0a..b33cd2927 100644 --- a/src/documents/templates/admin/documents/document/change_list_results.html +++ b/src/documents/templates/admin/documents/document/change_list_results.html @@ -29,13 +29,32 @@ .result .header { padding: 5px; background-color: #79AEC8; + position: relative; } - .result .header .checkbox{ + .result .header .checkbox { width: 5%; float: left; + position: absolute; + z-index: 2; } .result .header .info { margin-left: 10%; + position: relative; + } + .headerLink { + cursor: pointer; + opacity: 0; + z-index: 1; + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + } + .header > a { + z-index: 2; + margin-left: 10%; + position: relative; } .result .header a, .result a.tag { @@ -129,24 +148,36 @@ {# 0: Checkbox #} {# 1: Title #} {# 2: Date #} - {# 3: Image #} - {# 4: Correspondent #} - {# 5: Tags #} - {# 6: Document edit url #} + {# 3: Added #} + {# 4: Image #} + {# 5: Correspondent #} + {# 6: Tags #} + {# 7: Document edit url #}
{% endfor %} diff --git a/src/documents/templatetags/hacks.py b/src/documents/templatetags/hacks.py index 4faf1783f..0c0a0e099 100644 --- a/src/documents/templatetags/hacks.py +++ b/src/documents/templatetags/hacks.py @@ -38,6 +38,6 @@ def add_doc_edit_url(result): """ title = result[1] match = re.search(EXTRACT_URL, title) - edit_doc_url = match[1] + edit_doc_url = match.group(1) result.append(edit_doc_url) return result diff --git a/src/paperless/settings.py b/src/paperless/settings.py index ccafe956d..6dd0b6419 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") +# (This setting is ignored on Linux where inotify is used instead of a +# polling loop.) # The number of seconds that Paperless will wait between checking # CONSUMPTION_DIR. If you tend to write documents to this directory very # slowly, you may want to use a higher value than the default.