Merge branch 'master' into mcronce-disable_encryption

2026-02-11 23:59:31 -06:00 · 2018-06-17 16:32:51 +01:00
parent a1cb67c4ce d5876cc97d
commit c9f35a7da2
14 changed files with 279 additions and 135 deletions
--- a/docker-compose.env.example
+++ b/docker-compose.env.example
@@ -14,3 +14,25 @@
 # You can change the default user and group id to a custom one
 # USERMAP_UID=1000
 # USERMAP_GID=1000
 ###############################################################################
 ####                         Mail Consumption                              ####
 ###############################################################################
 # These values are required if you want paperless to check a particular email
 # box every 10 minutes and attempt to consume documents from there.  If you
 # don't define a HOST, mail checking will just be disabled.
 # Don't use quotes after = or it will crash your docker
 # PAPERLESS_CONSUME_MAIL_HOST=
 # PAPERLESS_CONSUME_MAIL_PORT=
 # PAPERLESS_CONSUME_MAIL_USER=
 # PAPERLESS_CONSUME_MAIL_PASS=
 # Override the default IMAP inbox here. If it's not set, Paperless defaults to
 # INBOX.
 # PAPERLESS_CONSUME_MAIL_INBOX=INBOX
 # Any email sent to the target account that does not contain this text will be
 # ignored.  Mail checking won't work without this.
 # PAPERLESS_EMAIL_SECRET=
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -62,6 +62,8 @@ encryption too, you only need to do two things:
  entrypoint and fixed it with some very creating Bash skills: `#352`_.
 * You can now use the search field to find documents by tag thanks to
  `thinkjk`_'s *first ever issue*: `#354`_.
 * Inotify is now being used to detect additions to the consume directory thanks
  to some excellent work from `erikarvstedt`_ on `#351`_
 1.3.0
 =====
@@ -491,6 +493,7 @@ encryption too, you only need to do two things:
 .. _#253: https://github.com/danielquinn/paperless/issues/253
 .. _#323: https://github.com/danielquinn/paperless/issues/323
 .. _#344: https://github.com/danielquinn/paperless/pull/344
 .. _#351: https://github.com/danielquinn/paperless/pull/351
 .. _#352: https://github.com/danielquinn/paperless/pull/352
 .. _#354: https://github.com/danielquinn/paperless/issues/354
--- a/docs/utilities.rst
+++ b/docs/utilities.rst
@@ -49,17 +49,18 @@ The Consumer
 ------------
 The consumer script runs in an infinite loop, constantly looking at a directory
-for PDF files to parse and index.  The process is pretty straightforward:
+for documents to parse and index.  The process is pretty straightforward:
-1. Look in ``CONSUMPTION_DIR`` for a PDF.  If one is found, go to #2.  If not,
+1. Look in ``CONSUMPTION_DIR`` for a document.  If one is found, go to #2.
-   wait 10 seconds and try again.
+   If not, wait 10 seconds and try again.  On Linux, new documents are detected
-2. Parse the PDF with Tesseract
+   instantly via inotify, so there's no waiting involved.
 2. Parse the document with Tesseract
 3. Create a new record in the database with the OCR'd text
 4. Attempt to automatically assign document attributes by doing some guesswork.
   Read up on the :ref:`guesswork documentation<guesswork>` for more
   information about this process.
-5. Encrypt the PDF and store it in the ``media`` directory under
+5. Encrypt the document and store it in the ``media`` directory under
-   ``documents/pdf``.
+   ``documents/originals``.
 6. Go to #1.
@@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script:
    $ /path/to/paperless/src/manage.py document_consumer
-This starts the service that will run in a loop, consuming PDF files as they
+This starts the service that will consume documents as they appear in
-appear in ``CONSUMPTION_DIR``.
+``CONSUMPTION_DIR``.
 Note that this command runs continuously, so exiting it will mean your webserver
 disappears.  If you want to run this full-time (which is kind of the point)
@@ -97,8 +98,8 @@ The Exporter
 ------------
 Tired of fiddling with Paperless, or just want to do something stupid and are
-afraid of accidentally damaging your files?  You can export all of your PDFs
+afraid of accidentally damaging your files?  You can export all of your
-into neatly named, dated, and unencrypted.
+documents into neatly named, dated, and unencrypted files.
 .. _utilities-exporter-howto:
@@ -112,10 +113,10 @@ This too is done via the ``manage.py`` script:
    $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
-This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
+This will dump all of your unencrypted documents into ``/path/to/somewhere``
-to do with as you please.  The files are accompanied with a special file,
+for you to do with as you please.  The files are accompanied with a special
-``manifest.json`` which can be used to
+file, ``manifest.json`` which can be used to :ref:`import the files
-:ref:`import the files <utilities-importer>` at a later date if you wish.
+<utilities-importer>` at a later date if you wish.
 .. _utilities-exporter-howto-docker:
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -165,6 +165,8 @@ PAPERLESS_EMAIL_SECRET=""
 #PAPERLESS_CONVERT_DENSITY=300
 # (This setting is ignored on Linux where inotify is used instead of a
 # polling loop.)
 # The number of seconds that Paperless will wait between checking
 # PAPERLESS_CONSUMPTION_DIR.  If you tend to write documents to this directory
 # rarely, you may want to use a higher value than the default (10).
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ flake8==3.5.0
 fuzzywuzzy==0.15.0
 gunicorn==19.8.1
 idna==2.6
 inotify_simple==1.1.7; sys_platform == 'linux'
 langdetect==1.0.7
 mccabe==0.6.1
 more-itertools==4.1.0
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -125,7 +125,9 @@ class DocumentAdmin(CommonAdmin):
        }
    search_fields = ("correspondent__name", "title", "content", "tags__name")
-    list_display = ("title", "created", "thumbnail", "correspondent", "tags_")
+    readonly_fields = ("added",)
    list_display = ("title", "created", "added", "thumbnail", "correspondent",
                    "tags_")
    list_filter = ("tags", "correspondent", FinancialYearFilter,
                   MonthListFilter)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -3,8 +3,10 @@ import hashlib
 import logging
 import os
 import re
 import time
 import uuid
 from operator import itemgetter
 from django.conf import settings
 from django.utils import timezone
 from paperless.db import GnuPG
@@ -32,21 +34,21 @@ class Consumer:
      5. Delete the document and image(s)
    """
    # Files are considered ready for consumption if they have been unmodified
    # for this duration
    FILES_MIN_UNMODIFIED_DURATION = 0.5
    def __init__(self, consume=settings.CONSUMPTION_DIR,
                 scratch=settings.SCRATCH_DIR):
        self.logger = logging.getLogger(__name__)
        self.logging_group = None
        self.stats = {}
        self._ignore = []
        self.consume = consume
        self.scratch = scratch
-        try:
+        os.makedirs(self.scratch, exist_ok=True)
            os.makedirs(self.scratch)
        except FileExistsError:
            pass
        self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        if settings.PASSPHRASE:
@@ -80,83 +82,99 @@ class Consumer:
            "group": self.logging_group
        })
-    def run(self):
+    def consume_new_files(self):
        """
        Find non-ignored files in consumption dir and consume them if they have
        been unmodified for FILES_MIN_UNMODIFIED_DURATION.
        """
        ignored_files = []
        files = []
        for entry in os.scandir(self.consume):
            if entry.is_file():
                file = (entry.path, entry.stat().st_mtime)
                if file in self._ignore:
                    ignored_files.append(file)
                else:
                    files.append(file)
-        for doc in os.listdir(self.consume):
+        if not files:
            return
-            doc = os.path.join(self.consume, doc)
+        # Set _ignore to only include files that still exist.
        # This keeps it from growing indefinitely.
        self._ignore[:] = ignored_files
-            if not os.path.isfile(doc):
+        files_old_to_new = sorted(files, key=itemgetter(1))
                continue
-            if not re.match(FileInfo.REGEXES["title"], doc):
+        time.sleep(self.FILES_MIN_UNMODIFIED_DURATION)
                continue
-            if doc in self._ignore:
+        for file, mtime in files_old_to_new:
-                continue
+            if mtime == os.path.getmtime(file):
                # File has not been modified and can be consumed
                if not self.try_consume_file(file):
                    self._ignore.append((file, mtime))
-            if not self._is_ready(doc):
+    def try_consume_file(self, file):
-                continue
+        "Return True if file was consumed"
-            if self._is_duplicate(doc):
+        if not re.match(FileInfo.REGEXES["title"], file):
-                self.log(
+            return False
                    "info",
                    "Skipping {} as it appears to be a duplicate".format(doc)
                )
                self._ignore.append(doc)
                continue
-            parser_class = self._get_parser_class(doc)
+        doc = file
            if not parser_class:
                self.log(
                    "error", "No parsers could be found for {}".format(doc))
                self._ignore.append(doc)
                continue
-            self.logging_group = uuid.uuid4()
+        if self._is_duplicate(doc):
            self.log(
                "info",
                "Skipping {} as it appears to be a duplicate".format(doc)
            )
            return False
-            self.log("info", "Consuming {}".format(doc))
+        parser_class = self._get_parser_class(doc)
        if not parser_class:
            self.log(
                "error", "No parsers could be found for {}".format(doc))
            return False
-            document_consumption_started.send(
+        self.logging_group = uuid.uuid4()
-                sender=self.__class__,
+
-                filename=doc,
+        self.log("info", "Consuming {}".format(doc))
-                logging_group=self.logging_group
+
        document_consumption_started.send(
            sender=self.__class__,
            filename=doc,
            logging_group=self.logging_group
        )
        parsed_document = parser_class(doc)
        try:
            thumbnail = parsed_document.get_thumbnail()
            date = parsed_document.get_date()
            document = self._store(
                parsed_document.get_text(),
                doc,
                thumbnail,
                date
            )
        except ParseError as e:
            self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
            parsed_document.cleanup()
            return False
        else:
            parsed_document.cleanup()
            self._cleanup_doc(doc)
            self.log(
                "info",
                "Document {} consumption finished".format(document)
            )
-            parsed_document = parser_class(doc)
+            document_consumption_finished.send(
-
+                sender=self.__class__,
-            try:
+                document=document,
-                thumbnail = parsed_document.get_thumbnail()
+                logging_group=self.logging_group
-                date = parsed_document.get_date()
+            )
-                document = self._store(
+            return True
                    parsed_document.get_text(),
                    doc,
                    thumbnail,
                    date
                )
            except ParseError as e:
                self._ignore.append(doc)
                self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
                parsed_document.cleanup()
                continue
            else:
                parsed_document.cleanup()
                self._cleanup_doc(doc)
                self.log(
                    "info",
                    "Document {} consumption finished".format(document)
                )
                document_consumption_finished.send(
                    sender=self.__class__,
                    document=document,
                    logging_group=self.logging_group
                )
    def _get_parser_class(self, doc):
        """
@@ -232,22 +250,6 @@ class Consumer:
        self.log("debug", "Deleting document {}".format(doc))
        os.unlink(doc)
    def _is_ready(self, doc):
        """
        Detect whether ``doc`` is ready to consume or if it's still being
        written to by the uploader.
        """
        t = os.stat(doc).st_mtime
        if self.stats.get(doc) == t:
            del(self.stats[doc])
            return True
        self.stats[doc] = t
        return False
    @staticmethod
    def _is_duplicate(doc):
        with open(doc, "rb") as f:
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -20,7 +20,7 @@ class MailFetcherError(Exception):
    pass
-class InvalidMessageError(Exception):
+class InvalidMessageError(MailFetcherError):
    pass
@@ -75,6 +75,9 @@ class Message(Loggable):
                continue
            dispositions = content_disposition.strip().split(";")
            if len(dispositions) < 2:
                continue
            if not dispositions[0].lower() == "attachment" and \
               "filename" not in dispositions[1].lower():
                continue
@@ -159,8 +162,10 @@ class MailFetcher(Loggable):
        self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
        self._enabled = bool(self._host)
        if self._enabled and Message.SECRET is None:
            raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
-        self.last_checked = datetime.datetime.now()
+        self.last_checked = time.time()
        self.consume = consume
    def pull(self):
@@ -187,7 +192,7 @@ class MailFetcher(Loggable):
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))
-        self.last_checked = datetime.datetime.now()
+        self.last_checked = time.time()
    def _get_messages(self):
@@ -205,7 +210,7 @@ class MailFetcher(Loggable):
            self._connection.close()
            self._connection.logout()
-        except Exception as e:
+        except MailFetcherError as e:
            self.log("error", str(e))
        return r
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,6 +1,7 @@
 import datetime
 import logging
 import os
 import sys
 import time
 from django.conf import settings
@@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError
 from ...consumer import Consumer, ConsumerError
 from ...mail import MailFetcher, MailFetcherError
 try:
    from inotify_simple import INotify, flags
 except ImportError:
    pass
 class Command(BaseCommand):
    """
@@ -53,13 +59,20 @@ class Command(BaseCommand):
            action="store_true",
            help="Run only once."
        )
        parser.add_argument(
            "--no-inotify",
            action="store_true",
            help="Don't use inotify, even if it's available."
        )
    def handle(self, *args, **options):
        self.verbosity = options["verbosity"]
        directory = options["directory"]
        loop_time = options["loop_time"]
-        mail_delta = datetime.timedelta(minutes=options["mail_delta"])
+        mail_delta = options["mail_delta"] * 60
        use_inotify = (not options["no_inotify"]
                       and "inotify_simple" in sys.modules)
        try:
            self.file_consumer = Consumer(consume=directory)
@@ -67,39 +80,68 @@ class Command(BaseCommand):
        except (ConsumerError, MailFetcherError) as e:
            raise CommandError(e)
-        for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS):
+        for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS):
-            try:
+            os.makedirs(d, exist_ok=True)
                os.makedirs(path)
            except FileExistsError:
                pass
        logging.getLogger(__name__).info(
-            "Starting document consumer at {}".format(directory)
+            "Starting document consumer at {}{}".format(
                directory,
                " with inotify" if use_inotify else ""
            )
        )
        if options["oneshot"]:
-            self.loop(mail_delta=mail_delta)
+            self.loop_step(mail_delta)
        else:
            try:
-                while True:
+                if use_inotify:
-                    self.loop(mail_delta=mail_delta)
+                    self.loop_inotify(mail_delta)
-                    time.sleep(loop_time)
+                else:
-                    if self.verbosity > 1:
+                    self.loop(loop_time, mail_delta)
                        print(".", int(time.time()))
            except KeyboardInterrupt:
                print("Exiting")
-    def loop(self, mail_delta):
+    def loop(self, loop_time, mail_delta):
        while True:
            start_time = time.time()
            if self.verbosity > 1:
                print(".", int(start_time))
            self.loop_step(mail_delta, start_time)
            # Sleep until the start of the next loop step
            time.sleep(max(0, start_time + loop_time - time.time()))
    def loop_step(self, mail_delta, time_now=None):
        # Occasionally fetch mail and store it to be consumed on the next loop
        # We fetch email when we first start up so that it is not necessary to
        # wait for 10 minutes after making changes to the config file.
-        delta = self.mail_fetcher.last_checked + mail_delta
+        next_mail_time = self.mail_fetcher.last_checked + mail_delta
-        if self.first_iteration or delta < datetime.datetime.now():
+        if self.first_iteration or time_now > next_mail_time:
            self.first_iteration = False
            self.mail_fetcher.pull()
-        # Consume whatever files we can.
+        self.file_consumer.consume_new_files()
-        # We have to run twice as the first run checks for file readiness
+
-        for i in range(2):
+    def loop_inotify(self, mail_delta):
-            self.file_consumer.run()
+        directory = self.file_consumer.consume
        inotify = INotify()
        inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO)
        # Run initial mail fetch and consume all currently existing documents
        self.loop_step(mail_delta)
        next_mail_time = self.mail_fetcher.last_checked + mail_delta
        while True:
            # Consume documents until next_mail_time
            while True:
                delta = next_mail_time - time.time()
                if delta > 0:
                    for event in inotify.read(timeout=delta):
                        file = os.path.join(directory, event.name)
                        if os.path.isfile(file):
                            self.file_consumer.try_consume_file(file)
                else:
                    break
            self.mail_fetcher.pull()
            next_mail_time = self.mail_fetcher.last_checked + mail_delta
--- a/src/documents/migrations/0020_document_added.py
+++ b/src/documents/migrations/0020_document_added.py
@@ -0,0 +1,27 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 from django.db import migrations, models
 import django.utils.timezone
 def set_added_time_to_created_time(apps, schema_editor):
    Document = apps.get_model("documents", "Document")
    for doc in Document.objects.all():
        doc.added = doc.created
        doc.save()
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '0019_add_consumer_user'),
    ]
    operations = [
        migrations.AddField(
            model_name='document',
            name='added',
            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False),
        ),
        migrations.RunPython(set_added_time_to_created_time)
    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -236,6 +236,7 @@ class Document(models.Model):
        default=timezone.now, db_index=True)
    modified = models.DateTimeField(
        auto_now=True, editable=False, db_index=True)
    storage_type = models.CharField(
        max_length=11,
        choices=STORAGE_TYPES,
@@ -243,6 +244,9 @@ class Document(models.Model):
        editable=False
    )
    added = models.DateTimeField(
        default=timezone.now, editable=False, db_index=True)
    class Meta:
        ordering = ("correspondent", "title")
--- a/src/documents/templates/admin/documents/document/change_list_results.html
+++ b/src/documents/templates/admin/documents/document/change_list_results.html
@@ -29,13 +29,32 @@
  .result .header {
    padding: 5px;
    background-color: #79AEC8;
    position: relative;
  }
-  .result .header .checkbox{
+  .result .header .checkbox {
    width: 5%;
    float: left;
    position: absolute;
    z-index: 2;
  }
  .result .header .info {
    margin-left: 10%;
    position: relative;
  }
  .headerLink {
    cursor: pointer;
    opacity: 0;
    z-index: 1;
    position: absolute;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
  }
  .header > a {
    z-index: 2;
    margin-left: 10%;
    position: relative;
  }
  .result .header a,
  .result a.tag {
@@ -129,24 +148,36 @@
    {# 0: Checkbox #}
    {# 1: Title #}
    {# 2: Date #}
-    {# 3: Image #}
+    {# 3: Added #}
-    {# 4: Correspondent #}
+    {# 4: Image #}
-    {# 5: Tags #}
+    {# 5: Correspondent #}
-    {# 6: Document edit url #}
+    {# 6: Tags #}
    {# 7: Document edit url #}
    <div class="box">
      <div class="result">
-        <div class="header" onclick="location.href='{{ result.6 }}';" style="cursor: pointer;">
+        <div class="header">
          {% comment %}
            The purpose of 'headerLink' is to make the whole header
            background clickable.
            We use an onclick handler here instead of a native link ('<a>')
            to allow selecting (and copying) the overlying doc title text
            with the mouse cursor.
            If the title link were layered upon another link ('<a>'), title text
            selection would not be possible with mouse click + drag. Instead,
            the underlying link would be dragged.
          {% endcomment %}
          <div class="headerLink" onclick="location.href='{{ result.7 }}';"></div>
          <div class="checkbox">{{ result.0 }}</div>
          <div class="info">
-            {{ result.4 }}<br />
+            {{ result.5 }}
            {{ result.1 }}
          </div>
          {{ result.1 }}
          <div style="clear: both;"></div>
        </div>
-        <div class="tags">{{ result.5 }}</div>
+        <div class="tags">{{ result.6 }}</div>
        <div class="date">{{ result.2 }}</div>
        <div style="clear: both;"></div>
-        <div class="image">{{ result.3 }}</div>
+        <div class="image">{{ result.4 }}</div>
      </div>
    </div>
  {% endfor %}
--- a/src/documents/templatetags/hacks.py
+++ b/src/documents/templatetags/hacks.py
@@ -38,6 +38,6 @@ def add_doc_edit_url(result):
    """
    title = result[1]
    match = re.search(EXTRACT_URL, title)
-    edit_doc_url = match[1]
+    edit_doc_url = match.group(1)
    result.append(edit_doc_url)
    return result
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -246,6 +246,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
 # This is where Paperless will look for PDFs to index
 CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
 # (This setting is ignored on Linux where inotify is used instead of a
 # polling loop.)
 # The number of seconds that Paperless will wait between checking
 # CONSUMPTION_DIR.  If you tend to write documents to this directory very
 # slowly, you may want to use a higher value than the default.