Merge branch 'master' of github.com:danielquinn/paperless

2025-09-16 21:55:37 -05:00 · 2018-02-01 12:37:29 +00:00
parent 88736ff867 3fcd1e2d7e
commit 5c59120c57
11 changed files with 375 additions and 282 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: python
 before_install:
 - sudo apt-get update -qq
 - sudo apt-get install -qq libpoppler-cpp-dev
 sudo: false
 matrix:
--- a/75
+++ b/75
@@ -1,50 +1,47 @@
-FROM python:3.5
+FROM alpine:3.7
 MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
-# Install dependencies
+LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
-RUN apt-get update \
+      contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
-    && apt-get install -y --no-install-recommends \
+        Sven Fischer <git-dev@linux4tw.de>"
        sudo \
        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
    && rm -rf /var/lib/apt/lists/*
 # Install python dependencies
 RUN mkdir -p /usr/src/paperless
 WORKDIR /usr/src/paperless
 COPY requirements.txt /usr/src/paperless/
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
-RUN mkdir -p /usr/src/paperless/src
+COPY requirements.txt /usr/src/paperless/
 RUN mkdir -p /usr/src/paperless/data
 RUN mkdir -p /usr/src/paperless/media
 COPY src/ /usr/src/paperless/src/
 COPY data/ /usr/src/paperless/data/
 COPY media/ /usr/src/paperless/media/
 # Set consumption directory
 ENV PAPERLESS_CONSUMPTION_DIR /consume
 RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
 # Migrate database
 WORKDIR /usr/src/paperless/src
 RUN ./manage.py migrate
 # Create user
 RUN groupadd -g 1000 paperless \
    && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
    && chown -Rh paperless:paperless /usr/src/paperless
 # Set export directory
 ENV PAPERLESS_EXPORT_DIR /export
 RUN mkdir -p $PAPERLESS_EXPORT_DIR
 # Setup entrypoint
 COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
 RUN chmod 755 /sbin/docker-entrypoint.sh
-# Mount volumes
+# Set export and consumption directories
 ENV PAPERLESS_EXPORT_DIR=/export \
    PAPERLESS_CONSUMPTION_DIR=/consume
 # Install dependencies
 RUN apk --no-cache --update add \
        python3 gnupg libmagic bash \
        sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
    apk --no-cache add --virtual .build-dependencies \
        python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
 # Install python dependencies
    python3 -m ensurepip && \
    rm -r /usr/lib/python*/ensurepip && \
    cd /usr/src/paperless && \
    pip3 install --no-cache-dir -r requirements.txt && \
 # Remove build dependencies
    apk del .build-dependencies && \
 # Create the consumption directory
    mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
 # Migrate database
    ./src/manage.py migrate && \
 # Create user
    addgroup -g 1000 paperless && \
    adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
    chown -Rh paperless:paperless /usr/src/paperless && \
    mkdir -p $PAPERLESS_EXPORT_DIR && \
 # Setup entrypoint
    chmod 755 /sbin/docker-entrypoint.sh
 WORKDIR /usr/src/paperless/src
 # Mount volumes and set Entrypoint
 VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
 ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
 CMD ["--help"]
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,6 @@ Paperless
 |Documentation|
 |Chat|
 |Travis|
 |Dependencies|
 Index and archive all of your scanned paper documents
@@ -28,12 +27,11 @@ scanner produces
 1. Buy a document scanner that can write to a place on your network.  If you
   need some inspiration, have a look at the `scanner recommendations`_ page.
   recommended by another user.
 2. Set it up to "scan to FTP" or something similar. It should be able to push
-   scanned images to a server without you having to do anything.  If your
+   scanned images to a server without you having to do anything.  Of course if
-   scanner doesn't know how to automatically upload the file somewhere, you can
+   your scanner doesn't know how to automatically upload the file somewhere,
-   always do that manually.  Paperless doesn't care how the documents get into
+   you can always do that manually.  Paperless doesn't care how the documents
-   its local consumption directory.
+   get into its local consumption directory.
 3. Have the target server run the Paperless consumption script to OCR the file
   and index it into a local database.
 4. Use the web frontend to sift through the database and find what you want.
@@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
   :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
 .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
   :target: https://travis-ci.org/danielquinn/paperless
 .. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
   :target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -2,7 +2,7 @@ version: '2'
 services:
    webserver:
-        image: pitkley/paperless
+        build: ./
        ports:
            # You can adapt the port you want Paperless to listen on by
            # modifying the part before the `:`.
@@ -20,7 +20,7 @@ services:
        command: ["runserver", "--insecure", "0.0.0.0:8000"]
    consumer:
-        image: pitkley/paperless
+        build: ./
        volumes:
            - data:/usr/src/paperless/data
            - media:/usr/src/paperless/media
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,7 +1,22 @@
 Changelog
 #########
-* 1.1.0
+1.2.0
 =====
 * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
  and `Pit`_.
 * `BastianPoe`_ has added the long-awaited feature to automatically skip the
  OCR step when the PDF already contains text. This can be overridden by
  setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
  in the environment.  Note that this also means that Paperless now requires
  ``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
  properly update.
 1.1.0
 =====
 * Fix for `#283`_, a redirect bug which broke interactions with
  paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
 * Addition of an optional new financial year filter, courtesy of
@@ -9,7 +24,9 @@ Changelog
 * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
  `Dan Panzarella`_
-* 1.0.0
+1.0.0
 =====
 * Upgrade to Django 1.11.  **You'll need to run
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
  properly update**.
@@ -26,12 +43,16 @@ Changelog
 * Date fields in the admin are now expressed as HTML5 date fields thanks to
  `Lukas Winkler`_'s issue `#278`_
-* 0.8.0
+0.8.0
 =====
 * Paperless can now run in a subdirectory on a host (``/paperless``), rather
  than always running in the root (``/``) thanks to `maphy-psd`_'s work on
  `#255`_.
-* 0.7.0
+0.7.0
 =====
 * **Potentially breaking change**: As per `#235`_, Paperless will no longer
  automatically delete documents attached to correspondents when those
  correspondents are themselves deleted.  This was Django's default
@@ -41,7 +62,9 @@ Changelog
  properly.  Thanks to `ayounggun`_ for reporting this one and to
  `Kusti Skytén`_ for posting the correct solution in the Github issue.
-* 0.6.0
+0.6.0
 =====
 * Abandon the shared-secret trick we were using for the POST API in favour
  of BasicAuth or Django session.
 * Fix the POST API so it actually works.  `#236`_
@@ -52,7 +75,10 @@ Changelog
  will still work for a while, but you should change your config if you've
  been using the email polling feature.  Thanks to `Joshua Gilman`_ for all
  the help with this feature.
-* 0.5.0
+
 0.5.0
 =====
 * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
  thanks to `Jake Gysland`_'s patch `#220`_.
 * Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks
@@ -68,11 +94,15 @@ Changelog
  * Amended the documentation for better handling of systemd service files (`#229`_)
  * Amended the Django Admin configuration to have nice headers (`#230`_)
-* 0.4.1
+0.4.1
 =====
 * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
  all-caps suffixes like ``.PDF``
-* 0.4.0
+0.4.0
 =====
 * Introducing reminders.  See `#199`_ for more information, but the short
  explanation is that you can now attach simple notes & times to documents
  which are made available via the API.  Currently, the default API
@@ -80,7 +110,9 @@ Changelog
  `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
  like to make use of this feature in his project.
-* 0.3.6
+0.3.6
 =====
 * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
  correspondent or the tags for a document.
 * The ``content`` field is now optional, to allow for the edge case of a
@@ -92,7 +124,9 @@ Changelog
  it with an environment variable, and you're good to go.  Proper
  documentation is on its way.
-* 0.3.5
+0.3.5
 =====
 * A serious facelift for the documents listing page wherein we drop the
  tabular layout in favour of a tiled interface.
 * Users can now configure the number of items per page.
@@ -101,7 +135,9 @@ Changelog
 * Fix for `#112`_: Added checks for binaries required for document
  consumption.
-* 0.3.4
+0.3.4
 =====
 * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
  Note that you *can* use Django Suit with Paperless, but only in a
  non-profit situation as their free license prohibits for-profit use.  As a
@@ -112,20 +148,28 @@ Changelog
 * BasicAuth support for document and thumbnail downloads, as well as the Push
  API thanks to @thomasbrueggemann.  See `#179`_.
-* 0.3.3
+0.3.3
 =====
 * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
 * Timezone, items per page, and default language are now all configurable,
  also thanks to @ekw.
-* 0.3.2
+0.3.2
 =====
 * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
  user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
  arise.
-* 0.3.1
+0.3.1
 =====
 * Added a default value for ``CONVERT_BINARY``
-* 0.3.0
+0.3.0
 =====
 * Updated to using django-filter 1.x
 * Added some system checks so new users aren't confused by misconfigurations.
 * Consumer loop time is now configurable for systems with slow writes.  Just
@@ -136,7 +180,8 @@ Changelog
  ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
  ``PAPERLESS_SHARED_SECRET`` respectively instead.
-* 0.2.0
+0.2.0
 =====
 * `#150`_: The media root is now a variable you can set in
  ``paperless.conf``.
@@ -163,7 +208,8 @@ Changelog
 * `#94`_: Restored support for changing the created date in the UI.  Thanks
  to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
-* 0.1.1
+0.1.1
 =====
 * Potentially **Breaking Change**: All references to "sender" in the code
  have been renamed to "correspondent" to better reflect the nature of the
@@ -186,7 +232,8 @@ Changelog
 * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
  to be imported but made unavailable.
-* 0.1.0
+0.1.0
 =====
 * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
  `Tikitu de Jager`_ for this one, and especially to `Pit`_
@@ -204,13 +251,15 @@ Changelog
 * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
 * Added tox with pep8 checking
-* 0.0.6
+0.0.6
 =====
 * Added support for parallel OCR (significant work from `Pit`_)
 * Sped up the language detection (significant work from `Pit`_)
 * Added simple logging
-* 0.0.5
+0.0.5
 =====
 * Added support for image files as documents (png, jpg, gif, tiff)
 * Added a crude means of HTTP POST for document imports
@@ -218,24 +267,28 @@ Changelog
 * Added a re-tagging utility
 * Documentation for the above as well as data migration
-* 0.0.4
+0.0.4
 =====
 * Added automated tagging basted on keyword matching
 * Cleaned up the document listing page
 * Removed ``User`` and ``Group`` from the admin
 * Added ``pytz`` to the list of requirements
-* 0.0.3
+0.0.3
 =====
 * Added basic tagging
-* 0.0.2
+0.0.2
 =====
 * Added language detection
 * Added datestamps to ``document_exporter``.
 * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
-* 0.0.1
+0.0.1
 =====
 * Initial release
@@ -268,6 +321,8 @@ Changelog
 .. _Lukas Winkler: https://github.com/Findus23
 .. _chris-aeviator: https://github.com/chris-aeviator
 .. _Dan Panzarella: https://github.com/pzl
 .. _addadi: https://github.com/addadi
 .. _BastianPoe: https://github.com/BastianPoe
 .. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -317,3 +372,5 @@ Changelog
 .. _#283: https://github.com/danielquinn/paperless/issues/283
 .. _#256: https://github.com/danielquinn/paperless/pull/256
 .. _#285: https://github.com/danielquinn/paperless/pull/285
 .. _pipenv: https://docs.pipenv.org/
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -11,24 +11,27 @@ should work) that has the following software installed:
 * `Tesseract`_, plus its language files matching your document base.
 * `Imagemagick`_ version 6.7.5 or higher
 * `unpaper`_
 * `libpoppler-cpp-dev`_ PDF rendering library
 .. _Python3: https://python.org/
 .. _GNU Privacy Guard: https://gnupg.org
 .. _Tesseract: https://github.com/tesseract-ocr
 .. _Imagemagick: http://imagemagick.org/
 .. _unpaper: https://www.flameeyes.eu/projects/unpaper
 .. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
 Notably, you should confirm how you access your Python3 installation.  Many
-Linux distributions will install Python3 in parallel to Python2, using the names
+Linux distributions will install Python3 in parallel to Python2, using the
-``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
+names ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
-``pip``.  Running Paperless with Python2 will likely break things, so make sure that 
+``pip``.  Running Paperless with Python2 will likely break things, so make sure
-you're using the right version.
+that you're using the right version.
 For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
 refer to their Python3 versions.
 In addition to the above, there are a number of Python requirements, all of
-which are listed in a file called ``requirements.txt`` in the project root directory.
+which are listed in a file called ``requirements.txt`` in the project root
 directory.
 If you're not working on a virtual environment (like Vagrant or Docker), you
 should probably be using a virtualenv, but that's your call.  The reasons why
@@ -39,12 +42,13 @@ probably figure that out before continuing.
 .. _requirements-apple:
-Apple-tastic Complications
+Problems with Imagemagick & PDFs
--------------------------
+--------------------------------
-Some users have `run into problems`_ with installing ImageMagick on Apple
+Some users have `run into problems`_ with getting ImageMagick to do its thing
-systems using HomeBrew.  The solution appears to be to install ghostscript as
+with PDFs.  Often this is the case with Apple systems using HomeBrew, but other
-well as ImageMagick:
+Linuxes have been a problem as well.  The solution appears to be to install
 ghostscript as well as ImageMagick:
 .. _run into problems: https://github.com/danielquinn/paperless/issues/25
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -175,7 +175,8 @@ Docker Method
   modified versions of the configuration files.
 4. Modify ``docker-compose.yml`` to your preferences, following the
   instructions in comments in the file. The only change that is a hard
-   requirement is to specify where the consumption directory should mount.
+   requirement is to specify where the consumption directory should
   mount.[#dockercomposeyml]_
 5. Modify ``docker-compose.env`` and adapt the following environment variables:
   ``PAPERLESS_PASSPHRASE``
@@ -192,7 +193,7 @@ Docker Method
     default English, set this parameter to a space separated list of
     three-letter language-codes after `ISO 639-2/T`_. For a list of available
     languages -- including their three letter codes -- see the
-     `Debian packagelist`_.
+     `Alpine packagelist`_.
   ``USERMAP_UID`` and ``USERMAP_GID``
     If you want to mount the consumption volume (directory ``/consume`` within
@@ -282,12 +283,17 @@ Docker Method
 .. _Docker: https://www.docker.com/
 .. _docker-compose: https://docs.docker.com/compose/install/
 .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
-.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
+.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
 .. [#compose] You of course don't have to use docker-compose, but it
   simplifies deployment immensely. If you know your way around Docker, feel
   free to tinker around without using compose!
 .. [#dockercomposeyml] If you're upgrading your docker-compose images from
   version 1.1.0 or earlier, you might need to change in the
   ``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
   both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
   newer ``docker-compose.yml.example`` file
 .. _setup-permanent:
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ python-dotenv>=0.6.2
 python-gnupg>=0.3.9
 pytz>=2016.10
 gunicorn==19.7.1
 pdftotext>=2.0.1
 # For the tests
 factory-boy
--- a/scripts/docker-entrypoint.sh
+++ b/scripts/docker-entrypoint.sh
@@ -9,7 +9,7 @@ map_uidgid() {
    USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
    if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
        echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
-        groupmod -g "${USERMAP_GID}" paperless
+        addgroup -g "${USERMAP_GID}" paperless
        sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
    fi
 }
@@ -56,25 +56,24 @@ install_languages() {
        return
    fi
    # Update apt-lists
    apt-get update
    # Loop over languages to be installed
    for lang in "${langs[@]}"; do
-        pkg="tesseract-ocr-$lang"
+        pkg="tesseract-ocr-data-$lang"
-        if dpkg -s "$pkg" > /dev/null 2>&1; then
+
        # English is installed by default
        if [ "$lang" ==  "eng" ]; then
            continue
        fi
-        if ! apt-cache show "$pkg" > /dev/null 2>&1; then
+        if apk info -e "$pkg" > /dev/null 2>&1; then
            continue
        fi
        if ! apk info "$pkg" > /dev/null 2>&1; then
            continue
        fi
-        apt-get install "$pkg"
+        apk --no-cache --update add "$pkg"
    done
    # Remove apt lists
    rm -rf /var/lib/apt/lists/*
 }
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
 # OCR all documents?
 OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -3,6 +3,7 @@ import os
 import re
 import subprocess
 from multiprocessing.pool import Pool
 import pdftotext
 import langdetect
 import pyocr
@@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
    def get_thumbnail(self):
        """
@@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
        return os.path.join(self.tempdir, "convert-0000.png")
    def _is_ocred(self):
        # Extract text from PDF using pdftotext
        text = get_text_from_pdf(self.document_path)
        # We assume, that a PDF with at least 50 characters contains text
        # (so no OCR required)
        if len(text) > 50:
            return True
        return False
    def get_text(self):
        if not self.OCR_ALWAYS and self._is_ocred():
            self.log("info", "Skipping OCR, using Text from PDF")
            return get_text_from_pdf(self.document_path)
        images = self._get_greyscale()
@@ -212,3 +228,13 @@ def image_to_string(args):
            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
 def get_text_from_pdf(pdf_file):
    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
            return False
    return "\n".join(pdf)