Merge branch 'master' of github.com:danielquinn/paperless

2026-01-30 23:08:59 -06:00 · 2018-02-01 12:37:29 +00:00
parent 88736ff867 3fcd1e2d7e
commit 5c59120c57
11 changed files with 375 additions and 282 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: python
 before_install:
 - sudo apt-get update -qq
 - sudo apt-get install -qq libpoppler-cpp-dev
 sudo: false
 matrix:
--- a/75
+++ b/75
@@ -1,50 +1,47 @@
-FROM python:3.5
+FROM alpine:3.7
 MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
-# Install dependencies
+LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
-RUN apt-get update \
+      contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
-    && apt-get install -y --no-install-recommends \
+        Sven Fischer <git-dev@linux4tw.de>"
        sudo \
        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
    && rm -rf /var/lib/apt/lists/*
 # Install python dependencies
 RUN mkdir -p /usr/src/paperless
 WORKDIR /usr/src/paperless
 COPY requirements.txt /usr/src/paperless/
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
-RUN mkdir -p /usr/src/paperless/src
+COPY requirements.txt /usr/src/paperless/
 RUN mkdir -p /usr/src/paperless/data
 RUN mkdir -p /usr/src/paperless/media
 COPY src/ /usr/src/paperless/src/
 COPY data/ /usr/src/paperless/data/
 COPY media/ /usr/src/paperless/media/
 # Set consumption directory
 ENV PAPERLESS_CONSUMPTION_DIR /consume
 RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
 # Migrate database
 WORKDIR /usr/src/paperless/src
 RUN ./manage.py migrate
 # Create user
 RUN groupadd -g 1000 paperless \
    && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
    && chown -Rh paperless:paperless /usr/src/paperless
 # Set export directory
 ENV PAPERLESS_EXPORT_DIR /export
 RUN mkdir -p $PAPERLESS_EXPORT_DIR
 # Setup entrypoint
 COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
 RUN chmod 755 /sbin/docker-entrypoint.sh
-# Mount volumes
+# Set export and consumption directories
 ENV PAPERLESS_EXPORT_DIR=/export \
    PAPERLESS_CONSUMPTION_DIR=/consume
 # Install dependencies
 RUN apk --no-cache --update add \
        python3 gnupg libmagic bash \
        sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
    apk --no-cache add --virtual .build-dependencies \
        python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
 # Install python dependencies
    python3 -m ensurepip && \
    rm -r /usr/lib/python*/ensurepip && \
    cd /usr/src/paperless && \
    pip3 install --no-cache-dir -r requirements.txt && \
 # Remove build dependencies
    apk del .build-dependencies && \
 # Create the consumption directory
    mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
 # Migrate database
    ./src/manage.py migrate && \
 # Create user
    addgroup -g 1000 paperless && \
    adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
    chown -Rh paperless:paperless /usr/src/paperless && \
    mkdir -p $PAPERLESS_EXPORT_DIR && \
 # Setup entrypoint
    chmod 755 /sbin/docker-entrypoint.sh
 WORKDIR /usr/src/paperless/src
 # Mount volumes and set Entrypoint
 VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
 ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
 CMD ["--help"]
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,6 @@ Paperless
 |Documentation|
 |Chat|
 |Travis|
 |Dependencies|
 Index and archive all of your scanned paper documents
@@ -28,12 +27,11 @@ scanner produces
 1. Buy a document scanner that can write to a place on your network.  If you
   need some inspiration, have a look at the `scanner recommendations`_ page.
   recommended by another user.
 2. Set it up to "scan to FTP" or something similar. It should be able to push
-   scanned images to a server without you having to do anything.  If your
+   scanned images to a server without you having to do anything.  Of course if
-   scanner doesn't know how to automatically upload the file somewhere, you can
+   your scanner doesn't know how to automatically upload the file somewhere,
-   always do that manually.  Paperless doesn't care how the documents get into
+   you can always do that manually.  Paperless doesn't care how the documents
-   its local consumption directory.
+   get into its local consumption directory.
 3. Have the target server run the Paperless consumption script to OCR the file
   and index it into a local database.
 4. Use the web frontend to sift through the database and find what you want.
@@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
   :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
 .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
   :target: https://travis-ci.org/danielquinn/paperless
 .. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
   :target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -2,7 +2,7 @@ version: '2'
 services:
    webserver:
-        image: pitkley/paperless
+        build: ./
        ports:
            # You can adapt the port you want Paperless to listen on by
            # modifying the part before the `:`.
@@ -20,7 +20,7 @@ services:
        command: ["runserver", "--insecure", "0.0.0.0:8000"]
    consumer:
-        image: pitkley/paperless
+        build: ./
        volumes:
            - data:/usr/src/paperless/data
            - media:/usr/src/paperless/media
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,243 +1,296 @@
 Changelog
 #########
-* 1.1.0
+1.2.0
-  * Fix for `#283`_, a redirect bug which broke interactions with
+=====
 * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
  and `Pit`_.
 * `BastianPoe`_ has added the long-awaited feature to automatically skip the
  OCR step when the PDF already contains text. This can be overridden by
  setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
  in the environment.  Note that this also means that Paperless now requires
  ``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
  properly update.
 1.1.0
 =====
 * Fix for `#283`_, a redirect bug which broke interactions with
  paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
-  * Addition of an optional new financial year filter, courtesy of
+* Addition of an optional new financial year filter, courtesy of
  `David Martin`_ `#256`_
-  * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
+* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
  `Dan Panzarella`_
-* 1.0.0
+1.0.0
-  * Upgrade to Django 1.11.  **You'll need to run
+=====
 * Upgrade to Django 1.11.  **You'll need to run
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
  properly update**.
-  * Replace the templatetag-based hack we had for document listing in favour of
+* Replace the templatetag-based hack we had for document listing in favour of
  a slightly less ugly solution in the form of another template tag with less
  copypasta.
-  * Support for multi-word-matches for auto-tagging thanks to an excellent
+* Support for multi-word-matches for auto-tagging thanks to an excellent
  patch from `ishirav`_ `#277`_.
-  * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
+* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
  the text and checkboxes under some resolutions `#272`_.
-  * Patched the Docker config to force the serving of static files.  Credit for
+* Patched the Docker config to force the serving of static files.  Credit for
  this one goes to `dev-rke`_ via `#248`_.
-  * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
+* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
-  * Date fields in the admin are now expressed as HTML5 date fields thanks to
+* Date fields in the admin are now expressed as HTML5 date fields thanks to
  `Lukas Winkler`_'s issue `#278`_
-* 0.8.0
+0.8.0
-  * Paperless can now run in a subdirectory on a host (``/paperless``), rather
+=====
 * Paperless can now run in a subdirectory on a host (``/paperless``), rather
  than always running in the root (``/``) thanks to `maphy-psd`_'s work on
  `#255`_.
-* 0.7.0
+0.7.0
-  * **Potentially breaking change**: As per `#235`_, Paperless will no longer
+=====
 * **Potentially breaking change**: As per `#235`_, Paperless will no longer
  automatically delete documents attached to correspondents when those
  correspondents are themselves deleted.  This was Django's default
  behaviour, but didn't make much sense in Paperless' case.  Thanks to
  `Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
-  * Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
+* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
  properly.  Thanks to `ayounggun`_ for reporting this one and to
  `Kusti Skytén`_ for posting the correct solution in the Github issue.
-* 0.6.0
+0.6.0
-  * Abandon the shared-secret trick we were using for the POST API in favour
+=====
 * Abandon the shared-secret trick we were using for the POST API in favour
  of BasicAuth or Django session.
-  * Fix the POST API so it actually works.  `#236`_
+* Fix the POST API so it actually works.  `#236`_
-  * **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
+* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
  as it was being used both for the API (now replaced with a normal auth)
  and form email polling.  Now that we're only using it for email, this
  variable has been renamed to ``PAPERLESS_EMAIL_SECRET``.  The old value
  will still work for a while, but you should change your config if you've
  been using the email polling feature.  Thanks to `Joshua Gilman`_ for all
  the help with this feature.
-* 0.5.0
+
-  * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
+0.5.0
 =====
 * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
  thanks to `Jake Gysland`_'s patch `#220`_.
-  * Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks
+* Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks
  to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
  this one.
-  * Updated the import/export scripts to include support for thumbnails.  Big
+* Updated the import/export scripts to include support for thumbnails.  Big
  thanks to `CkuT`_ for finding this shortcoming and doing the work to get
  it fixed in `#224`_.
-  * All of the following changes are thanks to `David Martin`_:
+* All of the following changes are thanks to `David Martin`_:
  * Bumped the dependency on pyocr to 0.4.7 so new users can make use of
  Tesseract 4 if they so prefer (`#226`_).
  * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
  * Amended the documentation for better handling of systemd service files (`#229`_)
  * Amended the Django Admin configuration to have nice headers (`#230`_)
-* 0.4.1
+0.4.1
-  * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
+=====
 * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
  all-caps suffixes like ``.PDF``
-* 0.4.0
+0.4.0
-  * Introducing reminders.  See `#199`_ for more information, but the short
+=====
 * Introducing reminders.  See `#199`_ for more information, but the short
  explanation is that you can now attach simple notes & times to documents
  which are made available via the API.  Currently, the default API
  (basically just the Django admin) doesn't really make use of this, but
  `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
  like to make use of this feature in his project.
-* 0.3.6
+0.3.6
-  * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
+=====
 * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
  correspondent or the tags for a document.
-  * The ``content`` field is now optional, to allow for the edge case of a
+* The ``content`` field is now optional, to allow for the edge case of a
  purely graphical document.
-  * You can no longer add documents via the admin.  This never worked in the
+* You can no longer add documents via the admin.  This never worked in the
  first place, so all I've done here is remove the link to the broken form.
-  * The consumer code has been heavily refactored to support a pluggable
+* The consumer code has been heavily refactored to support a pluggable
  interface.  Install a paperless consumer via pip and tell paperless about
  it with an environment variable, and you're good to go.  Proper
  documentation is on its way.
-* 0.3.5
+0.3.5
-  * A serious facelift for the documents listing page wherein we drop the
+=====
 * A serious facelift for the documents listing page wherein we drop the
  tabular layout in favour of a tiled interface.
-  * Users can now configure the number of items per page.
+* Users can now configure the number of items per page.
-  * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
+* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
-  * Moved the dotenv loading to the top of settings.py
+* Moved the dotenv loading to the top of settings.py
-  * Fix for `#112`_: Added checks for binaries required for document
+* Fix for `#112`_: Added checks for binaries required for document
  consumption.
-* 0.3.4
+0.3.4
-  * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
+=====
 * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
  Note that you *can* use Django Suit with Paperless, but only in a
  non-profit situation as their free license prohibits for-profit use.  As a
  result, I can't bundle Suit with Paperless without conflicting with the
  GPL.  Further development will be done against the stock Django admin.
-  * I shrunk the thumbnails a little 'cause they were too big for me, even on
+* I shrunk the thumbnails a little 'cause they were too big for me, even on
  my high-DPI monitor.
-  * BasicAuth support for document and thumbnail downloads, as well as the Push
+* BasicAuth support for document and thumbnail downloads, as well as the Push
  API thanks to @thomasbrueggemann.  See `#179`_.
-* 0.3.3
+0.3.3
-  * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
+=====
-  * Timezone, items per page, and default language are now all configurable,
+
 * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
 * Timezone, items per page, and default language are now all configurable,
  also thanks to @ekw.
-* 0.3.2
+0.3.2
-  * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
+=====
 * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
  user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
  arise.
-* 0.3.1
+0.3.1
-  * Added a default value for ``CONVERT_BINARY``
+=====
-* 0.3.0
+* Added a default value for ``CONVERT_BINARY``
-  * Updated to using django-filter 1.x
+
-  * Added some system checks so new users aren't confused by misconfigurations.
+0.3.0
-  * Consumer loop time is now configurable for systems with slow writes.  Just
+=====
 * Updated to using django-filter 1.x
 * Added some system checks so new users aren't confused by misconfigurations.
 * Consumer loop time is now configurable for systems with slow writes.  Just
  set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds.  The default
  is 10.
-  * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
+* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
  ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``.  Please use
  ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
  ``PAPERLESS_SHARED_SECRET`` respectively instead.
-* 0.2.0
+0.2.0
 =====
-  * `#150`_: The media root is now a variable you can set in
+* `#150`_: The media root is now a variable you can set in
  ``paperless.conf``.
-  * `#148`_: The database location (sqlite) is now a variable you can set in
+* `#148`_: The database location (sqlite) is now a variable you can set in
  ``paperless.conf``.
-  * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
+* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
  URL.
-  * `#131`_: Document files are now automatically removed from disk when
+* `#131`_: Document files are now automatically removed from disk when
  they're deleted in Paperless.
-  * `#121`_: Fixed a bug where Paperless wasn't setting document creation time
+* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
  based on the file naming scheme.
-  * `#81`_: Added a hook to run an arbitrary script after every document is
+* `#81`_: Added a hook to run an arbitrary script after every document is
  consumed.
-  * `#98`_: Added optional environment variables for ImageMagick so that it
+* `#98`_: Added optional environment variables for ImageMagick so that it
  doesn't explode when handling Very Large Documents or when it's just
  running on a low-memory system.  Thanks to `Florian Harr`_ for his help on
  this one.
-  * `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to
+* `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to
  `Justin Snyman`_ for the pointers in the issue queue.
-  * Added support for guessing the date from the file name along with the
+* Added support for guessing the date from the file name along with the
  correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull
  request that I took forever to merge and to `Pit`_ for his efforts on the
  regex front.
-  * `#94`_: Restored support for changing the created date in the UI.  Thanks
+* `#94`_: Restored support for changing the created date in the UI.  Thanks
  to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
-* 0.1.1
+0.1.1
 =====
-  * Potentially **Breaking Change**: All references to "sender" in the code
+* Potentially **Breaking Change**: All references to "sender" in the code
  have been renamed to "correspondent" to better reflect the nature of the
  property (one could quite reasonably scan a document before sending it to
  someone.)
-  * `#67`_: Rewrote the document exporter and added a new importer that allows
+* `#67`_: Rewrote the document exporter and added a new importer that allows
  for full metadata retention without depending on the file name and
  modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_,
  `Florian Jung`_, and `Christopher Luu`_ for their code snippets and
  contributing conversation that lead to this change.
-  * `#20`_: Added *unpaper* support to help in cleaning up the scanned image
+* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
  before it's OCR'd.  Thanks to `Pit`_ for this one.
-  * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
+* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
-  * `#68`_: Added support for using a proper config file at
+* `#68`_: Added support for using a proper config file at
  ``/etc/paperless.conf`` and modified the systemd unit files to use it.
-  * Refactored the Vagrant installation process to use environment variables
+* Refactored the Vagrant installation process to use environment variables
  rather than asking the user to modify ``settings.py``.
-  * `#44`_: Harmonise environment variable names with constant names.
+* `#44`_: Harmonise environment variable names with constant names.
-  * `#60`_: Setup logging to actually use the Python native logging framework.
+* `#60`_: Setup logging to actually use the Python native logging framework.
-  * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
+* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
  to be imported but made unavailable.
-* 0.1.0
+0.1.0
 =====
-  * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
+* Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
  `Tikitu de Jager`_ for this one, and especially to `Pit`_
  who spearheadded this effort.
-  * A simple REST API is in place, but it should be considered unstable.
+* A simple REST API is in place, but it should be considered unstable.
-  * Cleaned up the consumer to use temporary directories instead of a single
+* Cleaned up the consumer to use temporary directories instead of a single
  scratch space.  (Thanks `Pit`_)
-  * Improved the efficiency of the consumer by parsing pages more intelligently
+* Improved the efficiency of the consumer by parsing pages more intelligently
  and introducing a threaded OCR process (thanks again `Pit`_).
-  * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_.
+* `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_.
-  * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by
+* `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by
  `Pit`_.
-  * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
+* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
-  * `#54`_: Documented the re-tagger (`zedster`_)
+* `#54`_: Documented the re-tagger (`zedster`_)
-  * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
+* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
-  * Added tox with pep8 checking
+* Added tox with pep8 checking
-* 0.0.6
+0.0.6
 =====
-  * Added support for parallel OCR (significant work from `Pit`_)
+* Added support for parallel OCR (significant work from `Pit`_)
-  * Sped up the language detection (significant work from `Pit`_)
+* Sped up the language detection (significant work from `Pit`_)
-  * Added simple logging
+* Added simple logging
-* 0.0.5
+0.0.5
 =====
-  * Added support for image files as documents (png, jpg, gif, tiff)
+* Added support for image files as documents (png, jpg, gif, tiff)
-  * Added a crude means of HTTP POST for document imports
+* Added a crude means of HTTP POST for document imports
-  * Added IMAP mail support
+* Added IMAP mail support
-  * Added a re-tagging utility
+* Added a re-tagging utility
-  * Documentation for the above as well as data migration
+* Documentation for the above as well as data migration
-* 0.0.4
+0.0.4
 =====
-  * Added automated tagging basted on keyword matching
+* Added automated tagging basted on keyword matching
-  * Cleaned up the document listing page
+* Cleaned up the document listing page
-  * Removed ``User`` and ``Group`` from the admin
+* Removed ``User`` and ``Group`` from the admin
-  * Added ``pytz`` to the list of requirements
+* Added ``pytz`` to the list of requirements
-* 0.0.3
+0.0.3
 =====
-  * Added basic tagging
+* Added basic tagging
-* 0.0.2
+0.0.2
 =====
-  * Added language detection
+* Added language detection
-  * Added datestamps to ``document_exporter``.
+* Added datestamps to ``document_exporter``.
-  * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
+* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
-* 0.0.1
+0.0.1
 =====
-  * Initial release
+* Initial release
 .. _Brian Conn: https://github.com/TheConnMan
 .. _Christopher Luu: https://github.com/nuudles
@@ -268,6 +321,8 @@ Changelog
 .. _Lukas Winkler: https://github.com/Findus23
 .. _chris-aeviator: https://github.com/chris-aeviator
 .. _Dan Panzarella: https://github.com/pzl
 .. _addadi: https://github.com/addadi
 .. _BastianPoe: https://github.com/BastianPoe
 .. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -317,3 +372,5 @@ Changelog
 .. _#283: https://github.com/danielquinn/paperless/issues/283
 .. _#256: https://github.com/danielquinn/paperless/pull/256
 .. _#285: https://github.com/danielquinn/paperless/pull/285
 .. _pipenv: https://docs.pipenv.org/
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -11,24 +11,27 @@ should work) that has the following software installed:
 * `Tesseract`_, plus its language files matching your document base.
 * `Imagemagick`_ version 6.7.5 or higher
 * `unpaper`_
 * `libpoppler-cpp-dev`_ PDF rendering library
 .. _Python3: https://python.org/
 .. _GNU Privacy Guard: https://gnupg.org
 .. _Tesseract: https://github.com/tesseract-ocr
 .. _Imagemagick: http://imagemagick.org/
 .. _unpaper: https://www.flameeyes.eu/projects/unpaper
 .. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
 Notably, you should confirm how you access your Python3 installation.  Many
-Linux distributions will install Python3 in parallel to Python2, using the names
+Linux distributions will install Python3 in parallel to Python2, using the
-``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
+names ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
-``pip``.  Running Paperless with Python2 will likely break things, so make sure that 
+``pip``.  Running Paperless with Python2 will likely break things, so make sure
-you're using the right version.
+that you're using the right version.
 For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
 refer to their Python3 versions.
 In addition to the above, there are a number of Python requirements, all of
-which are listed in a file called ``requirements.txt`` in the project root directory.
+which are listed in a file called ``requirements.txt`` in the project root
 directory.
 If you're not working on a virtual environment (like Vagrant or Docker), you
 should probably be using a virtualenv, but that's your call.  The reasons why
@@ -39,12 +42,13 @@ probably figure that out before continuing.
 .. _requirements-apple:
-Apple-tastic Complications
+Problems with Imagemagick & PDFs
--------------------------
+--------------------------------
-Some users have `run into problems`_ with installing ImageMagick on Apple
+Some users have `run into problems`_ with getting ImageMagick to do its thing
-systems using HomeBrew.  The solution appears to be to install ghostscript as
+with PDFs.  Often this is the case with Apple systems using HomeBrew, but other
-well as ImageMagick:
+Linuxes have been a problem as well.  The solution appears to be to install
 ghostscript as well as ImageMagick:
 .. _run into problems: https://github.com/danielquinn/paperless/issues/25
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -175,7 +175,8 @@ Docker Method
   modified versions of the configuration files.
 4. Modify ``docker-compose.yml`` to your preferences, following the
   instructions in comments in the file. The only change that is a hard
-   requirement is to specify where the consumption directory should mount.
+   requirement is to specify where the consumption directory should
   mount.[#dockercomposeyml]_
 5. Modify ``docker-compose.env`` and adapt the following environment variables:
   ``PAPERLESS_PASSPHRASE``
@@ -192,7 +193,7 @@ Docker Method
     default English, set this parameter to a space separated list of
     three-letter language-codes after `ISO 639-2/T`_. For a list of available
     languages -- including their three letter codes -- see the
-     `Debian packagelist`_.
+     `Alpine packagelist`_.
   ``USERMAP_UID`` and ``USERMAP_GID``
     If you want to mount the consumption volume (directory ``/consume`` within
@@ -282,12 +283,17 @@ Docker Method
 .. _Docker: https://www.docker.com/
 .. _docker-compose: https://docs.docker.com/compose/install/
 .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
-.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
+.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
 .. [#compose] You of course don't have to use docker-compose, but it
   simplifies deployment immensely. If you know your way around Docker, feel
   free to tinker around without using compose!
 .. [#dockercomposeyml] If you're upgrading your docker-compose images from
   version 1.1.0 or earlier, you might need to change in the
   ``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
   both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
   newer ``docker-compose.yml.example`` file
 .. _setup-permanent:
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ python-dotenv>=0.6.2
 python-gnupg>=0.3.9
 pytz>=2016.10
 gunicorn==19.7.1
 pdftotext>=2.0.1
 # For the tests
 factory-boy
--- a/scripts/docker-entrypoint.sh
+++ b/scripts/docker-entrypoint.sh
@@ -9,7 +9,7 @@ map_uidgid() {
    USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
    if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
        echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
-        groupmod -g "${USERMAP_GID}" paperless
+        addgroup -g "${USERMAP_GID}" paperless
        sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
    fi
 }
@@ -56,25 +56,24 @@ install_languages() {
        return
    fi
    # Update apt-lists
    apt-get update
    # Loop over languages to be installed
    for lang in "${langs[@]}"; do
-        pkg="tesseract-ocr-$lang"
+        pkg="tesseract-ocr-data-$lang"
-        if dpkg -s "$pkg" > /dev/null 2>&1; then
+
        # English is installed by default
        if [ "$lang" ==  "eng" ]; then
            continue
        fi
-        if ! apt-cache show "$pkg" > /dev/null 2>&1; then
+        if apk info -e "$pkg" > /dev/null 2>&1; then
            continue
        fi
        if ! apk info "$pkg" > /dev/null 2>&1; then
            continue
        fi
-        apt-get install "$pkg"
+        apk --no-cache --update add "$pkg"
    done
    # Remove apt lists
    rm -rf /var/lib/apt/lists/*
 }
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
 # OCR all documents?
 OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -3,6 +3,7 @@ import os
 import re
 import subprocess
 from multiprocessing.pool import Pool
 import pdftotext
 import langdetect
 import pyocr
@@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
    def get_thumbnail(self):
        """
@@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
        return os.path.join(self.tempdir, "convert-0000.png")
    def _is_ocred(self):
        # Extract text from PDF using pdftotext
        text = get_text_from_pdf(self.document_path)
        # We assume, that a PDF with at least 50 characters contains text
        # (so no OCR required)
        if len(text) > 50:
            return True
        return False
    def get_text(self):
        if not self.OCR_ALWAYS and self._is_ocred():
            self.log("info", "Skipping OCR, using Text from PDF")
            return get_text_from_pdf(self.document_path)
        images = self._get_greyscale()
@@ -212,3 +228,13 @@ def image_to_string(args):
            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
 def get_text_from_pdf(pdf_file):
    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
            return False
    return "\n".join(pdf)