Merge branch 'master' of github.com:danielquinn/paperless

2026-01-08 21:24:26 -06:00 · 2018-02-01 12:37:29 +00:00
parent 88736ff867 3fcd1e2d7e
commit 5c59120c57
11 changed files with 375 additions and 282 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: python
 before_install:
 - sudo apt-get update -qq
 - sudo apt-get install -qq libpoppler-cpp-dev
 sudo: false
 matrix:
--- a/75
+++ b/75
@@ -1,50 +1,47 @@
-FROM python:3.5
+FROM alpine:3.7
 MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
-# Install dependencies
+LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
-RUN apt-get update \
+      contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
-    && apt-get install -y --no-install-recommends \
+        Sven Fischer <git-dev@linux4tw.de>"
        sudo \
        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
    && rm -rf /var/lib/apt/lists/*
 # Install python dependencies
 RUN mkdir -p /usr/src/paperless
 WORKDIR /usr/src/paperless
 COPY requirements.txt /usr/src/paperless/
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
-RUN mkdir -p /usr/src/paperless/src
+COPY requirements.txt /usr/src/paperless/
 RUN mkdir -p /usr/src/paperless/data
 RUN mkdir -p /usr/src/paperless/media
 COPY src/ /usr/src/paperless/src/
 COPY data/ /usr/src/paperless/data/
 COPY media/ /usr/src/paperless/media/
 # Set consumption directory
 ENV PAPERLESS_CONSUMPTION_DIR /consume
 RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
 # Migrate database
 WORKDIR /usr/src/paperless/src
 RUN ./manage.py migrate
 # Create user
 RUN groupadd -g 1000 paperless \
    && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
    && chown -Rh paperless:paperless /usr/src/paperless
 # Set export directory
 ENV PAPERLESS_EXPORT_DIR /export
 RUN mkdir -p $PAPERLESS_EXPORT_DIR
 # Setup entrypoint
 COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
 RUN chmod 755 /sbin/docker-entrypoint.sh
-# Mount volumes
+# Set export and consumption directories
 ENV PAPERLESS_EXPORT_DIR=/export \
    PAPERLESS_CONSUMPTION_DIR=/consume
 # Install dependencies
 RUN apk --no-cache --update add \
        python3 gnupg libmagic bash \
        sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
    apk --no-cache add --virtual .build-dependencies \
        python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
 # Install python dependencies
    python3 -m ensurepip && \
    rm -r /usr/lib/python*/ensurepip && \
    cd /usr/src/paperless && \
    pip3 install --no-cache-dir -r requirements.txt && \
 # Remove build dependencies
    apk del .build-dependencies && \
 # Create the consumption directory
    mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
 # Migrate database
    ./src/manage.py migrate && \
 # Create user
    addgroup -g 1000 paperless && \
    adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
    chown -Rh paperless:paperless /usr/src/paperless && \
    mkdir -p $PAPERLESS_EXPORT_DIR && \
 # Setup entrypoint
    chmod 755 /sbin/docker-entrypoint.sh
 WORKDIR /usr/src/paperless/src
 # Mount volumes and set Entrypoint
 VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
 ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
 CMD ["--help"]
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,6 @@ Paperless
 |Documentation|
 |Chat|
 |Travis|
 |Dependencies|
 Index and archive all of your scanned paper documents
@@ -28,12 +27,11 @@ scanner produces
 1. Buy a document scanner that can write to a place on your network.  If you
   need some inspiration, have a look at the `scanner recommendations`_ page.
   recommended by another user.
 2. Set it up to "scan to FTP" or something similar. It should be able to push
-   scanned images to a server without you having to do anything.  If your
+   scanned images to a server without you having to do anything.  Of course if
-   scanner doesn't know how to automatically upload the file somewhere, you can
+   your scanner doesn't know how to automatically upload the file somewhere,
-   always do that manually.  Paperless doesn't care how the documents get into
+   you can always do that manually.  Paperless doesn't care how the documents
-   its local consumption directory.
+   get into its local consumption directory.
 3. Have the target server run the Paperless consumption script to OCR the file
   and index it into a local database.
 4. Use the web frontend to sift through the database and find what you want.
@@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
   :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
 .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
   :target: https://travis-ci.org/danielquinn/paperless
 .. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
   :target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -2,7 +2,7 @@ version: '2'
 services:
    webserver:
-        image: pitkley/paperless
+        build: ./
        ports:
            # You can adapt the port you want Paperless to listen on by
            # modifying the part before the `:`.
@@ -20,7 +20,7 @@ services:
        command: ["runserver", "--insecure", "0.0.0.0:8000"]
    consumer:
-        image: pitkley/paperless
+        build: ./
        volumes:
            - data:/usr/src/paperless/data
            - media:/usr/src/paperless/media
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,243 +1,296 @@
 Changelog
 #########
-* 1.1.0
+1.2.0
-  * Fix for `#283`_, a redirect bug which broke interactions with
+=====
    paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
  * Addition of an optional new financial year filter, courtesy of
    `David Martin`_ `#256`_
  * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
    `Dan Panzarella`_
-* 1.0.0
+* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
-  * Upgrade to Django 1.11.  **You'll need to run
+  and `Pit`_.
-    ``pip install -r requirements.txt`` after the usual ``git pull`` to
+* `BastianPoe`_ has added the long-awaited feature to automatically skip the
-    properly update**.
+  OCR step when the PDF already contains text. This can be overridden by
-  * Replace the templatetag-based hack we had for document listing in favour of
+  setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
-    a slightly less ugly solution in the form of another template tag with less
+  in the environment.  Note that this also means that Paperless now requires
-    copypasta.
+  ``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
-  * Support for multi-word-matches for auto-tagging thanks to an excellent
+  ``pip install -r requirements.txt`` after the usual ``git pull`` to
-    patch from `ishirav`_ `#277`_.
+  properly update.
  * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
    the text and checkboxes under some resolutions `#272`_.
  * Patched the Docker config to force the serving of static files.  Credit for
    this one goes to `dev-rke`_ via `#248`_.
  * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
  * Date fields in the admin are now expressed as HTML5 date fields thanks to
    `Lukas Winkler`_'s issue `#278`_
-* 0.8.0
+1.1.0
-  * Paperless can now run in a subdirectory on a host (``/paperless``), rather
+=====
    than always running in the root (``/``) thanks to `maphy-psd`_'s work on
    `#255`_.
-* 0.7.0
+* Fix for `#283`_, a redirect bug which broke interactions with
-  * **Potentially breaking change**: As per `#235`_, Paperless will no longer
+  paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
-    automatically delete documents attached to correspondents when those
+* Addition of an optional new financial year filter, courtesy of
-    correspondents are themselves deleted.  This was Django's default
+  `David Martin`_ `#256`_
-    behaviour, but didn't make much sense in Paperless' case.  Thanks to
+* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
-    `Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
+  `Dan Panzarella`_
  * Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
    properly.  Thanks to `ayounggun`_ for reporting this one and to
    `Kusti Skytén`_ for posting the correct solution in the Github issue.
-* 0.6.0
+1.0.0
-  * Abandon the shared-secret trick we were using for the POST API in favour
+=====
    of BasicAuth or Django session.
  * Fix the POST API so it actually works.  `#236`_
  * **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
    as it was being used both for the API (now replaced with a normal auth)
    and form email polling.  Now that we're only using it for email, this
    variable has been renamed to ``PAPERLESS_EMAIL_SECRET``.  The old value
    will still work for a while, but you should change your config if you've
    been using the email polling feature.  Thanks to `Joshua Gilman`_ for all
    the help with this feature.
 * 0.5.0
  * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
    thanks to `Jake Gysland`_'s patch `#220`_.
  * Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks
    to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
    this one.
  * Updated the import/export scripts to include support for thumbnails.  Big
    thanks to `CkuT`_ for finding this shortcoming and doing the work to get
    it fixed in `#224`_.
  * All of the following changes are thanks to `David Martin`_:
    * Bumped the dependency on pyocr to 0.4.7 so new users can make use of
    Tesseract 4 if they so prefer (`#226`_).
    * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
    * Amended the documentation for better handling of systemd service files (`#229`_)
    * Amended the Django Admin configuration to have nice headers (`#230`_)
-* 0.4.1
+* Upgrade to Django 1.11.  **You'll need to run
-  * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
+  ``pip install -r requirements.txt`` after the usual ``git pull`` to
-    all-caps suffixes like ``.PDF``
+  properly update**.
 * Replace the templatetag-based hack we had for document listing in favour of
  a slightly less ugly solution in the form of another template tag with less
  copypasta.
 * Support for multi-word-matches for auto-tagging thanks to an excellent
  patch from `ishirav`_ `#277`_.
 * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
  the text and checkboxes under some resolutions `#272`_.
 * Patched the Docker config to force the serving of static files.  Credit for
  this one goes to `dev-rke`_ via `#248`_.
 * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
 * Date fields in the admin are now expressed as HTML5 date fields thanks to
  `Lukas Winkler`_'s issue `#278`_
-* 0.4.0
+0.8.0
-  * Introducing reminders.  See `#199`_ for more information, but the short
+=====
    explanation is that you can now attach simple notes & times to documents
    which are made available via the API.  Currently, the default API
    (basically just the Django admin) doesn't really make use of this, but
    `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
    like to make use of this feature in his project.
-* 0.3.6
+* Paperless can now run in a subdirectory on a host (``/paperless``), rather
-  * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
+  than always running in the root (``/``) thanks to `maphy-psd`_'s work on
-    correspondent or the tags for a document.
+  `#255`_.
  * The ``content`` field is now optional, to allow for the edge case of a
    purely graphical document.
  * You can no longer add documents via the admin.  This never worked in the
    first place, so all I've done here is remove the link to the broken form.
  * The consumer code has been heavily refactored to support a pluggable
    interface.  Install a paperless consumer via pip and tell paperless about
    it with an environment variable, and you're good to go.  Proper
    documentation is on its way.
-* 0.3.5
+0.7.0
-  * A serious facelift for the documents listing page wherein we drop the
+=====
    tabular layout in favour of a tiled interface.
  * Users can now configure the number of items per page.
  * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
  * Moved the dotenv loading to the top of settings.py
  * Fix for `#112`_: Added checks for binaries required for document
    consumption.
-* 0.3.4
+* **Potentially breaking change**: As per `#235`_, Paperless will no longer
-  * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
+  automatically delete documents attached to correspondents when those
-    Note that you *can* use Django Suit with Paperless, but only in a
+  correspondents are themselves deleted.  This was Django's default
-    non-profit situation as their free license prohibits for-profit use.  As a
+  behaviour, but didn't make much sense in Paperless' case.  Thanks to
-    result, I can't bundle Suit with Paperless without conflicting with the
+  `Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
-    GPL.  Further development will be done against the stock Django admin.
+* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
-  * I shrunk the thumbnails a little 'cause they were too big for me, even on
+  properly.  Thanks to `ayounggun`_ for reporting this one and to
-    my high-DPI monitor.
+  `Kusti Skytén`_ for posting the correct solution in the Github issue.
  * BasicAuth support for document and thumbnail downloads, as well as the Push
    API thanks to @thomasbrueggemann.  See `#179`_.
-* 0.3.3
+0.6.0
-  * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
+=====
  * Timezone, items per page, and default language are now all configurable,
    also thanks to @ekw.
-* 0.3.2
+* Abandon the shared-secret trick we were using for the POST API in favour
-  * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
+  of BasicAuth or Django session.
-    user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
+* Fix the POST API so it actually works.  `#236`_
-    arise.
+* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
  as it was being used both for the API (now replaced with a normal auth)
  and form email polling.  Now that we're only using it for email, this
  variable has been renamed to ``PAPERLESS_EMAIL_SECRET``.  The old value
  will still work for a while, but you should change your config if you've
  been using the email polling feature.  Thanks to `Joshua Gilman`_ for all
  the help with this feature.
-* 0.3.1
+0.5.0
-  * Added a default value for ``CONVERT_BINARY``
+=====
-* 0.3.0
+* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
-  * Updated to using django-filter 1.x
+  thanks to `Jake Gysland`_'s patch `#220`_.
-  * Added some system checks so new users aren't confused by misconfigurations.
+* Modified the Dockerfile to prepare an export directory (`#212`_).  Thanks
-  * Consumer loop time is now configurable for systems with slow writes.  Just
+  to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
-    set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds.  The default
+  this one.
-    is 10.
+* Updated the import/export scripts to include support for thumbnails.  Big
-  * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
+  thanks to `CkuT`_ for finding this shortcoming and doing the work to get
-    ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``.  Please use
+  it fixed in `#224`_.
-    ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
+* All of the following changes are thanks to `David Martin`_:
-    ``PAPERLESS_SHARED_SECRET`` respectively instead.
+  * Bumped the dependency on pyocr to 0.4.7 so new users can make use of
  Tesseract 4 if they so prefer (`#226`_).
  * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
  * Amended the documentation for better handling of systemd service files (`#229`_)
  * Amended the Django Admin configuration to have nice headers (`#230`_)
-* 0.2.0
+0.4.1
 =====
-  * `#150`_: The media root is now a variable you can set in
+* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
-    ``paperless.conf``.
+  all-caps suffixes like ``.PDF``
  * `#148`_: The database location (sqlite) is now a variable you can set in
    ``paperless.conf``.
  * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
    URL.
  * `#131`_: Document files are now automatically removed from disk when
    they're deleted in Paperless.
  * `#121`_: Fixed a bug where Paperless wasn't setting document creation time
    based on the file naming scheme.
  * `#81`_: Added a hook to run an arbitrary script after every document is
    consumed.
  * `#98`_: Added optional environment variables for ImageMagick so that it
    doesn't explode when handling Very Large Documents or when it's just
    running on a low-memory system.  Thanks to `Florian Harr`_ for his help on
    this one.
  * `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to
    `Justin Snyman`_ for the pointers in the issue queue.
  * Added support for guessing the date from the file name along with the
    correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull
    request that I took forever to merge and to `Pit`_ for his efforts on the
    regex front.
  * `#94`_: Restored support for changing the created date in the UI.  Thanks
    to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
-* 0.1.1
+0.4.0
 =====
-  * Potentially **Breaking Change**: All references to "sender" in the code
+* Introducing reminders.  See `#199`_ for more information, but the short
-    have been renamed to "correspondent" to better reflect the nature of the
+  explanation is that you can now attach simple notes & times to documents
-    property (one could quite reasonably scan a document before sending it to
+  which are made available via the API.  Currently, the default API
-    someone.)
+  (basically just the Django admin) doesn't really make use of this, but
-  * `#67`_: Rewrote the document exporter and added a new importer that allows
+  `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
-    for full metadata retention without depending on the file name and
+  like to make use of this feature in his project.
    modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_,
    `Florian Jung`_, and `Christopher Luu`_ for their code snippets and
    contributing conversation that lead to this change.
  * `#20`_: Added *unpaper* support to help in cleaning up the scanned image
    before it's OCR'd.  Thanks to `Pit`_ for this one.
  * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
  * `#68`_: Added support for using a proper config file at
    ``/etc/paperless.conf`` and modified the systemd unit files to use it.
  * Refactored the Vagrant installation process to use environment variables
    rather than asking the user to modify ``settings.py``.
  * `#44`_: Harmonise environment variable names with constant names.
  * `#60`_: Setup logging to actually use the Python native logging framework.
  * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
    to be imported but made unavailable.
-* 0.1.0
+0.3.6
 =====
-  * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
+* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
-    `Tikitu de Jager`_ for this one, and especially to `Pit`_
+  correspondent or the tags for a document.
-    who spearheadded this effort.
+* The ``content`` field is now optional, to allow for the edge case of a
-  * A simple REST API is in place, but it should be considered unstable.
+  purely graphical document.
-  * Cleaned up the consumer to use temporary directories instead of a single
+* You can no longer add documents via the admin.  This never worked in the
-    scratch space.  (Thanks `Pit`_)
+  first place, so all I've done here is remove the link to the broken form.
-  * Improved the efficiency of the consumer by parsing pages more intelligently
+* The consumer code has been heavily refactored to support a pluggable
-    and introducing a threaded OCR process (thanks again `Pit`_).
+  interface.  Install a paperless consumer via pip and tell paperless about
-  * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_.
+  it with an environment variable, and you're good to go.  Proper
-  * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by
+  documentation is on its way.
    `Pit`_.
  * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
  * `#54`_: Documented the re-tagger (`zedster`_)
  * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
  * Added tox with pep8 checking
-* 0.0.6
+0.3.5
 =====
-  * Added support for parallel OCR (significant work from `Pit`_)
+* A serious facelift for the documents listing page wherein we drop the
-  * Sped up the language detection (significant work from `Pit`_)
+  tabular layout in favour of a tiled interface.
-  * Added simple logging
+* Users can now configure the number of items per page.
 * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
 * Moved the dotenv loading to the top of settings.py
 * Fix for `#112`_: Added checks for binaries required for document
  consumption.
-* 0.0.5
+0.3.4
 =====
-  * Added support for image files as documents (png, jpg, gif, tiff)
+* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
-  * Added a crude means of HTTP POST for document imports
+  Note that you *can* use Django Suit with Paperless, but only in a
-  * Added IMAP mail support
+  non-profit situation as their free license prohibits for-profit use.  As a
-  * Added a re-tagging utility
+  result, I can't bundle Suit with Paperless without conflicting with the
-  * Documentation for the above as well as data migration
+  GPL.  Further development will be done against the stock Django admin.
 * I shrunk the thumbnails a little 'cause they were too big for me, even on
  my high-DPI monitor.
 * BasicAuth support for document and thumbnail downloads, as well as the Push
  API thanks to @thomasbrueggemann.  See `#179`_.
-* 0.0.4
+0.3.3
 =====
-  * Added automated tagging basted on keyword matching
+* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
-  * Cleaned up the document listing page
+* Timezone, items per page, and default language are now all configurable,
-  * Removed ``User`` and ``Group`` from the admin
+  also thanks to @ekw.
  * Added ``pytz`` to the list of requirements
-* 0.0.3
+0.3.2
 =====
-  * Added basic tagging
+* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
  user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
  arise.
-* 0.0.2
+0.3.1
 =====
-  * Added language detection
+* Added a default value for ``CONVERT_BINARY``
  * Added datestamps to ``document_exporter``.
  * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
-* 0.0.1
+0.3.0
 =====
-  * Initial release
+* Updated to using django-filter 1.x
 * Added some system checks so new users aren't confused by misconfigurations.
 * Consumer loop time is now configurable for systems with slow writes.  Just
  set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds.  The default
  is 10.
 * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
  ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``.  Please use
  ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
  ``PAPERLESS_SHARED_SECRET`` respectively instead.
 0.2.0
 =====
 * `#150`_: The media root is now a variable you can set in
  ``paperless.conf``.
 * `#148`_: The database location (sqlite) is now a variable you can set in
  ``paperless.conf``.
 * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
  URL.
 * `#131`_: Document files are now automatically removed from disk when
  they're deleted in Paperless.
 * `#121`_: Fixed a bug where Paperless wasn't setting document creation time
  based on the file naming scheme.
 * `#81`_: Added a hook to run an arbitrary script after every document is
  consumed.
 * `#98`_: Added optional environment variables for ImageMagick so that it
  doesn't explode when handling Very Large Documents or when it's just
  running on a low-memory system.  Thanks to `Florian Harr`_ for his help on
  this one.
 * `#89`_ Ported the auto-tagging code to correspondents as well.  Thanks to
  `Justin Snyman`_ for the pointers in the issue queue.
 * Added support for guessing the date from the file name along with the
  correspondent, title, and tags.  Thanks to `Tikitu de Jager`_ for his pull
  request that I took forever to merge and to `Pit`_ for his efforts on the
  regex front.
 * `#94`_: Restored support for changing the created date in the UI.  Thanks
  to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
 0.1.1
 =====
 * Potentially **Breaking Change**: All references to "sender" in the code
  have been renamed to "correspondent" to better reflect the nature of the
  property (one could quite reasonably scan a document before sending it to
  someone.)
 * `#67`_: Rewrote the document exporter and added a new importer that allows
  for full metadata retention without depending on the file name and
  modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_,
  `Florian Jung`_, and `Christopher Luu`_ for their code snippets and
  contributing conversation that lead to this change.
 * `#20`_: Added *unpaper* support to help in cleaning up the scanned image
  before it's OCR'd.  Thanks to `Pit`_ for this one.
 * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
 * `#68`_: Added support for using a proper config file at
  ``/etc/paperless.conf`` and modified the systemd unit files to use it.
 * Refactored the Vagrant installation process to use environment variables
  rather than asking the user to modify ``settings.py``.
 * `#44`_: Harmonise environment variable names with constant names.
 * `#60`_: Setup logging to actually use the Python native logging framework.
 * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
  to be imported but made unavailable.
 0.1.0
 =====
 * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
  `Tikitu de Jager`_ for this one, and especially to `Pit`_
  who spearheadded this effort.
 * A simple REST API is in place, but it should be considered unstable.
 * Cleaned up the consumer to use temporary directories instead of a single
  scratch space.  (Thanks `Pit`_)
 * Improved the efficiency of the consumer by parsing pages more intelligently
  and introducing a threaded OCR process (thanks again `Pit`_).
 * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_.
 * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by
  `Pit`_.
 * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
 * `#54`_: Documented the re-tagger (`zedster`_)
 * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
 * Added tox with pep8 checking
 0.0.6
 =====
 * Added support for parallel OCR (significant work from `Pit`_)
 * Sped up the language detection (significant work from `Pit`_)
 * Added simple logging
 0.0.5
 =====
 * Added support for image files as documents (png, jpg, gif, tiff)
 * Added a crude means of HTTP POST for document imports
 * Added IMAP mail support
 * Added a re-tagging utility
 * Documentation for the above as well as data migration
 0.0.4
 =====
 * Added automated tagging basted on keyword matching
 * Cleaned up the document listing page
 * Removed ``User`` and ``Group`` from the admin
 * Added ``pytz`` to the list of requirements
 0.0.3
 =====
 * Added basic tagging
 0.0.2
 =====
 * Added language detection
 * Added datestamps to ``document_exporter``.
 * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
 0.0.1
 =====
 * Initial release
 .. _Brian Conn: https://github.com/TheConnMan
 .. _Christopher Luu: https://github.com/nuudles
@@ -268,6 +321,8 @@ Changelog
 .. _Lukas Winkler: https://github.com/Findus23
 .. _chris-aeviator: https://github.com/chris-aeviator
 .. _Dan Panzarella: https://github.com/pzl
 .. _addadi: https://github.com/addadi
 .. _BastianPoe: https://github.com/BastianPoe
 .. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -317,3 +372,5 @@ Changelog
 .. _#283: https://github.com/danielquinn/paperless/issues/283
 .. _#256: https://github.com/danielquinn/paperless/pull/256
 .. _#285: https://github.com/danielquinn/paperless/pull/285
 .. _pipenv: https://docs.pipenv.org/
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -11,24 +11,27 @@ should work) that has the following software installed:
 * `Tesseract`_, plus its language files matching your document base.
 * `Imagemagick`_ version 6.7.5 or higher
 * `unpaper`_
 * `libpoppler-cpp-dev`_ PDF rendering library
 .. _Python3: https://python.org/
 .. _GNU Privacy Guard: https://gnupg.org
 .. _Tesseract: https://github.com/tesseract-ocr
 .. _Imagemagick: http://imagemagick.org/
 .. _unpaper: https://www.flameeyes.eu/projects/unpaper
 .. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
 Notably, you should confirm how you access your Python3 installation.  Many
-Linux distributions will install Python3 in parallel to Python2, using the names
+Linux distributions will install Python3 in parallel to Python2, using the
-``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
+names ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and
-``pip``.  Running Paperless with Python2 will likely break things, so make sure that 
+``pip``.  Running Paperless with Python2 will likely break things, so make sure
-you're using the right version.
+that you're using the right version.
 For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
 refer to their Python3 versions.
 In addition to the above, there are a number of Python requirements, all of
-which are listed in a file called ``requirements.txt`` in the project root directory.
+which are listed in a file called ``requirements.txt`` in the project root
 directory.
 If you're not working on a virtual environment (like Vagrant or Docker), you
 should probably be using a virtualenv, but that's your call.  The reasons why
@@ -39,12 +42,13 @@ probably figure that out before continuing.
 .. _requirements-apple:
-Apple-tastic Complications
+Problems with Imagemagick & PDFs
--------------------------
+--------------------------------
-Some users have `run into problems`_ with installing ImageMagick on Apple
+Some users have `run into problems`_ with getting ImageMagick to do its thing
-systems using HomeBrew.  The solution appears to be to install ghostscript as
+with PDFs.  Often this is the case with Apple systems using HomeBrew, but other
-well as ImageMagick:
+Linuxes have been a problem as well.  The solution appears to be to install
 ghostscript as well as ImageMagick:
 .. _run into problems: https://github.com/danielquinn/paperless/issues/25
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -175,7 +175,8 @@ Docker Method
   modified versions of the configuration files.
 4. Modify ``docker-compose.yml`` to your preferences, following the
   instructions in comments in the file. The only change that is a hard
-   requirement is to specify where the consumption directory should mount.
+   requirement is to specify where the consumption directory should
   mount.[#dockercomposeyml]_
 5. Modify ``docker-compose.env`` and adapt the following environment variables:
   ``PAPERLESS_PASSPHRASE``
@@ -192,7 +193,7 @@ Docker Method
     default English, set this parameter to a space separated list of
     three-letter language-codes after `ISO 639-2/T`_. For a list of available
     languages -- including their three letter codes -- see the
-     `Debian packagelist`_.
+     `Alpine packagelist`_.
   ``USERMAP_UID`` and ``USERMAP_GID``
     If you want to mount the consumption volume (directory ``/consume`` within
@@ -282,12 +283,17 @@ Docker Method
 .. _Docker: https://www.docker.com/
 .. _docker-compose: https://docs.docker.com/compose/install/
 .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
-.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
+.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
 .. [#compose] You of course don't have to use docker-compose, but it
   simplifies deployment immensely. If you know your way around Docker, feel
   free to tinker around without using compose!
 .. [#dockercomposeyml] If you're upgrading your docker-compose images from
   version 1.1.0 or earlier, you might need to change in the
   ``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
   both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
   newer ``docker-compose.yml.example`` file
 .. _setup-permanent:
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ python-dotenv>=0.6.2
 python-gnupg>=0.3.9
 pytz>=2016.10
 gunicorn==19.7.1
 pdftotext>=2.0.1
 # For the tests
 factory-boy
--- a/scripts/docker-entrypoint.sh
+++ b/scripts/docker-entrypoint.sh
@@ -9,7 +9,7 @@ map_uidgid() {
    USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
    if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
        echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
-        groupmod -g "${USERMAP_GID}" paperless
+        addgroup -g "${USERMAP_GID}" paperless
        sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
    fi
 }
@@ -56,25 +56,24 @@ install_languages() {
        return
    fi
    # Update apt-lists
    apt-get update
    # Loop over languages to be installed
    for lang in "${langs[@]}"; do
-        pkg="tesseract-ocr-$lang"
+        pkg="tesseract-ocr-data-$lang"
-        if dpkg -s "$pkg" > /dev/null 2>&1; then
+
        # English is installed by default
        if [ "$lang" ==  "eng" ]; then
            continue
        fi
-        if ! apt-cache show "$pkg" > /dev/null 2>&1; then
+        if apk info -e "$pkg" > /dev/null 2>&1; then
            continue
        fi
        if ! apk info "$pkg" > /dev/null 2>&1; then
            continue
        fi
-        apt-get install "$pkg"
+        apk --no-cache --update add "$pkg"
    done
    # Remove apt lists
    rm -rf /var/lib/apt/lists/*
 }
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
 # OCR all documents?
 OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -3,6 +3,7 @@ import os
 import re
 import subprocess
 from multiprocessing.pool import Pool
 import pdftotext
 import langdetect
 import pyocr
@@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
    def get_thumbnail(self):
        """
@@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
        return os.path.join(self.tempdir, "convert-0000.png")
    def _is_ocred(self):
        # Extract text from PDF using pdftotext
        text = get_text_from_pdf(self.document_path)
        # We assume, that a PDF with at least 50 characters contains text
        # (so no OCR required)
        if len(text) > 50:
            return True
        return False
    def get_text(self):
        if not self.OCR_ALWAYS and self._is_ocred():
            self.log("info", "Skipping OCR, using Text from PDF")
            return get_text_from_pdf(self.document_path)
        images = self._get_greyscale()
@@ -212,3 +228,13 @@ def image_to_string(args):
            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
 def get_text_from_pdf(pdf_file):
    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
            return False
    return "\n".join(pdf)