Merge branch 'master' of github.com:danielquinn/paperless

This commit is contained in:
Daniel Quinn 2018-02-01 12:37:29 +00:00
commit 5c59120c57
11 changed files with 375 additions and 282 deletions

View File

@ -1,5 +1,9 @@
language: python language: python
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev
sudo: false sudo: false
matrix: matrix:

View File

@ -1,50 +1,47 @@
FROM python:3.5 FROM alpine:3.7
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
# Install dependencies LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
RUN apt-get update \ contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
&& apt-get install -y --no-install-recommends \ Sven Fischer <git-dev@linux4tw.de>"
sudo \
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
&& rm -rf /var/lib/apt/lists/*
# Install python dependencies
RUN mkdir -p /usr/src/paperless
WORKDIR /usr/src/paperless
COPY requirements.txt /usr/src/paperless/
RUN pip install --no-cache-dir -r requirements.txt
# Copy application # Copy application
RUN mkdir -p /usr/src/paperless/src COPY requirements.txt /usr/src/paperless/
RUN mkdir -p /usr/src/paperless/data
RUN mkdir -p /usr/src/paperless/media
COPY src/ /usr/src/paperless/src/ COPY src/ /usr/src/paperless/src/
COPY data/ /usr/src/paperless/data/ COPY data/ /usr/src/paperless/data/
COPY media/ /usr/src/paperless/media/ COPY media/ /usr/src/paperless/media/
# Set consumption directory
ENV PAPERLESS_CONSUMPTION_DIR /consume
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
# Migrate database
WORKDIR /usr/src/paperless/src
RUN ./manage.py migrate
# Create user
RUN groupadd -g 1000 paperless \
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
&& chown -Rh paperless:paperless /usr/src/paperless
# Set export directory
ENV PAPERLESS_EXPORT_DIR /export
RUN mkdir -p $PAPERLESS_EXPORT_DIR
# Setup entrypoint
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
RUN chmod 755 /sbin/docker-entrypoint.sh
# Mount volumes # Set export and consumption directories
ENV PAPERLESS_EXPORT_DIR=/export \
PAPERLESS_CONSUMPTION_DIR=/consume
# Install dependencies
RUN apk --no-cache --update add \
python3 gnupg libmagic bash \
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
apk --no-cache add --virtual .build-dependencies \
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
# Install python dependencies
python3 -m ensurepip && \
rm -r /usr/lib/python*/ensurepip && \
cd /usr/src/paperless && \
pip3 install --no-cache-dir -r requirements.txt && \
# Remove build dependencies
apk del .build-dependencies && \
# Create the consumption directory
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
# Migrate database
./src/manage.py migrate && \
# Create user
addgroup -g 1000 paperless && \
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
chown -Rh paperless:paperless /usr/src/paperless && \
mkdir -p $PAPERLESS_EXPORT_DIR && \
# Setup entrypoint
chmod 755 /sbin/docker-entrypoint.sh
WORKDIR /usr/src/paperless/src
# Mount volumes and set Entrypoint
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"] ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["--help"] CMD ["--help"]

View File

@ -4,7 +4,6 @@ Paperless
|Documentation| |Documentation|
|Chat| |Chat|
|Travis| |Travis|
|Dependencies|
Index and archive all of your scanned paper documents Index and archive all of your scanned paper documents
@ -28,12 +27,11 @@ scanner produces
1. Buy a document scanner that can write to a place on your network. If you 1. Buy a document scanner that can write to a place on your network. If you
need some inspiration, have a look at the `scanner recommendations`_ page. need some inspiration, have a look at the `scanner recommendations`_ page.
recommended by another user.
2. Set it up to "scan to FTP" or something similar. It should be able to push 2. Set it up to "scan to FTP" or something similar. It should be able to push
scanned images to a server without you having to do anything. If your scanned images to a server without you having to do anything. Of course if
scanner doesn't know how to automatically upload the file somewhere, you can your scanner doesn't know how to automatically upload the file somewhere,
always do that manually. Paperless doesn't care how the documents get into you can always do that manually. Paperless doesn't care how the documents
its local consumption directory. get into its local consumption directory.
3. Have the target server run the Paperless consumption script to OCR the file 3. Have the target server run the Paperless consumption script to OCR the file
and index it into a local database. and index it into a local database.
4. Use the web frontend to sift through the database and find what you want. 4. Use the web frontend to sift through the database and find what you want.
@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
:target: https://travis-ci.org/danielquinn/paperless :target: https://travis-ci.org/danielquinn/paperless
.. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
:target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500

View File

@ -2,7 +2,7 @@ version: '2'
services: services:
webserver: webserver:
image: pitkley/paperless build: ./
ports: ports:
# You can adapt the port you want Paperless to listen on by # You can adapt the port you want Paperless to listen on by
# modifying the part before the `:`. # modifying the part before the `:`.
@ -20,7 +20,7 @@ services:
command: ["runserver", "--insecure", "0.0.0.0:8000"] command: ["runserver", "--insecure", "0.0.0.0:8000"]
consumer: consumer:
image: pitkley/paperless build: ./
volumes: volumes:
- data:/usr/src/paperless/data - data:/usr/src/paperless/data
- media:/usr/src/paperless/media - media:/usr/src/paperless/media

View File

@ -1,7 +1,22 @@
Changelog Changelog
######### #########
* 1.1.0 1.2.0
=====
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
and `Pit`_.
* `BastianPoe`_ has added the long-awaited feature to automatically skip the
OCR step when the PDF already contains text. This can be overridden by
setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
in the environment. Note that this also means that Paperless now requires
``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
``pip install -r requirements.txt`` after the usual ``git pull`` to
properly update.
1.1.0
=====
* Fix for `#283`_, a redirect bug which broke interactions with * Fix for `#283`_, a redirect bug which broke interactions with
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
* Addition of an optional new financial year filter, courtesy of * Addition of an optional new financial year filter, courtesy of
@ -9,7 +24,9 @@ Changelog
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
`Dan Panzarella`_ `Dan Panzarella`_
* 1.0.0 1.0.0
=====
* Upgrade to Django 1.11. **You'll need to run * Upgrade to Django 1.11. **You'll need to run
``pip install -r requirements.txt`` after the usual ``git pull`` to ``pip install -r requirements.txt`` after the usual ``git pull`` to
properly update**. properly update**.
@ -26,12 +43,16 @@ Changelog
* Date fields in the admin are now expressed as HTML5 date fields thanks to * Date fields in the admin are now expressed as HTML5 date fields thanks to
`Lukas Winkler`_'s issue `#278`_ `Lukas Winkler`_'s issue `#278`_
* 0.8.0 0.8.0
=====
* Paperless can now run in a subdirectory on a host (``/paperless``), rather * Paperless can now run in a subdirectory on a host (``/paperless``), rather
than always running in the root (``/``) thanks to `maphy-psd`_'s work on than always running in the root (``/``) thanks to `maphy-psd`_'s work on
`#255`_. `#255`_.
* 0.7.0 0.7.0
=====
* **Potentially breaking change**: As per `#235`_, Paperless will no longer * **Potentially breaking change**: As per `#235`_, Paperless will no longer
automatically delete documents attached to correspondents when those automatically delete documents attached to correspondents when those
correspondents are themselves deleted. This was Django's default correspondents are themselves deleted. This was Django's default
@ -41,7 +62,9 @@ Changelog
properly. Thanks to `ayounggun`_ for reporting this one and to properly. Thanks to `ayounggun`_ for reporting this one and to
`Kusti Skytén`_ for posting the correct solution in the Github issue. `Kusti Skytén`_ for posting the correct solution in the Github issue.
* 0.6.0 0.6.0
=====
* Abandon the shared-secret trick we were using for the POST API in favour * Abandon the shared-secret trick we were using for the POST API in favour
of BasicAuth or Django session. of BasicAuth or Django session.
* Fix the POST API so it actually works. `#236`_ * Fix the POST API so it actually works. `#236`_
@ -52,7 +75,10 @@ Changelog
will still work for a while, but you should change your config if you've will still work for a while, but you should change your config if you've
been using the email polling feature. Thanks to `Joshua Gilman`_ for all been using the email polling feature. Thanks to `Joshua Gilman`_ for all
the help with this feature. the help with this feature.
* 0.5.0
0.5.0
=====
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
thanks to `Jake Gysland`_'s patch `#220`_. thanks to `Jake Gysland`_'s patch `#220`_.
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks * Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
@ -68,11 +94,15 @@ Changelog
* Amended the documentation for better handling of systemd service files (`#229`_) * Amended the documentation for better handling of systemd service files (`#229`_)
* Amended the Django Admin configuration to have nice headers (`#230`_) * Amended the Django Admin configuration to have nice headers (`#230`_)
* 0.4.1 0.4.1
=====
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
all-caps suffixes like ``.PDF`` all-caps suffixes like ``.PDF``
* 0.4.0 0.4.0
=====
* Introducing reminders. See `#199`_ for more information, but the short * Introducing reminders. See `#199`_ for more information, but the short
explanation is that you can now attach simple notes & times to documents explanation is that you can now attach simple notes & times to documents
which are made available via the API. Currently, the default API which are made available via the API. Currently, the default API
@ -80,7 +110,9 @@ Changelog
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
like to make use of this feature in his project. like to make use of this feature in his project.
* 0.3.6 0.3.6
=====
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
correspondent or the tags for a document. correspondent or the tags for a document.
* The ``content`` field is now optional, to allow for the edge case of a * The ``content`` field is now optional, to allow for the edge case of a
@ -92,7 +124,9 @@ Changelog
it with an environment variable, and you're good to go. Proper it with an environment variable, and you're good to go. Proper
documentation is on its way. documentation is on its way.
* 0.3.5 0.3.5
=====
* A serious facelift for the documents listing page wherein we drop the * A serious facelift for the documents listing page wherein we drop the
tabular layout in favour of a tiled interface. tabular layout in favour of a tiled interface.
* Users can now configure the number of items per page. * Users can now configure the number of items per page.
@ -101,7 +135,9 @@ Changelog
* Fix for `#112`_: Added checks for binaries required for document * Fix for `#112`_: Added checks for binaries required for document
consumption. consumption.
* 0.3.4 0.3.4
=====
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3. * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
Note that you *can* use Django Suit with Paperless, but only in a Note that you *can* use Django Suit with Paperless, but only in a
non-profit situation as their free license prohibits for-profit use. As a non-profit situation as their free license prohibits for-profit use. As a
@ -112,20 +148,28 @@ Changelog
* BasicAuth support for document and thumbnail downloads, as well as the Push * BasicAuth support for document and thumbnail downloads, as well as the Push
API thanks to @thomasbrueggemann. See `#179`_. API thanks to @thomasbrueggemann. See `#179`_.
* 0.3.3 0.3.3
=====
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw! * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
* Timezone, items per page, and default language are now all configurable, * Timezone, items per page, and default language are now all configurable,
also thanks to @ekw. also thanks to @ekw.
* 0.3.2 0.3.2
=====
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
arise. arise.
* 0.3.1 0.3.1
=====
* Added a default value for ``CONVERT_BINARY`` * Added a default value for ``CONVERT_BINARY``
* 0.3.0 0.3.0
=====
* Updated to using django-filter 1.x * Updated to using django-filter 1.x
* Added some system checks so new users aren't confused by misconfigurations. * Added some system checks so new users aren't confused by misconfigurations.
* Consumer loop time is now configurable for systems with slow writes. Just * Consumer loop time is now configurable for systems with slow writes. Just
@ -136,7 +180,8 @@ Changelog
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
``PAPERLESS_SHARED_SECRET`` respectively instead. ``PAPERLESS_SHARED_SECRET`` respectively instead.
* 0.2.0 0.2.0
=====
* `#150`_: The media root is now a variable you can set in * `#150`_: The media root is now a variable you can set in
``paperless.conf``. ``paperless.conf``.
@ -163,7 +208,8 @@ Changelog
* `#94`_: Restored support for changing the created date in the UI. Thanks * `#94`_: Restored support for changing the created date in the UI. Thanks
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this. to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
* 0.1.1 0.1.1
=====
* Potentially **Breaking Change**: All references to "sender" in the code * Potentially **Breaking Change**: All references to "sender" in the code
have been renamed to "correspondent" to better reflect the nature of the have been renamed to "correspondent" to better reflect the nature of the
@ -186,7 +232,8 @@ Changelog
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
to be imported but made unavailable. to be imported but made unavailable.
* 0.1.0 0.1.0
=====
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and * Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
`Tikitu de Jager`_ for this one, and especially to `Pit`_ `Tikitu de Jager`_ for this one, and especially to `Pit`_
@ -204,13 +251,15 @@ Changelog
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_) * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
* Added tox with pep8 checking * Added tox with pep8 checking
* 0.0.6 0.0.6
=====
* Added support for parallel OCR (significant work from `Pit`_) * Added support for parallel OCR (significant work from `Pit`_)
* Sped up the language detection (significant work from `Pit`_) * Sped up the language detection (significant work from `Pit`_)
* Added simple logging * Added simple logging
* 0.0.5 0.0.5
=====
* Added support for image files as documents (png, jpg, gif, tiff) * Added support for image files as documents (png, jpg, gif, tiff)
* Added a crude means of HTTP POST for document imports * Added a crude means of HTTP POST for document imports
@ -218,24 +267,28 @@ Changelog
* Added a re-tagging utility * Added a re-tagging utility
* Documentation for the above as well as data migration * Documentation for the above as well as data migration
* 0.0.4 0.0.4
=====
* Added automated tagging basted on keyword matching * Added automated tagging basted on keyword matching
* Cleaned up the document listing page * Cleaned up the document listing page
* Removed ``User`` and ``Group`` from the admin * Removed ``User`` and ``Group`` from the admin
* Added ``pytz`` to the list of requirements * Added ``pytz`` to the list of requirements
* 0.0.3 0.0.3
=====
* Added basic tagging * Added basic tagging
* 0.0.2 0.0.2
=====
* Added language detection * Added language detection
* Added datestamps to ``document_exporter``. * Added datestamps to ``document_exporter``.
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``. * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
* 0.0.1 0.0.1
=====
* Initial release * Initial release
@ -268,6 +321,8 @@ Changelog
.. _Lukas Winkler: https://github.com/Findus23 .. _Lukas Winkler: https://github.com/Findus23
.. _chris-aeviator: https://github.com/chris-aeviator .. _chris-aeviator: https://github.com/chris-aeviator
.. _Dan Panzarella: https://github.com/pzl .. _Dan Panzarella: https://github.com/pzl
.. _addadi: https://github.com/addadi
.. _BastianPoe: https://github.com/BastianPoe
.. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44 .. _#44: https://github.com/danielquinn/paperless/issues/44
@ -317,3 +372,5 @@ Changelog
.. _#283: https://github.com/danielquinn/paperless/issues/283 .. _#283: https://github.com/danielquinn/paperless/issues/283
.. _#256: https://github.com/danielquinn/paperless/pull/256 .. _#256: https://github.com/danielquinn/paperless/pull/256
.. _#285: https://github.com/danielquinn/paperless/pull/285 .. _#285: https://github.com/danielquinn/paperless/pull/285
.. _pipenv: https://docs.pipenv.org/

View File

@ -11,24 +11,27 @@ should work) that has the following software installed:
* `Tesseract`_, plus its language files matching your document base. * `Tesseract`_, plus its language files matching your document base.
* `Imagemagick`_ version 6.7.5 or higher * `Imagemagick`_ version 6.7.5 or higher
* `unpaper`_ * `unpaper`_
* `libpoppler-cpp-dev`_ PDF rendering library
.. _Python3: https://python.org/ .. _Python3: https://python.org/
.. _GNU Privacy Guard: https://gnupg.org .. _GNU Privacy Guard: https://gnupg.org
.. _Tesseract: https://github.com/tesseract-ocr .. _Tesseract: https://github.com/tesseract-ocr
.. _Imagemagick: http://imagemagick.org/ .. _Imagemagick: http://imagemagick.org/
.. _unpaper: https://www.flameeyes.eu/projects/unpaper .. _unpaper: https://www.flameeyes.eu/projects/unpaper
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
Notably, you should confirm how you access your Python3 installation. Many Notably, you should confirm how you access your Python3 installation. Many
Linux distributions will install Python3 in parallel to Python2, using the names Linux distributions will install Python3 in parallel to Python2, using the
``python3`` and ``python`` respectively. The same goes for ``pip3`` and names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
``pip``. Running Paperless with Python2 will likely break things, so make sure that ``pip``. Running Paperless with Python2 will likely break things, so make sure
you're using the right version. that you're using the right version.
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
refer to their Python3 versions. refer to their Python3 versions.
In addition to the above, there are a number of Python requirements, all of In addition to the above, there are a number of Python requirements, all of
which are listed in a file called ``requirements.txt`` in the project root directory. which are listed in a file called ``requirements.txt`` in the project root
directory.
If you're not working on a virtual environment (like Vagrant or Docker), you If you're not working on a virtual environment (like Vagrant or Docker), you
should probably be using a virtualenv, but that's your call. The reasons why should probably be using a virtualenv, but that's your call. The reasons why
@ -39,12 +42,13 @@ probably figure that out before continuing.
.. _requirements-apple: .. _requirements-apple:
Apple-tastic Complications Problems with Imagemagick & PDFs
-------------------------- --------------------------------
Some users have `run into problems`_ with installing ImageMagick on Apple Some users have `run into problems`_ with getting ImageMagick to do its thing
systems using HomeBrew. The solution appears to be to install ghostscript as with PDFs. Often this is the case with Apple systems using HomeBrew, but other
well as ImageMagick: Linuxes have been a problem as well. The solution appears to be to install
ghostscript as well as ImageMagick:
.. _run into problems: https://github.com/danielquinn/paperless/issues/25 .. _run into problems: https://github.com/danielquinn/paperless/issues/25

View File

@ -175,7 +175,8 @@ Docker Method
modified versions of the configuration files. modified versions of the configuration files.
4. Modify ``docker-compose.yml`` to your preferences, following the 4. Modify ``docker-compose.yml`` to your preferences, following the
instructions in comments in the file. The only change that is a hard instructions in comments in the file. The only change that is a hard
requirement is to specify where the consumption directory should mount. requirement is to specify where the consumption directory should
mount.[#dockercomposeyml]_
5. Modify ``docker-compose.env`` and adapt the following environment variables: 5. Modify ``docker-compose.env`` and adapt the following environment variables:
``PAPERLESS_PASSPHRASE`` ``PAPERLESS_PASSPHRASE``
@ -192,7 +193,7 @@ Docker Method
default English, set this parameter to a space separated list of default English, set this parameter to a space separated list of
three-letter language-codes after `ISO 639-2/T`_. For a list of available three-letter language-codes after `ISO 639-2/T`_. For a list of available
languages -- including their three letter codes -- see the languages -- including their three letter codes -- see the
`Debian packagelist`_. `Alpine packagelist`_.
``USERMAP_UID`` and ``USERMAP_GID`` ``USERMAP_UID`` and ``USERMAP_GID``
If you want to mount the consumption volume (directory ``/consume`` within If you want to mount the consumption volume (directory ``/consume`` within
@ -282,12 +283,17 @@ Docker Method
.. _Docker: https://www.docker.com/ .. _Docker: https://www.docker.com/
.. _docker-compose: https://docs.docker.com/compose/install/ .. _docker-compose: https://docs.docker.com/compose/install/
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- .. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
.. [#compose] You of course don't have to use docker-compose, but it .. [#compose] You of course don't have to use docker-compose, but it
simplifies deployment immensely. If you know your way around Docker, feel simplifies deployment immensely. If you know your way around Docker, feel
free to tinker around without using compose! free to tinker around without using compose!
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
version 1.1.0 or earlier, you might need to change in the
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
newer ``docker-compose.yml.example`` file
.. _setup-permanent: .. _setup-permanent:

View File

@ -14,6 +14,7 @@ python-dotenv>=0.6.2
python-gnupg>=0.3.9 python-gnupg>=0.3.9
pytz>=2016.10 pytz>=2016.10
gunicorn==19.7.1 gunicorn==19.7.1
pdftotext>=2.0.1
# For the tests # For the tests
factory-boy factory-boy

View File

@ -9,7 +9,7 @@ map_uidgid() {
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
groupmod -g "${USERMAP_GID}" paperless addgroup -g "${USERMAP_GID}" paperless
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
fi fi
} }
@ -56,25 +56,24 @@ install_languages() {
return return
fi fi
# Update apt-lists
apt-get update
# Loop over languages to be installed # Loop over languages to be installed
for lang in "${langs[@]}"; do for lang in "${langs[@]}"; do
pkg="tesseract-ocr-$lang" pkg="tesseract-ocr-data-$lang"
if dpkg -s "$pkg" > /dev/null 2>&1; then
# English is installed by default
if [ "$lang" == "eng" ]; then
continue continue
fi fi
if ! apt-cache show "$pkg" > /dev/null 2>&1; then if apk info -e "$pkg" > /dev/null 2>&1; then
continue
fi
if ! apk info "$pkg" > /dev/null 2>&1; then
continue continue
fi fi
apt-get install "$pkg" apk --no-cache --update add "$pkg"
done done
# Remove apt lists
rm -rf /var/lib/apt/lists/*
} }

View File

@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR # The amount of threads to use for OCR
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# OCR all documents?
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
# If this is true, any failed attempts to OCR a PDF will result in the PDF # If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file # being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR. # will simply be left in the CONSUMPTION_DIR.

View File

@ -3,6 +3,7 @@ import os
import re import re
import subprocess import subprocess
from multiprocessing.pool import Pool from multiprocessing.pool import Pool
import pdftotext
import langdetect import langdetect
import pyocr import pyocr
@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def get_thumbnail(self): def get_thumbnail(self):
""" """
@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
return os.path.join(self.tempdir, "convert-0000.png") return os.path.join(self.tempdir, "convert-0000.png")
def _is_ocred(self):
# Extract text from PDF using pdftotext
text = get_text_from_pdf(self.document_path)
# We assume, that a PDF with at least 50 characters contains text
# (so no OCR required)
if len(text) > 50:
return True
return False
def get_text(self): def get_text(self):
if not self.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
return get_text_from_pdf(self.document_path)
images = self._get_greyscale() images = self._get_greyscale()
@ -212,3 +228,13 @@ def image_to_string(args):
except (TesseractError, OtherTesseractError): except (TesseractError, OtherTesseractError):
pass pass
return ocr.image_to_string(f, lang=lang) return ocr.image_to_string(f, lang=lang)
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return False
return "\n".join(pdf)