mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'master' of github.com:danielquinn/paperless
This commit is contained in:
commit
5c59120c57
@ -1,5 +1,9 @@
|
||||
language: python
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev
|
||||
|
||||
sudo: false
|
||||
|
||||
matrix:
|
||||
|
75
Dockerfile
75
Dockerfile
@ -1,50 +1,47 @@
|
||||
FROM python:3.5
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
FROM alpine:3.7
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
sudo \
|
||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install python dependencies
|
||||
RUN mkdir -p /usr/src/paperless
|
||||
WORKDIR /usr/src/paperless
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
|
||||
contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
|
||||
Sven Fischer <git-dev@linux4tw.de>"
|
||||
|
||||
# Copy application
|
||||
RUN mkdir -p /usr/src/paperless/src
|
||||
RUN mkdir -p /usr/src/paperless/data
|
||||
RUN mkdir -p /usr/src/paperless/media
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
COPY src/ /usr/src/paperless/src/
|
||||
COPY data/ /usr/src/paperless/data/
|
||||
COPY media/ /usr/src/paperless/media/
|
||||
|
||||
# Set consumption directory
|
||||
ENV PAPERLESS_CONSUMPTION_DIR /consume
|
||||
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
|
||||
|
||||
# Migrate database
|
||||
WORKDIR /usr/src/paperless/src
|
||||
RUN ./manage.py migrate
|
||||
|
||||
# Create user
|
||||
RUN groupadd -g 1000 paperless \
|
||||
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
||||
&& chown -Rh paperless:paperless /usr/src/paperless
|
||||
|
||||
# Set export directory
|
||||
ENV PAPERLESS_EXPORT_DIR /export
|
||||
RUN mkdir -p $PAPERLESS_EXPORT_DIR
|
||||
|
||||
# Setup entrypoint
|
||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
RUN chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
# Mount volumes
|
||||
# Set export and consumption directories
|
||||
ENV PAPERLESS_EXPORT_DIR=/export \
|
||||
PAPERLESS_CONSUMPTION_DIR=/consume
|
||||
|
||||
# Install dependencies
|
||||
RUN apk --no-cache --update add \
|
||||
python3 gnupg libmagic bash \
|
||||
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
|
||||
apk --no-cache add --virtual .build-dependencies \
|
||||
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
|
||||
# Install python dependencies
|
||||
python3 -m ensurepip && \
|
||||
rm -r /usr/lib/python*/ensurepip && \
|
||||
cd /usr/src/paperless && \
|
||||
pip3 install --no-cache-dir -r requirements.txt && \
|
||||
# Remove build dependencies
|
||||
apk del .build-dependencies && \
|
||||
# Create the consumption directory
|
||||
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
|
||||
# Migrate database
|
||||
./src/manage.py migrate && \
|
||||
# Create user
|
||||
addgroup -g 1000 paperless && \
|
||||
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
|
||||
chown -Rh paperless:paperless /usr/src/paperless && \
|
||||
mkdir -p $PAPERLESS_EXPORT_DIR && \
|
||||
# Setup entrypoint
|
||||
chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
WORKDIR /usr/src/paperless/src
|
||||
# Mount volumes and set Entrypoint
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
|
||||
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["--help"]
|
||||
|
12
README.rst
12
README.rst
@ -4,7 +4,6 @@ Paperless
|
||||
|Documentation|
|
||||
|Chat|
|
||||
|Travis|
|
||||
|Dependencies|
|
||||
|
||||
Index and archive all of your scanned paper documents
|
||||
|
||||
@ -28,12 +27,11 @@ scanner produces
|
||||
|
||||
1. Buy a document scanner that can write to a place on your network. If you
|
||||
need some inspiration, have a look at the `scanner recommendations`_ page.
|
||||
recommended by another user.
|
||||
2. Set it up to "scan to FTP" or something similar. It should be able to push
|
||||
scanned images to a server without you having to do anything. If your
|
||||
scanner doesn't know how to automatically upload the file somewhere, you can
|
||||
always do that manually. Paperless doesn't care how the documents get into
|
||||
its local consumption directory.
|
||||
scanned images to a server without you having to do anything. Of course if
|
||||
your scanner doesn't know how to automatically upload the file somewhere,
|
||||
you can always do that manually. Paperless doesn't care how the documents
|
||||
get into its local consumption directory.
|
||||
3. Have the target server run the Paperless consumption script to OCR the file
|
||||
and index it into a local database.
|
||||
4. Use the web frontend to sift through the database and find what you want.
|
||||
@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
|
||||
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
||||
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
||||
:target: https://travis-ci.org/danielquinn/paperless
|
||||
.. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
|
||||
:target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
|
||||
|
@ -2,7 +2,7 @@ version: '2'
|
||||
|
||||
services:
|
||||
webserver:
|
||||
image: pitkley/paperless
|
||||
build: ./
|
||||
ports:
|
||||
# You can adapt the port you want Paperless to listen on by
|
||||
# modifying the part before the `:`.
|
||||
@ -20,7 +20,7 @@ services:
|
||||
command: ["runserver", "--insecure", "0.0.0.0:8000"]
|
||||
|
||||
consumer:
|
||||
image: pitkley/paperless
|
||||
build: ./
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
|
@ -1,243 +1,296 @@
|
||||
Changelog
|
||||
#########
|
||||
|
||||
* 1.1.0
|
||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||
* Addition of an optional new financial year filter, courtesy of
|
||||
`David Martin`_ `#256`_
|
||||
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
|
||||
`Dan Panzarella`_
|
||||
1.2.0
|
||||
=====
|
||||
|
||||
* 1.0.0
|
||||
* Upgrade to Django 1.11. **You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
properly update**.
|
||||
* Replace the templatetag-based hack we had for document listing in favour of
|
||||
a slightly less ugly solution in the form of another template tag with less
|
||||
copypasta.
|
||||
* Support for multi-word-matches for auto-tagging thanks to an excellent
|
||||
patch from `ishirav`_ `#277`_.
|
||||
* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
|
||||
the text and checkboxes under some resolutions `#272`_.
|
||||
* Patched the Docker config to force the serving of static files. Credit for
|
||||
this one goes to `dev-rke`_ via `#248`_.
|
||||
* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
|
||||
* Date fields in the admin are now expressed as HTML5 date fields thanks to
|
||||
`Lukas Winkler`_'s issue `#278`_
|
||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||
and `Pit`_.
|
||||
* `BastianPoe`_ has added the long-awaited feature to automatically skip the
|
||||
OCR step when the PDF already contains text. This can be overridden by
|
||||
setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
|
||||
in the environment. Note that this also means that Paperless now requires
|
||||
``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
properly update.
|
||||
|
||||
* 0.8.0
|
||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||
`#255`_.
|
||||
1.1.0
|
||||
=====
|
||||
|
||||
* 0.7.0
|
||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||
automatically delete documents attached to correspondents when those
|
||||
correspondents are themselves deleted. This was Django's default
|
||||
behaviour, but didn't make much sense in Paperless' case. Thanks to
|
||||
`Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
|
||||
* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
|
||||
properly. Thanks to `ayounggun`_ for reporting this one and to
|
||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||
* Addition of an optional new financial year filter, courtesy of
|
||||
`David Martin`_ `#256`_
|
||||
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
|
||||
`Dan Panzarella`_
|
||||
|
||||
* 0.6.0
|
||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||
of BasicAuth or Django session.
|
||||
* Fix the POST API so it actually works. `#236`_
|
||||
* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
|
||||
as it was being used both for the API (now replaced with a normal auth)
|
||||
and form email polling. Now that we're only using it for email, this
|
||||
variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value
|
||||
will still work for a while, but you should change your config if you've
|
||||
been using the email polling feature. Thanks to `Joshua Gilman`_ for all
|
||||
the help with this feature.
|
||||
* 0.5.0
|
||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
|
||||
to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
|
||||
this one.
|
||||
* Updated the import/export scripts to include support for thumbnails. Big
|
||||
thanks to `CkuT`_ for finding this shortcoming and doing the work to get
|
||||
it fixed in `#224`_.
|
||||
* All of the following changes are thanks to `David Martin`_:
|
||||
* Bumped the dependency on pyocr to 0.4.7 so new users can make use of
|
||||
Tesseract 4 if they so prefer (`#226`_).
|
||||
* Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
|
||||
* Amended the documentation for better handling of systemd service files (`#229`_)
|
||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||
1.0.0
|
||||
=====
|
||||
|
||||
* 0.4.1
|
||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||
all-caps suffixes like ``.PDF``
|
||||
* Upgrade to Django 1.11. **You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
properly update**.
|
||||
* Replace the templatetag-based hack we had for document listing in favour of
|
||||
a slightly less ugly solution in the form of another template tag with less
|
||||
copypasta.
|
||||
* Support for multi-word-matches for auto-tagging thanks to an excellent
|
||||
patch from `ishirav`_ `#277`_.
|
||||
* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
|
||||
the text and checkboxes under some resolutions `#272`_.
|
||||
* Patched the Docker config to force the serving of static files. Credit for
|
||||
this one goes to `dev-rke`_ via `#248`_.
|
||||
* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
|
||||
* Date fields in the admin are now expressed as HTML5 date fields thanks to
|
||||
`Lukas Winkler`_'s issue `#278`_
|
||||
|
||||
* 0.4.0
|
||||
* Introducing reminders. See `#199`_ for more information, but the short
|
||||
explanation is that you can now attach simple notes & times to documents
|
||||
which are made available via the API. Currently, the default API
|
||||
(basically just the Django admin) doesn't really make use of this, but
|
||||
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
|
||||
like to make use of this feature in his project.
|
||||
0.8.0
|
||||
=====
|
||||
|
||||
* 0.3.6
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
* The ``content`` field is now optional, to allow for the edge case of a
|
||||
purely graphical document.
|
||||
* You can no longer add documents via the admin. This never worked in the
|
||||
first place, so all I've done here is remove the link to the broken form.
|
||||
* The consumer code has been heavily refactored to support a pluggable
|
||||
interface. Install a paperless consumer via pip and tell paperless about
|
||||
it with an environment variable, and you're good to go. Proper
|
||||
documentation is on its way.
|
||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||
`#255`_.
|
||||
|
||||
* 0.3.5
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
* Users can now configure the number of items per page.
|
||||
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
||||
* Moved the dotenv loading to the top of settings.py
|
||||
* Fix for `#112`_: Added checks for binaries required for document
|
||||
consumption.
|
||||
0.7.0
|
||||
=====
|
||||
|
||||
* 0.3.4
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
non-profit situation as their free license prohibits for-profit use. As a
|
||||
result, I can't bundle Suit with Paperless without conflicting with the
|
||||
GPL. Further development will be done against the stock Django admin.
|
||||
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
||||
my high-DPI monitor.
|
||||
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||
automatically delete documents attached to correspondents when those
|
||||
correspondents are themselves deleted. This was Django's default
|
||||
behaviour, but didn't make much sense in Paperless' case. Thanks to
|
||||
`Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
|
||||
* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
|
||||
properly. Thanks to `ayounggun`_ for reporting this one and to
|
||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||
|
||||
* 0.3.3
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
0.6.0
|
||||
=====
|
||||
|
||||
* 0.3.2
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||
of BasicAuth or Django session.
|
||||
* Fix the POST API so it actually works. `#236`_
|
||||
* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
|
||||
as it was being used both for the API (now replaced with a normal auth)
|
||||
and form email polling. Now that we're only using it for email, this
|
||||
variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value
|
||||
will still work for a while, but you should change your config if you've
|
||||
been using the email polling feature. Thanks to `Joshua Gilman`_ for all
|
||||
the help with this feature.
|
||||
|
||||
* 0.3.1
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
0.5.0
|
||||
=====
|
||||
|
||||
* 0.3.0
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
* Consumer loop time is now configurable for systems with slow writes. Just
|
||||
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
||||
is 10.
|
||||
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
||||
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
||||
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
|
||||
to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
|
||||
this one.
|
||||
* Updated the import/export scripts to include support for thumbnails. Big
|
||||
thanks to `CkuT`_ for finding this shortcoming and doing the work to get
|
||||
it fixed in `#224`_.
|
||||
* All of the following changes are thanks to `David Martin`_:
|
||||
* Bumped the dependency on pyocr to 0.4.7 so new users can make use of
|
||||
Tesseract 4 if they so prefer (`#226`_).
|
||||
* Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
|
||||
* Amended the documentation for better handling of systemd service files (`#229`_)
|
||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||
|
||||
* 0.2.0
|
||||
0.4.1
|
||||
=====
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
||||
URL.
|
||||
* `#131`_: Document files are now automatically removed from disk when
|
||||
they're deleted in Paperless.
|
||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||
based on the file naming scheme.
|
||||
* `#81`_: Added a hook to run an arbitrary script after every document is
|
||||
consumed.
|
||||
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||
doesn't explode when handling Very Large Documents or when it's just
|
||||
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||
this one.
|
||||
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
||||
`Justin Snyman`_ for the pointers in the issue queue.
|
||||
* Added support for guessing the date from the file name along with the
|
||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||
regex front.
|
||||
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||
all-caps suffixes like ``.PDF``
|
||||
|
||||
* 0.1.1
|
||||
0.4.0
|
||||
=====
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
property (one could quite reasonably scan a document before sending it to
|
||||
someone.)
|
||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||
for full metadata retention without depending on the file name and
|
||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||
contributing conversation that lead to this change.
|
||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||
* `#68`_: Added support for using a proper config file at
|
||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||
* Refactored the Vagrant installation process to use environment variables
|
||||
rather than asking the user to modify ``settings.py``.
|
||||
* `#44`_: Harmonise environment variable names with constant names.
|
||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||
to be imported but made unavailable.
|
||||
* Introducing reminders. See `#199`_ for more information, but the short
|
||||
explanation is that you can now attach simple notes & times to documents
|
||||
which are made available via the API. Currently, the default API
|
||||
(basically just the Django admin) doesn't really make use of this, but
|
||||
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
|
||||
like to make use of this feature in his project.
|
||||
|
||||
* 0.1.0
|
||||
0.3.6
|
||||
=====
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
who spearheadded this effort.
|
||||
* A simple REST API is in place, but it should be considered unstable.
|
||||
* Cleaned up the consumer to use temporary directories instead of a single
|
||||
scratch space. (Thanks `Pit`_)
|
||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||
`Pit`_.
|
||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||
* Added tox with pep8 checking
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
* The ``content`` field is now optional, to allow for the edge case of a
|
||||
purely graphical document.
|
||||
* You can no longer add documents via the admin. This never worked in the
|
||||
first place, so all I've done here is remove the link to the broken form.
|
||||
* The consumer code has been heavily refactored to support a pluggable
|
||||
interface. Install a paperless consumer via pip and tell paperless about
|
||||
it with an environment variable, and you're good to go. Proper
|
||||
documentation is on its way.
|
||||
|
||||
* 0.0.6
|
||||
0.3.5
|
||||
=====
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
* Users can now configure the number of items per page.
|
||||
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
||||
* Moved the dotenv loading to the top of settings.py
|
||||
* Fix for `#112`_: Added checks for binaries required for document
|
||||
consumption.
|
||||
|
||||
* 0.0.5
|
||||
0.3.4
|
||||
=====
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
* Added IMAP mail support
|
||||
* Added a re-tagging utility
|
||||
* Documentation for the above as well as data migration
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
non-profit situation as their free license prohibits for-profit use. As a
|
||||
result, I can't bundle Suit with Paperless without conflicting with the
|
||||
GPL. Further development will be done against the stock Django admin.
|
||||
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
||||
my high-DPI monitor.
|
||||
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
|
||||
* 0.0.4
|
||||
0.3.3
|
||||
=====
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
* Removed ``User`` and ``Group`` from the admin
|
||||
* Added ``pytz`` to the list of requirements
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
|
||||
* 0.0.3
|
||||
0.3.2
|
||||
=====
|
||||
|
||||
* Added basic tagging
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
|
||||
* 0.0.2
|
||||
0.3.1
|
||||
=====
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
|
||||
* 0.0.1
|
||||
0.3.0
|
||||
=====
|
||||
|
||||
* Initial release
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
* Consumer loop time is now configurable for systems with slow writes. Just
|
||||
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
||||
is 10.
|
||||
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
||||
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
||||
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
|
||||
0.2.0
|
||||
=====
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
||||
URL.
|
||||
* `#131`_: Document files are now automatically removed from disk when
|
||||
they're deleted in Paperless.
|
||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||
based on the file naming scheme.
|
||||
* `#81`_: Added a hook to run an arbitrary script after every document is
|
||||
consumed.
|
||||
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||
doesn't explode when handling Very Large Documents or when it's just
|
||||
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||
this one.
|
||||
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
||||
`Justin Snyman`_ for the pointers in the issue queue.
|
||||
* Added support for guessing the date from the file name along with the
|
||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||
regex front.
|
||||
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
|
||||
0.1.1
|
||||
=====
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
property (one could quite reasonably scan a document before sending it to
|
||||
someone.)
|
||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||
for full metadata retention without depending on the file name and
|
||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||
contributing conversation that lead to this change.
|
||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||
* `#68`_: Added support for using a proper config file at
|
||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||
* Refactored the Vagrant installation process to use environment variables
|
||||
rather than asking the user to modify ``settings.py``.
|
||||
* `#44`_: Harmonise environment variable names with constant names.
|
||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||
to be imported but made unavailable.
|
||||
|
||||
0.1.0
|
||||
=====
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
who spearheadded this effort.
|
||||
* A simple REST API is in place, but it should be considered unstable.
|
||||
* Cleaned up the consumer to use temporary directories instead of a single
|
||||
scratch space. (Thanks `Pit`_)
|
||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||
`Pit`_.
|
||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||
* Added tox with pep8 checking
|
||||
|
||||
0.0.6
|
||||
=====
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
|
||||
0.0.5
|
||||
=====
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
* Added IMAP mail support
|
||||
* Added a re-tagging utility
|
||||
* Documentation for the above as well as data migration
|
||||
|
||||
0.0.4
|
||||
=====
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
* Removed ``User`` and ``Group`` from the admin
|
||||
* Added ``pytz`` to the list of requirements
|
||||
|
||||
0.0.3
|
||||
=====
|
||||
|
||||
* Added basic tagging
|
||||
|
||||
0.0.2
|
||||
=====
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
|
||||
0.0.1
|
||||
=====
|
||||
|
||||
* Initial release
|
||||
|
||||
.. _Brian Conn: https://github.com/TheConnMan
|
||||
.. _Christopher Luu: https://github.com/nuudles
|
||||
@ -268,6 +321,8 @@ Changelog
|
||||
.. _Lukas Winkler: https://github.com/Findus23
|
||||
.. _chris-aeviator: https://github.com/chris-aeviator
|
||||
.. _Dan Panzarella: https://github.com/pzl
|
||||
.. _addadi: https://github.com/addadi
|
||||
.. _BastianPoe: https://github.com/BastianPoe
|
||||
|
||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||
@ -317,3 +372,5 @@ Changelog
|
||||
.. _#283: https://github.com/danielquinn/paperless/issues/283
|
||||
.. _#256: https://github.com/danielquinn/paperless/pull/256
|
||||
.. _#285: https://github.com/danielquinn/paperless/pull/285
|
||||
|
||||
.. _pipenv: https://docs.pipenv.org/
|
||||
|
@ -11,24 +11,27 @@ should work) that has the following software installed:
|
||||
* `Tesseract`_, plus its language files matching your document base.
|
||||
* `Imagemagick`_ version 6.7.5 or higher
|
||||
* `unpaper`_
|
||||
* `libpoppler-cpp-dev`_ PDF rendering library
|
||||
|
||||
.. _Python3: https://python.org/
|
||||
.. _GNU Privacy Guard: https://gnupg.org
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Imagemagick: http://imagemagick.org/
|
||||
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
|
||||
|
||||
Notably, you should confirm how you access your Python3 installation. Many
|
||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
||||
``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||
``pip``. Running Paperless with Python2 will likely break things, so make sure that
|
||||
you're using the right version.
|
||||
Linux distributions will install Python3 in parallel to Python2, using the
|
||||
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||
``pip``. Running Paperless with Python2 will likely break things, so make sure
|
||||
that you're using the right version.
|
||||
|
||||
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
||||
refer to their Python3 versions.
|
||||
|
||||
In addition to the above, there are a number of Python requirements, all of
|
||||
which are listed in a file called ``requirements.txt`` in the project root directory.
|
||||
which are listed in a file called ``requirements.txt`` in the project root
|
||||
directory.
|
||||
|
||||
If you're not working on a virtual environment (like Vagrant or Docker), you
|
||||
should probably be using a virtualenv, but that's your call. The reasons why
|
||||
@ -39,12 +42,13 @@ probably figure that out before continuing.
|
||||
|
||||
.. _requirements-apple:
|
||||
|
||||
Apple-tastic Complications
|
||||
--------------------------
|
||||
Problems with Imagemagick & PDFs
|
||||
--------------------------------
|
||||
|
||||
Some users have `run into problems`_ with installing ImageMagick on Apple
|
||||
systems using HomeBrew. The solution appears to be to install ghostscript as
|
||||
well as ImageMagick:
|
||||
Some users have `run into problems`_ with getting ImageMagick to do its thing
|
||||
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
|
||||
Linuxes have been a problem as well. The solution appears to be to install
|
||||
ghostscript as well as ImageMagick:
|
||||
|
||||
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
||||
|
||||
|
@ -175,7 +175,8 @@ Docker Method
|
||||
modified versions of the configuration files.
|
||||
4. Modify ``docker-compose.yml`` to your preferences, following the
|
||||
instructions in comments in the file. The only change that is a hard
|
||||
requirement is to specify where the consumption directory should mount.
|
||||
requirement is to specify where the consumption directory should
|
||||
mount.[#dockercomposeyml]_
|
||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||
|
||||
``PAPERLESS_PASSPHRASE``
|
||||
@ -192,7 +193,7 @@ Docker Method
|
||||
default English, set this parameter to a space separated list of
|
||||
three-letter language-codes after `ISO 639-2/T`_. For a list of available
|
||||
languages -- including their three letter codes -- see the
|
||||
`Debian packagelist`_.
|
||||
`Alpine packagelist`_.
|
||||
|
||||
``USERMAP_UID`` and ``USERMAP_GID``
|
||||
If you want to mount the consumption volume (directory ``/consume`` within
|
||||
@ -282,12 +283,17 @@ Docker Method
|
||||
.. _Docker: https://www.docker.com/
|
||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
||||
.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
|
||||
|
||||
.. [#compose] You of course don't have to use docker-compose, but it
|
||||
simplifies deployment immensely. If you know your way around Docker, feel
|
||||
free to tinker around without using compose!
|
||||
|
||||
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
|
||||
version 1.1.0 or earlier, you might need to change in the
|
||||
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
|
||||
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
|
||||
newer ``docker-compose.yml.example`` file
|
||||
|
||||
.. _setup-permanent:
|
||||
|
||||
|
@ -14,6 +14,7 @@ python-dotenv>=0.6.2
|
||||
python-gnupg>=0.3.9
|
||||
pytz>=2016.10
|
||||
gunicorn==19.7.1
|
||||
pdftotext>=2.0.1
|
||||
|
||||
# For the tests
|
||||
factory-boy
|
||||
|
@ -9,7 +9,7 @@ map_uidgid() {
|
||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||
groupmod -g "${USERMAP_GID}" paperless
|
||||
addgroup -g "${USERMAP_GID}" paperless
|
||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||
fi
|
||||
}
|
||||
@ -56,25 +56,24 @@ install_languages() {
|
||||
return
|
||||
fi
|
||||
|
||||
# Update apt-lists
|
||||
apt-get update
|
||||
|
||||
# Loop over languages to be installed
|
||||
for lang in "${langs[@]}"; do
|
||||
pkg="tesseract-ocr-$lang"
|
||||
if dpkg -s "$pkg" > /dev/null 2>&1; then
|
||||
pkg="tesseract-ocr-data-$lang"
|
||||
|
||||
# English is installed by default
|
||||
if [ "$lang" == "eng" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if apk info -e "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
if ! apk info "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! apt-cache show "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
|
||||
apt-get install "$pkg"
|
||||
apk --no-cache --update add "$pkg"
|
||||
done
|
||||
|
||||
# Remove apt lists
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
}
|
||||
|
||||
|
||||
|
@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||
# will simply be left in the CONSUMPTION_DIR.
|
||||
|
@ -3,6 +3,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
import pdftotext
|
||||
|
||||
import langdetect
|
||||
import pyocr
|
||||
@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def _is_ocred(self):
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
if len(text) > 50:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_text(self):
|
||||
if not self.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("info", "Skipping OCR, using Text from PDF")
|
||||
return get_text_from_pdf(self.document_path)
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
@ -212,3 +228,13 @@ def image_to_string(args):
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return False
|
||||
|
||||
return "\n".join(pdf)
|
||||
|
Loading…
x
Reference in New Issue
Block a user