mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'master' of github.com:danielquinn/paperless
This commit is contained in:
commit
5c59120c57
@ -1,5 +1,9 @@
|
|||||||
language: python
|
language: python
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- sudo apt-get update -qq
|
||||||
|
- sudo apt-get install -qq libpoppler-cpp-dev
|
||||||
|
|
||||||
sudo: false
|
sudo: false
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
|
75
Dockerfile
75
Dockerfile
@ -1,50 +1,47 @@
|
|||||||
FROM python:3.5
|
FROM alpine:3.7
|
||||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
|
||||||
|
|
||||||
# Install dependencies
|
LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
|
||||||
RUN apt-get update \
|
contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
|
||||||
&& apt-get install -y --no-install-recommends \
|
Sven Fischer <git-dev@linux4tw.de>"
|
||||||
sudo \
|
|
||||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install python dependencies
|
|
||||||
RUN mkdir -p /usr/src/paperless
|
|
||||||
WORKDIR /usr/src/paperless
|
|
||||||
COPY requirements.txt /usr/src/paperless/
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
|
|
||||||
# Copy application
|
# Copy application
|
||||||
RUN mkdir -p /usr/src/paperless/src
|
COPY requirements.txt /usr/src/paperless/
|
||||||
RUN mkdir -p /usr/src/paperless/data
|
|
||||||
RUN mkdir -p /usr/src/paperless/media
|
|
||||||
COPY src/ /usr/src/paperless/src/
|
COPY src/ /usr/src/paperless/src/
|
||||||
COPY data/ /usr/src/paperless/data/
|
COPY data/ /usr/src/paperless/data/
|
||||||
COPY media/ /usr/src/paperless/media/
|
COPY media/ /usr/src/paperless/media/
|
||||||
|
|
||||||
# Set consumption directory
|
|
||||||
ENV PAPERLESS_CONSUMPTION_DIR /consume
|
|
||||||
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
|
|
||||||
|
|
||||||
# Migrate database
|
|
||||||
WORKDIR /usr/src/paperless/src
|
|
||||||
RUN ./manage.py migrate
|
|
||||||
|
|
||||||
# Create user
|
|
||||||
RUN groupadd -g 1000 paperless \
|
|
||||||
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
|
||||||
&& chown -Rh paperless:paperless /usr/src/paperless
|
|
||||||
|
|
||||||
# Set export directory
|
|
||||||
ENV PAPERLESS_EXPORT_DIR /export
|
|
||||||
RUN mkdir -p $PAPERLESS_EXPORT_DIR
|
|
||||||
|
|
||||||
# Setup entrypoint
|
|
||||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||||
RUN chmod 755 /sbin/docker-entrypoint.sh
|
|
||||||
|
|
||||||
# Mount volumes
|
# Set export and consumption directories
|
||||||
|
ENV PAPERLESS_EXPORT_DIR=/export \
|
||||||
|
PAPERLESS_CONSUMPTION_DIR=/consume
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN apk --no-cache --update add \
|
||||||
|
python3 gnupg libmagic bash \
|
||||||
|
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
|
||||||
|
apk --no-cache add --virtual .build-dependencies \
|
||||||
|
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
|
||||||
|
# Install python dependencies
|
||||||
|
python3 -m ensurepip && \
|
||||||
|
rm -r /usr/lib/python*/ensurepip && \
|
||||||
|
cd /usr/src/paperless && \
|
||||||
|
pip3 install --no-cache-dir -r requirements.txt && \
|
||||||
|
# Remove build dependencies
|
||||||
|
apk del .build-dependencies && \
|
||||||
|
# Create the consumption directory
|
||||||
|
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
|
||||||
|
# Migrate database
|
||||||
|
./src/manage.py migrate && \
|
||||||
|
# Create user
|
||||||
|
addgroup -g 1000 paperless && \
|
||||||
|
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
|
||||||
|
chown -Rh paperless:paperless /usr/src/paperless && \
|
||||||
|
mkdir -p $PAPERLESS_EXPORT_DIR && \
|
||||||
|
# Setup entrypoint
|
||||||
|
chmod 755 /sbin/docker-entrypoint.sh
|
||||||
|
|
||||||
|
WORKDIR /usr/src/paperless/src
|
||||||
|
# Mount volumes and set Entrypoint
|
||||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
|
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
|
||||||
|
|
||||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
12
README.rst
12
README.rst
@ -4,7 +4,6 @@ Paperless
|
|||||||
|Documentation|
|
|Documentation|
|
||||||
|Chat|
|
|Chat|
|
||||||
|Travis|
|
|Travis|
|
||||||
|Dependencies|
|
|
||||||
|
|
||||||
Index and archive all of your scanned paper documents
|
Index and archive all of your scanned paper documents
|
||||||
|
|
||||||
@ -28,12 +27,11 @@ scanner produces
|
|||||||
|
|
||||||
1. Buy a document scanner that can write to a place on your network. If you
|
1. Buy a document scanner that can write to a place on your network. If you
|
||||||
need some inspiration, have a look at the `scanner recommendations`_ page.
|
need some inspiration, have a look at the `scanner recommendations`_ page.
|
||||||
recommended by another user.
|
|
||||||
2. Set it up to "scan to FTP" or something similar. It should be able to push
|
2. Set it up to "scan to FTP" or something similar. It should be able to push
|
||||||
scanned images to a server without you having to do anything. If your
|
scanned images to a server without you having to do anything. Of course if
|
||||||
scanner doesn't know how to automatically upload the file somewhere, you can
|
your scanner doesn't know how to automatically upload the file somewhere,
|
||||||
always do that manually. Paperless doesn't care how the documents get into
|
you can always do that manually. Paperless doesn't care how the documents
|
||||||
its local consumption directory.
|
get into its local consumption directory.
|
||||||
3. Have the target server run the Paperless consumption script to OCR the file
|
3. Have the target server run the Paperless consumption script to OCR the file
|
||||||
and index it into a local database.
|
and index it into a local database.
|
||||||
4. Use the web frontend to sift through the database and find what you want.
|
4. Use the web frontend to sift through the database and find what you want.
|
||||||
@ -140,5 +138,3 @@ work and they need the money a lot more than I do.
|
|||||||
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
||||||
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
||||||
:target: https://travis-ci.org/danielquinn/paperless
|
:target: https://travis-ci.org/danielquinn/paperless
|
||||||
.. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg
|
|
||||||
:target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
|
|
||||||
|
@ -2,7 +2,7 @@ version: '2'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
webserver:
|
webserver:
|
||||||
image: pitkley/paperless
|
build: ./
|
||||||
ports:
|
ports:
|
||||||
# You can adapt the port you want Paperless to listen on by
|
# You can adapt the port you want Paperless to listen on by
|
||||||
# modifying the part before the `:`.
|
# modifying the part before the `:`.
|
||||||
@ -20,7 +20,7 @@ services:
|
|||||||
command: ["runserver", "--insecure", "0.0.0.0:8000"]
|
command: ["runserver", "--insecure", "0.0.0.0:8000"]
|
||||||
|
|
||||||
consumer:
|
consumer:
|
||||||
image: pitkley/paperless
|
build: ./
|
||||||
volumes:
|
volumes:
|
||||||
- data:/usr/src/paperless/data
|
- data:/usr/src/paperless/data
|
||||||
- media:/usr/src/paperless/media
|
- media:/usr/src/paperless/media
|
||||||
|
@ -1,243 +1,296 @@
|
|||||||
Changelog
|
Changelog
|
||||||
#########
|
#########
|
||||||
|
|
||||||
* 1.1.0
|
1.2.0
|
||||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
=====
|
||||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
|
||||||
* Addition of an optional new financial year filter, courtesy of
|
|
||||||
`David Martin`_ `#256`_
|
|
||||||
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
|
|
||||||
`Dan Panzarella`_
|
|
||||||
|
|
||||||
* 1.0.0
|
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||||
* Upgrade to Django 1.11. **You'll need to run
|
and `Pit`_.
|
||||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
* `BastianPoe`_ has added the long-awaited feature to automatically skip the
|
||||||
properly update**.
|
OCR step when the PDF already contains text. This can be overridden by
|
||||||
* Replace the templatetag-based hack we had for document listing in favour of
|
setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
|
||||||
a slightly less ugly solution in the form of another template tag with less
|
in the environment. Note that this also means that Paperless now requires
|
||||||
copypasta.
|
``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
|
||||||
* Support for multi-word-matches for auto-tagging thanks to an excellent
|
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||||
patch from `ishirav`_ `#277`_.
|
properly update.
|
||||||
* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
|
|
||||||
the text and checkboxes under some resolutions `#272`_.
|
|
||||||
* Patched the Docker config to force the serving of static files. Credit for
|
|
||||||
this one goes to `dev-rke`_ via `#248`_.
|
|
||||||
* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
|
|
||||||
* Date fields in the admin are now expressed as HTML5 date fields thanks to
|
|
||||||
`Lukas Winkler`_'s issue `#278`_
|
|
||||||
|
|
||||||
* 0.8.0
|
1.1.0
|
||||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
=====
|
||||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
|
||||||
`#255`_.
|
|
||||||
|
|
||||||
* 0.7.0
|
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||||
automatically delete documents attached to correspondents when those
|
* Addition of an optional new financial year filter, courtesy of
|
||||||
correspondents are themselves deleted. This was Django's default
|
`David Martin`_ `#256`_
|
||||||
behaviour, but didn't make much sense in Paperless' case. Thanks to
|
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
|
||||||
`Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
|
`Dan Panzarella`_
|
||||||
* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
|
|
||||||
properly. Thanks to `ayounggun`_ for reporting this one and to
|
|
||||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
|
||||||
|
|
||||||
* 0.6.0
|
1.0.0
|
||||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
=====
|
||||||
of BasicAuth or Django session.
|
|
||||||
* Fix the POST API so it actually works. `#236`_
|
|
||||||
* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
|
|
||||||
as it was being used both for the API (now replaced with a normal auth)
|
|
||||||
and form email polling. Now that we're only using it for email, this
|
|
||||||
variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value
|
|
||||||
will still work for a while, but you should change your config if you've
|
|
||||||
been using the email polling feature. Thanks to `Joshua Gilman`_ for all
|
|
||||||
the help with this feature.
|
|
||||||
* 0.5.0
|
|
||||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
|
||||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
|
||||||
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
|
|
||||||
to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
|
|
||||||
this one.
|
|
||||||
* Updated the import/export scripts to include support for thumbnails. Big
|
|
||||||
thanks to `CkuT`_ for finding this shortcoming and doing the work to get
|
|
||||||
it fixed in `#224`_.
|
|
||||||
* All of the following changes are thanks to `David Martin`_:
|
|
||||||
* Bumped the dependency on pyocr to 0.4.7 so new users can make use of
|
|
||||||
Tesseract 4 if they so prefer (`#226`_).
|
|
||||||
* Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
|
|
||||||
* Amended the documentation for better handling of systemd service files (`#229`_)
|
|
||||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
|
||||||
|
|
||||||
* 0.4.1
|
* Upgrade to Django 1.11. **You'll need to run
|
||||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||||
all-caps suffixes like ``.PDF``
|
properly update**.
|
||||||
|
* Replace the templatetag-based hack we had for document listing in favour of
|
||||||
|
a slightly less ugly solution in the form of another template tag with less
|
||||||
|
copypasta.
|
||||||
|
* Support for multi-word-matches for auto-tagging thanks to an excellent
|
||||||
|
patch from `ishirav`_ `#277`_.
|
||||||
|
* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
|
||||||
|
the text and checkboxes under some resolutions `#272`_.
|
||||||
|
* Patched the Docker config to force the serving of static files. Credit for
|
||||||
|
this one goes to `dev-rke`_ via `#248`_.
|
||||||
|
* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
|
||||||
|
* Date fields in the admin are now expressed as HTML5 date fields thanks to
|
||||||
|
`Lukas Winkler`_'s issue `#278`_
|
||||||
|
|
||||||
* 0.4.0
|
0.8.0
|
||||||
* Introducing reminders. See `#199`_ for more information, but the short
|
=====
|
||||||
explanation is that you can now attach simple notes & times to documents
|
|
||||||
which are made available via the API. Currently, the default API
|
|
||||||
(basically just the Django admin) doesn't really make use of this, but
|
|
||||||
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
|
|
||||||
like to make use of this feature in his project.
|
|
||||||
|
|
||||||
* 0.3.6
|
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||||
correspondent or the tags for a document.
|
`#255`_.
|
||||||
* The ``content`` field is now optional, to allow for the edge case of a
|
|
||||||
purely graphical document.
|
|
||||||
* You can no longer add documents via the admin. This never worked in the
|
|
||||||
first place, so all I've done here is remove the link to the broken form.
|
|
||||||
* The consumer code has been heavily refactored to support a pluggable
|
|
||||||
interface. Install a paperless consumer via pip and tell paperless about
|
|
||||||
it with an environment variable, and you're good to go. Proper
|
|
||||||
documentation is on its way.
|
|
||||||
|
|
||||||
* 0.3.5
|
0.7.0
|
||||||
* A serious facelift for the documents listing page wherein we drop the
|
=====
|
||||||
tabular layout in favour of a tiled interface.
|
|
||||||
* Users can now configure the number of items per page.
|
|
||||||
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
|
||||||
* Moved the dotenv loading to the top of settings.py
|
|
||||||
* Fix for `#112`_: Added checks for binaries required for document
|
|
||||||
consumption.
|
|
||||||
|
|
||||||
* 0.3.4
|
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
automatically delete documents attached to correspondents when those
|
||||||
Note that you *can* use Django Suit with Paperless, but only in a
|
correspondents are themselves deleted. This was Django's default
|
||||||
non-profit situation as their free license prohibits for-profit use. As a
|
behaviour, but didn't make much sense in Paperless' case. Thanks to
|
||||||
result, I can't bundle Suit with Paperless without conflicting with the
|
`Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
|
||||||
GPL. Further development will be done against the stock Django admin.
|
* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
|
||||||
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
properly. Thanks to `ayounggun`_ for reporting this one and to
|
||||||
my high-DPI monitor.
|
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||||
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
|
||||||
API thanks to @thomasbrueggemann. See `#179`_.
|
|
||||||
|
|
||||||
* 0.3.3
|
0.6.0
|
||||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
=====
|
||||||
* Timezone, items per page, and default language are now all configurable,
|
|
||||||
also thanks to @ekw.
|
|
||||||
|
|
||||||
* 0.3.2
|
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
of BasicAuth or Django session.
|
||||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
* Fix the POST API so it actually works. `#236`_
|
||||||
arise.
|
* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
|
||||||
|
as it was being used both for the API (now replaced with a normal auth)
|
||||||
|
and form email polling. Now that we're only using it for email, this
|
||||||
|
variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value
|
||||||
|
will still work for a while, but you should change your config if you've
|
||||||
|
been using the email polling feature. Thanks to `Joshua Gilman`_ for all
|
||||||
|
the help with this feature.
|
||||||
|
|
||||||
* 0.3.1
|
0.5.0
|
||||||
* Added a default value for ``CONVERT_BINARY``
|
=====
|
||||||
|
|
||||||
* 0.3.0
|
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||||
* Updated to using django-filter 1.x
|
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||||
* Added some system checks so new users aren't confused by misconfigurations.
|
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
|
||||||
* Consumer loop time is now configurable for systems with slow writes. Just
|
to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
|
||||||
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
this one.
|
||||||
is 10.
|
* Updated the import/export scripts to include support for thumbnails. Big
|
||||||
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
thanks to `CkuT`_ for finding this shortcoming and doing the work to get
|
||||||
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
it fixed in `#224`_.
|
||||||
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
* All of the following changes are thanks to `David Martin`_:
|
||||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
* Bumped the dependency on pyocr to 0.4.7 so new users can make use of
|
||||||
|
Tesseract 4 if they so prefer (`#226`_).
|
||||||
|
* Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
|
||||||
|
* Amended the documentation for better handling of systemd service files (`#229`_)
|
||||||
|
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||||
|
|
||||||
* 0.2.0
|
0.4.1
|
||||||
|
=====
|
||||||
|
|
||||||
* `#150`_: The media root is now a variable you can set in
|
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||||
``paperless.conf``.
|
all-caps suffixes like ``.PDF``
|
||||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
|
||||||
``paperless.conf``.
|
|
||||||
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
|
||||||
URL.
|
|
||||||
* `#131`_: Document files are now automatically removed from disk when
|
|
||||||
they're deleted in Paperless.
|
|
||||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
|
||||||
based on the file naming scheme.
|
|
||||||
* `#81`_: Added a hook to run an arbitrary script after every document is
|
|
||||||
consumed.
|
|
||||||
* `#98`_: Added optional environment variables for ImageMagick so that it
|
|
||||||
doesn't explode when handling Very Large Documents or when it's just
|
|
||||||
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
|
||||||
this one.
|
|
||||||
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
|
||||||
`Justin Snyman`_ for the pointers in the issue queue.
|
|
||||||
* Added support for guessing the date from the file name along with the
|
|
||||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
|
||||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
|
||||||
regex front.
|
|
||||||
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
|
||||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
|
||||||
|
|
||||||
* 0.1.1
|
0.4.0
|
||||||
|
=====
|
||||||
|
|
||||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
* Introducing reminders. See `#199`_ for more information, but the short
|
||||||
have been renamed to "correspondent" to better reflect the nature of the
|
explanation is that you can now attach simple notes & times to documents
|
||||||
property (one could quite reasonably scan a document before sending it to
|
which are made available via the API. Currently, the default API
|
||||||
someone.)
|
(basically just the Django admin) doesn't really make use of this, but
|
||||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
|
||||||
for full metadata retention without depending on the file name and
|
like to make use of this feature in his project.
|
||||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
|
||||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
|
||||||
contributing conversation that lead to this change.
|
|
||||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
|
||||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
|
||||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
|
||||||
* `#68`_: Added support for using a proper config file at
|
|
||||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
|
||||||
* Refactored the Vagrant installation process to use environment variables
|
|
||||||
rather than asking the user to modify ``settings.py``.
|
|
||||||
* `#44`_: Harmonise environment variable names with constant names.
|
|
||||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
|
||||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
|
||||||
to be imported but made unavailable.
|
|
||||||
|
|
||||||
* 0.1.0
|
0.3.6
|
||||||
|
=====
|
||||||
|
|
||||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
correspondent or the tags for a document.
|
||||||
who spearheadded this effort.
|
* The ``content`` field is now optional, to allow for the edge case of a
|
||||||
* A simple REST API is in place, but it should be considered unstable.
|
purely graphical document.
|
||||||
* Cleaned up the consumer to use temporary directories instead of a single
|
* You can no longer add documents via the admin. This never worked in the
|
||||||
scratch space. (Thanks `Pit`_)
|
first place, so all I've done here is remove the link to the broken form.
|
||||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
* The consumer code has been heavily refactored to support a pluggable
|
||||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
interface. Install a paperless consumer via pip and tell paperless about
|
||||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
it with an environment variable, and you're good to go. Proper
|
||||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
documentation is on its way.
|
||||||
`Pit`_.
|
|
||||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
|
||||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
|
||||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
|
||||||
* Added tox with pep8 checking
|
|
||||||
|
|
||||||
* 0.0.6
|
0.3.5
|
||||||
|
=====
|
||||||
|
|
||||||
* Added support for parallel OCR (significant work from `Pit`_)
|
* A serious facelift for the documents listing page wherein we drop the
|
||||||
* Sped up the language detection (significant work from `Pit`_)
|
tabular layout in favour of a tiled interface.
|
||||||
* Added simple logging
|
* Users can now configure the number of items per page.
|
||||||
|
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
||||||
|
* Moved the dotenv loading to the top of settings.py
|
||||||
|
* Fix for `#112`_: Added checks for binaries required for document
|
||||||
|
consumption.
|
||||||
|
|
||||||
* 0.0.5
|
0.3.4
|
||||||
|
=====
|
||||||
|
|
||||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||||
* Added a crude means of HTTP POST for document imports
|
Note that you *can* use Django Suit with Paperless, but only in a
|
||||||
* Added IMAP mail support
|
non-profit situation as their free license prohibits for-profit use. As a
|
||||||
* Added a re-tagging utility
|
result, I can't bundle Suit with Paperless without conflicting with the
|
||||||
* Documentation for the above as well as data migration
|
GPL. Further development will be done against the stock Django admin.
|
||||||
|
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
||||||
|
my high-DPI monitor.
|
||||||
|
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
||||||
|
API thanks to @thomasbrueggemann. See `#179`_.
|
||||||
|
|
||||||
* 0.0.4
|
0.3.3
|
||||||
|
=====
|
||||||
|
|
||||||
* Added automated tagging basted on keyword matching
|
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||||
* Cleaned up the document listing page
|
* Timezone, items per page, and default language are now all configurable,
|
||||||
* Removed ``User`` and ``Group`` from the admin
|
also thanks to @ekw.
|
||||||
* Added ``pytz`` to the list of requirements
|
|
||||||
|
|
||||||
* 0.0.3
|
0.3.2
|
||||||
|
=====
|
||||||
|
|
||||||
* Added basic tagging
|
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||||
|
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||||
|
arise.
|
||||||
|
|
||||||
* 0.0.2
|
0.3.1
|
||||||
|
=====
|
||||||
|
|
||||||
* Added language detection
|
* Added a default value for ``CONVERT_BINARY``
|
||||||
* Added datestamps to ``document_exporter``.
|
|
||||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
|
||||||
|
|
||||||
* 0.0.1
|
0.3.0
|
||||||
|
=====
|
||||||
|
|
||||||
* Initial release
|
* Updated to using django-filter 1.x
|
||||||
|
* Added some system checks so new users aren't confused by misconfigurations.
|
||||||
|
* Consumer loop time is now configurable for systems with slow writes. Just
|
||||||
|
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
||||||
|
is 10.
|
||||||
|
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
||||||
|
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
||||||
|
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
||||||
|
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||||
|
|
||||||
|
0.2.0
|
||||||
|
=====
|
||||||
|
|
||||||
|
* `#150`_: The media root is now a variable you can set in
|
||||||
|
``paperless.conf``.
|
||||||
|
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||||
|
``paperless.conf``.
|
||||||
|
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
||||||
|
URL.
|
||||||
|
* `#131`_: Document files are now automatically removed from disk when
|
||||||
|
they're deleted in Paperless.
|
||||||
|
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||||
|
based on the file naming scheme.
|
||||||
|
* `#81`_: Added a hook to run an arbitrary script after every document is
|
||||||
|
consumed.
|
||||||
|
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||||
|
doesn't explode when handling Very Large Documents or when it's just
|
||||||
|
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||||
|
this one.
|
||||||
|
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
||||||
|
`Justin Snyman`_ for the pointers in the issue queue.
|
||||||
|
* Added support for guessing the date from the file name along with the
|
||||||
|
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||||
|
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||||
|
regex front.
|
||||||
|
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
||||||
|
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||||
|
|
||||||
|
0.1.1
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||||
|
have been renamed to "correspondent" to better reflect the nature of the
|
||||||
|
property (one could quite reasonably scan a document before sending it to
|
||||||
|
someone.)
|
||||||
|
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||||
|
for full metadata retention without depending on the file name and
|
||||||
|
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||||
|
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||||
|
contributing conversation that lead to this change.
|
||||||
|
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||||
|
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||||
|
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||||
|
* `#68`_: Added support for using a proper config file at
|
||||||
|
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||||
|
* Refactored the Vagrant installation process to use environment variables
|
||||||
|
rather than asking the user to modify ``settings.py``.
|
||||||
|
* `#44`_: Harmonise environment variable names with constant names.
|
||||||
|
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||||
|
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||||
|
to be imported but made unavailable.
|
||||||
|
|
||||||
|
0.1.0
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||||
|
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||||
|
who spearheadded this effort.
|
||||||
|
* A simple REST API is in place, but it should be considered unstable.
|
||||||
|
* Cleaned up the consumer to use temporary directories instead of a single
|
||||||
|
scratch space. (Thanks `Pit`_)
|
||||||
|
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||||
|
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||||
|
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||||
|
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||||
|
`Pit`_.
|
||||||
|
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||||
|
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||||
|
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||||
|
* Added tox with pep8 checking
|
||||||
|
|
||||||
|
0.0.6
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Added support for parallel OCR (significant work from `Pit`_)
|
||||||
|
* Sped up the language detection (significant work from `Pit`_)
|
||||||
|
* Added simple logging
|
||||||
|
|
||||||
|
0.0.5
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||||
|
* Added a crude means of HTTP POST for document imports
|
||||||
|
* Added IMAP mail support
|
||||||
|
* Added a re-tagging utility
|
||||||
|
* Documentation for the above as well as data migration
|
||||||
|
|
||||||
|
0.0.4
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Added automated tagging basted on keyword matching
|
||||||
|
* Cleaned up the document listing page
|
||||||
|
* Removed ``User`` and ``Group`` from the admin
|
||||||
|
* Added ``pytz`` to the list of requirements
|
||||||
|
|
||||||
|
0.0.3
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Added basic tagging
|
||||||
|
|
||||||
|
0.0.2
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Added language detection
|
||||||
|
* Added datestamps to ``document_exporter``.
|
||||||
|
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||||
|
|
||||||
|
0.0.1
|
||||||
|
=====
|
||||||
|
|
||||||
|
* Initial release
|
||||||
|
|
||||||
.. _Brian Conn: https://github.com/TheConnMan
|
.. _Brian Conn: https://github.com/TheConnMan
|
||||||
.. _Christopher Luu: https://github.com/nuudles
|
.. _Christopher Luu: https://github.com/nuudles
|
||||||
@ -268,6 +321,8 @@ Changelog
|
|||||||
.. _Lukas Winkler: https://github.com/Findus23
|
.. _Lukas Winkler: https://github.com/Findus23
|
||||||
.. _chris-aeviator: https://github.com/chris-aeviator
|
.. _chris-aeviator: https://github.com/chris-aeviator
|
||||||
.. _Dan Panzarella: https://github.com/pzl
|
.. _Dan Panzarella: https://github.com/pzl
|
||||||
|
.. _addadi: https://github.com/addadi
|
||||||
|
.. _BastianPoe: https://github.com/BastianPoe
|
||||||
|
|
||||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||||
@ -317,3 +372,5 @@ Changelog
|
|||||||
.. _#283: https://github.com/danielquinn/paperless/issues/283
|
.. _#283: https://github.com/danielquinn/paperless/issues/283
|
||||||
.. _#256: https://github.com/danielquinn/paperless/pull/256
|
.. _#256: https://github.com/danielquinn/paperless/pull/256
|
||||||
.. _#285: https://github.com/danielquinn/paperless/pull/285
|
.. _#285: https://github.com/danielquinn/paperless/pull/285
|
||||||
|
|
||||||
|
.. _pipenv: https://docs.pipenv.org/
|
||||||
|
@ -11,24 +11,27 @@ should work) that has the following software installed:
|
|||||||
* `Tesseract`_, plus its language files matching your document base.
|
* `Tesseract`_, plus its language files matching your document base.
|
||||||
* `Imagemagick`_ version 6.7.5 or higher
|
* `Imagemagick`_ version 6.7.5 or higher
|
||||||
* `unpaper`_
|
* `unpaper`_
|
||||||
|
* `libpoppler-cpp-dev`_ PDF rendering library
|
||||||
|
|
||||||
.. _Python3: https://python.org/
|
.. _Python3: https://python.org/
|
||||||
.. _GNU Privacy Guard: https://gnupg.org
|
.. _GNU Privacy Guard: https://gnupg.org
|
||||||
.. _Tesseract: https://github.com/tesseract-ocr
|
.. _Tesseract: https://github.com/tesseract-ocr
|
||||||
.. _Imagemagick: http://imagemagick.org/
|
.. _Imagemagick: http://imagemagick.org/
|
||||||
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||||
|
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
|
||||||
|
|
||||||
Notably, you should confirm how you access your Python3 installation. Many
|
Notably, you should confirm how you access your Python3 installation. Many
|
||||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
Linux distributions will install Python3 in parallel to Python2, using the
|
||||||
``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||||
``pip``. Running Paperless with Python2 will likely break things, so make sure that
|
``pip``. Running Paperless with Python2 will likely break things, so make sure
|
||||||
you're using the right version.
|
that you're using the right version.
|
||||||
|
|
||||||
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
||||||
refer to their Python3 versions.
|
refer to their Python3 versions.
|
||||||
|
|
||||||
In addition to the above, there are a number of Python requirements, all of
|
In addition to the above, there are a number of Python requirements, all of
|
||||||
which are listed in a file called ``requirements.txt`` in the project root directory.
|
which are listed in a file called ``requirements.txt`` in the project root
|
||||||
|
directory.
|
||||||
|
|
||||||
If you're not working on a virtual environment (like Vagrant or Docker), you
|
If you're not working on a virtual environment (like Vagrant or Docker), you
|
||||||
should probably be using a virtualenv, but that's your call. The reasons why
|
should probably be using a virtualenv, but that's your call. The reasons why
|
||||||
@ -39,12 +42,13 @@ probably figure that out before continuing.
|
|||||||
|
|
||||||
.. _requirements-apple:
|
.. _requirements-apple:
|
||||||
|
|
||||||
Apple-tastic Complications
|
Problems with Imagemagick & PDFs
|
||||||
--------------------------
|
--------------------------------
|
||||||
|
|
||||||
Some users have `run into problems`_ with installing ImageMagick on Apple
|
Some users have `run into problems`_ with getting ImageMagick to do its thing
|
||||||
systems using HomeBrew. The solution appears to be to install ghostscript as
|
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
|
||||||
well as ImageMagick:
|
Linuxes have been a problem as well. The solution appears to be to install
|
||||||
|
ghostscript as well as ImageMagick:
|
||||||
|
|
||||||
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
||||||
|
|
||||||
|
@ -175,7 +175,8 @@ Docker Method
|
|||||||
modified versions of the configuration files.
|
modified versions of the configuration files.
|
||||||
4. Modify ``docker-compose.yml`` to your preferences, following the
|
4. Modify ``docker-compose.yml`` to your preferences, following the
|
||||||
instructions in comments in the file. The only change that is a hard
|
instructions in comments in the file. The only change that is a hard
|
||||||
requirement is to specify where the consumption directory should mount.
|
requirement is to specify where the consumption directory should
|
||||||
|
mount.[#dockercomposeyml]_
|
||||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||||
|
|
||||||
``PAPERLESS_PASSPHRASE``
|
``PAPERLESS_PASSPHRASE``
|
||||||
@ -192,7 +193,7 @@ Docker Method
|
|||||||
default English, set this parameter to a space separated list of
|
default English, set this parameter to a space separated list of
|
||||||
three-letter language-codes after `ISO 639-2/T`_. For a list of available
|
three-letter language-codes after `ISO 639-2/T`_. For a list of available
|
||||||
languages -- including their three letter codes -- see the
|
languages -- including their three letter codes -- see the
|
||||||
`Debian packagelist`_.
|
`Alpine packagelist`_.
|
||||||
|
|
||||||
``USERMAP_UID`` and ``USERMAP_GID``
|
``USERMAP_UID`` and ``USERMAP_GID``
|
||||||
If you want to mount the consumption volume (directory ``/consume`` within
|
If you want to mount the consumption volume (directory ``/consume`` within
|
||||||
@ -282,12 +283,17 @@ Docker Method
|
|||||||
.. _Docker: https://www.docker.com/
|
.. _Docker: https://www.docker.com/
|
||||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||||
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
|
||||||
|
|
||||||
.. [#compose] You of course don't have to use docker-compose, but it
|
.. [#compose] You of course don't have to use docker-compose, but it
|
||||||
simplifies deployment immensely. If you know your way around Docker, feel
|
simplifies deployment immensely. If you know your way around Docker, feel
|
||||||
free to tinker around without using compose!
|
free to tinker around without using compose!
|
||||||
|
|
||||||
|
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
|
||||||
|
version 1.1.0 or earlier, you might need to change in the
|
||||||
|
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
|
||||||
|
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
|
||||||
|
newer ``docker-compose.yml.example`` file
|
||||||
|
|
||||||
.. _setup-permanent:
|
.. _setup-permanent:
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ python-dotenv>=0.6.2
|
|||||||
python-gnupg>=0.3.9
|
python-gnupg>=0.3.9
|
||||||
pytz>=2016.10
|
pytz>=2016.10
|
||||||
gunicorn==19.7.1
|
gunicorn==19.7.1
|
||||||
|
pdftotext>=2.0.1
|
||||||
|
|
||||||
# For the tests
|
# For the tests
|
||||||
factory-boy
|
factory-boy
|
||||||
|
@ -9,7 +9,7 @@ map_uidgid() {
|
|||||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||||
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
||||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||||
groupmod -g "${USERMAP_GID}" paperless
|
addgroup -g "${USERMAP_GID}" paperless
|
||||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -56,25 +56,24 @@ install_languages() {
|
|||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update apt-lists
|
|
||||||
apt-get update
|
|
||||||
|
|
||||||
# Loop over languages to be installed
|
# Loop over languages to be installed
|
||||||
for lang in "${langs[@]}"; do
|
for lang in "${langs[@]}"; do
|
||||||
pkg="tesseract-ocr-$lang"
|
pkg="tesseract-ocr-data-$lang"
|
||||||
if dpkg -s "$pkg" > /dev/null 2>&1; then
|
|
||||||
|
# English is installed by default
|
||||||
|
if [ "$lang" == "eng" ]; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! apt-cache show "$pkg" > /dev/null 2>&1; then
|
if apk info -e "$pkg" > /dev/null 2>&1; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if ! apk info "$pkg" > /dev/null 2>&1; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
apt-get install "$pkg"
|
apk --no-cache --update add "$pkg"
|
||||||
done
|
done
|
||||||
|
|
||||||
# Remove apt lists
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
|||||||
# The amount of threads to use for OCR
|
# The amount of threads to use for OCR
|
||||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||||
|
|
||||||
|
# OCR all documents?
|
||||||
|
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
|
||||||
|
|
||||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||||
# will simply be left in the CONSUMPTION_DIR.
|
# will simply be left in the CONSUMPTION_DIR.
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
|
import pdftotext
|
||||||
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import pyocr
|
import pyocr
|
||||||
@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
"""
|
"""
|
||||||
@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return os.path.join(self.tempdir, "convert-0000.png")
|
return os.path.join(self.tempdir, "convert-0000.png")
|
||||||
|
|
||||||
|
def _is_ocred(self):
|
||||||
|
# Extract text from PDF using pdftotext
|
||||||
|
text = get_text_from_pdf(self.document_path)
|
||||||
|
|
||||||
|
# We assume, that a PDF with at least 50 characters contains text
|
||||||
|
# (so no OCR required)
|
||||||
|
if len(text) > 50:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
|
if not self.OCR_ALWAYS and self._is_ocred():
|
||||||
|
self.log("info", "Skipping OCR, using Text from PDF")
|
||||||
|
return get_text_from_pdf(self.document_path)
|
||||||
|
|
||||||
images = self._get_greyscale()
|
images = self._get_greyscale()
|
||||||
|
|
||||||
@ -212,3 +228,13 @@ def image_to_string(args):
|
|||||||
except (TesseractError, OtherTesseractError):
|
except (TesseractError, OtherTesseractError):
|
||||||
pass
|
pass
|
||||||
return ocr.image_to_string(f, lang=lang)
|
return ocr.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_from_pdf(pdf_file):
|
||||||
|
with open(pdf_file, "rb") as f:
|
||||||
|
try:
|
||||||
|
pdf = pdftotext.PDF(f)
|
||||||
|
except pdftotext.Error:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "\n".join(pdf)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user