Merge branch 'dev' into celery-tasks

This commit is contained in:
Jonas Winkler 2020-11-19 22:10:57 +01:00
commit 17430210a1
145 changed files with 5228 additions and 11538 deletions

View File

@ -1,3 +1,4 @@
/src-ui/.vscode
/src-ui/node_modules
/src-ui/dist
.git
@ -5,3 +6,7 @@
/consume
/media
/data
/docs
.pytest_cache
/dist
/scripts

4
.gitignore vendored
View File

@ -66,6 +66,7 @@ target/
virtualenv
/venv
docker-compose.env
docker-compose.yml
# Used for development
scripts/import-for-development
@ -86,3 +87,6 @@ scripts/nuke
/consume
/export
/src-ui/.vscode
# this is where the compiled frontend is moved to.
/src/documents/static/frontend/

View File

@ -1,56 +1,22 @@
language: python
python:
- "3.6"
- "3.7"
- "3.8"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
sudo: false
matrix:
include:
- python: "3.5"
- python: "3.6"
- python: "3.7-dev"
- env:
- BUILD_DOCKER=1
# Variable to add to publish the Docker image:
# * DOCKER_USERNAME
# * DOCKER_PASSWORD, to be encrypted, use `travis encrypt DOCKER_PASSWORD=<password>`
services:
- docker
before_install:
- true
install:
- true
script:
- docker build --tag=the-paperless-project/paperless .
after_success:
- true
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
install:
- pip install --upgrade pip pipenv sphinx
- pipenv lock -r > requirements.txt
- pip install -r requirements.txt
- pip install --upgrade pipenv
- pipenv install --system --dev
script:
- cd src/
- pytest --cov
- pycodestyle
- sphinx-build -b html ../docs ../docs/_build -W
- cd src/
- pipenv run pytest --cov
- pipenv run pycodestyle
after_success:
- coveralls
deploy:
- provider: script
skip_cleanup: true
script: ci/deploy-docker
on:
tags: true
condition: '"${BUILD_DOCKER}" = 1'
- provider: script
skip_cleanup: true
script: ci/deploy-docker
on:
branch: master
condition: '"${BUILD_DOCKER}" = 1'
- pipenv run coveralls

View File

@ -1,82 +0,0 @@
###############################################################################
### Front end ###
###############################################################################
FROM node:current AS frontend
WORKDIR /usr/src/paperless/src-ui/
COPY src-ui/package* ./
RUN npm install
COPY src-ui .
RUN node_modules/.bin/ng build --prod --output-hashing none --sourceMap=false
###############################################################################
### Back end ###
###############################################################################
FROM ubuntu:20.04
WORKDIR /usr/src/paperless/
COPY Pipfile* ./
#Dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND="noninteractive" apt-get -y --no-install-recommends install \
build-essential \
curl \
ghostscript \
gnupg \
imagemagick \
libmagic-dev \
libpoppler-cpp-dev \
libpq-dev \
optipng \
python3 \
python3-dev \
python3-pip \
sudo \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-deu \
tesseract-ocr-fra \
tesseract-ocr-ita \
tesseract-ocr-spa \
tzdata \
unpaper \
&& pip3 install --upgrade pipenv supervisor setuptools \
&& pipenv install --system --deploy \
&& pipenv --clear \
&& apt-get -y purge build-essential python3-pip python3-dev \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /var/log/supervisord /var/run/supervisord
# copy scripts
# this fixes issues with imagemagick and PDF
COPY scripts/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
COPY scripts/gunicorn.conf.py ./
COPY scripts/supervisord.conf /etc/supervisord.conf
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
# copy app
COPY src/ ./src/
COPY --from=frontend /usr/src/paperless/src-ui/dist/paperless-ui/ ./src/documents/static/frontend/
# add users, setup scripts
RUN addgroup --gid 1000 paperless \
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
&& chown -R paperless:paperless . \
&& chmod 755 /sbin/docker-entrypoint.sh
WORKDIR /usr/src/paperless/src/
RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["supervisord", "-c", "/etc/supervisord.conf"]
LABEL maintainer="Jonas Winkler <dev@jpwinkler.de>"

11
Pipfile
View File

@ -3,6 +3,11 @@ url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[[source]]
url = "https://www.piwheels.org/simple"
verify_ssl = true
name = "piwheels"
[packages]
django = "~=3.1"
pillow = "*"
@ -21,7 +26,7 @@ psycopg2-binary = "*"
scikit-learn="~=0.23"
whoosh="~=2.7"
gunicorn = "*"
whitenoise = "*"
whitenoise = "~=5.2"
fuzzywuzzy = "*"
python-Levenshtein = "*"
django-extensions = "*"
@ -29,6 +34,7 @@ watchdog = "*"
pathvalidate = "*"
django-q = "*"
redis = "*"
imap-tools = "*"
channels = "~=3.0"
channels-redis = "*"
daphne = "~=3.0"
@ -36,7 +42,7 @@ daphne = "~=3.0"
[dev-packages]
coveralls = "*"
factory-boy = "*"
sphinx = "*"
sphinx = "~=3.3"
tox = "*"
pycodestyle = "*"
pytest = "*"
@ -45,3 +51,4 @@ pytest-django = "*"
pytest-sugar = "*"
pytest-env = "*"
pytest-xdist = "*"
sphinx_rtd_theme = "*"

View File

@ -1,3 +1,9 @@
[![Build Status](https://travis-ci.org/jonaswinkler/paperless-ng.svg?branch=master)](https://travis-ci.org/jonaswinkler/paperless-ng)
[![Documentation Status](https://readthedocs.org/projects/paperless-ng/badge/?version=latest)](https://paperless-ng.readthedocs.io/en/latest/?badge=latest)
[![Docker Hub Pulls](https://img.shields.io/docker/pulls/jonaswinkler/paperless-ng.svg)](https://hub.docker.com/r/jonaswinkler/paperless-ng)
# Paperless-ng
[Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents.
Paperless-ng is a fork of the original project, adding a new interface and many other changes under the hood. For a detailed list of changes, see below.
@ -22,35 +28,16 @@ Here's what you get:
I wanted to make big changes to the project that will impact the way it is used by its users greatly. Among the users who currently use paperless in production there are probably many that don't want these changes right away. I also wanted to have more control over what goes into the code and what does not. Therefore, paperless-ng was created. NG stands for both Angular (the framework used for the Frontend) and next-gen. Publishing this project under a different name also avoids confusion between paperless and paperless-ng.
This is a list of changes that have been made to the original project.
The gist of the changes is the following:
## Added
- **A new single page UI** built with bootstrap and Angular. Its much more responsive than the django admin pages. It features the follwing improvements over the old django admin interface:
- *Dashboard.* The landing page shows some useful information, such as statistics, recently scanned documents, file uploading, and possibly more in the future.
- *Document uploading on the web page.* This is very crude right now, but gets the job done. It simply uploads the documents and stores them in the configured consumer directory. The API for that has always been in the project, there simply was no form on the UI to support it.
- *Full text search* with a proper document indexer: The search feature sorts documents by relevance to the search query, highlights query terms in the found documents and provides autocomplete while typing the query. This is still very basic but will see extensions in the future.
- *Saveable filters.* Save filter and sorting presets and optionally display a couple documents of saved filters (i.e., your inbox sorted descending by added date, or tagged TODO, oldest to newest) on the dash board.
- *Statistics.* Provides basic statistics about your document collection.
- **Document types.** Similar to correspondents, each document may have a type (i.e., invoice, letter, receipt, bank statement, ...). I've initially intented to use this for some individual processing of differently typed documents, however, no such features exists yet.
- **Inbox tags.** These tags are automatically assigned to every newly scanned document. They are intented to be removed once you have manually edited the meta data of a document.
- **Automatic matching** for document types, correspondents, and tags. A new matching algorithm has been implemented (Auto), which is based on a classification model (simple feed forward neural nets are used). This classifier is trained on your document collection and learns to assign metadata to new documents based on their similiarity to existing documents.
- If, for example, all your bank statements for a specific account are tagged with "bank_account_1234" and the matching algorithm of that tag is set to Auto, the classifier learns relevant phrases and words in the documents and assigns this tag automatically to newly scanned and matching documents.
- This works reasonably well, if there is a correlation between the tag and the content of the document. Tags such as 'TODO' or 'Contact Correspondent' cannot be assigned automatically.
- **Archive serial numbers.** These are there to support the recommended workflow for storing physical copies of very important documents. The idea is that if a document has to be kept in physical form, you write a running number on the document before scanning (the archive serial number) and keep these documents sorted by number in a binder. If you need to access a specific physical document at some point in time, search for the document in paperless, identify the ASN and grab the document.
* New front end. This will eventually be mobile friendly as well.
* New full text search.
* Machine learning powered document matching.
* Code cleanup in many, MANY areas.
## Modified
- **(BREAKING) REST API changes.** In order to support the new UI, changes had to be made to the API. Some filters are not available anymore, other filters were added. Furthermore, foreign key relationships are not expressed with URLs anymore, but with their respective ids. Also, the urls for fetching documents and thumbnails have changed. Redirects are in place to support the old urls.
If you want to see some screenshots of paperless-ng in action, [some are available in the documentation](https://paperless-ng.readthedocs.io/en/latest/screenshots.html).
## Internal changes
- Many improvements to the code. More concise logging of the consumer, better multithreading of the tesseract parser for large documents, less hacks overall.
- Updated docker image. This image runs everything in a single container. (Except the optional database, of course)
## Removed
These features were removed each due to two reasons. First, I did not feel these features contributed all that much to the over project, and second, I don't want to maintain these features.
- **(BREAKING) Reminders.** I have no idea what they were used for and thus removed them from the project.
- **Every customization made to the admin interface.** Since this is not the primary interface for the application anymore, there is no need to keep and maintain these. Besides, some changes were incompatible with the most recent versions of django. The interface is completely usable, though.
For a complete list of changes, check out the [changelog](https://paperless-ng.readthedocs.io/en/latest/changelog.html)
## Planned
@ -61,6 +48,7 @@ These features will make it into the application at some point, sorted by priori
- Ability to search for “Similar documents” in the search results
- Provide corrections for mispelled queries
- **More robust consumer** that shows its progress on the web page.
- **More rigid email processing**. Like, dont delete imported mail, provide filters, etc...
- **Arbitrary tag colors**. Allow the selection of any color with a color picker.
## On the chopping block.
@ -68,39 +56,34 @@ These features will make it into the application at some point, sorted by priori
I don't know if these features are used all that much. I don't exactly know how they work and will probably remove them at some point in the future.
- **GnuPG encrypion.** Since its disabled by default and the website allows transparent access to encrypted documents anyway, this doesnt really provide any benefit over having the application stored on an encrypted file system.
- **E-Mail scanning.** I dont use it and dont know the state of the implementation. Ill have to look into that.
# Getting started
The recommended way to deploy paperless is docker-compose.
The recommended way to deploy paperless is docker-compose. Use the provided docker-compose.yml files to get started. This pulls the image from Docker hub. Alternatively, you can build the image yourself.
git clone https://github.com/jonaswinkler/paperless
cd paperless
cp docker-compose.yml.example docker-compose.yml
cp docker-compose.env.example docker-compose.env
docker-compose up -d
Please be aware that this uses a postgres database instead of sqlite. If you want to continue using sqlite, remove the database-related options from the docker-compose.env file.
Read the [documentation](https://paperless-ng.readthedocs.io/en/latest/setup.html#installation) on how to get started.
Alternatively, you can install the dependencies and setup apache and a database server yourself. Details for that will be available in the documentation.
# Migrating to paperless-ng
Don't do it yet. The migrations are in place, but I have not verified yet that they work.
Read the section about [migration](https://paperless-ng.readthedocs.io/en/latest/setup.html#migration-to-paperless-ng) in the documentation.
# Documentation
The documentation for Paperless is available on [ReadTheDocs](https://paperless-ng.readthedocs.io/). Updated documentation for this project is not yet available.
The documentation for Paperless-ng is available on [ReadTheDocs](https://paperless-ng.readthedocs.io/).
# Affiliated Projects
Paperless has been around a while now, and people are starting to build stuff on top of it. If you're one of those people, we can add your project to this list:
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless. This app is not compatible at this point.
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless.
* [Paperless Desktop](https://github.com/thomasbrueggemann/paperless-desktop): A desktop UI for your Paperless installation. Runs on Mac, Linux, and Windows.
* [ansible-role-paperless](https://github.com/ovv/ansible-role-paperless): An easy way to get Paperless running via Ansible.
* [paperless-cli](https://github.com/stgarf/paperless-cli): A golang command line binary to interact with a Paperless instance.
Compatibility with Paperless-ng is unknown.
# Important Note
Document scanners are typically used to scan sensitive documents. Things like your social insurance number, tax records, invoices, etc. Everything is stored in the clear without encryption by default (it needs to be searchable, so if someone has ideas on how to do that on encrypted data, I'm all ears). This means that Paperless should never be run on an untrusted host. Instead, I recommend that if you do want to use it, run it locally on a server in your own home.

View File

@ -1,15 +0,0 @@
#!/bin/bash
if [ "${DOCKER_USERNAME}" == "" -o "${DOCKER_PASSWORD}" == "" ]
then
exit 0
fi
docker login --username=${DOCKER_USERNAME} --password=${DOCKER_PASSWORD}
if [ "${TRAVIS_TAG}" != "" ]
then
docker tag the-paperless-project/paperless the-paperless-project/paperless:${TRAVIS_TAG}
docker push the-paperless-project/paperless:${TRAVIS_TAG}
else
docker push the-paperless-project/paperless
fi

View File

@ -24,16 +24,16 @@
# Adjust this key if you plan to make paperless available publicly. It should
# be a very long sequence of random characters. You don't need to remember it.
#PAPERLESS_SECRET_KEY="change-me"
#PAPERLESS_SECRET_KEY=change-me
# Use this variable to set a timezone for the Paperless Docker containers. If not specified, defaults to UTC.
#PAPERLESS_TIME_ZONE=America/Los_Angeles
# The default language to use for OCR. Set this to the language most of your
# documents are written in.
#PAPERLESS_OCR_LANGUAGE="eng"
#PAPERLESS_OCR_LANGUAGE=eng
# By default Paperless does not OCR a document if the text can be retrieved from
# the document directly. Set to true to always OCR documents. (i.e., if you
# know that some of your documents have faulty/bad OCR data)
#PAPERLESS_OCR_ALWAYS="true"
#PAPERLESS_OCR_ALWAYS=true

View File

@ -86,5 +86,9 @@ if [[ ! -z "$PAPERLESS_OCR_LANGUAGES" ]]; then
install_languages "$PAPERLESS_OCR_LANGUAGES"
fi
exec "$@"
if [[ "$1" != "/"* ]]; then
exec sudo -HEu paperless python3 manage.py "$@"
else
exec "$@"
fi

View File

@ -1,12 +1,12 @@
version: "3.4"
services:
broker:
image: redis:latest
#restart: always
image: redis:6.0
restart: always
db:
image: postgres:13
#restart: always
restart: always
volumes:
- pgdata:/var/lib/postgresql/data
environment:
@ -15,10 +15,11 @@ services:
POSTGRES_PASSWORD: paperless
webserver:
image: paperless-ng:latest
#restart: always
image: jonaswinkler/paperless-ng:0.9.1
restart: always
depends_on:
- db
- broker
ports:
- 8000:8000
healthcheck:
@ -35,7 +36,6 @@ services:
environment:
PAPERLESS_REDIS: redis://broker:6379
PAPERLESS_DBHOST: db
command: ["supervisord", "-c", "/etc/supervisord.conf"]
volumes:

View File

@ -0,0 +1,31 @@
version: "3.4"
services:
broker:
image: redis:6.0
restart: always
webserver:
image: jonaswinkler/paperless-ng:0.9.1
restart: always
depends_on:
- broker
ports:
- 8000:8000
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s
timeout: 10s
retries: 5
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
- ./export:/usr/src/paperless/export
- ./consume:/usr/src/paperless/consume
env_file: docker-compose.env
environment:
PAPERLESS_REDIS: redis://broker:6379
volumes:
data:
media:

60
docker/local/Dockerfile Normal file
View File

@ -0,0 +1,60 @@
FROM python:3.7-slim
WORKDIR /usr/src/paperless/
COPY requirements.txt ./
#Dependencies
RUN apt-get update \
&& apt-get -y --no-install-recommends install \
build-essential \
curl \
ghostscript \
gnupg \
imagemagick \
libatlas-base-dev \
libmagic-dev \
libpoppler-cpp-dev \
libpq-dev \
optipng \
sudo \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-deu \
tesseract-ocr-fra \
tesseract-ocr-ita \
tesseract-ocr-spa \
tzdata \
unpaper \
&& pip3 install --upgrade supervisor setuptools \
&& pip install --no-cache-dir -r requirements.txt \
&& apt-get -y purge build-essential \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /var/log/supervisord /var/run/supervisord
# copy scripts
# this fixes issues with imagemagick and PDF
COPY docker/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
COPY docker/gunicorn.conf.py ./
COPY docker/supervisord.conf /etc/supervisord.conf
COPY docker/docker-entrypoint.sh /sbin/docker-entrypoint.sh
# copy app
COPY src/ ./src/
# add users, setup scripts
RUN addgroup --gid 1000 paperless \
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
&& chown -R paperless:paperless . \
&& chmod 755 /sbin/docker-entrypoint.sh
WORKDIR /usr/src/paperless/src/
RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisord.conf"]
LABEL maintainer="Jonas Winkler <dev@jpwinkler.de>"

View File

@ -0,0 +1,44 @@
version: "3.4"
services:
broker:
image: redis:6.0
restart: always
db:
image: postgres:13
restart: always
volumes:
- pgdata:/var/lib/postgresql/data
environment:
POSTGRES_DB: paperless
POSTGRES_USER: paperless
POSTGRES_PASSWORD: paperless
webserver:
build: .
restart: always
depends_on:
- db
- broker
ports:
- 8000:8000
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s
timeout: 10s
retries: 5
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
- ./export:/usr/src/paperless/export
- ./consume:/usr/src/paperless/consume
env_file: docker-compose.env
environment:
PAPERLESS_REDIS: redis://broker:6379
PAPERLESS_DBHOST: db
volumes:
data:
media:
pgdata:

View File

@ -0,0 +1,31 @@
version: "3.4"
services:
broker:
image: redis:6.0
restart: always
webserver:
build: .
restart: always
depends_on:
- broker
ports:
- 8000:8000
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s
timeout: 10s
retries: 5
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
- ./export:/usr/src/paperless/export
- ./consume:/usr/src/paperless/consume
env_file: docker-compose.env
environment:
PAPERLESS_REDIS: redis://broker:6379
volumes:
data:
media:

View File

@ -1,10 +1,11 @@
[supervisord]
nodaemon=true ; start in foreground if true; default false
logfile=/var/log/supervisord/supervisord.log ; main log file; default $CWD/supervisord.log
pidfile=/var/log/supervisord/supervisord.pid ; supervisord pidfile; default supervisord.pid
pidfile=/var/run/supervisord/supervisord.pid ; supervisord pidfile; default supervisord.pid
logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB
logfile_backups=10 ; # of main logfile backups; 0 means none, default 10
loglevel=info ; log level; default info; others: debug,warn,trace
user=root
[program:daphne]
command=daphne -b 0.0.0.0 -p 8000 paperless.asgi:application

Binary file not shown.

Before

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

BIN
docs/_static/paperless-0-dashboard.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

BIN
docs/_static/paperless-1-list-table.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

BIN
docs/_static/paperless-10-mobile.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

BIN
docs/_static/paperless-2-list-smallcards.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 256 KiB

BIN
docs/_static/paperless-3-list-largecards.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

BIN
docs/_static/paperless-4-filter.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

BIN
docs/_static/paperless-5-editing.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

BIN
docs/_static/paperless-6-tags.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

BIN
docs/_static/paperless-7-autocomplete.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

BIN
docs/_static/paperless-8-search-results.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

BIN
docs/_static/paperless-9-admin.png vendored Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

BIN
docs/_static/recommended_workflow.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

391
docs/administration.rst Normal file
View File

@ -0,0 +1,391 @@
**************
Administration
**************
.. _administration-backup:
Making backups
##############
Multiple options exist for making backups of your paperless instance,
depending on how you installed paperless.
Before making backups, make sure that paperless is not running.
Options available to any installation of paperless:
* Use the :ref:`document exporter <utilities-exporter>`.
The document exporter exports all your documents, thumbnails and
metadata to a specific folder. You may import your documents into a
fresh instance of paperless again or store your documents in another
DMS with this export.
Options available to docker installations:
* Backup the docker volumes. These usually reside within
``/var/lib/docker/volumes`` on the host and you need to be root in order
to access them.
Paperless uses 3 volumes:
* ``paperless_media``: This is where your documents are stored.
* ``paperless_data``: This is where auxilliary data is stored. This
folder also contains the SQLite database, if you use it.
* ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
the database.
Options available to bare-metal and non-docker installations:
* Backup the entire paperless folder. This ensures that if your paperless instance
crashes at some point or your disk fails, you can simply copy the folder back
into place and it works.
When using PostgreSQL, you'll also have to backup the database.
.. _migrating-restoring:
Restoring
=========
.. _administration-updating:
Updating paperless
##################
If a new release of paperless-ng is available, upgrading depends on how you
installed paperless-ng in the first place. The releases are available at
`release page <https://github.com/jonaswinkler/paperless-ng/releases>`_.
First of all, ensure that paperless is stopped.
.. code:: shell-session
$ cd /path/to/paperless
$ docker-compose down
After that, :ref:`make a backup <administration-backup>`.
A. If you used the docker-compose file, simply download the files of the new release,
adjust the settings in the files (i.e., the path to your consumption directory),
and replace your existing docker-compose files. Then start paperless as usual,
which will pull the new image, and update your database, if necessary:
.. code:: shell-session
$ cd /path/to/paperless
$ docker-compose up
If you see everything working, you can start paperless-ng with "-d" to have it
run in the background.
.. hint::
The released docker-compose files specify exact versions to be pulled from the hub.
This is to ensure that if the docker-compose files should change at some point
(i.e., services updates/configured differently), you wont run into trouble due to
docker pulling the ``latest`` image and running it in an older environment.
B. If you built the image yourself, grab the new archive and replace your current
paperless folder with the new contents.
After that, make the necessary adjustments to the docker-compose.yml (i.e.,
adjust your consumption directory).
Build and start the new image with:
.. code:: shell-session
$ cd /path/to/paperless
$ docker-compose build
$ docker-compose up
If you see everything working, you can start paperless-ng with "-d" to have it
run in the background.
.. hint::
You can usually keep your ``docker-compose.env`` file, since this file will
never include mandantory configuration options. However, it is worth checking
out the new version of this file, since it might have new recommendations
on what to configure.
Updating paperless without docker
=================================
After grabbing the new release and unpacking the contents, do the following:
1. Update python requirements. Paperless uses
`Pipenv`_ for managing dependencies:
.. code:: shell-session
$ pip install --upgrade pipenv
$ cd /path/to/paperless
$ pipenv install
$ pipenv clean
This creates a new virtual environment (or uses your existing environment)
and installs all dependencies into it.
2. Collect static files.
.. code:: shell-session
$ cd src
$ pipenv run python3 manage.py collectstatic --clear
3. Migrate the database.
.. code:: shell-session
$ cd src
$ pipenv run python3 manage.py migrate
Management utilities
####################
Paperless comes with some management commands that perform various maintenance
tasks on your paperless instance. You can invoke these commands either by
.. code:: bash
$ cd /path/to/paperless
$ docker-compose run --rm webserver <command> <arguments>
or
.. code:: bash
$ cd /path/to/paperless/src
$ pipenv run python manage.py <command> <arguments>
depending on whether you use docker or not.
All commands have built-in help, which can be accessed by executing them with
the argument ``--help``.
.. _utilities-exporter:
Document exporter
=================
The document exporter exports all your data from paperless into a folder for
backup or migration to another DMS.
.. code::
document_exporter target
``target`` is a folder to which the data gets written. This includes documents,
thumbnails and a ``manifest.json`` file. The manifest contains all metadata from
the database (correspondents, tags, etc).
When you use the provided docker compose script, specify ``../export`` as the
target. This path inside the container is automatically mounted on your host on
the folder ``export``.
.. _utilities-importer:
Document importer
=================
The document importer takes the export produced by the `Document exporter`_ and
imports it into paperless.
The importer works just like the exporter. You point it at a directory, and
the script does the rest of the work:
.. code::
document_importer source
When you use the provided docker compose script, put the export inside the
``export`` folder in your paperless source directory. Specify ``../export``
as the ``source``.
.. _utilities-retagger:
Document retagger
=================
Say you've imported a few hundred documents and now want to introduce
a tag or set up a new correspondent, and apply its matching to all of
the currently-imported docs. This problem is common enough that
there are tools for it.
.. code::
document_retagger [-h] [-c] [-T] [-t] [-i] [--use-first] [-f]
optional arguments:
-c, --correspondent
-T, --tags
-t, --document_type
-i, --inbox-only
--use-first
-f, --overwrite
Run this after changing or adding matching rules. It'll loop over all
of the documents in your database and attempt to match documents
according to the new rules.
Specify any combination of ``-c``, ``-T`` and ``-t`` to have the
retagger perform matching of the specified metadata type. If you don't
specify any of these options, the document retagger won't do anything.
Specify ``-i`` to have the document retagger work on documents tagged
with inbox tags only. This is useful when you don't want to mess with
your already processed documents.
When multiple document types or correspondents match a single document,
the retagger won't assign these to the document. Specify ``--use-first``
to override this behaviour and just use the first correspondent or type
it finds. This option does not apply to tags, since any amount of tags
can be applied to a document.
Finally, ``-f`` specifies that you wish to overwrite already assigned
correspondents, types and/or tags. The default behaviour is to not
assign correspondents and types to documents that have this data already
assigned. ``-f`` works differently for tags: By default, only additional tags get
added to documents, no tags will be removed. With ``-f``, tags that don't
match a document anymore get removed as well.
Managing the Automatic matching algorithm
=========================================
The *Auto* matching algorithm requires a trained neural network to work.
This network needs to be updated whenever somethings in your data
changes. The docker image takes care of that automatically with the task
scheduler. You can manually renew the classifier by invoking the following
management command:
.. code::
document_create_classifier
This command takes no arguments.
Managing the document search index
==================================
The document search index is responsible for delivering search results for the
website. The document index is automatically updated whenever documents get
added to, changed, or removed from paperless. However, if the search yields
non-existing documents or won't find anything, you may need to recreate the
index manually.
.. code::
document_index {reindex,optimize}
Specify ``reindex`` to have the index created from scratch. This may take some
time.
Specify ``optimize`` to optimize the index. This updates certain aspects of
the index and usually makes queries faster and also ensures that the
autocompletion works properly. This command is regularly invoked by the task
scheduler.
.. _utilities-renamer:
Managing filenames
==================
If you use paperless' feature to
:ref:`assign custom filenames to your documents <advanced-file_name_handling>`,
you can use this command to move all your files after changing
the naming scheme.
.. warning::
Since this command moves you documents around alot, it is advised to to
a backup before. The renaming logic is robust and will never overwrite
or delete a file, but you can't ever be careful enough.
.. code::
document_renamer
The command takes no arguments and processes all your documents at once.
Fetching e-mail
===============
Paperless automatically fetches your e-mail every 10 minutes by default. If
you want to invoke the email consumer manually, call the following management
command:
.. code::
mail_fetcher
The command takes no arguments and processes all your mail accounts and rules.
.. _utilities-encyption:
Managing encryption
===================
Documents can be stored in Paperless using GnuPG encryption.
.. danger::
Encryption is depreceated since paperless-ng 0.9 and doesn't really provide any
additional security, since you have to store the passphrase in a configuration
file on the same system as the encrypted documents for paperless to work.
Furthermore, the entire text content of the documents is stored plain in the
database, even if your documents are encrypted. Filenames are not encrypted as
well.
Also, the web server provides transparent access to your encrypted documents.
Consider running paperless on an encrypted filesystem instead, which will then
at least provide security against physical hardware theft.
.. code::
change_storage_type [--passphrase PASSPHRASE] {gpg,unencrypted} {gpg,unencrypted}
positional arguments:
{gpg,unencrypted} The state you want to change your documents from
{gpg,unencrypted} The state you want to change your documents to
optional arguments:
--passphrase PASSPHRASE
Enabling encryption
-------------------
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
.. code::
change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
Disabling encryption
--------------------
Basic usage to enable encryption of your document store:
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
.. code::
change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
.. _Pipenv: https://pipenv.pypa.io/en/latest/

327
docs/advanced_usage.rst Normal file
View File

@ -0,0 +1,327 @@
***************
Advanced topics
***************
Paperless offers a couple features that automate certain tasks and make your life
easier.
Guesswork
#########
Any document you put into the consumption directory will be consumed, but if
you name the file right, it'll automatically set some values in the database
for you. This is is the logic the consumer follows:
1. Try to find the correspondent, title, and tags in the file name following
the pattern: ``Date - Correspondent - Title - tag,tag,tag.pdf``. Note that
the format of the date is **rigidly defined** as ``YYYYMMDDHHMMSSZ`` or
``YYYYMMDDZ``. The ``Z`` refers "Zulu time" AKA "UTC".
The tags are optional, so the format ``Date - Correspondent - Title.pdf``
works as well.
2. If that doesn't work, we skip the date and try this pattern:
``Correspondent - Title - tag,tag,tag.pdf``.
3. If that doesn't work, we try to find the correspondent and title in the file
name following the pattern: ``Correspondent - Title.pdf``.
4. If that doesn't work, just assume that the name of the file is the title.
So given the above, the following examples would work as you'd expect:
* ``20150314000700Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``20150314Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``Another Company - Letter of Reference.jpg``
* ``Dad's Recipe for Pancakes.png``
These however wouldn't work:
* ``2015-03-14 00:07:00 UTC - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``2015-03-14 - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``Another Company- Letter of Reference.jpg``
Do I have to be so strict about naming?
=======================================
Rather than using the strict document naming rules, one can also set the option
``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
that is accepted by dateparser_. Doing so will cause ``paperless`` to default
to any date format that is found in the title, instead of a date pulled from
the document's text, without requiring the strict formatting of the document
filename as described above.
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
.. _advanced-transforming_filenames:
Transforming filenames for parsing
==================================
Some devices can't produce filenames that can be parsed by the default
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
``paperless.conf`` one can add transformations that are applied to the filename
before it's parsed.
The option contains a list of dictionaries of regular expressions (key:
``pattern``) and replacements (key: ``repl``) in JSON format, which are
applied in order by passing them to ``re.subn``. Transformation stops
after the first match, so at most one transformation is applied. The general
syntax is
.. code:: python
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
The example below is for a Brother ADS-2400N, a scanner that allows
different names to different hardware buttons (useful for handling
multiple entities in one instance), but insists on adding ``_<count>``
to the filename.
.. code:: python
# Brother profile configuration, support "Name_Date_Count" (the default
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
Matching tags, correspondents and document types
################################################
After the consumer has tried to figure out what it could from the file name,
it starts looking at the content of the document itself. It will compare the
matching algorithms defined by every tag and correspondent already set in your
database to see if they apply to the text in that document. In other words,
if you defined a tag called ``Home Utility`` that had a ``match`` property of
``bc hydro`` and a ``matching_algorithm`` of ``literal``, Paperless will
automatically tag your newly-consumed document with your ``Home Utility`` tag
so long as the text ``bc hydro`` appears in the body of the document somewhere.
The matching logic is quite powerful, and supports searching the text of your
document with different algorithms, and as such, some experimentation may be
necessary to get things right.
In order to have a tag, correspondent or type assigned automatically to newly
consumed documents, assign a match and matching algorithm using the web
interface. These settings define when to assign correspondents, tags and types
to documents.
The following algorithms are available:
* **Any:** Looks for any occurrence of any word provided in match in the PDF.
If you define the match as ``Bank1 Bank2``, it will match documents containing
either of these terms.
* **All:** Requires that every word provided appears in the PDF, albeit not in the
order provided.
* **Literal:** Matches only if the match appears exactly as provided in the PDF.
* **Regular expression:** Parses the match as a regular expression and tries to
find a match within the document.
* **Fuzzy match:** I dont know. Look at the source.
* **Auto:** Tries to automatically match new documents. This does not require you
to set a match. See the notes below.
When using the "any" or "all" matching algorithms, you can search for terms
that consist of multiple words by enclosing them in double quotes. For example,
defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
will match documents that contain either "Bank of America" or "BofA", but will
not match documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document,
automatically tagged with the appropriate data.
.. _advanced-automatic_matching:
Automatic matching
==================
Paperless-ng comes with a new matching algorithm called *Auto*. This matching
algorithm tries to assign tags, correspondents and document types to your
documents based on how you have assigned these on existing documents. It
uses a neural network under the hood.
If, for example, all your bank statements of your account 123 at the Bank of
America are tagged with the tag "bofa_123" and the matching algorithm of this
tag is set to *Auto*, this neural network will examine your documents and
automatically learn when to assign this tag.
There are a couple caveats you need to keep in mind when using this feature:
* Changes to your documents are not immediately reflected by the matching
algorithm. The neural network needs to be *trained* on your documents after
changes. Paperless periodically (default: once each hour) checks for changes
and does this automatically for you.
* The Auto matching algorithm only takes documents into account which are NOT
placed in your inbox (i.e., have inbox tags assigned to them). This ensures
that the neural network only learns from documents which you have correctly
tagged before.
* The matching algorithm can only work if there is a correlation between the
tag, correspondent or document type and the document itself. Your bank
statements usually contain your bank account number and the name of the bank,
so this works reasonably well, However, tags such as "TODO" cannot be
automatically assigned.
* The matching algorithm needs a reasonable number of documents to identify when
to assign tags, correspondents, and types. If one out of a thousand documents
has the correspondent "Very obscure web shop I bought something five years
ago", it will probably not assign this correspondent automatically if you buy
something from them again. The more documents, the better.
Hooking into the consumption process
####################################
Sometimes you may want to do something arbitrary whenever a document is
consumed. Rather than try to predict what you may want to do, Paperless lets
you execute scripts of your own choosing just before or after a document is
consumed using a couple simple hooks.
Just write a script, put it somewhere that Paperless can read & execute, and
then put the path to that script in ``paperless.conf`` with the variable name
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
``PAPERLESS_POST_CONSUME_SCRIPT``.
.. important::
These scripts are executed in a **blocking** process, which means that if
a script takes a long time to run, it can significantly slow down your
document consumption flow. If you want things to run asynchronously,
you'll have to fork the process in your script and exit.
Pre-consumption script
======================
Executed after the consumer sees a new document in the consumption folder, but
before any processing of the document is performed. This script receives exactly
one argument:
* Document file name
A simple but common example for this would be creating a simple script like
this:
``/usr/local/bin/ocr-pdf``
.. code:: bash
#!/usr/bin/env bash
pdf2pdfocr.py -i ${1}
``/etc/paperless.conf``
.. code:: bash
...
PAPERLESS_PRE_CONSUME_SCRIPT="/usr/local/bin/ocr-pdf"
...
This will pass the path to the document about to be consumed to ``/usr/local/bin/ocr-pdf``,
which will in turn call `pdf2pdfocr.py`_ on your document, which will then
overwrite the file with an OCR'd version of the file and exit. At which point,
the consumption process will begin with the newly modified file.
.. _pdf2pdfocr.py: https://github.com/LeoFCardoso/pdf2pdfocr
.. _advanced-post_consume_script:
Post-consumption script
=======================
Executed after the consumer has successfully processed a document and has moved it
into paperless. It receives the following arguments:
* Document id
* Generated file name
* Source path
* Thumbnail path
* Download URL
* Thumbnail URL
* Correspondent
* Tags
The script can be in any language you like, but for a simple shell script
example, you can take a look at ``post-consumption-example.sh`` in the
``scripts`` directory in this project.
The post consumption script cannot cancel the consumption process.
.. _advanced-file_name_handling:
File name handling
##################
By default, paperless stores your documents in the media directory and renames them
using the identifier which it has assigned to each document. You will end up getting
files like ``0000123.pdf`` in your media directory. This isn't necessarily a bad
thing, because you normally don't have to access these files manually. However, if
you wish to name your files differently, you can do that by adjustng the
``PAPERLESS_FILENAME_FORMAT`` settings variable.
This variable allows you to configure the filename (folders are allowed!) using
placeholders. For example, setting
.. code:: bash
PAPERLESS_FILENAME_FORMAT={created_year}/{correspondent}/{title}
will create a directory structure as follows:
.. code::
2019/
my_bank/
statement-january-0000001.pdf
statement-february-0000002.pdf
2020/
my_bank/
statement-january-0000003.pdf
shoe_store/
my_new_shoes-0000004.pdf
Paperless appends the unique identifier of each document to the filename. This
avoides filename clashes.
.. danger::
Do not manually move your files in the media folder. Paperless remembers the
last filename a document was stored as. If you do rename a file, paperless will
report your files as missing and won't be able to find them.
Paperless provides the following placeholders withing filenames:
* ``{correspondent}``: The name of the correspondent, or "none".
* ``{title}``: The title of the document.
* ``{created}``: The full date and time the document was created.
* ``{created_year}``: Year created only.
* ``{created_month}``: Month created only (number 1-12).
* ``{created_day}``: Day created only (number 1-31).
* ``{added}``: The full date and time the document was added to paperless.
* ``{added_year}``: Year added only.
* ``{added_month}``: Month added only (number 1-12).
* ``{added_day}``: Day added only (number 1-31).
* ``{tags}``: I don't know how this works. Look at the source.
Paperless will convert all values for the placeholders into values which are safe
for use in filenames.
.. hint::
Paperless checks the filename of a document whenever it is saved. Therefore,
you need to update the filenames of your documents and move them after altering
this setting by invoking the :ref:`document renamer <utilities-renamer>`.
.. warning::
Make absolutely sure you get the spelling of the placeholders right, or else
paperless will use the default naming scheme instead.
.. caution::
As of now, you could totally tell paperless to store your files anywhere outside
the media directory by setting
.. code::
PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title}
However, keep in mind that inside docker, if files get stored outside of the
predefined volumes, they will be lost after a restart of paperless.

View File

@ -1,23 +1,173 @@
.. _api:
************
The REST API
############
************
Paperless makes use of the `Django REST Framework`_ standard API interface
because of its inherent awesomeness. Conveniently, the system is also
self-documenting, so to learn more about the access points, schema, what's
accepted and what isn't, you need only visit ``/api`` on your local Paperless
installation.
Paperless makes use of the `Django REST Framework`_ standard API interface.
It provides a browsable API for most of its endpoints, which you can inspect
at ``http://<paperless-host>:<port>/api/``. This also documents most of the
available filters and ordering fields.
.. _Django REST Framework: http://django-rest-framework.org/
The API provides 5 main endpoints:
.. _api-uploading:
* ``/api/correspondents/``: Full CRUD support.
* ``/api/document_types/``: Full CRUD support.
* ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
* ``/api/logs/``: Read-Only.
* ``/api/tags/``: Full CRUD support.
Uploading
---------
All of these endpoints except for the logging endpoint
allow you to fetch, edit and delete individual objects
by appending their primary key to the path, for example ``/api/documents/454/``.
File uploads in an API are hard and so far as I've been able to tell, there's
no standard way of accepting them, so rather than crowbar file uploads into the
REST API and endure that headache, I've left that process to a simple HTTP
POST, documented on the :ref:`consumption page <consumption-http>`.
In addition to that, the document endpoint offers these additional actions on
individual documents:
* ``/api/documents/<pk>/download/``: Download the original document.
* ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
* ``/api/documents/<pk>/preview/``: Display the original document inline,
without downloading it.
.. hint::
Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
are in place. However, if you use these old URLs to access documents, you
should update your app or script to use the new URLs.
Searching for documents
#######################
Paperless-ng offers API endpoints for full text search. These are as follows:
``/api/search/``
================
Get search results based on a query.
Query parameters:
* ``query``: The query string. See
`here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
for details on the syntax.
* ``page``: Specify the page you want to retrieve. Each page
contains 10 search results and the first page is ``page=1``, which
is the default if this is omitted.
Result list object returned by the endpoint:
.. code:: json
{
"count": 1,
"page": 1,
"page_count": 1,
"results": [
]
}
* ``count``: The approximate total number of results.
* ``page``: The page returned to you. This might be different from
the page you requested, if you requested a page that is behind
the last page. In that case, the last page is returned.
* ``page_count``: The total number of pages.
* ``results``: A list of result objects on the current page.
Result object:
.. code:: json
{
"id": 1,
"highlights": [
],
"score": 6.34234,
"rank": 23,
"document": {
}
}
* ``id``: the primary key of the found document
* ``highlights``: an object containing parseable highlights for the result.
See below.
* ``score``: The score assigned to the document. A higher score indicates a
better match with the query. Search results are sorted descending by score.
* ``rank``: the position of the document within the entire search results list.
* ``document``: The full json of the document, as returned by
``/api/documents/<id>/``.
Highlights object:
Highlights are provided as a list of fragments. A fragment is a longer section of
text from the original document.
Each fragment contains a list of strings, and some of them are marked as a highlight.
.. code:: json
[
[
{"text": "This is a sample text with a "},
{"text": "highlighted", "term": 0},
{"text": " word."}
],
[
{"text": "Another", "term": 1},
{"text": " fragment with a highlight."}
]
]
When ``term`` is present within a string, the word within ``text`` should be highlighted.
The term index groups multiple matches together and words with the same index
should get identical highlighting.
A client may use this example to produce the following output:
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
``/api/search/autocomplete/``
=============================
Get auto completions for a partial search term.
Query parameters:
* ``term``: The incomplete term.
* ``limit``: Amount of results. Defaults to 10.
Results returned by the endpoint are ordered by importance of the term in the
document index. The first result is the term that has the highest Tf/Idf score
in the index.
.. code:: json
[
"term1",
"term3",
"term6",
"term4"
]
.. _api-file_uploads:
POSTing documents
#################
The API provides a special endpoint for file uploads:
``/api/documents/post_document/``
POST a multipart form to this endpoint, where the form field ``document`` contains
the document that you want to upload to paperless. The filename is sanitized and
then used to store the document in the consumption folder, where the consumer will
detect the document and process it as any other document.
The endpoint will immediately return "OK." if the document was stored in the
consumption directory.

View File

@ -1,12 +1,119 @@
.. _paperless_changelog:
*********
Changelog
#########
*********
paperless-ng 0.9.1
##################
* Moved documentation of the settings to the actual documentation.
* Updated release script to force the user to choose between SQLite
and PostgreSQL. This avoids confusion when upgrading from paperless.
paperless-ng 0.9.0
##################
* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
This features will most likely be removed in future versions.
* **Added:** New frontend. Features:
* Single page application: It's much more responsive than the django admin pages.
* Dashboard. Shows recently scanned documents, or todos, or other documents
at wish. Allows uploading of documents. Shows basic statistics.
* Better document list with multiple display options.
* Full text search with result highlighting, auto completion and scoring based
on the query. It uses a document search index in the background.
* Saveable filters.
* Better log viewer.
* **Added:** Document types. Assign these to documents just as correspondents.
They may be used in the future to perform automatic operations on documents
depending on the type.
* **Added:** Inbox tags. Define an inbox tag and it will automatically be
assigned to any new document scanned into the system.
* **Added:** Automatic matching. A new matching algorithm that automatically
assigns tags, document types and correspondents to your documents. It uses
a neural network trained on your data.
* **Added:** Archive serial numbers. Assign these to quickly find documents stored in
physical binders.
* **Added:** Enabled the internal user management of django. This isn't really a
multi user solution, however, it allows more than one user to access the website
and set some basic permissions / renew passwords.
* **Modified [breaking]:** All new mail consumer with customizable filters, actions and
multiple account support. Replaces the old mail consumer. The new mail consumer
needs different configuration but can be configured to act exactly like the old
consumer.
* **Modified:** Changes to the consumer:
* Now uses the excellent watchdog library that should make sure files are
discovered no matter what the platform is.
* The consumer now uses a task scheduler to run consumption processes in parallel.
This means that consuming many documents should be much faster on systems with
many cores.
* Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
* The consumer no longer blocks the database for extended periods of time.
* An issue with tesseract running multiple threads per page and slowing down
the consumer was fixed.
* **Modified [breaking]:** REST Api changes:
* New filters added, other filters removed (case sensitive filters, slug filters)
* Endpoints for thumbnails, previews and downloads replace the old ``/fetch/`` urls. Redirects are in place.
* Endpoint for document uploads replaces the old ``/push`` url. Redirects are in place.
* Foreign key relationships are now served as IDs, not as urls.
* **Modified [breaking]:** PostgreSQL:
* If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses postgresql instead of sqlite.
Username, database and password all default to ``paperless`` if not specified.
* **Modified [breaking]:** document_retagger management command rework. See
:ref:`utilities-retagger` for details. Replaces ``document_correspondents``
management command.
* **Removed [breaking]:** Reminders.
* **Removed:** All customizations made to the django admin pages.
* **Removed [breaking]:** The docker image no longer supports SSL. If you want to expose
paperless to the internet, hide paperless behind a proxy server that handles SSL
requests.
* **Internal changes:** Mostly code cleanup, including:
* Rework of the code of the tesseract parser. This is now a lot cleaner.
* Rework of the filename handling code. It was a mess.
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
* Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
and consuming documents.
* Updated dependencies. Now uses Pipenv all around.
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
* **Settings:**
* ``PAPERLESS_FORGIVING_OCR`` is now default and gone. Reason: Even if ``langdetect`` fails to detect
a language, tesseract still does a very good job at ocr'ing a document with the default language.
Certain language specifics such as umlauts may not get picked up properly.
* ``PAPERLESS_DEBUG`` defaults to ``false``.
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
sqlite.
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
* ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
optimization. This is useful on less powerful devices.
* Many more small changes here and there. The usual stuff.
2.7.0
=====
#####
* `syntonym`_ submitted a pull request to catch IMAP connection errors `#475`_.
* `Stéphane Brunner`_ added ``psycopg2`` to the Pipfile `#489`_. He also fixed
a syntax error in ``docker-compose.yml.example`` `#488`_ and added [DjangoQL](https://github.com/ivelum/djangoql),
a syntax error in ``docker-compose.yml.example`` `#488`_ and added `DjangoQL`_,
which allows a litany of handy search functionality `#492`_.
* `CkuT`_ and `JOKer`_ hacked out a simple, but super-helpful optimisation to
how the thumbnails are served up, improving performance considerably `#481`_.
@ -19,7 +126,7 @@ Changelog
2.6.1
=====
#####
* We now have a logo, complete with a favicon :-)
* Removed some problematic tests.
@ -31,7 +138,7 @@ Changelog
2.6.0
=====
#####
* Allow an infinite number of logs to be deleted. Thanks to `Ulli`_ for noting
the problem in `#433`_.
@ -52,7 +159,7 @@ Changelog
2.5.0
=====
#####
* **New dependency**: Paperless now optimises thumbnail generation with
`optipng`_, so you'll need to install that somewhere in your PATH or declare
@ -96,7 +203,7 @@ Changelog
2.4.0
=====
#####
* A new set of actions are now available thanks to `jonaswinkler`_'s very first
pull request! You can now do nifty things like tag documents in bulk, or set
@ -117,7 +224,7 @@ Changelog
2.3.0
=====
#####
* Support for consuming plain text & markdown documents was added by
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
@ -135,14 +242,14 @@ Changelog
2.2.1
=====
#####
* `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
the ``DISABLE_LOGIN`` feature: `#392`_.
2.2.0
=====
#####
* Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
version of Paperless that supports Django 2.0! As a result of their hard
@ -159,7 +266,7 @@ Changelog
2.1.0
=====
#####
* `Enno Lohmeier`_ added three simple features that make Paperless a lot more
user (and developer) friendly:
@ -178,7 +285,7 @@ Changelog
2.0.0
=====
#####
This is a big release as we've changed a core-functionality of Paperless: we no
longer encrypt files with GPG by default.
@ -194,7 +301,7 @@ that it was more an annoyance than anything else, so this feature is now turned
off unless you explicitly set a passphrase in your config file.
Migrating from 1.x
------------------
==================
Encryption isn't gone, it's just off for new users. So long as you have
``PAPERLESS_PASSPHRASE`` set in your config or your environment, Paperless
@ -210,7 +317,7 @@ Special thanks to `erikarvstedt`_, `matthewmoto`_, and `mcronce`_ who did the
bulk of the work on this big change.
1.4.0
=====
#####
* `Quentin Dawans`_ has refactored the document consumer to allow for some
command-line options. Notably, you can now direct it to consume from a
@ -245,7 +352,7 @@ bulk of the work on this big change.
to some excellent work from `erikarvstedt`_ on `#351`_
1.3.0
=====
#####
* You can now run Paperless without a login, though you'll still have to create
at least one user. This is thanks to a pull-request from `matthewmoto`_:
@ -268,7 +375,7 @@ bulk of the work on this big change.
problem and helping me find where to fix it.
1.2.0
=====
#####
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
and `Pit`_. This new image is dramatically smaller than the Debian-based
@ -287,7 +394,7 @@ bulk of the work on this big change.
in the document text.
1.1.0
=====
#####
* Fix for `#283`_, a redirect bug which broke interactions with
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
@ -297,7 +404,7 @@ bulk of the work on this big change.
`Dan Panzarella`_
1.0.0
=====
#####
* Upgrade to Django 1.11. **You'll need to run
``pip install -r requirements.txt`` after the usual ``git pull`` to
@ -316,14 +423,14 @@ bulk of the work on this big change.
`Lukas Winkler`_'s issue `#278`_
0.8.0
=====
#####
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
`#255`_.
0.7.0
=====
#####
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
automatically delete documents attached to correspondents when those
@ -335,7 +442,7 @@ bulk of the work on this big change.
`Kusti Skytén`_ for posting the correct solution in the Github issue.
0.6.0
=====
#####
* Abandon the shared-secret trick we were using for the POST API in favour
of BasicAuth or Django session.
@ -349,7 +456,7 @@ bulk of the work on this big change.
the help with this feature.
0.5.0
=====
#####
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
thanks to `Jake Gysland`_'s patch `#220`_.
@ -367,13 +474,13 @@ bulk of the work on this big change.
* Amended the Django Admin configuration to have nice headers (`#230`_)
0.4.1
=====
#####
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
all-caps suffixes like ``.PDF``
0.4.0
=====
#####
* Introducing reminders. See `#199`_ for more information, but the short
explanation is that you can now attach simple notes & times to documents
@ -383,7 +490,7 @@ bulk of the work on this big change.
like to make use of this feature in his project.
0.3.6
=====
#####
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
correspondent or the tags for a document.
@ -397,7 +504,7 @@ bulk of the work on this big change.
documentation is on its way.
0.3.5
=====
#####
* A serious facelift for the documents listing page wherein we drop the
tabular layout in favour of a tiled interface.
@ -408,7 +515,7 @@ bulk of the work on this big change.
consumption.
0.3.4
=====
#####
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
Note that you *can* use Django Suit with Paperless, but only in a
@ -421,26 +528,26 @@ bulk of the work on this big change.
API thanks to @thomasbrueggemann. See `#179`_.
0.3.3
=====
#####
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
* Timezone, items per page, and default language are now all configurable,
also thanks to @ekw.
0.3.2
=====
#####
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
arise.
0.3.1
=====
#####
* Added a default value for ``CONVERT_BINARY``
0.3.0
=====
#####
* Updated to using django-filter 1.x
* Added some system checks so new users aren't confused by misconfigurations.
@ -453,7 +560,7 @@ bulk of the work on this big change.
``PAPERLESS_SHARED_SECRET`` respectively instead.
0.2.0
=====
#####
* `#150`_: The media root is now a variable you can set in
``paperless.conf``.
@ -481,7 +588,7 @@ bulk of the work on this big change.
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
0.1.1
=====
#####
* Potentially **Breaking Change**: All references to "sender" in the code
have been renamed to "correspondent" to better reflect the nature of the
@ -505,7 +612,7 @@ bulk of the work on this big change.
to be imported but made unavailable.
0.1.0
=====
#####
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
`Tikitu de Jager`_ for this one, and especially to `Pit`_
@ -524,14 +631,14 @@ bulk of the work on this big change.
* Added tox with pep8 checking
0.0.6
=====
#####
* Added support for parallel OCR (significant work from `Pit`_)
* Sped up the language detection (significant work from `Pit`_)
* Added simple logging
0.0.5
=====
#####
* Added support for image files as documents (png, jpg, gif, tiff)
* Added a crude means of HTTP POST for document imports
@ -540,7 +647,7 @@ bulk of the work on this big change.
* Documentation for the above as well as data migration
0.0.4
=====
#####
* Added automated tagging basted on keyword matching
* Cleaned up the document listing page
@ -548,19 +655,19 @@ bulk of the work on this big change.
* Added ``pytz`` to the list of requirements
0.0.3
=====
#####
* Added basic tagging
0.0.2
=====
#####
* Added language detection
* Added datestamps to ``document_exporter``.
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
0.0.1
=====
#####
* Initial release
@ -739,6 +846,6 @@ bulk of the work on this big change.
.. _#489: https://github.com/the-paperless-project/paperless/pull/489
.. _#492: https://github.com/the-paperless-project/paperless/pull/492
.. _pipenv: https://docs.pipenv.org/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
.. _optipng: http://optipng.sourceforge.net/
.. _DjangoQL: https://github.com/ivelum/djangoql

View File

@ -1,15 +0,0 @@
Changelog (jonaswinkler)
########################
1.0.0
=====
* First release based on paperless 2.6.0
* Added: Automatic document classification using neural networks (replaces
regex-based tagging)
* Added: Document types
* Added: Archive serial number allows easy referencing of physical document
copies
* Added: Inbox tags (added automatically to newly consumed documents)
* Added: Document viewer on document edit page
* Database backend is now configurable

View File

@ -54,7 +54,7 @@ source_suffix = '.rst'
master_doc = 'index'
# General information about the project.
project = u'Paperless'
project = u'Paperless-ng'
copyright = u'2015, Daniel Quinn'
# The version info for the project you're documenting, acts as replacement for
@ -205,7 +205,8 @@ try:
import sphinx_rtd_theme
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
except ImportError:
except ImportError as e:
print("error " + str(e))
pass
# -- Options for LaTeX output ---------------------------------------------

306
docs/configuration.rst Normal file
View File

@ -0,0 +1,306 @@
.. _configuration:
*************
Configuration
*************
Paperless provides a wide range of customizations.
Depending on how you run paperless, these settings have to be defined in different
places.
* If you run paperless on docker, ``paperless.conf`` is not used. Rather, configure
paperless by copying necessary options to ``docker-compose.env``.
* If you are running paperless on anything else, paperless will search for the
configuration file in these locations and use the first one it finds:
.. code::
/path/to/paperless/paperless.conf
/etc/paperless.conf
/usr/local/etc/paperless.conf
Required services
#################
PAPERLESS_REDIS=<url>
This is required for processing scheduled tasks such as email fetching, index
optimization and for training the automatic document matcher.
Defaults to redis://localhost:6379.
PAPERLESS_DBHOST=<hostname>
By default, sqlite is used as the database backend. This can be changed here.
Set PAPERLESS_DBHOST and PostgreSQL will be used instead of mysql.
PAPERLESS_DBPORT=<port>
Adjust port if necessary.
Default is 5432.
PAPERLESS_DBNAME=<name>
Database name in PostgreSQL.
Defaults to "paperless".
PAPERLESS_DBUSER=<name>
Database user in PostgreSQL.
Defaults to "paperless".
PAPERLESS_DBPASS=<password>
Database password for PostgreSQL.
Defaults to "paperless".
Paths and folders
#################
PAPERLESS_CONSUMPTION_DIR=<path>
This where your documents should go to be consumed. Make sure that it exists
and that the user running the paperless service can read/write its contents
before you start Paperless.
Don't change this when using docker, as it only changes the path within the
container. Change the local consumption directory in the docker-compose.yml
file instead.
Defaults to "../consume", relative to the "src" directory.
PAPERLESS_DATA_DIR=<path>
This is where paperless stores all its data (search index, sqlite database,
classification model, etc).
Defaults to "../data", relative to the "src" directory.
PAPERLESS_MEDIA_ROOT=<path>
This is where your documents and thumbnails are stored.
You can set this and PAPERLESS_DATA_DIR to the same folder to have paperless
store all its data within the same volume.
Defaults to "../media", relative to the "src" directory.
PAPERLESS_STATICDIR=<path>
Override the default STATIC_ROOT here. This is where all static files
created using "collectstatic" manager command are stored.
Unless you're doing something fancy, there is no need to override this.
Defaults to "../static", relative to the "src" directory.
PAPERLESS_FILENAME_FORMAT=<format>
Changes the filenames paperless uses to store documents in the media directory.
See :ref:`advanced-file_name_handling` for details.
Default is none, which disables this feature.
Hosting & Security
##################
PAPERLESS_SECRET_KEY=<key>
Paperless uses this to make session tokens. If you exose paperless on the
internet, you need to change this, since the default secret is well known.
Use any sequence of characters. The more, the better. You don't need to
remember this. Just face-roll your keyboard.
Default is listed in the file ``src/paperless/settings.py``.
PAPERLESS_ALLOWED_HOSTS<comma-separated-list>
If you're planning on putting Paperless on the open internet, then you
really should set this value to the domain name you're using. Failing to do
so leaves you open to HTTP host header attacks:
https://docs.djangoproject.com/en/3.1/topics/security/#host-header-validation
Just remember that this is a comma-separated list, so "example.com" is fine,
as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
Defaults to "*", which is all hosts.
PAPERLESS_CORS_ALLOWED_HOSTS<comma-separated-list>
You need to add your servers to the list of allowed hosts that can do CORS
calls. Set this to your public domain name.
Defaults to "http://localhost:8000".
PAPERLESS_FORCE_SCRIPT_NAME=<path>
To host paperless under a subpath url like example.com/paperless you set
this value to /paperless. No trailing slash!
.. note::
I don't know if this works in paperless-ng. Probably not.
Defaults to none, which hosts paperless at "/".
PAPERLESS_STATIC_URL=<path>
Override the STATIC_URL here. Unless you're hosting Paperless off a
subdomain like /paperless/, you probably don't need to change this.
Defaults to "/static/".
Software tweaks
###############
PAPERLESS_TASK_WORKERS=<num>
Paperless does multiple things in the background: Maintain the search index,
maintain the automatic matching algorithm, check emails, consume documents,
etc. This variable specifies how many things it will do in parallel.
PAPERLESS_THREADS_PER_WORKER=<num>
Furthermore, paperless uses multiple threads when consuming documents to
speed up OCR. This variable specifies how many pages paperless will process
in parallel on a single document.
.. caution::
Ensure that the product
PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
does not exceed your CPU core count or else paperless will be extremely slow.
If you want paperless to process many documents in parallel, choose a high
worker count. If you want paperless to process very large documents faster,
use a higher thread per worker count.
The default is a balance between the two, according to your CPU core count,
with a slight favor towards threads per worker, and using as much cores as
possible.
If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
PAPERLESS_THREADS_PER_WORKER automatically.
PAPERLESS_TIME_ZONE=<timezone>
Set the time zone here.
See https://docs.djangoproject.com/en/3.1/ref/settings/#std:setting-TIME_ZONE
for details on how to set it.
Defaults to UTC.
PAPERLESS_OCR_LANGUAGE=<lang>
Customize the default language that tesseract will attempt to use when
parsing documents. The default language is used whenever
* No language could be detected on a document
* No tesseract data files are available for the detected language
It should be a 3-letter language code consistent with ISO
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
Set this to the language most of your documents are written in.
Defaults to "eng".
PAPERLESS_OCR_ALWAYS=<bool>
By default Paperless does not OCR a document if the text can be retrieved from
the document directly. Set to true to always OCR documents.
Defaults to false.
PAPERLESS_CONSUMER_POLLING=<num>
If paperless won't find documents added to your consume folder, it might
not be able to automatically detect filesystem changes. In that case,
specify a polling interval in seconds here, which will then cause paperless
to periodically check your consumption directory for changes.
Defaults to 0, which disables polling and uses filesystem notifiactions.
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
When the consumer detects a duplicate document, it will not touch the
original document. This default behavior can be changed here.
Defaults to false.
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
On smaller systems, or even in the case of Very Large Documents, the consumer
may explode, complaining about how it's "unable to extend pixel cache". In
such cases, try setting this to a reasonably low value, like 32. The
default is to use whatever is necessary to do everything without writing to
disk, and units are in megabytes.
For more information on how to use this value, you should search
the web for "MAGICK_MEMORY_LIMIT".
Defaults to 0, which disables the limit.
PAPERLESS_CONVERT_TMPDIR=<path>
Similar to the memory limit, if you've got a small system and your OS mounts
/tmp as tmpfs, you should set this to a path that's on a physical disk, like
/home/your_user/tmp or something. ImageMagick will use this as scratch space
when crunching through very large documents.
For more information on how to use this value, you should search
the web for "MAGICK_TMPDIR".
Default is none, which disables the temporary directory.
PAPERLESS_CONVERT_DENSITY=<num>
This setting has a high impact on the physical size of tmp page files,
the speed of document conversion, and can affect the accuracy of OCR
results. Individual results can vary and this setting should be tested
thoroughly against the documents you are importing to see if it has any
impacts either negative or positive.
Testing on limited document sets has shown a setting of 200 can cut the
size of tmp files by 1/3, and speed up conversion by up to 4x
with little impact to OCR accuracy.
Default is 300.
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
Use optipng to optimize thumbnails. This usually reduces the sice of
thumbnails by about 20%, but uses considerable compute time during
consumption.
Defaults to true.
PAPERLESS_POST_CONSUME_SCRIPT=<filename>
After a document is consumed, Paperless can trigger an arbitrary script if
you like. This script will be passed a number of arguments for you to work
with. For more information, take a look at :ref:`advanced-post_consume_script`.
The default is blank, which means nothing will be executed.
PAPERLESS_FILENAME_DATE_ORDER=<format>
Paperless will check the document text for document date information.
Use this setting to enable checking the document filename for date
information. The date order can be set to any option as specified in
https://dateparser.readthedocs.io/en/latest/settings.html#date-order.
The filename will be checked first, and if nothing is found, the document
text will be checked as normal.
Defaults to none, which disables this feature.
PAPERLESS_FILENAME_PARSE_TRANSFORMS
Transforms filenames before they are processed by paperless. See
:ref:`advanced-transforming_filenames` for details.
Defaults to none, which disables this feature.
Binaries
########
There are a few external software packages that Paperless expects to find on
your system when it starts up. Unless you've done something creative with
their installation, you probably won't need to edit any of these. However,
if you've installed these programs somewhere where simply typing the name of
the program doesn't automatically execute it (ie. the program isn't in your
$PATH), then you'll need to specify the literal path for that program.
PAPERLESS_CONVERT_BINARY=<path>
Defaults to "/usr/bin/convert".
PAPERLESS_GS_BINARY=<path>
Defaults to "/usr/bin/gs".
PAPERLESS_UNPAPER_BINARY=<path>
Defaults to "/usr/bin/unpaper".
PAPERLESS_OPTIPNG_BINARY=<path>
Defaults to "/usr/bin/optipng".

View File

@ -1,255 +0,0 @@
.. _consumption:
Consumption
###########
Once you've got Paperless setup, you need to start feeding documents into it.
Currently, there are three options: the consumption directory, IMAP (email), and
HTTP POST.
.. _consumption-directory:
The Consumption Directory
=========================
The primary method of getting documents into your database is by putting them in
the consumption directory. The ``document_consumer`` script runs in an infinite
loop looking for new additions to this directory and when it finds them, it goes
about the process of parsing them with the OCR, indexing what it finds, and
encrypting the PDF (if ``PAPERLESS_PASSPHRASE`` is set), storing it in the
media directory.
Getting stuff into this directory is up to you. If you're running Paperless
on your local computer, you might just want to drag and drop files there, but if
you're running this on a server and want your scanner to automatically push
files to this directory, you'll need to setup some sort of service to accept the
files from the scanner. Typically, you're looking at an FTP server like
`Proftpd`_ or `Samba`_.
.. _Proftpd: http://www.proftpd.org/
.. _Samba: http://www.samba.org/
So where is this consumption directory? It's wherever you define it. Look for
the ``CONSUMPTION_DIR`` value in ``settings.py``. Set that to somewhere
appropriate for your use and put some documents in there. When you're ready,
follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
.. _consumption-directory-hook:
Hooking into the Consumption Process
------------------------------------
Sometimes you may want to do something arbitrary whenever a document is
consumed. Rather than try to predict what you may want to do, Paperless lets
you execute scripts of your own choosing just before or after a document is
consumed using a couple simple hooks.
Just write a script, put it somewhere that Paperless can read & execute, and
then put the path to that script in ``paperless.conf`` with the variable name
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
``PAPERLESS_POST_CONSUME_SCRIPT``. The script will be executed before or
or after the document is consumed respectively.
.. important::
These scripts are executed in a **blocking** process, which means that if
a script takes a long time to run, it can significantly slow down your
document consumption flow. If you want things to run asynchronously,
you'll have to fork the process in your script and exit.
.. _consumption-directory-hook-variables:
What Can These Scripts Do?
..........................
It's your script, so you're only limited by your imagination and the laws of
physics. However, the following values are passed to the scripts in order:
.. _consumption-director-hook-variables-pre:
Pre-consumption script
::::::::::::::::::::::
* Document file name
A simple but common example for this would be creating a simple script like
this:
``/usr/local/bin/ocr-pdf``
.. code:: bash
#!/usr/bin/env bash
pdf2pdfocr.py -i ${1}
``/etc/paperless.conf``
.. code:: bash
...
PAPERLESS_PRE_CONSUME_SCRIPT="/usr/local/bin/ocr-pdf"
...
This will pass the path to the document about to be consumed to ``/usr/local/bin/ocr-pdf``,
which will in turn call `pdf2pdfocr.py`_ on your document, which will then
overwrite the file with an OCR'd version of the file and exit. At which point,
the consumption process will begin with the newly modified file.
.. _pdf2pdfocr.py: https://github.com/LeoFCardoso/pdf2pdfocr
.. _consumption-director-hook-variables-post:
Post-consumption script
:::::::::::::::::::::::
* Document id
* Generated file name
* Source path
* Thumbnail path
* Download URL
* Thumbnail URL
* Correspondent
* Tags
The script can be in any language you like, but for a simple shell script
example, you can take a look at ``post-consumption-example.sh`` in the
``scripts`` directory in this project.
.. _consumption-imap:
IMAP (Email)
============
Another handy way to get documents into your database is to email them to
yourself. The typical use-case would be to be out for lunch and want to send a
copy of the receipt back to your system at home. Paperless can be taught to
pull emails down from an arbitrary account and dump them into the consumption
directory where the process :ref:`above <consumption-directory>` will follow the
usual pattern on consuming the document.
Some things you need to know about this feature:
* It's disabled by default. By setting the values below it will be enabled.
* It's been tested in a limited environment, so it may not work for you (please
submit a pull request if you can!)
* It's designed to **delete mail from the server once consumed**. So don't go
pointing this to your personal email account and wonder where all your stuff
went.
* Currently, only one photo (attachment) per email will work.
So, with all that in mind, here's what you do to get it running:
1. Setup a new email account somewhere, or if you're feeling daring, create a
folder in an existing email box and note the path to that folder.
2. In ``/etc/paperless.conf`` set all of the appropriate values in
``PATHS AND FOLDERS`` and ``SECURITY``.
If you decided to use a subfolder of an existing account, then make sure you
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
have to include that in every email you send.
3. Restart the :ref:`consumer <utilities-consumer>`. The consumer will check
the configured email account at startup and from then on every 10 minutes
for something new and pulls down whatever it finds.
4. Send yourself an email! Note that the subject is treated as the file name,
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
get what you expect. Also, you must include the aforementioned secret
string in every email so the fetcher knows that it's safe to import.
Note that Paperless only allows the email title to consist of safe characters
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
5. After a few minutes, the consumer will poll your mailbox, pull down the
message, and place the attachment in the consumption directory with the
appropriate name. A few minutes later, the consumer will import it like any
other file.
.. _consumption-http:
HTTP POST
=========
You can also submit a document via HTTP POST, so long as you do so after
authenticating. To push your document to Paperless, send an HTTP POST to the
server with the following name/value pairs:
* ``correspondent``: The name of the document's correspondent. Note that there
are restrictions on what characters you can use here. Specifically,
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else is
out. You also can't use the sequence ` - ` (space, dash, space).
* ``title``: The title of the document. The rules for characters is the same
here as the correspondent.
* ``document``: The file you're uploading
Specify ``enctype="multipart/form-data"``, and then POST your file with::
Content-Disposition: form-data; name="document"; filename="whatever.pdf"
An example of this in HTML is a typical form:
.. code:: html
<form method="post" enctype="multipart/form-data">
<input type="text" name="correspondent" value="My Correspondent" />
<input type="text" name="title" value="My Title" />
<input type="file" name="document" />
<input type="submit" name="go" value="Do the thing" />
</form>
But a potentially more useful way to do this would be in Python. Here we use
the requests library to handle basic authentication and to send the POST data
to the URL.
.. code:: python
import os
from hashlib import sha256
import requests
from requests.auth import HTTPBasicAuth
# You authenticate via BasicAuth or with a session id.
# We use BasicAuth here
username = "my-username"
password = "my-super-secret-password"
# Where you have Paperless installed and listening
url = "http://localhost:8000/push"
# Document metadata
correspondent = "Test Correspondent"
title = "Test Title"
# The local file you want to push
path = "/path/to/some/directory/my-document.pdf"
with open(path, "rb") as f:
response = requests.post(
url=url,
data={"title": title, "correspondent": correspondent},
files={"document": (os.path.basename(path), f, "application/pdf")},
auth=HTTPBasicAuth(username, password),
allow_redirects=False
)
if response.status_code == 202:
# Everything worked out ok
print("Upload successful")
else:
# If you don't get a 202, it's probably because your credentials
# are wrong or something. This will give you a rough idea of what
# happened.
print("We got HTTP status code: {}".format(response.status_code))
for k, v in response.headers.items():
print("{}: {}".format(k, v))

View File

@ -3,6 +3,10 @@
Contributing to Paperless
#########################
.. warning::
This section is not updated to paperless-ng yet.
Maybe you've been using Paperless for a while and want to add a feature or two,
or maybe you've come across a bug that you have some ideas how to solve. The
beauty of Free software is that you can see what's wrong and help to get it

View File

@ -1,42 +0,0 @@
.. _customising:
Customising Paperless
#####################
Currently, the Paperless' interface is just the default Django admin, which
while powerful, is rather boring. If you'd like to give the site a bit of a
face-lift, or if you simply want to adjust the colours, contrast, or font size
to make things easier to read, you can do that by adding your own CSS or
Javascript quite easily.
.. _customising-overrides:
Overrides
=========
On every page load, Paperless looks for two files in your media root directory
(the directory defined by your ``PAPERLESS_MEDIADIR`` configuration variable or
the default, ``<project root>/media/``) for two files:
* ``overrides.css``
* ``overrides.js``
If it finds either or both of those files, they'll be loaded into the page: the
CSS in the ``<head>``, and the Javascript stuffed into the last line of the
``<body>``.
.. _customising-overrides-note:
An important note about customisation
-------------------------------------
Any changes you make to the site with your CSS or Javascript are likely to
depend on the structure of the current HTML and/or the existing CSS rules. For
the most part it's safe to assume that these bits won't change, but *sometimes
they do* as features are added or bugs are fixed.
If you make a change that you think others would appreciate though, submit it
as a pull request and maybe we can find a way to work it into the project by
default!

View File

@ -3,6 +3,10 @@
Extending Paperless
===================
.. warning::
This section is not updated to paperless-ng yet.
For the most part, Paperless is monolithic, so extending it is often best
managed by way of modifying the code directly and issuing a pull request on
`GitHub`_. However, over time the project has been evolving to be a little

57
docs/faq.rst Normal file
View File

@ -0,0 +1,57 @@
**************************
Frequently asked questions
**************************
**Q:** *I'm using docker. Where are my documents?*
**A:** Your documents are stored inside the docker volume ``paperless_media``.
Docker manages this volume automatically for you. It is a persistent storage
and will persist as long as you don't explicitly delete it. The actual location
depends on your host operating system. On Linux, chances are high that this location
is
.. code::
/var/lib/docker/volumes/paperless_media/_data
.. caution::
Dont mess with this folder. Don't change permissions and don't move
files around manually. This folder is meant to be entirely managed by docker
and paperless.
**Q:** *Will paperless-ng run on Raspberry Pi?*
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
The long answer is that certain parts of
Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
try to OCR documents before feeding them into paperless so that paperless can
reuse the text. The web interface should be alot snappier, since it runs
in your browser and paperless has to do much less work to serve the data.
.. note::
Consider setting ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to false to speed up
the consumption process. This takes quite a bit of time on Raspberry Pi.
.. note::
Updating the :ref:`automatic matching algorithm <advanced-automatic_matching>`
takes quite a bit of time. However, the update mechanism checks if your
data has changed before doing the heavy lifting. If you experience the
algorithm taking too much cpu time, consider changing the schedule in the
admin interface to daily or weekly. You can also manually invoke the task
by changing the date and time of the next run to today/now.
The actual matching of the algorithm is fast and works on Raspberry Pi as
well as on any other device.
**Q:** *How do I install paperless-ng on Raspberry Pi?*
**A:** There is not docker image for ARM available. If you know how to build
that automatically, I'm all ears. For now, you have to grab the latest release
archive from the project page and build the image yourself. The release comes
with the front end already compiled, so you don't have to do this on the Pi.

View File

@ -1,131 +0,0 @@
.. _guesswork:
Guesswork
#########
During the consumption process, Paperless tries to guess some of the attributes
of the document it's looking at. To do this it uses two approaches:
.. _guesswork-naming:
File Naming
===========
Any document you put into the consumption directory will be consumed, but if
you name the file right, it'll automatically set some values in the database
for you. This is is the logic the consumer follows:
1. Try to find the correspondent, title, and tags in the file name following
the pattern: ``Date - Correspondent - Title - tag,tag,tag.pdf``. Note that
the format of the date is **rigidly defined** as ``YYYYMMDDHHMMSSZ`` or
``YYYYMMDDZ``. The ``Z`` refers "Zulu time" AKA "UTC".
The tags are optional, so the format ``Date - Correspondent - Title.pdf``
works as well.
2. If that doesn't work, we skip the date and try this pattern:
``Correspondent - Title - tag,tag,tag.pdf``.
3. If that doesn't work, we try to find the correspondent and title in the file
name following the pattern: ``Correspondent - Title.pdf``.
4. If that doesn't work, just assume that the name of the file is the title.
So given the above, the following examples would work as you'd expect:
* ``20150314000700Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``20150314Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
* ``Another Company - Letter of Reference.jpg``
* ``Dad's Recipe for Pancakes.png``
These however wouldn't work:
* ``2015-03-14 00:07:00 UTC - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``2015-03-14 - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``Another Company- Letter of Reference.jpg``
Do I have to be so strict about naming?
---------------------------------------
Rather than using the strict document naming rules, one can also set the option
``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
that is accepted by dateparser_. Doing so will cause ``paperless`` to default
to any date format that is found in the title, instead of a date pulled from
the document's text, without requiring the strict formatting of the document
filename as described above.
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
Transforming filenames for parsing
----------------------------------
Some devices can't produce filenames that can be parsed by the default
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
``paperless.conf`` one can add transformations that are applied to the filename
before it's parsed.
The option contains a list of dictionaries of regular expressions (key:
``pattern``) and replacements (key: ``repl``) in JSON format, which are
applied in order by passing them to ``re.subn``. Transformation stops
after the first match, so at most one transformation is applied. The general
syntax is
.. code:: python
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
The example below is for a Brother ADS-2400N, a scanner that allows
different names to different hardware buttons (useful for handling
multiple entities in one instance), but insists on adding ``_<count>``
to the filename.
.. code:: python
# Brother profile configuration, support "Name_Date_Count" (the default
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
.. _guesswork-content:
Reading the Document Contents
=============================
After the consumer has tried to figure out what it could from the file name,
it starts looking at the content of the document itself. It will compare the
matching algorithms defined by every tag and correspondent already set in your
database to see if they apply to the text in that document. In other words,
if you defined a tag called ``Home Utility`` that had a ``match`` property of
``bc hydro`` and a ``matching_algorithm`` of ``literal``, Paperless will
automatically tag your newly-consumed document with your ``Home Utility`` tag
so long as the text ``bc hydro`` appears in the body of the document somewhere.
The matching logic is quite powerful, and supports searching the text of your
document with different algorithms, and as such, some experimentation may be
necessary to get things Just Right.
.. _guesswork-content-howto:
How Do I Set Up These Matching Algorithms?
------------------------------------------
Setting up of the algorithms is easily done through the admin interface. When
you create a new correspondent or tag, there are optional fields for matching
text and matching algorithm. From the help info there:
.. note::
Which algorithm you want to use when matching text to the OCR'd PDF. Here,
"any" looks for any occurrence of any word provided in the PDF, while "all"
requires that every word provided appear in the PDF, albeit not in the
order provided. A "literal" match means that the text you enter must
appear in the PDF exactly as you've entered it, and "regular expression"
uses a regex to match the PDF. If you don't know what a regex is, you
probably don't want this option.
When using the "any" or "all" matching algorithms, you can search for terms
that consist of multiple words by enclosing them in double quotes. For example,
defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
will match documents that contain either "Bank of America" or "BofA", but will
not match documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document,
automatically tagged with the appropriate data.

View File

@ -1,17 +1,14 @@
.. _index:
*********
Paperless
=========
*********
Paperless is a simple Django application running in two parts:
a :ref:`consumer <utilities-consumer>` (the thing that does the indexing) and
the :ref:`webserver <utilities-webserver>` (the part that lets you search &
a *Consumer* (the thing that does the indexing) and
the *Web server* (the part that lets you search &
download already-indexed documents). If you want to learn more about its
functions keep on reading after the installation section.
.. _index-why-this-exists:
Why This Exists
===============
@ -25,26 +22,51 @@ finding stuff again. I feed documents right from the post box into the scanner
and then shred them. Perhaps you might find it useful too.
Paperless-ng
============
Paperless-ng is a fork of the original paperless project. It changes many
things both on the surface and under the hood. Paperless-ng was created
because I feel that these changes are too big to be pushed into the main
repository right away.
NG stands for both Angular (the framework used for the
Frontend) and next-gen. Publishing this project under a different name also
avoids confusion between paperless and paperless-ng.
If you want to learn about what's different in paperless-ng, check out these
resources in the documentation:
* :ref:`Some screenshots <screenshots>` of the new UI are available.
* Read :ref:`this section <advanced-automatic_matching>` if you want to
learn about how paperless automates all tagging using machine learning.
* Paperless now comes with a :ref:`proper email consumer <usage-email>`
that's fully tested and production ready.
* See :ref:`this note <utilities-encyption>` about GnuPG encryption in
paperless-ng.
* The :ref:`changelog <paperless_changelog>` contains a detailed list of all changes
in paperless-ng.
It would be great if this project could eventually merge back into the main
repository, but it needs a lot more work before that can happen.
Contents
========
.. toctree::
:maxdepth: 2
:maxdepth: 1
requirements
setup
consumption
usage_overview
advanced_usage
administration
configuration
api
utilities
guesswork
migrating
customising
faq
extending
troubleshooting
contributing
scanners
screenshots
changelog
changelog_jonaswinkler

View File

@ -1,109 +0,0 @@
.. _migrating:
Migrating, Updates, and Backups
===============================
As Paperless is still under active development, there's a lot that can change
as software updates roll out. You should backup often, so if anything goes
wrong during an update, you at least have a means of restoring to something
usable. Thankfully, there are automated ways of backing up, restoring, and
updating the software.
.. _migrating-backup:
Backing Up
----------
So you're bored of this whole project, or you want to make a remote backup of
your files for whatever reason. This is easy to do, simply use the
:ref:`exporter <utilities-exporter>` to dump your documents and database out
into an arbitrary directory.
.. _migrating-restoring:
Restoring
---------
Restoring your data is just as easy, since nearly all of your data exists either
in the file names, or in the contents of the files themselves. You just need to
create an empty database (just follow the
:ref:`installation instructions <setup-installation>` again) and then import the
``tags.json`` file you created as part of your backup. Lastly, copy your
exported documents into the consumption directory and start up the consumer.
.. code-block:: shell-session
$ cd /path/to/project
$ rm data/db.sqlite3 # Delete the database
$ cd src
$ ./manage.py migrate # Create the database
$ ./manage.py createsuperuser
$ ./manage.py loaddata /path/to/arbitrary/place/tags.json
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
$ ./manage.py document_consumer
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
is almost as simple:
.. code-block:: shell-session
# Stop and remove your current containers
$ docker-compose stop
$ docker-compose rm -f
# Recreate them, add the superuser
$ docker-compose up -d
$ docker-compose run --rm webserver createsuperuser
# Load the tags
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
# Load your exported documents into the consumption directory
# (How you do this highly depends on how you have set this up)
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
After loading the documents into the consumption directory the consumer will
immediately start consuming the documents.
.. _migrating-updates:
Updates
-------
For the most part, all you have to do to update Paperless is run ``git pull``
on the directory containing the project files, and then use Django's
``migrate`` command to execute any database schema updates that might have been
rolled in as part of the update:
.. code-block:: shell-session
$ cd /path/to/project
$ git pull
$ pip install -r requirements.txt
$ cd src
$ ./manage.py migrate
Note that it's possible (even likely) that while ``git pull`` may update some
files, the ``migrate`` step may not update anything. This is totally normal.
Additionally, as new features are added, the ability to control those features
is typically added by way of an environment variable set in ``paperless.conf``.
You may want to take a look at the ``paperless.conf.example`` file to see if
there's anything new in there compared to what you've got in ``/etc``.
If you are :ref:`using Docker <setup-installation-docker>` the update process
is similar:
.. code-block:: shell-session
$ cd /path/to/project
$ git pull
$ docker build -t paperless .
$ docker-compose run --rm consumer migrate
$ docker-compose up -d
If ``git pull`` doesn't report any changes, there is no need to continue with
the remaining steps.

View File

@ -1,125 +0,0 @@
.. _requirements:
Requirements
============
You need a Linux machine or Unix-like setup (theoretically an Apple machine
should work) that has the following software installed:
* `Python3`_ (with development libraries, pip and virtualenv)
* `GNU Privacy Guard`_
* `Tesseract`_, plus its language files matching your document base.
* `Imagemagick`_ version 6.7.5 or higher
* `unpaper`_
* `libpoppler-cpp-dev`_ PDF rendering library
* `optipng`_
.. _Python3: https://python.org/
.. _GNU Privacy Guard: https://gnupg.org
.. _Tesseract: https://github.com/tesseract-ocr
.. _Imagemagick: http://imagemagick.org/
.. _unpaper: https://github.com/unpaper/unpaper
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
.. _optipng: http://optipng.sourceforge.net/
Notably, you should confirm how you access your Python3 installation. Many
Linux distributions will install Python3 in parallel to Python2, using the
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
``pip``. Running Paperless with Python2 will likely break things, so make sure
that you're using the right version.
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
refer to their Python3 versions.
In addition to the above, there are a number of Python requirements, all of
which are listed in a file called ``requirements.txt`` in the project root
directory.
If you're not working on a virtual environment (like Docker), you
should probably be using a virtualenv, but that's your call. The reasons why
you might choose a virtualenv or not aren't really within the scope of this
document. Needless to say if you don't know what a virtualenv is, you should
probably figure that out before continuing.
.. _requirements-apple:
Problems with Imagemagick & PDFs
--------------------------------
Some users have `run into problems`_ with getting ImageMagick to do its thing
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
Linuxes have been a problem as well. The solution appears to be to install
ghostscript as well as ImageMagick:
.. _run into problems: https://github.com/the-paperless-project/paperless/issues/25
.. code:: bash
$ brew install ghostscript
$ brew install imagemagick
$ brew install libmagic
.. _requirements-baremetal:
Python-specific Requirements: No Virtualenv
-------------------------------------------
If you don't care to use a virtual env, then installation of the Python
dependencies is easy:
.. code:: bash
$ pip install --user --requirement /path/to/paperless/requirements.txt
This will download and install all of the requirements into
``${HOME}/.local``. Remember that your distribution may be using ``pip3`` as
mentioned above.
.. _requirements-virtualenv:
Python-specific Requirements: Virtualenv
----------------------------------------
Using a virtualenv for this is pretty straightforward: create a virtualenv,
enter it, and install the requirements using the ``requirements.txt`` file:
.. code:: bash
$ virtualenv --python=/path/to/python3 /path/to/arbitrary/directory
$ . /path/to/arbitrary/directory/bin/activate
$ pip install --requirement /path/to/paperless/requirements.txt
Now you're ready to go. Just remember to enter (activate) your virtualenv
whenever you want to use Paperless.
.. _requirements-documentation:
Documentation
-------------
As generation of the documentation is not required for the use of Paperless,
dependencies for this process are not included in ``requirements.txt``. If
you'd like to generate your own docs locally, you'll need to:
.. code:: bash
$ pip install sphinx
and then cd into the ``docs`` directory and type ``make html``.
If you are using Docker, you can use the following commands to build the
documentation and run a webserver serving it on `port 8001`_:
.. code:: bash
$ pwd
/path/to/paperless
$ docker build -t paperless:docs -f docs/Dockerfile .
$ docker run --rm -it -p "8001:8000" paperless:docs
.. _port 8001: http://127.0.0.1:8001

View File

@ -1,7 +1,9 @@
.. _scanners:
Scanner Recommendations
=======================
***********************
Scanner recommendations
***********************
As Paperless operates by watching a folder for new files, doesn't care what
scanner you use, but sometimes finding a scanner that will write to an FTP,
@ -23,16 +25,19 @@ that works right for you based on recommentations from other Paperless users.
+---------+----------------+-----+-----+-----+----------------+
| Fujitsu | `ix500`_ | yes | | yes | `eonist`_ |
+---------+----------------+-----+-----+-----+----------------+
| Fujitsu | `S1300i`_ | yes | | yes | `jonaswinkler`_|
+---------+----------------+-----+-----+-----+----------------+
.. _ADS-1500W: https://www.brother.ca/en/p/ads1500w
.. _MFC-J6930DW: https://www.brother.ca/en/p/MFCJ6930DW
.. _MFC-J5910DW: https://www.brother.co.uk/printers/inkjet-printers/mfcj5910dw
.. _MFC-9142CDN: https://www.brother.co.uk/printers/laser-printers/mfc9140cdn
.. _ix500: http://www.fujitsu.com/us/products/computing/peripheral/scanners/scansnap/ix500/
.. _ix500: https://www.fujitsu.com/global/products/computing/peripheral/scanners/scansnap/ix500/
.. _S1300i: https://www.fujitsu.com/global/products/computing/peripheral/scanners/soho/s1300i/
.. _danielquinn: https://github.com/danielquinn
.. _ayounggun: https://github.com/ayounggun
.. _bmsleight: https://github.com/bmsleight
.. _eonist: https://github.com/eonist
.. _REOLDEV: https://github.com/REOLDEV
.. _jonaswinkler: https://github.com/jonaswinkler

View File

@ -1,16 +1,48 @@
.. _screenshots:
***********
Screenshots
===========
***********
Once everything is set-up login to paperless using the web front-end
This is what paperless-ng looks like. You shouldn't use paperless to index
research papers though, its a horrible tool for that job.
.. image:: ./_static/Screenshot_first_run_login.png
The dashboard shows customizable views on your document and allows document uploads:
Nice clean interface
.. image:: _static/paperless-0-dashboard.png
.. image:: ./_static/Screenshot_first_logged.png
The document list provides three different styles to scroll through your documents:
Some documents loaded in via ftp or using the scanners ftp.
.. image:: _static/paperless-1-list-table.png
.. image:: _static/paperless-2-list-smallcards.png
.. image:: _static/paperless-3-list-largecards.png
Extensive filtering mechanisms:
.. image:: _static/paperless-4-filter.png
Side-by-side editing of documents. Optmized for 1080p.
.. image:: _static/paperless-5-editing.png
Tag editing. This looks about the same for correspondents and document types.
.. image:: _static/paperless-6-tags.png
Searching provides auto complete and highlights the results.
.. image:: _static/paperless-7-autocomplete.png
.. image:: _static/paperless-8-search-results.png
The old admin is still there and accessible!
.. image:: _static/paperless-9-admin.png
Fancy mail filters!
.. image:: _static/paperless-11-mail-filters.png
Mobile support in the future? This doesn't really work yet.
.. image:: _static/paperless-10-mobile.png
.. image:: ./_static/Screenshot_upload_and_scanned.png

View File

@ -1,500 +1,282 @@
.. _setup:
*****
Setup
=====
Paperless isn't a very complicated app, but there are a few components, so some
basic documentation is in order. If you follow along in this document and
still have trouble, please open an `issue on GitHub`_ so I can fill in the
gaps.
.. _issue on GitHub: https://github.com/the-paperless-project/paperless/issues
.. _setup-download:
*****
Download
--------
########
The source is currently only available via GitHub, so grab it from there,
either by using ``git``:
Go to the project page on GitHub and download the
`latest release <https://github.com/jonaswinkler/paperless-ng/releases>`_.
There are multiple options available.
.. code:: bash
* Download the docker-compose files if you want to pull paperless from
Docker Hub.
$ git clone https://github.com/the-paperless-project/paperless.git
$ cd paperless
* Download the archive and extract it if you want to build the docker image
yourself or want to install paperless without docker.
or just download the tarball and go that route:
.. hint::
.. code:: bash
$ cd to the directory where you want to run Paperless
$ wget https://github.com/the-paperless-project/paperless/archive/master.zip
$ unzip master.zip
$ cd paperless-master
In contrast to paperless, the recommended way to get and update paperless-ng
is not to pull the entire git repository. Paperless-ng includes artifacts
that need to be compiled, and that's already done for you in the release.
.. _setup-installation:
Overview of Paperless-ng
########################
Installation & Configuration
----------------------------
Compared to paperless, paperless-ng works a little different under the hood and has
more moving parts that work together. While this increases the complexity of
the system, it also brings many benefits.
Paperless consists of the following components:
* **The webserver:** This is pretty much the same as in paperless. It serves
the administration pages, the API, and the new frontend. This is the main
tool you'll be using to interact with paperless. You may start the webserver
with
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
or by any other means such as Apache ``mod_wsgi``.
* **The consumer:** This is what watches your consumption folder for documents.
However, the consumer itself does not consume really consume your documents anymore.
It rather notifies a task processor that a new file is ready for consumption.
I suppose it should be named differently.
This also used to check your emails, but that's now gone elsewhere as well.
Start the consumer with the management command ``document_consumer``:
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run python3 manage.py document_consumer
* **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
for doing much of the heavy lifting. This is a task queue that accepts tasks from
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
certain commands periodically.
This task processor is responsible for:
* Consuming documents. When the consumer finds new documents, it notifies the task processor to
start a consumption task.
* Consuming emails. It periodically checks your configured accounts for new mails and
produces consumption tasks for any documents it finds.
* The task processor also performs the consumption of any documents you upload through
the web interface.
* Maintain the search index and the automatic matching algorithm. These are things that paperless
needs to do from time to time in order to operate properly.
This allows paperless to process multiple documents from your consumption folder in parallel! On
a modern multicore system, consumption with full ocr is blazing fast.
The task processor comes with a built-in admin interface that you can use to see whenever any of the
tasks fail and inspect the errors.
You may start the task processor by executing:
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run python3 manage.py qcluster
* A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
for getting the tasks from the webserver and consumer to the task scheduler. These run in different
processes (maybe even on different machines!), and therefore, this is necessary.
* A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
Installation
############
You can go multiple routes with setting up and running Paperless:
* The `bare metal route`_
* The `docker route`_
* A suggested `linux containers route`_
* The `docker route`_
* The `bare metal route`_
The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
from above automatically so that it just works and uses sensible defaults for all configuration options.
The `bare metal route`_ is more complicated to setup but makes it easier
should you want to contribute some code back. You need to configure and
run the above mentioned components yourself.
Docker Route
============
1. Install `Docker`_ and `docker-compose`_. [#compose]_
.. caution::
If you want to use the included ``docker-compose.*.yml`` file, you
need to have at least Docker version **17.09.0** and docker-compose
version **1.17.0**.
See the `Docker installation guide`_ on how to install the current
version of Docker for your operating system or Linux distribution of
choice. To get an up-to-date version of docker-compose, follow the
`docker-compose installation guide`_ if your package repository doesn't
include it.
.. _Docker installation guide: https://docs.docker.com/engine/installation/
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
2. Copy either ``docker-compose.sqlite.yml`` or ``docker-compose.postgres.yml`` to
``docker-compose.yml``, depending on which database backend you want to use.
.. hint::
For new installations, it is recommended to use postgresql as the database
backend. This is due to the increased amount of concurrency in paperless-ng.
2. Modify ``docker-compose.yml`` to your preferences. You should change the path
to the consumption directory in this file. Find the line that specifies where
to mount the consumption directory:
.. code::
- ./consume:/usr/src/paperless/consume
Replace the part BEFORE the colon with a local directory of your choice:
.. code::
- /home/jonaswinkler/paperless-inbox:/usr/src/paperless/consume
Don't change the part after the colon or paperless wont find your documents.
The `docker route`_ is quick & easy.
3. Modify ``docker-compose.env``, following the comments in the file. The
most important change is to set ``USERMAP_UID`` and ``USERMAP_GID``
to the uid and gid of your user on the host system. This ensures that
both the docker container and you on the host machine have write access
to the consumption directory. If your UID and GID on the host system is
1000 (the default for the first normal user on most systems), it will
work out of the box without any modifications.
The `bare metal route`_ is a bit more complicated to setup but makes it easier
should you want to contribute some code back.
.. note::
The `linux containers route`_ is quick, but makes alot of assumptions on the
set-up, on the other hand the script could be used to install on a base
debian or ubuntu server.
You can use any settings from the file ``paperless.conf`` in this file.
Have a look at :ref:`configuration` to see whats available.
.. _docker route: setup-installation-docker_
.. _bare metal route: setup-installation-bare-metal_
.. _Docker Machine: https://docs.docker.com/machine/
4. Run ``docker-compose up -d``. This will create and start the necessary
containers. This will also build the image of paperless if you grabbed the
source archive.
.. _setup-installation-bare-metal:
5. To be able to login, you will need a super user. To create it, execute the
following command:
Standard (Bare Metal)
+++++++++++++++++++++
.. code-block:: shell-session
1. Install the requirements as per the :ref:`requirements <requirements>` page.
2. Within the extract of master.zip go to the ``src`` directory.
3. Copy ``../paperless.conf.example`` to ``/etc/paperless.conf`` and open it in
your favourite editor. As this file contains passwords. It should only be
readable by user root and paperless! Set the values for:
$ docker-compose run --rm webserver createsuperuser
Set the values for:
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
dumped to be consumed by Paperless.
* ``PAPERLESS_OCR_THREADS``: this is the number of threads the OCR process
will spawn to process document pages in parallel.
* ``PAPERLESS_PASSPHRASE``: this is only required if you want to use GPG to
encrypt your document files. This is the passphrase Paperless uses to
encrypt/decrypt the original documents. Don't worry about defining this
if you don't want to use encryption (the default).
Note also that if you're using the ``runserver`` as mentioned below, you
should make sure that PAPERLESS_DEBUG="true" or is just commented out as
this is the default.
4. Initialise the SQLite database with ``./manage.py migrate``.
5. Collect the static files for the webserver with ``./manage.py collectstatic``.
6. Create a user for your Paperless instance with
``./manage.py createsuperuser``. Follow the prompts to create your user.
7. Start the webserver with ``./manage.py runserver <IP>:<PORT>``.
If no specific IP or port is given, the default is ``127.0.0.1:8000`` also
known as http://localhost:8000/.
You should now be able to visit your (empty) installation at
`Paperless webserver`_ or whatever you chose before. You can login with the
user/pass you created in #5.
8. In a separate window, change to the ``src`` directory in this repo again,
but this time, you should start the consumer script with
``./manage.py document_consumer``.
9. Scan something or put a file into the ``CONSUMPTION_DIR``.
10. Wait a few minutes
11. Visit the document list on your webserver, and it should be there, indexed
and downloadable.
.. caution::
This installation is not secure. Once everything is working head over to
`Making things more permanent`_
.. _Paperless webserver: http://127.0.0.1:8000
.. _Making things more permanent: setup-permanent_
.. _setup-installation-docker:
Docker Method
+++++++++++++
1. Install `Docker`_.
.. caution::
As mentioned earlier, this guide assumes that you use Docker natively
under Linux. If you are using `Docker Machine`_ under Mac OS X or
Windows, you will have to adapt IP addresses, volume-mounting, command
execution and maybe more.
2. Install `docker-compose`_. [#compose]_
.. caution::
If you want to use the included ``docker-compose.yml.example`` file, you
need to have at least Docker version **1.12.0** and docker-compose
version **1.9.0**.
See the `Docker installation guide`_ on how to install the current
version of Docker for your operating system or Linux distribution of
choice. To get an up-to-date version of docker-compose, follow the
`docker-compose installation guide`_ if your package repository doesn't
include it.
.. _Docker installation guide: https://docs.docker.com/engine/installation/
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``
and a copy of ``docker-compose.env.example`` as ``docker-compose.env``.
You'll be editing both these files: taking a copy ensures that you can
``git pull`` to receive updates without risking merge conflicts with your
modified versions of the configuration files.
4. Modify ``docker-compose.yml`` to your preferences, following the
instructions in comments in the file. The only change that is a hard
requirement is to specify where the consumption directory should
mount.[#dockercomposeyml]_
.. caution::
If you are using NFS mounts for the consume directory you also need to
change the command to turn off inotify as it doesn't work with NFS
``command: ["document_consumer", "--no-inotify"]``
5. Modify ``docker-compose.env`` and adapt the following environment variables:
``PAPERLESS_PASSPHRASE``
This is the passphrase Paperless uses to encrypt/decrypt the original
document. If you aren't planning on using GPG encryption, you can just
leave this undefined.
``PAPERLESS_OCR_THREADS``
This is the number of threads the OCR process will spawn to process
document pages in parallel. If the variable is not set, Python determines
the core-count of your CPU and uses that value.
``PAPERLESS_OCR_LANGUAGES``
If you want the OCR to recognize other languages in addition to the
default English, set this parameter to a space separated list of
three-letter language-codes after `ISO 639-2/T`_. For a list of available
languages -- including their three letter codes -- see the
`Alpine packagelist`_.
``USERMAP_UID`` and ``USERMAP_GID``
If you want to mount the consumption volume (directory ``/consume`` within
the containers) to a host-directory -- which you probably want to do --
access rights might be an issue. The default user and group ``paperless``
in the containers have an id of 1000. The containers will enforce that the
owning group of the consumption directory will be ``paperless`` to be able
to delete consumed documents. If your host-system has a group with an ID
of 1000 and you don't want this group to have access rights to the
consumption directory, you can use ``USERMAP_GID`` to change the id in the
container and thus the one of the consumption directory. Furthermore, you
can change the id of the default user as well using ``USERMAP_UID``.
``PAPERLESS_USE_SSL``
If you want Paperless to use SSL for the user interface, set this variable
to ``true``. You also need to copy your certificate and key to the ``data``
directory, named ``ssl.cert`` and ``ssl.key``.
This is not an ideal solution and, if possible, a reverse proxy with nginx
is preferred.
6. Run ``docker-compose up -d``. This will create and start the necessary
containers.
7. To be able to login, you will need a super user. To create it, execute the
following command:
.. code-block:: shell-session
$ docker-compose run --rm webserver createsuperuser
This will prompt you to set a username (default ``paperless``), an optional
e-mail address and finally a password.
8. The default ``docker-compose.yml`` exports the webserver on your local port
8000. If you haven't adapted this, you should now be able to visit your
`Paperless webserver`_ at ``http://127.0.0.1:8000`` (or
``https://127.0.0.1:8000`` if you enabled SSL). You can login with the
user and password you just created.
9. Add files to consumption directory the way you prefer to. Following are two
possible options:
1. Mount the consumption directory to a local host path by modifying your
``docker-compose.yml``:
.. code-block:: diff
diff --git a/docker-compose.yml b/docker-compose.yml
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,9 +18,8 @@ services:
volumes:
- paperless-data:/usr/src/paperless/data
- paperless-media:/usr/src/paperless/media
- - /consume
+ - /local/path/you/choose:/consume
.. danger::
While the consumption container will ensure at startup that it can
**delete** a consumed file from a host-mounted directory, it might
not be able to **read** the document in the first place if the access
rights to the file are incorrect.
Make sure that the documents you put into the consumption directory
will either be readable by everyone (``chmod o+r file.pdf``) or
readable by the default user or group id 1000 (or the one you have
set with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
2. Use ``docker cp`` to copy your files directly into the container:
.. code-block:: shell-session
$ # Identify your containers
$ docker-compose ps
Name Command State Ports
-------------------------------------------------------------------------
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
``docker cp`` is a one-shot-command, just like ``cp``. This means that
every time you want to consume a new document, you will have to execute
``docker cp`` again. You can of course automate this process, but option
1 is generally the preferred one.
.. danger::
``docker cp`` will change the owning user and group of a copied file
to the acting user at the destination, which will be ``root``.
You therefore need to ensure that the documents you want to copy into
the container are readable by everyone (``chmod o+r file.pdf``)
before copying them.
This will prompt you to set a username, an optional e-mail address and
finally a password.
6. The default ``docker-compose.yml`` exports the webserver on your local port
8000. If you haven't adapted this, you should now be able to visit your
Paperless instance at ``http://127.0.0.1:8000``. You can login with the
user and password you just created.
.. _Docker: https://www.docker.com/
.. _docker-compose: https://docs.docker.com/compose/install/
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
.. [#compose] You of course don't have to use docker-compose, but it
simplifies deployment immensely. If you know your way around Docker, feel
free to tinker around without using compose!
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
version 1.1.0 or earlier, you might need to change in the
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
newer ``docker-compose.yml.example`` file
Bare Metal Route
================
.. _setup-permanent:
.. warning::
Making Things a Little more Permanent
-------------------------------------
TBD. User docker for now.
Once you've tested things and are happy with the work flow, you should secure
the installation and automate the process of starting the webserver and
consumer.
Migration to paperless-ng
#########################
At its core, paperless-ng is still paperless and fully compatible. However, some
things have changed under the hood, so you need to adapt your setup depending on
how you installed paperless. The important things to keep in mind are as follows.
.. _setup-permanent-webserver:
* Read the :ref:`changelog <paperless_changelog>` and take note of breaking changes.
* It is recommended to use postgresql as the database now. If you want to continue
using SQLite, which is the default of paperless, use ``docker-compose.sqlite.yml``.
See :ref:`setup-sqlite_to_psql` for details on how to move your data from
sqlite to postgres.
* The task scheduler of paperless, which is used to execute periodic tasks
such as email checking and maintenance, requires a `redis`_ message broker
instance. The docker-compose route takes care of that.
* The layout of the folder structure for your documents and data remains the
same, so you can just plug your old docker volumes into paperless-ng and
expect it to find everything where it should be.
Using a Real Webserver
++++++++++++++++++++++
Migration to paperless-ng is then performed in a few simple steps:
The default is to use Django's development server, as that's easy and does the
job well enough on a home network. However it is heavily discouraged to use
it for more than that.
1. Stop paperless.
If you want to do things right you should use a real webserver capable of
handling more than one thread. You will also have to let the webserver serve
the static files (CSS, JavaScript) from the directory configured in
``PAPERLESS_STATICDIR``. The default static files directory is ``../static``.
.. code:: bash
For that you need to activate your virtual environment and collect the static
files with the command:
$ cd /path/to/current/paperless
$ docker-compose down
.. code:: bash
2. Do a backup for two purposes: If something goes wrong, you still have your
data. Second, if you don't like paperless-ng, you can switch back to
paperless.
$ cd <paperless directory>/src
$ ./manage.py collectstatic
3. Download the latest release of paperless-ng. You can either go with the
docker-compose files or use the archive to build the image yourself.
You can either replace your current paperless folder or put paperless-ng
in a different location.
.. caution::
Apache
~~~~~~
Make sure you also download the ``.env`` file. This will set the
project name for docker compose to ``paperless`` and then it will
automatically reuse your existing paperless volumes.
This is a configuration supplied by `steckerhalter`_ on GitHub. It uses Apache
and mod_wsgi, with a Paperless installation in ``/home/paperless/``:
4. Adjust ``docker-compose.yml`` and
``docker-compose.env`` to your needs.
See `docker route`_ for details on which edits are required.
.. code:: apache
5. Start paperless-ng.
<VirtualHost *:80>
ServerName example.com
.. code:: bash
Alias /static/ /home/paperless/paperless/static/
<Directory /home/paperless/paperless/static>
Require all granted
</Directory>
$ docker-compose up
WSGIScriptAlias / /home/paperless/paperless/src/paperless/wsgi.py
WSGIDaemonProcess example.com user=paperless group=paperless threads=5 python-path=/home/paperless/paperless/src:/home/paperless/.env/lib/python3.6/site-packages
WSGIProcessGroup example.com
If you see everything working (you should see some migrations getting
applied, for instance), you can gracefully stop paperless-ng with Ctrl-C
and then start paperless-ng as usual with
<Directory /home/paperless/paperless/src/paperless>
<Files wsgi.py>
Require all granted
</Files>
</Directory>
</VirtualHost>
.. code:: bash
.. _steckerhalter: https://github.com/steckerhalter
$ docker-compose up -d
This will run paperless in the background and automatically start it on system boot.
Nginx + Gunicorn
~~~~~~~~~~~~~~~~
6. Paperless installed a permanent redirect to ``admin/`` in your browser. This
redirect is still in place and prevents access to the new UI. Clear
everything related to paperless in your browsers data in order to fix
this issue.
If you're using Nginx, the most common setup is to combine it with a
Python-based server like Gunicorn so that Nginx is acting as a proxy. Below is
a copy of a simple Nginx configuration fragment making use of a gunicorn
instance listening on localhost port 8000.
.. code:: nginx
.. _setup-sqlite_to_psql:
server {
listen 80;
Moving data from sqlite to postgresql
=====================================
index index.html index.htm index.php;
access_log /var/log/nginx/paperless_access.log;
error_log /var/log/nginx/paperless_error.log;
.. warning::
location /static {
autoindex on;
alias <path-to-paperless-static-directory>;
}
location / {
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://127.0.0.1:8000;
}
}
The gunicorn server can be started with the command:
.. code-block:: shell
$ <path-to-paperless-virtual-environment>/bin/gunicorn --pythonpath=<path-to-paperless>/src paperless.wsgi -w 2
.. _setup-permanent-standard-systemd:
Standard (Bare Metal + Systemd)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you're running on a bare metal system that's using Systemd, you can use the
service unit files in the ``scripts`` directory to set this up.
1. You'll need to create a group and user called ``paperless`` (without login)
2. Setup Paperless to be in a place that this new user can read and write to.
3. Ensure ``/etc/paperless`` is readable by the ``paperless`` user.
4. Copy the service file from the ``scripts`` directory to
``/etc/systemd/system``.
.. code-block:: bash
$ cp /path/to/paperless/scripts/paperless-consumer.service /etc/systemd/system/
$ cp /path/to/paperless/scripts/paperless-webserver.service /etc/systemd/system/
5. Edit the service file to point the ``ExecStart`` line to the proper location
of your paperless install, referencing the appropriate Python binary. For
example:
``ExecStart=/path/to/python3 /path/to/paperless/src/manage.py document_consumer``.
6. Start and enable (so they start on boot) the services.
.. code-block:: bash
$ systemctl enable paperless-consumer
$ systemctl enable paperless-webserver
$ systemctl start paperless-consumer
$ systemctl start paperless-webserver
.. _setup-permanent-standard-upstart:
Standard (Bare Metal + Upstart)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Ubuntu 14.04 and earlier use the `Upstart`_ init system to start services
during the boot process. To configure Upstart to run Paperless automatically
after restarting your system:
1. Change to the directory where Upstart's configuration files are kept:
``cd /etc/init``
2. Create a new file: ``sudo nano paperless-server.conf``
3. In the newly-created file enter::
start on (local-filesystems and net-device-up IFACE=eth0)
stop on shutdown
respawn
respawn limit 10 5
script
exec <path to paperless virtual environment>/bin/gunicorn --pythonpath=<path to parperless>/src paperless.wsgi -w 2
end script
Note that you'll need to replace ``/srv/paperless/src/manage.py`` with the
path to the ``manage.py`` script in your installation directory.
If you are using a network interface other than ``eth0``, you will have to
change ``IFACE=eth0``. For example, if you are connected via WiFi, you will
likely need to replace ``eth0`` above with ``wlan0``. To see all interfaces,
run ``ifconfig -a``.
Save the file.
4. Create a new file: ``sudo nano paperless-consumer.conf``
5. In the newly-created file enter::
start on (local-filesystems and net-device-up IFACE=eth0)
stop on shutdown
respawn
respawn limit 10 5
script
exec <path to paperless virtual environment>/bin/python <path to parperless>/manage.py document_consumer
end script
Replace the path placeholder and ``eth0`` with the appropriate value and save the file.
These two configuration files together will start both the Paperless webserver
and document consumer processes when the file system and network interface
specified is available after boot. Furthermore, if either process ever exits
unexpectedly, Upstart will try to restart it a maximum of 10 times within a 5
second period.
.. _Upstart: http://upstart.ubuntu.com/
.. _setup-permanent-docker:
Docker
~~~~~~
If you're using Docker, you can set a restart-policy_ in the
``docker-compose.yml`` to have the containers automatically start with the
Docker daemon.
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
TBD.
.. _redis: https://redis.io/

View File

@ -1,12 +1,42 @@
.. _troubleshooting:
***************
Troubleshooting
===============
***************
No files are added by the consumer
##################################
Check for the following issues:
* Ensure that the directory you're putting your documents in is the folder
paperless is watching. With docker, this setting is performed in the
``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
setting. Don't adjust this setting if you're using docker.
* Ensure that redis is up and running. Paperless does its task processing
asynchronously, and for documents to arrive at the task processor, it needs
redis to run.
* Ensure that the task processor is running. Docker does this automatically.
Manually invoke the task processor by executing
.. code:: shell-session
$ python3 manage.py qcluster
* Look at the output of paperless and inspect it for any errors.
* Go to the admin interface, and check if there are failed tasks. If so, the
tasks will contain an error message.
Consumer fails to pickup any new files
######################################
If you notice, that the consumer will only pickup files in the consumption
directory at startup, but won't find any other files added later, check out
the configuration file and enable filesystem polling with the setting
``PAPERLESS_CONSUMER_POLLING``.
.. _troubleshooting-languagemissing:
Consumer warns ``OCR for XX failed``
------------------------------------
####################################
If you find the OCR accuracy to be too low, and/or the document consumer warns
that ``OCR for XX failed, but we're going to stick with what we've got since
@ -20,10 +50,9 @@ box, and your documents are written in Spanish you may need to run::
apt-get install -y tesseract-ocr-spa
.. _troubleshooting-convertpixelcache:
Consumer dies with ``convert: unable to extent pixel cache``
------------------------------------------------------------
############################################################
During the consumption process, Paperless invokes ImageMagick's ``convert``
program to translate the source document into something that the OCR engine can
@ -48,10 +77,9 @@ that's actually on a physical disk (and writable by the user running
Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.
.. _troubleshooting-decompressionbombwarning:
DecompressionBombWarning and/or no text in the OCR output
---------------------------------------------------------
#########################################################
Some users have had issues using Paperless to consume PDFs that were created
by merging Very Large Scanned Images into one PDF. If this happens to you,
it's likely because the PDF you've created contains some very large pages
@ -72,4 +100,4 @@ with a DPI of 300, then merging the images into the single PDF with
For more information on this and situations like it, you should take a look
at `Issue #118`_ as that's where this tip originated.
.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118
.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118

246
docs/usage_overview.rst Normal file
View File

@ -0,0 +1,246 @@
**************
Usage Overview
**************
Paperless is an application that manages your personal documents. With
the help of a document scanner (see :ref:`scanners`), paperless transforms
your wieldy physical document binders into a searchable archive and
provices many utilities for finding and managing your documents.
Terms and definitions
#####################
Paperless esentially consists of two different parts for managing your
documents:
* The *consumer* watches a specified folder and adds all documents in that
folder to paperless.
* The *web server* provides a UI that you use to manage and search for your
scanned documents.
Each document has a couple of fields that you can assign to them:
* A *Document* is a piece of paper that sometimes contains valuable
information.
* The *correspondent* of a document is the person, institution or company that
a document either originates form, or is sent to.
* A *tag* is a label that you can assign to documents. Think of labels as more
powerful folders: Multiple documents can be grouped together with a single
tag, however, a single document can also have multiple tags. This is not
possible with folders. The reason folders are not implemented in paperless
is simply that tags are much more versatile than folders.
* A *document type* is used to demarkate the type of a document such as letter,
bank statement, invoice, contract, etc. It is used to identify what a document
is about.
* The *date added* of a document is the date the document was scanned into
paperless. You cannot and should not change this date.
* The *date created* of a document is the date the document was intially issued.
This can be the date you bought a product, the date you signed a contract, or
the date a letter was sent to you.
* The *archive serial number* (short: ASN) of a document is the identifier of
the document in your physical document binders. See
:ref:`usage-recommended_workflow` below.
* The *content* of a document is the text that was OCR'ed from the document.
This text is fed into the search engine and is used for matching tags,
correspondents and document types.
Frontend overview
#################
.. warning::
TBD. Add some fancy screenshots!
Adding documents to paperless
#############################
Once you've got Paperless setup, you need to start feeding documents into it.
Currently, there are three options: the consumption directory, IMAP (email), and
HTTP POST.
The consumption directory
=========================
The primary method of getting documents into your database is by putting them in
the consumption directory. The consumer runs in an infinite
loop looking for new additions to this directory and when it finds them, it goes
about the process of parsing them with the OCR, indexing what it finds, and storing
it in the media directory.
Getting stuff into this directory is up to you. If you're running Paperless
on your local computer, you might just want to drag and drop files there, but if
you're running this on a server and want your scanner to automatically push
files to this directory, you'll need to setup some sort of service to accept the
files from the scanner. Typically, you're looking at an FTP server like
`Proftpd`_ or a Windows folder share with `Samba`_.
.. _Proftpd: http://www.proftpd.org/
.. _Samba: http://www.samba.org/
.. TODO: hyperref to configuration of the location of this magic folder.
.. _usage-email:
IMAP (Email)
============
You can tell paperless-ng to consume documents from your email accounts.
This is a very flexible and powerful feature, if you regularly received documents
via mail that you need to archive. The mail consumer can be configured by using the
admin interface in the following manner:
1. Define e-mail accounts.
2. Define mail rules for your account.
These rules perform the following:
1. Connect to the mail server.
2. Fetch all matching mails (as defined by folder, maximum age and the filters)
3. Check if there are any consumable attachments.
4. If so, instruct paperless to consume the attachments and optionally
use the metadata provided in the rule for the new document.
5. If documents were consumed from a mail, the rule action is performed
on that mail.
Paperless will completely ignore mails that do not match your filters. It will also
only perform the action on mails that it has consumed documents from.
The actions all ensure that the same mail is not consumed twice by different means.
These are as follows:
* **Delete:** Immediately deletes mail that paperless has consumed documents from.
Use with caution.
* **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
from already read mails. If you read a mail before paperless sees it, it will be
ignored.
* **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
will not consume flagged mails.
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
consume them again.
.. caution::
The mail consumer will perform these actions on all mails it has consumed
documents from. Keep in mind that the actual consumption process may fail
for some reason, leaving you with missing documents in paperless.
.. note::
With the correct set of rules, you can completely automate your email documents.
Create rules for every correspondent you receive digital documents from and
paperless will read them automatically. The default acion "mark as read" is
pretty tame and will not cause any damage or data loss whatsoever.
You can also setup a special folder in your mail account for paperless and use
your favorite mail client to move to be consumed mails into that folder
automatically or manually and tell paperless to move them to yet another folder
after consumption. It's up to you.
.. note::
Paperless will process the rules in the order defined in the admin page.
You can define catch-all rules and have them executed last to consume
any documents not matched by previous rules. Such a rule may assign an "Unknown
mail document" tag to consumed documents so you can inspect them further.
Paperless is set up to check your mails every 10 minutes. This can be configured on the
'Scheduled tasks' page in the admin.
REST API
========
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
.. _usage-recommended_workflow:
The recommended workflow
########################
Once you have familiarized yourself with paperless and are ready to use it
for all your documents, the recommended workflow for managing your documents
is as follows. This workflow also takes into account that some documents
have to be kept in physical form, but still ensures that you get all the
advantages for these documents as well.
The following diagram shows how easy it is to manage your documents.
.. image:: _static/recommended_workflow.png
Preparations in paperless
=========================
* Create an inbox tag that gets assigned to all new documents.
* Create a TODO tag.
Processing of the physical documents
====================================
Keep a physical inbox. Whenever you receive a document that you need to
archive, put it into your inbox. Regulary, do the following for all documents
in your inbox:
1. For each document, decide if you need to keep the document in physical
form. This applies to certain important documents, such as contracts and
certificates.
2. If you need to keep the document, write a running number on the document
before scanning, starting at one and counting upwards. This is the archive
serial number, or ASN in short.
3. Scan the document.
4. If the document has an ASN assigned, store it in a *single* binder, sorted
by ASN. Don't order this binder in any other way.
5. If the document has no ASN, throw it away. Yay!
Over time, you will notice that your physical binder will fill up. If it is
full, label the binder with the range of ASNs in this binder (i.e., "Documents
1 to 343"), store the binder in your cellar or elsewhere, and start a new
binder.
The idea behind this process is that you will never have to use the physical
binders to find a document. If you need a specific physical document, you
may find this document by:
1. Searching in paperless for the document.
2. Identify the ASN of the document, since it appears on the scan.
3. Grab the relevant document binder and get the document. This is easy since
they are sorted by ASN.
Processing of documents in paperless
====================================
Once you have scanned in a document, proceed in paperless as follows.
1. If the document has an ASN, assign the ASN to the document.
2. Assign a correspondent to the document (i.e., your employer, bank, etc)
This isnt strictly necessary but helps in finding a document when you need
it.
3. Assign a document type (i.e., invoice, bank statement, etc) to the document
This isnt strictly necessary but helps in finding a document when you need
it.
4. Assign a proper title to the document (the name of an item you bought, the
subject of the letter, etc)
5. Check that the date of the document is corrent. Paperless tries to read
the date from the content of the document, but this fails sometimes if the
OCR is bad or multiple dates appear on the document.
6. Remove inbox tags from the documents.
Task management
===============
Some documents require attention and require you to act on the document. You
may take two different approaches to handle these documents based on how
regularly you intent to use paperless and scan documents.
* If you scan and process your documents in paperless regularly, assign a
TODO tag to all scanned documents that you need to process. Create a saved
view on the dashboard that shows all documents with this tag.
* If you do not scan documents regularly and use paperless solely for archiving,
create a physical todo box next to your physical inbox and put documents you
need to process in the TODO box. When you performed the task associated with
the document, move it to the inbox.

View File

@ -1,284 +0,0 @@
.. _utilities:
Utilities
=========
There's basically three utilities to Paperless: the webserver, consumer, and
if needed, the exporter. They're all detailed here.
.. _utilities-webserver:
The Webserver
-------------
At the heart of it, Paperless is a simple Django webservice, and the entire
interface is based on Django's standard admin interface. Once running, visiting
the URL for your service delivers the admin, through which you can get a
detailed listing of all available documents, search for specific files, and
download whatever it is you're looking for.
.. _utilities-webserver-howto:
How to Use It
.............
The webserver is started via the ``manage.py`` script:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py runserver
By default, the server runs on localhost, port 8000, but you can change this
with a few arguments, run ``manage.py --help`` for more information.
Add the option ``--noreload`` to reduce resource usage. Otherwise, the server
continuously polls all source files for changes to auto-reload them.
Note that when exiting this command your webserver will disappear.
If you want to run this full-time (which is kind of the point)
you'll need to have it start in the background -- something you'll need to
figure out for your own system. To get you started though, there are Systemd
service files in the ``scripts`` directory.
.. _utilities-consumer:
The Consumer
------------
The consumer script runs in an infinite loop, constantly looking at a directory
for documents to parse and index. The process is pretty straightforward:
1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2.
If not, wait 10 seconds and try again. On Linux, new documents are detected
instantly via inotify, so there's no waiting involved.
2. Parse the document with Tesseract
3. Create a new record in the database with the OCR'd text
4. Attempt to automatically assign document attributes by doing some guesswork.
Read up on the :ref:`guesswork documentation<guesswork>` for more
information about this process.
5. Encrypt the document (if you have a passphrase set) and store it in the
``media`` directory under ``documents/originals``.
6. Go to #1.
.. _utilities-consumer-howto:
How to Use It
.............
The consumer is started via the ``manage.py`` script:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_consumer
This starts the service that will consume documents as they appear in
``CONSUMPTION_DIR``.
Note that this command runs continuously, so exiting it will mean your webserver
disappears. If you want to run this full-time (which is kind of the point)
you'll need to have it start in the background -- something you'll need to
figure out for your own system. To get you started though, there are Systemd
service files in the ``scripts`` directory.
Some command line arguments are available to customize the behavior of the
consumer. By default it will use ``/etc/paperless.conf`` values. Display the
help with:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_consumer --help
.. _utilities-exporter:
The Exporter
------------
Tired of fiddling with Paperless, or just want to do something stupid and are
afraid of accidentally damaging your files? You can export all of your
documents into neatly named, dated, and unencrypted files.
.. _utilities-exporter-howto:
How to Use It
.............
This too is done via the ``manage.py`` script:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
This will dump all of your unencrypted documents into ``/path/to/somewhere``
for you to do with as you please. The files are accompanied with a special
file, ``manifest.json`` which can be used to :ref:`import the files
<utilities-importer>` at a later date if you wish.
.. _utilities-exporter-howto-docker:
Docker
______
If you are :ref:`using Docker <setup-installation-docker>`, running the
expoorter is almost as easy. To mount a volume for exports, follow the
instructions in the ``docker-compose.yml.example`` file for the ``/export``
volume (making the changes in your own ``docker-compose.yml`` file, of course).
Once you have the volume mounted, the command to run an export is:
.. code-block:: shell-session
$ docker-compose run --rm consumer document_exporter /export
If you prefer to use ``docker run`` directly, supplying the necessary commandline
options:
.. code-block:: shell-session
$ # Identify your containers
$ docker-compose ps
Name Command State Ports
-------------------------------------------------------------------------
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
$ # Make sure to replace your passphrase and remove or adapt the id mapping
$ docker run --rm \
--volumes-from paperless_data_1 \
--volume /path/to/arbitrary/place:/export \
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
paperless document_exporter /export
.. _utilities-importer:
The Importer
------------
Looking to transfer Paperless data from one instance to another, or just want
to restore from a backup? This is your go-to toy.
.. _utilities-importer-howto:
How to Use It
.............
The importer works just like the exporter. You point it at a directory, and
the script does the rest of the work:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
Docker
______
Assuming that you've already gone through the steps above in the
:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
to do is just re-use the ``/export`` path you already setup:
.. code-block:: shell-session
$ docker-compose run --rm consumer document_importer /export
Similarly, if you're not using docker-compose, you can adjust the export
instructions above to do the import.
.. _utilities-retagger:
Re-running your tagging and correspondent matchers
--------------------------------------------------
Say you've imported a few hundred documents and now want to introduce
a tag or set up a new correspondent, and apply its matching to all of
the currently-imported docs. This problem is common enough that
there are tools for it.
.. _utilities-retagger-howto:
How to Do It
............
This too is done via the ``manage.py`` script:
.. code:: bash
$ /path/to/paperless/src/manage.py document_retagger
Run this after changing or adding tagging rules. It'll loop over all
of the documents in your database and attempt to match all of your
tags to them. If one matches, it'll be applied. And don't worry, you
can run this as often as you like, it won't double-tag a document.
.. code:: bash
$ /path/to/paperless/src/manage.py document_correspondents
This is the similar command to run after adding or changing a correspondent.
.. _utilities-encyption:
Enabling Encrpytion
-------------------
Let's say you've imported a few documents to play around with paperless and now
you are using it more seriously and want to enable encryption of your files.
.. utilities-encryption-howto:
Basic Syntax
.............
Again we'll use the ``manage.py`` script, passing ``change_storage_type``:
.. code:: console
$ /path/to/paperless/src/manage.py change_storage_type --help
usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
[--settings SETTINGS]
[--pythonpath PYTHONPATH] [--traceback]
[--no-color] [--passphrase PASSPHRASE]
{gpg,unencrypted} {gpg,unencrypted}
This is how you migrate your stored documents from an encrypted state to an
unencrypted one (or vice-versa)
positional arguments:
{gpg,unencrypted} The state you want to change your documents from
{gpg,unencrypted} The state you want to change your documents to
optional arguments:
--passphrase PASSPHRASE
If PAPERLESS_PASSPHRASE isn't set already, you need to
specify it here
Enabling Encryption
...................
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
.. code:: bash
$ /path/to/paperless/src/manage.py change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
Disabling Encryption
....................
Basic usage to enable encryption of your document store:
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
.. code:: bash
$ /path/to/paperless/src/manage.py change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted

View File

@ -1,269 +1,55 @@
# Sample paperless.conf
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
# As this file contains passwords it should only be readable by the user
# running paperless.
# Have a look at the docs for documentation.
# https://paperless-ng.readthedocs.io/en/latest/configuration.html
###############################################################################
#### Message Broker ####
###############################################################################
# Debug. Only enable this for development.
# This is required for processing scheduled tasks such as email fetching, index
# optimization and for training the automatic document matcher.
# Defaults to localhost:6379.
#PAPERLESS_REDIS="redis://localhost:6379"
#PAPERLESS_DEBUG=false
# Required services
###############################################################################
#### Database Settings ####
###############################################################################
#PAPERLESS_REDIS=redis://localhost:6379
#PAPERLESS_DBHOST=localhost
#PAPERLESS_DBPORT=5432
#PAPERLESS_DBNAME=paperless
#PAPERLESS_DBUSER=paperless
#PAPERLESS_DBPASS=paperless
# By default, sqlite is used as the database backend. This can be changed here.
# The docker-compose service definition uses a postgresql server. The
# configuration for this is already done inside the docker-compose.env file.
# Paths and folders
#Set PAPERLESS_DBHOST and postgresql will be used instead of mysql.
#PAPERLESS_DBHOST="localhost"
#PAPERLESS_CONSUMPTION_DIR=../consume
#PAPERLESS_DATA_DIR=../data
#PAPERLESS_MEDIA_ROOT=../media
#PAPERLESS_STATICDIR=../static
#PAPERLESS_FILENAME_FORMAT=
#Adjust port if necessary
#PAPERLESS_DBPORT=
# Security and hosting
#name, user and pass all default to "paperless"
#PAPERLESS_DBNAME="paperless"
#PAPERLESS_DBUSER="paperless"
#PAPERLESS_DBPASS="paperless"
#PAPERLESS_SECRET_KEY=change-me
#PAPERLESS_ALLOWED_HOSTS=example.com,www.example.com
#PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
#PAPERLESS_FORCE_SCRIPT_NAME=
#PAPERLESS_STATIC_URL=/static/
# Software tweaks
###############################################################################
#### Paths & Folders ####
###############################################################################
# This where your documents should go to be consumed. Make sure that it exists
# and that the user running the paperless service can read/write its contents
# before you start Paperless.
PAPERLESS_CONSUMPTION_DIR="../consume"
# This is where paperless stores all its data (search index, sqlite database,
# classification model, etc).
#PAPERLESS_DATA_DIR="../data"
# This is where your documents and thumbnails are stored.
#PAPERLESS_MEDIA_ROOT="../media"
# Override the default STATIC_ROOT here. This is where all static files
# created using "collectstatic" manager command are stored.
#PAPERLESS_STATICDIR="../static"
# Override the STATIC_URL here. Unless you're hosting Paperless off a
# subdomain like /paperless/, you probably don't need to change this.
#PAPERLESS_STATIC_URL="/static/"
# These values are required if you want paperless to check a particular email
# box every 10 minutes and attempt to consume documents from there. If you
# don't define a HOST, mail checking will just be disabled.
#PAPERLESS_CONSUME_MAIL_HOST=""
#PAPERLESS_CONSUME_MAIL_PORT=""
#PAPERLESS_CONSUME_MAIL_USER=""
#PAPERLESS_CONSUME_MAIL_PASS=""
# Override the default IMAP inbox here. If not set Paperless defaults to
# "INBOX".
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
# Any email sent to the target account that does not contain this text will be
# ignored.
PAPERLESS_EMAIL_SECRET=""
# Specify a filename format for the document (directories are supported)
# Use the following placeholders:
# * {correspondent}
# * {title}
# * {created}
# * {added}
# * {tags[KEY]} If your tags conform to key_value or key-value
# * {tags[INDEX]} If your tags are strings, select the tag by index
# Uniqueness of filenames is ensured, as an incrementing counter is attached
# to each filename.
#PAPERLESS_FILENAME_FORMAT=""
###############################################################################
#### Security ####
###############################################################################
# Controls whether django's debug mode is enabled. Disable this on production
# systems. Debug mode is disabled by default.
#PAPERLESS_DEBUG="false"
# GnuPG encryption is deprecated and will be removed in future versions.
#
# Paperless can be instructed to attempt to encrypt your PDF files with GPG
# using the PAPERLESS_PASSPHRASE specified below. If however you're not
# concerned about encrypting these files (for example if you have disk
# encryption locally) then you don't need this and can safely leave this value
# un-set.
#
# One final note about the passphrase. Once you've consumed a document with
# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and
# can't properly export documents that were encrypted with an old passphrase if
# you've since changed it to a new one.
#
# The default is to not use encryption at all.
#PAPERLESS_PASSPHRASE="secret"
# The secret key has a default that should be fine so long as you're hosting
# Paperless on a closed network. However, if you're putting this anywhere
# public, you should change the key to something unique and verbose.
#PAPERLESS_SECRET_KEY="change-me"
# If you're planning on putting Paperless on the open internet, then you
# really should set this value to the domain name you're using. Failing to do
# so leaves you open to HTTP host header attacks:
# https://docs.djangoproject.com/en/1.10/topics/security/#host-headers-virtual-hosting
#
# Just remember that this is a comma-separated list, so "example.com" is fine,
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
# If you decide to use the Paperless API in an ajax call, you need to add your
# servers to the list of allowed hosts that can do CORS calls. By default
# Paperless allows calls from localhost:8080, but you'd like to change that,
# you can set this value to a comma-separated list.
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
# To host paperless under a subpath url like example.com/paperless you set
# this value to /paperless. No trailing slash!
#
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
#PAPERLESS_FORCE_SCRIPT_NAME=""
###############################################################################
#### Software Tweaks ####
###############################################################################
# After a document is consumed, Paperless can trigger an arbitrary script if
# you like. This script will be passed a number of arguments for you to work
# with. The default is blank, which means nothing will be executed. For more
# information, take a look at the docs:
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
# By default, paperless will check the document text for document date information.
# Uncomment the line below to enable checking the document filename for date
# information. The date order can be set to any option as specified in
# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be
# checked first, and if nothing is found, the document text will be checked
# as normal.
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
# Sometimes devices won't create filenames which can be parsed properly
# by the filename parser (see
# https://paperless.readthedocs.io/en/latest/guesswork.html).
#
# This setting allows to specify a list of transformations
# in regular expression syntax, which are passed in order to re.sub.
# Transformation stops after the first match, so at most one transformation
# is applied.
#
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
# as keys.
#
# The example below transforms filenames created by a Brother ADS-2400N
# document scanner in its standard configuration `Name_Date_Count', so that
# count is used as title, name as tag and date can be parsed by paperless.
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
#
# The following values use sensible defaults for modern systems, but if you're
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
# some of these values may be necessary.
#
# By default, Paperless will attempt to use all available CPU cores to process
# a document, but if you would like to limit that, you can set this value to
# an integer:
#PAPERLESS_OCR_THREADS=1
# Customize the default language that tesseract will attempt to use when
# parsing documents. The default language is used whenever
# - No language could be detected on a document
# - No tesseract data files are available for the detected language
# It should be a 3-letter language code consistent with ISO
# 639: https://www.loc.gov/standards/iso639-2/php/code_list.php
#PAPERLESS_OCR_LANGUAGE=eng
# On smaller systems, or even in the case of Very Large Documents, the consumer
# may explode, complaining about how it's "unable to extend pixel cache". In
# such cases, try setting this to a reasonably low value, like 32000000. The
# default is to use whatever is necessary to do everything without writing to
# disk, and units are in megabytes.
#
# For more information on how to use this value, you should probably search
# the web for "MAGICK_MEMORY_LIMIT".
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
# Similar to the memory limit, if you've got a small system and your OS mounts
# /tmp as tmpfs, you should set this to a path that's on a physical disk, like
# /home/your_user/tmp or something. ImageMagick will use this as scratch space
# when crunching through very large documents.
#
# For more information on how to use this value, you should probably search
# the web for "MAGICK_TMPDIR".
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
# By default the conversion density setting for documents is 300DPI, in some
# cases it has proven useful to configure a lesser value.
# This setting has a high impact on the physical size of tmp page files,
# the speed of document conversion, and can affect the accuracy of OCR
# results. Individual results can vary and this setting should be tested
# thoroughly against the documents you are importing to see if it has any
# impacts either negative or positive.
# Testing on limited document sets has shown a setting of 200 can cut the
# size of tmp files by 1/3, and speed up conversion by up to 4x
# with little impact to OCR accuracy.
#PAPERLESS_CONVERT_DENSITY=300
# By default Paperless does not OCR a document if the text can be retrieved from
# the document directly. Set to true to always OCR documents.
#PAPERLESS_OCR_ALWAYS="false"
###############################################################################
#### Interface ####
###############################################################################
# Override the default UTC time zone here.
# See https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-TIME_ZONE
# for details on how to set it.
#PAPERLESS_TASK_WORKERS=1
#PAPERLESS_THREADS_PER_WORKER=1
#PAPERLESS_TIME_ZONE=UTC
#PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_ALWAYS=false
#PAPERLESS_CONSUMER_POLLING=10
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
#PAPERLESS_CONVERT_DENSITY=300
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_FILENAME_DATE_ORDER=YMD
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
# Binaries
###############################################################################
#### Third-Party Binaries ####
###############################################################################
# There are a few external software packages that Paperless expects to find on
# your system when it starts up. Unless you've done something creative with
# their installation, you probably won't need to edit any of these. However,
# if you've installed these programs somewhere where simply typing the name of
# the program doesn't automatically execute it (ie. the program isn't in your
# $PATH), then you'll need to specify the literal path for that program here.
# Convert (part of the ImageMagick suite)
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
# Ghostscript
#PAPERLESS_GS_BINARY = /usr/bin/gs
# Unpaper
#PAPERLESS_GS_BINARY=/usr/bin/gs
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
# Optipng (for optimising thumbnail sizes)
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng

101
scripts/make-release.sh Executable file
View File

@ -0,0 +1,101 @@
#!/bin/bash
set -e
VERSION=$1
if [ -z "$VERSION" ]
then
echo "Need a version string."
exit 1
fi
# source root directory of paperless
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
# output directory
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
if [ -d "$PAPERLESS_DIST" ]
then
echo "Removing $PAPERLESS_DIST"
rm "$PAPERLESS_DIST" -r
fi
mkdir "$PAPERLESS_DIST"
mkdir "$PAPERLESS_DIST_APP"
mkdir "$PAPERLESS_DIST_APP/docker"
# setup dependencies.
cd "$PAPERLESS_ROOT"
pipenv clean
pipenv install --dev
pipenv lock --keep-outdated -r > "$PAPERLESS_DIST_APP/requirements.txt"
# test if the application works.
cd "$PAPERLESS_ROOT/src"
pipenv run pytest --cov
pipenv run pycodestyle
# make the documentation.
cd "$PAPERLESS_ROOT/docs"
make clean html
# copy stuff into place
# the application itself
cp "$PAPERLESS_ROOT/.env" \
"$PAPERLESS_ROOT/.dockerignore" \
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
"$PAPERLESS_ROOT/LICENSE" \
"$PAPERLESS_ROOT/Pipfile" \
"$PAPERLESS_ROOT/Pipfile.lock" \
"$PAPERLESS_ROOT/README.md" "$PAPERLESS_DIST_APP"
cp "$PAPERLESS_ROOT/paperless.conf.example" "$PAPERLESS_DIST_APP/paperless.conf"
# copy python source, templates and static files.
cd "$PAPERLESS_ROOT"
find src -wholename '*/templates/*' -o -wholename '*/static/*' -o -name '*.py' | cpio -pdm "$PAPERLESS_DIST_APP"
# build the front end.
cd "$PAPERLESS_ROOT/src-ui"
ng build --prod --output-hashing none --sourceMap=false --output-path "$PAPERLESS_DIST_APP/src/documents/static/frontend"
# documentation
cp "$PAPERLESS_ROOT/docs/_build/html/" "$PAPERLESS_DIST_APP/docs" -r
# docker files for building the image yourself
cp "$PAPERLESS_ROOT/docker/local/"* "$PAPERLESS_DIST_APP"
cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST_APP"
# docker files for pulling from docker hub
cp "$PAPERLESS_ROOT/docker/hub/"* "$PAPERLESS_DIST"
cp "$PAPERLESS_ROOT/.env" "$PAPERLESS_DIST"
cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST"
# auxiliary files required for the docker image
cp "$PAPERLESS_ROOT/docker/docker-entrypoint.sh" "$PAPERLESS_DIST_APP/docker/"
cp "$PAPERLESS_ROOT/docker/gunicorn.conf.py" "$PAPERLESS_DIST_APP/docker/"
cp "$PAPERLESS_ROOT/docker/imagemagick-policy.xml" "$PAPERLESS_DIST_APP/docker/"
cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
# try to make the docker build.
cd "$PAPERLESS_DIST_APP"
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
# works. package the app!
cd "$PAPERLESS_DIST"
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/

23
scripts/push-release.sh Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
set -e
VERSION=$1
if [ -z "$VERSION" ]
then
echo "Need a version string."
exit 1
fi
# source root directory of paperless
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
# output directory
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
cd "$PAPERLESS_DIST_APP"
docker push "jonaswinkler/paperless-ng:$VERSION"

2
scripts/start_services.sh Executable file
View File

@ -0,0 +1,2 @@
docker run -p 5432:5432 -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13
docker run -d -p 6379:6379 redis:latest

View File

@ -8260,6 +8260,14 @@
"moment": "2.18.1"
}
},
"ngx-cookie-service": {
"version": "10.1.1",
"resolved": "https://registry.npmjs.org/ngx-cookie-service/-/ngx-cookie-service-10.1.1.tgz",
"integrity": "sha512-HvBrYHdxMN1NvFJGEIF/8EuAg2fjxj8QwqTv9h6qZGqNLU+lUba8Pb2zRPw1YA+gqKkJawOy5dYNeH0kyPyipw==",
"requires": {
"tslib": "^2.0.0"
}
},
"ngx-file-drop": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/ngx-file-drop/-/ngx-file-drop-10.0.0.tgz",

View File

@ -23,6 +23,7 @@
"@ng-bootstrap/ng-bootstrap": "^8.0.0",
"bootstrap": "^4.5.0",
"ng-bootstrap": "^1.6.3",
"ngx-cookie-service": "^10.1.1",
"ngx-file-drop": "^10.0.0",
"ngx-infinite-scroll": "^9.1.0",
"rxjs": "~6.6.0",

View File

@ -39,6 +39,8 @@ import { InfiniteScrollModule } from 'ngx-infinite-scroll';
import { DateTimeComponent } from './components/common/input/date-time/date-time.component';
import { TagsComponent } from './components/common/input/tags/tags.component';
import { SortableDirective } from './directives/sortable.directive';
import { CookieService } from 'ngx-cookie-service';
import { CsrfInterceptor } from './interceptors/csrf.interceptor';
import { SavedViewWidgetComponent } from './components/dashboard/widgets/saved-view-widget/saved-view-widget.component';
import { ConsumerStatusWidgetComponent } from './components/dashboard/widgets/consumer-status-widget/consumer-status-widget.component';
import { StatisticsWidgetComponent } from './components/dashboard/widgets/statistics-widget/statistics-widget.component';
@ -93,7 +95,12 @@ import { FileUploadWidgetComponent } from './components/dashboard/widgets/file-u
InfiniteScrollModule
],
providers: [
DatePipe
DatePipe,
CookieService, {
provide: HTTP_INTERCEPTORS,
useClass: CsrfInterceptor,
multi: true
}
],
bootstrap: [AppComponent]
})

View File

@ -1,5 +1,5 @@
<nav class="navbar navbar-dark sticky-top bg-dark flex-md-nowrap p-0 shadow">
<span class="navbar-brand col-md-3 col-lg-2 mr-0 px-3" href="#">Paperless</span>
<span class="navbar-brand col-md-3 col-lg-2 mr-0 px-3" href="#">Paperless-ng</span>
<button class="navbar-toggler position-absolute d-md-none collapsed" type="button" data-toggle="collapse"
data-target="#sidebarMenu" aria-controls="sidebarMenu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
@ -69,6 +69,14 @@
{{d.title}}
</a>
</li>
<li class="nav-item w-100" *ngIf="openDocuments.length > 1">
<a class="nav-link text-truncate" [routerLink]="" (click)="closeAll()">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#x"/>
</svg>
Close all
</a>
</li>
</ul>
<h6 class="sidebar-heading d-flex justify-content-between align-items-center px-3 mt-4 mb-1 text-muted">
@ -124,6 +132,28 @@
</a>
</li>
</ul>
<h6 class="sidebar-heading d-flex justify-content-between align-items-center px-3 mt-4 mb-1 text-muted">
<span>Misc</span>
</h6>
<ul class="nav flex-column mb-2">
<li class="nav-item">
<a class="nav-link" href="https://paperless-ng.readthedocs.io/en/latest/">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#question-circle"/>
</svg>
Documentation
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://github.com/jonaswinkler/paperless-ng">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#link"/>
</svg>
Github
</a>
</li>
</ul>
</div>
</nav>

View File

@ -1,12 +1,13 @@
import { Component, OnDestroy, OnInit } from '@angular/core';
import { FormControl } from '@angular/forms';
import { Router } from '@angular/router';
import { ActivatedRoute, Router } from '@angular/router';
import { from, Observable, Subscription } from 'rxjs';
import { debounceTime, distinctUntilChanged, map, switchMap } from 'rxjs/operators';
import { PaperlessDocument } from 'src/app/data/paperless-document';
import { OpenDocumentsService } from 'src/app/services/open-documents.service';
import { SearchService } from 'src/app/services/rest/search.service';
import { SavedViewConfigService } from 'src/app/services/saved-view-config.service';
import { DocumentDetailComponent } from '../document-detail/document-detail.component';
@Component({
selector: 'app-app-frame',
@ -17,6 +18,7 @@ export class AppFrameComponent implements OnInit, OnDestroy {
constructor (
public router: Router,
private activatedRoute: ActivatedRoute,
private openDocumentsService: OpenDocumentsService,
private searchService: SearchService,
public viewConfigService: SavedViewConfigService
@ -62,6 +64,19 @@ export class AppFrameComponent implements OnInit, OnDestroy {
this.router.navigate(['search'], {queryParams: {query: this.searchField.value}})
}
closeAll() {
this.openDocumentsService.closeAll()
// TODO: is there a better way to do this?
let route = this.activatedRoute
while (route.firstChild) {
route = route.firstChild
}
if (route.component == DocumentDetailComponent) {
this.router.navigate([""])
}
}
ngOnInit() {
this.openDocuments = this.openDocumentsService.getOpenDocuments()
}

View File

@ -86,7 +86,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor {
var modal = this.modalService.open(TagEditDialogComponent, {backdrop: 'static'})
modal.componentInstance.dialogMode = 'create'
modal.componentInstance.success.subscribe(newTag => {
this.tagService.list().subscribe(tags => {
this.tagService.listAll().subscribe(tags => {
this.tags = tags.results
this.addTag(newTag.id)
})

View File

@ -1,6 +1,6 @@
<div class="row pt-3 pb-2 mb-3 border-bottom align-items-center">
<div class="row pt-3 pb-1 mb-3 border-bottom align-items-center" >
<div class="col text-truncate">
<h1 class="h2 text-truncate">{{title}}</h1>
<h1 class="h2 text-truncate" style="line-height: 1.4">{{title}}</h1>
</div>
<div class="btn-toolbar col-auto">
<ng-content></ng-content>

View File

@ -2,7 +2,7 @@
<app-page-header title="Dashboard">
</app-page-header>
<p>Welcome to paperless!</p>
<p>Welcome to paperless-ng!</p>
<div class='row'>
<div class="col-lg">

View File

@ -49,7 +49,6 @@ export class DocumentDetailComponent implements OnInit {
private route: ActivatedRoute,
private correspondentService: CorrespondentService,
private documentTypeService: DocumentTypeService,
private datePipe: DatePipe,
private router: Router,
private modalService: NgbModal,
private openDocumentService: OpenDocumentsService,
@ -89,7 +88,7 @@ export class DocumentDetailComponent implements OnInit {
var modal = this.modalService.open(DocumentTypeEditDialogComponent, {backdrop: 'static'})
modal.componentInstance.dialogMode = 'create'
modal.componentInstance.success.subscribe(newDocumentType => {
this.documentTypeService.list().subscribe(documentTypes => {
this.documentTypeService.listAll().subscribe(documentTypes => {
this.documentTypes = documentTypes.results
this.documentForm.get('document_type_id').setValue(newDocumentType.id)
})
@ -100,7 +99,7 @@ export class DocumentDetailComponent implements OnInit {
var modal = this.modalService.open(CorrespondentEditDialogComponent, {backdrop: 'static'})
modal.componentInstance.dialogMode = 'create'
modal.componentInstance.success.subscribe(newCorrespondent => {
this.correspondentService.list().subscribe(correspondents => {
this.correspondentService.listAll().subscribe(correspondents => {
this.correspondents = correspondents.results
this.documentForm.get('correspondent_id').setValue(newCorrespondent.id)
})

View File

@ -85,8 +85,8 @@
<th>Correspondent</th>
<th>Title</th>
<th>Document type</th>
<th>Date created</th>
<th>Date added</th>
<th>Created</th>
<th>Added</th>
</thead>
<tbody>
<tr *ngFor="let d of docs.documents" routerLink="/documents/{{d.id}}">

View File

@ -1,3 +1,7 @@
.log-entry-10 {
color: lightslategray !important;
}
.log-entry-30 {
color: yellow !important;
}

View File

@ -30,7 +30,7 @@ export class LogsComponent implements OnInit {
onScroll() {
let lastCreated = null
if (this.logs.length > 0) {
lastCreated = this.logs[this.logs.length-1].created
lastCreated = new Date(this.logs[this.logs.length-1].created).toISOString()
}
this.logService.list(1, 25, 'created', 'des', {'created__lt': lastCreated, 'level__gte': this.level}).subscribe(result => {
this.logs.push(...result.results)

View File

@ -0,0 +1,16 @@
import { TestBed } from '@angular/core/testing';
import { CsrfInterceptor } from './csrf.interceptor';
describe('CsrfInterceptor', () => {
beforeEach(() => TestBed.configureTestingModule({
providers: [
CsrfInterceptor
]
}));
it('should be created', () => {
const interceptor: CsrfInterceptor = TestBed.inject(CsrfInterceptor);
expect(interceptor).toBeTruthy();
});
});

View File

@ -0,0 +1,30 @@
import { Injectable } from '@angular/core';
import {
HttpRequest,
HttpHandler,
HttpEvent,
HttpInterceptor
} from '@angular/common/http';
import { Observable } from 'rxjs';
import { CookieService } from 'ngx-cookie-service';
@Injectable()
export class CsrfInterceptor implements HttpInterceptor {
constructor(private cookieService: CookieService) {
}
intercept(request: HttpRequest<unknown>, next: HttpHandler): Observable<HttpEvent<unknown>> {
let csrfToken = this.cookieService.get('csrftoken')
if (csrfToken) {
request = request.clone({
setHeaders: {
'X-CSRFToken': csrfToken
}
})
}
return next.handle(request);
}
}

View File

@ -1,5 +1,4 @@
import { Injectable } from '@angular/core';
import { Observable, Subject } from 'rxjs';
import { PaperlessDocument } from '../data/paperless-document';
import { OPEN_DOCUMENT_SERVICE } from '../data/storage-keys';
@ -8,6 +7,8 @@ import { OPEN_DOCUMENT_SERVICE } from '../data/storage-keys';
})
export class OpenDocumentsService {
private MAX_OPEN_DOCUMENTS = 5
constructor() {
if (sessionStorage.getItem(OPEN_DOCUMENT_SERVICE.DOCUMENTS)) {
try {
@ -31,7 +32,10 @@ export class OpenDocumentsService {
openDocument(doc: PaperlessDocument) {
if (this.openDocuments.find(d => d.id == doc.id) == null) {
this.openDocuments.push(doc)
this.openDocuments.unshift(doc)
if (this.openDocuments.length > this.MAX_OPEN_DOCUMENTS) {
this.openDocuments.pop()
}
this.save()
}
}
@ -44,6 +48,11 @@ export class OpenDocumentsService {
}
}
closeAll() {
this.openDocuments.splice(0, this.openDocuments.length)
this.save()
}
save() {
sessionStorage.setItem(OPEN_DOCUMENT_SERVICE.DOCUMENTS, JSON.stringify(this.openDocuments))
}

View File

@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="69.999977mm"
height="84.283669mm"
viewBox="0 0 69.999977 84.283669"
version="1.1"
id="svg4812"
inkscape:version="1.0.1 (3bc2e813f5, 2020-09-07)"
sodipodi:docname="logo-dark-notext.svg">
<defs
id="defs4806" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="0.98994949"
inkscape:cx="328.04904"
inkscape:cy="330.33332"
inkscape:document-units="mm"
inkscape:current-layer="SvgjsG1020"
inkscape:document-rotation="0"
showgrid="false"
inkscape:window-width="1920"
inkscape:window-height="1016"
inkscape:window-x="1280"
inkscape:window-y="27"
inkscape:window-maximized="1" />
<metadata
id="metadata4809">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-9.9999792,-10.000082)">
<g
id="SvgjsG1020"
featureKey="symbol1"
fill="#ffffff"
transform="matrix(0.10341565,0,0,0.10341565,1.2287665,8.3453496)">
<path
id="path57"
style="fill:#ffffff;stroke-width:1.10017"
d="M 752.4375,82.365234 C 638.02019,348.60552 87.938206,381.6089 263.96484,810.67383 c 2.20034,5.50083 -40.70621,63.80947 -69.31054,112.21679 -6.601,-24.20366 -14.30329,-50.6063 -13.20313,-52.80664 C 324.47281,700.65835 79.135592,604.94324 65.933594,466.32227 4.3242706,576.33891 -17.678136,768.86756 168.25,879.98438 c 1.10017,-10e-6 9.90207,41.80777 14.30273,62.71093 -4.40066,8.80133 -8.80162,17.60213 -11.00195,24.20313 -4.40066,11.00166 28.60352,9.90123 28.60352,12.10156 3.3005,-1.10017 81.41295,-138.62054 83.61328,-139.7207 C 726.0345,738.06398 804.14532,339.80419 752.4375,82.365234 Z M 526.9043,362.90625 C 320.073,547.73422 284.86775,685.25508 291.46875,752.36523 222.15826,588.44043 425.68898,408.01308 526.9043,362.90625 Z M 127.54297,626.94727 c 39.60599,36.30549 105.6163,147.4222 49.50781,212.33203 13.202,-29.7045 17.60234,-96.81455 -49.50781,-212.33203 z"
transform="matrix(0.90895334,0,0,0.90895334,65.06894,-58.865357)" />
<defs
id="defs14302" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 2.9 KiB

View File

@ -1,5 +1,4 @@
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
@ -32,7 +31,7 @@ class TagAdmin(admin.ModelAdmin):
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
readonly_fields = ("slug",)
readonly_fields = ("slug", )
class DocumentTypeAdmin(admin.ModelAdmin):
@ -51,9 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type",)
list_display = ("title", "created", "added", "correspondent",
"tags_", "archive_serial_number", "document_type")
readonly_fields = ("added", "file_type", "storage_type", "filename")
list_display = (
"title",
"created",
"added",
"correspondent",
"tags_",
"archive_serial_number",
"document_type",
"filename"
)
list_filter = (
"document_type",
"tags",
@ -120,8 +127,3 @@ admin.site.register(Tag, TagAdmin)
admin.site.register(DocumentType, DocumentTypeAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)
# Unless we implement multi-user, these default registrations don't make sense.
admin.site.unregister(Group)
admin.site.unregister(User)

View File

@ -1,5 +1,4 @@
from django.apps import AppConfig
from django.db.models.signals import post_delete
class DocumentsConfig(AppConfig):
@ -14,7 +13,6 @@ class DocumentsConfig(AppConfig):
add_inbox_tags,
run_pre_consume_script,
run_post_consume_script,
cleanup_document_deletion,
set_log_entry,
set_correspondent,
set_document_type,
@ -33,6 +31,4 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_post_consume_script)
post_delete.connect(cleanup_document_deletion)
AppConfig.ready(self)

View File

@ -4,6 +4,8 @@ from django.conf import settings
from django.core.checks import Error, register
from django.db.utils import OperationalError, ProgrammingError
from documents.signals import document_consumer_declaration
@register()
def changed_password_check(app_configs, **kwargs):
@ -37,3 +39,17 @@ def changed_password_check(app_configs, **kwargs):
"""))]
return []
@register()
def parser_check(app_configs, **kwargs):
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
if len(parsers) == 0:
return [Error("No parsers found. This is a bug. The consumer won't be "
"able to onsume any documents without parsers.")]
else:
return []

View File

@ -3,7 +3,6 @@ import logging
import os
import pickle
import re
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
@ -64,7 +63,7 @@ class DocumentClassifier(object):
def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f) # Version
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.data_hash, f)
pickle.dump(self.data_vectorizer, f)
@ -89,16 +88,14 @@ class DocumentClassifier(object):
data.append(preprocessed_content)
y = -1
if doc.document_type:
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_document_type.append(y)
y = -1
if doc.correspondent:
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_correspondent.append(y)
@ -120,8 +117,8 @@ class DocumentClassifier(object):
num_tags = len(labels_tags_unique)
# substract 1 since -1 (null) is also part of the classes.
num_correspondents = len(labels_correspondent) - 1
num_document_types = len(labels_document_type) - 1
num_correspondents = len(set(labels_correspondent)) - 1
num_document_types = len(set(labels_document_type)) - 1
logging.getLogger(__name__).debug(
"{} documents, {} tag(s), {} correspondent(s), "
@ -137,7 +134,7 @@ class DocumentClassifier(object):
logging.getLogger(__name__).debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1,2),
ngram_range=(1, 2),
min_df=0.01
)
data_vectorized = self.data_vectorizer.fit_transform(data)

View File

@ -3,7 +3,6 @@ import hashlib
import logging
import os
import re
import uuid
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
@ -13,7 +12,9 @@ from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .models import Document, FileInfo
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .signals import (
document_consumption_finished,
@ -25,17 +26,10 @@ class ConsumerError(Exception):
pass
class Consumer:
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale pnm
2. Use tesseract on the pnm
3. Store the document in the MEDIA_ROOT with optional encryption
4. Store the OCR'd text in the database
5. Delete the document and image(s)
"""
class Consumer(LoggingMixin):
def _send_progress(self, filename, current_progress, max_progress, status, message, document_id=None):
def _send_progress(self, filename, current_progress, max_progress, status,
message, document_id=None):
payload = {
'filename': os.path.basename(filename),
'current_progress': current_progress,
@ -44,156 +38,226 @@ class Consumer:
'message': message,
'document_id': document_id
}
async_to_sync(self.channel_layer.group_send)("status_updates", {'type': 'status_update', 'data': payload})
async_to_sync(self.channel_layer.group_send)("status_updates",
{'type': 'status_update',
'data': payload})
def __init__(self, consume=settings.CONSUMPTION_DIR,
scratch=settings.SCRATCH_DIR):
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.consume = consume
self.scratch = scratch
self.classifier = DocumentClassifier()
def __init__(self):
super().__init__()
self.path = None
self.filename = None
self.override_title = None
self.override_correspondent_id = None
self.override_tag_ids = None
self.override_document_type_id = None
self.channel_layer = get_channel_layer()
os.makedirs(self.scratch, exist_ok=True)
def pre_check_file_exists(self):
if not os.path.isfile(self.path):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
self.storage_type = Document.STORAGE_TYPE_GPG
if not self.consume:
def pre_check_consumption_dir(self):
if not settings.CONSUMPTION_DIR:
raise ConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
"set.")
if not os.path.exists(self.consume):
if not os.path.isdir(settings.CONSUMPTION_DIR):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.consume))
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
@transaction.atomic
def try_consume_file(self, file):
"""
Return True if file was consumed
"""
self.logging_group = uuid.uuid4()
if not re.match(FileInfo.REGEXES["title"], file):
return False
doc = file
if self._is_duplicate(doc):
self.log(
"warning",
"Skipping {} as it appears to be a duplicate".format(doc)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists():
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
raise ConsumerError(
"Not consuming {}: It is a duplicate.".format(self.filename)
)
return False
self.log("info", "Consuming {}".format(doc))
def pre_check_directories(self):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
def try_consume_file(self,
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
"""
Return the document object if it was successfully created.
"""
parser_class = get_parser_class(doc)
self.path = path
self.filename = override_filename or os.path.basename(path)
self.override_title = override_title
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
self.override_tag_ids = override_tag_ids
# this is for grouping logging entries for this particular file
# together.
self.renew_logging_group()
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class.
parser_class = get_parser_class(self.filename)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
return False
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else:
self.log("info", "Parser: {}".format(parser_class.__name__))
self.log("debug", "Parser: {}".format(parser_class.__name__))
self._send_progress(file, 0, 100, 'WORKING', 'Consumption started')
# Notify all listeners that we're going to do some work.
self._send_progress(self.filename, 0, 100, 'WORKING', 'Consumption started')
document_consumption_started.send(
sender=self.__class__,
filename=doc,
filename=self.path,
logging_group=self.logging_group
)
def progress_callback(current_progress, max_progress, message):
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 60 + 20)
self._send_progress(file, p, 100, "WORKING", message)
self._send_progress(self.filename, p, 100, "WORKING", message)
document_parser = parser_class(doc, self.logging_group, progress_callback)
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.path, self.logging_group, progress_callback)
# However, this already created working directories which we have to
# clean up.
# Parse the document. This may take some time.
try:
self.log("info", "Generating thumbnail for {}...".format(doc))
self._send_progress(file, 10, 100, 'WORKING',
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
self._send_progress(self.filename, 10, 100, 'WORKING',
'Generating thumbnail...')
thumbnail = document_parser.get_optimised_thumbnail()
self._send_progress(file, 20, 100, 'WORKING',
self.log("debug", "Parsing {}...".format(self.filename))
self._send_progress(self.filename, 20, 100, 'WORKING',
'Getting text from document...')
text = document_parser.get_text()
self._send_progress(file, 80, 100, 'WORKING',
self._send_progress(self.filename, 80, 100, 'WORKING',
'Getting date from document...')
date = document_parser.get_date()
self._send_progress(file, 85, 100, 'WORKING',
'Storing the document...')
document = self._store(
text,
doc,
thumbnail,
date
)
except ParseError as e:
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
document_parser.cleanup()
self._send_progress(self.filename, 100, 100, 'FAILED',
"Failed: {}".format(e))
raise ConsumerError(e)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
try:
classifier = DocumentClassifier()
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning(
"Cannot classify documents: {}.".format(e))
classifier = None
self._send_progress(self.filename, 85, 100, 'WORKING',
'Storing the document...')
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
document = self._store(
text=text,
date=date
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
self._send_progress(self.filename, 90, 100, 'WORKING',
'Performing post-consumption tasks...')
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
create_source_path_directory(document.source_path)
self._write(document, self.path, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
except Exception as e:
raise ConsumerError(e)
self._send_progress(file, 100, 100, 'FAILED',
"Failed: {}".format(e))
finally:
document_parser.cleanup()
return False
else:
document_parser.cleanup()
self._cleanup_doc(doc)
self.log(
"info",
"Document {} consumption finished".format(document)
)
self.log(
"info",
"Document {} consumption finished".format(document)
)
classifier = None
self._send_progress(file, 100, 100, 'SUCCESS',
'Finished.', document.id)
try:
self.classifier.reload()
classifier = self.classifier
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
return document
self._send_progress(file, 90, 100, 'WORKING',
'Performing post-consumption tasks...')
def _store(self, text, date):
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
self._send_progress(file, 100, 100, 'SUCCESS',
'Finished.', document.id)
return True
# If someone gave us the original filename, use it instead of doc.
def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(self.filename)
file_info = FileInfo.from_path(doc)
stats = os.stat(doc)
stats = os.stat(self.path)
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
else:
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with open(self.path, "rb") as f:
document = Document.objects.create(
correspondent=file_info.correspondent,
title=file_info.title,
@ -202,7 +266,7 @@ class Consumer:
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
storage_type=self.storage_type
storage_type=storage_type
)
relevant_tags = set(file_info.tags)
@ -211,14 +275,30 @@ class Consumer:
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
self.apply_overrides(document)
#TODO: why do we need to save the document again?
document.filename = generate_filename(document)
# We need to save the document twice, since we need the PK of the
# document in order to create its filename above.
document.save()
return document
def apply_overrides(self, document):
if self.override_title:
document.title = self.override_title
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:
document.tags.add(Tag.objects.get(pk=tag_id))
def _write(self, document, source, target):
with open(source, "rb") as read_file:
with open(target, "wb") as write_file:
@ -227,13 +307,3 @@ class Consumer:
return
self.log("debug", "Encrypting")
write_file.write(GnuPG.encrypted(read_file))
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()

View File

@ -0,0 +1,102 @@
import logging
import os
from collections import defaultdict
from django.conf import settings
from django.template.defaultfilters import slugify
def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
def delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(settings.ORIGINALS_DIR)
if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder.
# append os.path.set so that we avoid these cases:
# directory = /home/originals2/test
# root = /home/originals ("/" gets appended and startswith fails)
return
while directory != root:
if not os.listdir(directory):
# it's empty
try:
os.rmdir(directory)
except OSError:
# whatever. empty directories aren't that bad anyway.
return
else:
# it's not empty.
return
# go one level up
directory = os.path.normpath(os.path.dirname(directory))
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter + 1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_filename(document):
# Create filename based on configured format
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
many_to_dictionary(document.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(document.correspondent),
title=slugify(document.title),
created=slugify(document.created),
created_year=document.created.year if document.created else "none",
created_month=document.created.month if document.created else "none",
created_day=document.created.day if document.created else "none",
added=slugify(document.added),
added_year=document.added.year if document.added else "none",
added_month=document.added.month if document.added else "none",
added_day=document.added.day if document.added else "none",
tags=tags,
)
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
else:
filename = "%07i.%s" % (document.pk, document.file_type)
# Append .gpg for encrypted files
if document.storage_type == document.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename

View File

@ -1,10 +1,11 @@
import os
import tempfile
from datetime import datetime
from time import mktime
from django import forms
from django.conf import settings
from django_q.tasks import async_task
from pathvalidate import validate_filename, ValidationError
@ -19,12 +20,6 @@ class UploadForm(forms.Form):
raise forms.ValidationError("That filename is suspicious.")
return self.cleaned_data.get("document")
def get_filename(self, i=None):
return os.path.join(
settings.CONSUMPTION_DIR,
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
)
def save(self):
"""
Since the consumer already does a lot of work, it's easier just to save
@ -33,15 +28,16 @@ class UploadForm(forms.Form):
"""
document = self.cleaned_data.get("document").read()
original_filename = self.cleaned_data.get("document").name
t = int(mktime(datetime.now().timetuple()))
file_name = self.get_filename()
i = 0
while os.path.exists(file_name):
i += 1
file_name = self.get_filename(i)
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
with open(file_name, "wb") as f:
f.write(document)
os.utime(file_name, times=(t, t))
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))

View File

@ -1,7 +1,6 @@
import logging
from contextlib import contextmanager
from django.db import models
from django.dispatch import receiver
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.highlight import Formatter, get_text
@ -9,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter
from documents.models import Document
from paperless import settings
logger = logging.getLogger(__name__)
@ -69,6 +66,9 @@ def open_index(recreate=False):
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
else:
# TODO: this is not thread safe. If 2 instances try to create the index
# at the same time, this fails. This currently prevents parallel
# tests.
return create_in(settings.INDEX_DIR, get_schema())
@ -99,15 +99,19 @@ def remove_document_from_index(document):
remove_document(writer, document)
@contextmanager
def query_page(ix, query, page):
with ix.searcher() as searcher:
searcher = ix.searcher()
try:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
return result_page
yield result_page
finally:
searcher.close()
def autocomplete(ix, term, limit=10):

View File

@ -1,4 +1,5 @@
import logging
import uuid
class PaperlessHandler(logging.Handler):
@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
kwargs["group"] = record.group
Log.objects.create(**kwargs)
class LoggingMixin:
logging_group = None
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
getattr(logger, level)(message, extra={
"group": self.logging_group
})

View File

@ -1,250 +0,0 @@
import datetime
import imaplib
import logging
import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
from email.parser import BytesParser
from dateutil import parser
from django.conf import settings
from .models import Correspondent
class MailFetcherError(Exception):
pass
class InvalidMessageError(MailFetcherError):
pass
class Loggable(object):
def __init__(self, group=None):
self.logger = logging.getLogger(__name__)
self.logging_group = group or uuid.uuid4()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
class Message(Loggable):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
"""
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
def __init__(self, data, group=None):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
Loggable.__init__(self, group=group)
self.subject = None
self.time = None
self.attachment = None
message = BytesParser(policy=policy.default).parsebytes(data)
self.subject = str(message["Subject"]).replace("\r\n", "")
self.body = str(message.get_body())
self.check_subject()
self.check_body()
self._set_time(message)
self.log("info", 'Importing email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
content_disposition = part.get("Content-Disposition")
if not content_disposition:
continue
dispositions = content_disposition.strip().split(";")
if len(dispositions) < 2:
continue
if not dispositions[0].lower() == "attachment" and \
"filename" not in dispositions[1].lower():
continue
file_data = part.get_payload()
attachments.append(Attachment(
b64decode(file_data), content_type=part.get_content_type()))
if len(attachments) == 0:
raise InvalidMessageError(
"There don't appear to be any attachments to this message")
if len(attachments) > 1:
raise InvalidMessageError(
"There's more than one attachment to this message. It cannot "
"be indexed automatically."
)
self.attachment = attachments[0]
def __bool__(self):
return bool(self.attachment)
def check_subject(self):
if self.subject is None:
raise InvalidMessageError("Message does not have a subject")
if not Correspondent.SAFE_REGEX.match(self.subject):
raise InvalidMessageError("Message subject is unsafe: {}".format(
self.subject))
def check_body(self):
if self.SECRET not in self.body:
raise InvalidMessageError("The secret wasn't in the body")
def _set_time(self, message):
self.time = datetime.datetime.now()
message_time = message.get("Date")
if message_time:
try:
self.time = parser.parse(message_time)
except (ValueError, AttributeError):
pass # We assume that "now" is ok
@property
def file_name(self):
return "{}.{}".format(self.subject, self.attachment.suffix)
class Attachment(object):
SAFE_SUFFIX_REGEX = re.compile(
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
def __init__(self, data, content_type):
self.content_type = content_type
self.data = data
self.suffix = None
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
if not m:
raise MailFetcherError(
"Not-awesome file type: {}".format(self.content_type))
self.suffix = m.group(2) or m.group(4)
def read(self):
return self.data
class MailFetcher(Loggable):
def __init__(self, consume=settings.CONSUMPTION_DIR):
Loggable.__init__(self)
self._connection = None
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
self._enabled = bool(self._host)
if self._enabled and Message.SECRET is None:
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
self.last_checked = time.time()
self.consume = consume
def pull(self):
"""
Fetch all available mail at the target address and store it locally in
the consumption directory so that the file consumer can pick it up and
do its thing.
"""
if self._enabled:
# Reset the grouping id for each fetch
self.logging_group = uuid.uuid4()
self.log("debug", "Checking mail")
for message in self._get_messages():
self.log("info", 'Storing email: "{}"'.format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(self.consume, message.file_name)
with open(file_name, "wb") as f:
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))
self.last_checked = time.time()
def _get_messages(self):
r = []
try:
self._connect()
self._login()
for message in self._fetch():
if message:
r.append(message)
self._connection.expunge()
self._connection.close()
self._connection.logout()
except MailFetcherError as e:
self.log("error", str(e))
return r
def _connect(self):
try:
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
except OSError as e:
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
raise MailFetcherError(msg)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select(self._inbox)
if not inbox[0] == "OK":
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
__, data = self._connection.fetch(num, "(RFC822)")
message = None
try:
message = Message(data[0][1], self.logging_group)
except InvalidMessageError as e:
self.log("error", str(e))
else:
self._connection.store(num, "+FLAGS", "\\Deleted")
if message:
yield message

View File

@ -3,11 +3,10 @@ import os
from django.conf import settings
from django.core.management.base import BaseCommand
from watchdog.observers import Observer
from django_q.tasks import async_task
from watchdog.events import FileSystemEventHandler
from documents.consumer import Consumer
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver
try:
from inotify_simple import INotify, flags
@ -17,17 +16,25 @@ except ImportError:
class Handler(FileSystemEventHandler):
def __init__(self, consumer):
self.consumer = consumer
def _consume(self, file):
if os.path.isfile(file):
try:
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
except Exception as e:
# Catch all so that the consumer won't crash.
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
def on_created(self, event):
self.consumer.try_consume_file(event.src_path)
self._consume(event.src_path)
def on_moved(self, event):
self._consume(event.src_path)
class Command(BaseCommand):
"""
On every iteration of an infinite loop, consume what we can from the
consumption directory, and fetch any mail available.
consumption directory.
"""
def __init__(self, *args, **kwargs):
@ -35,12 +42,6 @@ class Command(BaseCommand):
self.verbosity = 0
self.logger = logging.getLogger(__name__)
self.file_consumer = None
self.mail_fetcher = None
self.first_iteration = True
self.consumer = Consumer()
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
@ -56,9 +57,6 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
directory = options["directory"]
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
os.makedirs(d, exist_ok=True)
logging.getLogger(__name__).info(
"Starting document consumer at {}".format(
directory
@ -68,11 +66,16 @@ class Command(BaseCommand):
# Consume all files as this is not done initially by the watchdog
for entry in os.scandir(directory):
if entry.is_file():
self.consumer.try_consume_file(entry.path)
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
# Start the watchdog. Woof!
observer = Observer()
event_handler = Handler(self.consumer)
if settings.CONSUMER_POLLING > 0:
logging.getLogger(__name__).info('Using polling instead of file'
'system notifications.')
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
else:
observer = Observer()
event_handler = Handler()
observer.schedule(event_handler, directory, recursive=True)
observer.start()
try:

View File

@ -1,4 +1,5 @@
from django.core.management.base import BaseCommand
from ...mixins import Renderable
from ...tasks import train_classifier

View File

@ -1,16 +1,15 @@
import json
import os
import time
import shutil
import time
from django.core.management.base import BaseCommand, CommandError
from django.core import serializers
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document, Correspondent, Tag, DocumentType
from paperless.db import GnuPG
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...mixins import Renderable
class Command(Renderable, BaseCommand):

View File

@ -3,15 +3,14 @@ import os
import shutil
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from paperless.db import GnuPG
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...file_handling import generate_filename, create_source_path_directory
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
@ -82,6 +81,10 @@ class Command(Renderable, BaseCommand):
def _import_files_from_manifest(self):
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
for record in self.manifest:
if not record["model"] == "documents.document":
@ -94,6 +97,14 @@ class Command(Renderable, BaseCommand):
document_path = os.path.join(self.source, doc_file)
thumbnail_path = os.path.join(self.source, thumb_file)
document.storage_type = storage_type
document.filename = generate_filename(document)
if os.path.isfile(document.source_path):
raise FileExistsError(document.source_path)
create_source_path_directory(document.source_path)
if settings.PASSPHRASE:
with open(document_path, "rb") as unencrypted:
@ -109,18 +120,8 @@ class Command(Renderable, BaseCommand):
encrypted.write(GnuPG.encrypted(unencrypted))
else:
print("Moving {} to {}".format(document_path, document.source_path))
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)
# Reset the storage type to whatever we've used while importing
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
Document.objects.filter(
pk__in=[r["pk"] for r in self.manifest]
).update(
storage_type=storage_type
)
document.save()

View File

@ -8,5 +8,5 @@ class Command(BaseCommand):
help = "A quick & dirty way to see what's in the logs"
def handle(self, *args, **options):
for l in Log.objects.order_by("pk"):
print(l)
for log in Log.objects.order_by("pk"):
print(log)

View File

@ -1,7 +1,6 @@
from django.core.management.base import BaseCommand
from documents.models import Document, Tag
from documents.models import Document
from ...mixins import Renderable

View File

@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier):
correspondents = Correspondent.objects.all()
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
return matched_correspondents
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
def match_document_types(document_content, classifier):
document_types = DocumentType.objects.all()
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
return matched_document_types
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
def match_tags(document_content, classifier):

View File

@ -1,7 +1,4 @@
# Generated by Django 3.1.3 on 2020-11-07 12:35
import os
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion

View File

@ -9,11 +9,11 @@ from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.all().delete()
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
class Migration(migrations.Migration):

View File

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2020-11-11 11:05
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1001_auto_20201109_1636'),
]
operations = [
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True),
),
]

View File

@ -3,18 +3,15 @@
import logging
import os
import re
from collections import OrderedDict, defaultdict
from collections import OrderedDict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.dispatch import receiver
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
class MatchingModel(models.Model):
MATCH_ANY = 1
@ -116,6 +113,7 @@ class DocumentType(MatchingModel):
class Document(models.Model):
# TODO: why do we need an explicit list
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
@ -192,7 +190,7 @@ class Document(models.Model):
default=timezone.now, editable=False, db_index=True)
filename = models.FilePathField(
max_length=256,
max_length=1024,
editable=False,
default=None,
null=True,
@ -220,123 +218,18 @@ class Document(models.Model):
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
def find_renamed_document(self, subdirectory=""):
suffix = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
suffix += ".gpg"
# Go up in the directory hierarchy and try to delete all directories
root = os.path.normpath(Document.filename_to_path(subdirectory))
for filename in os.listdir(root):
if filename.endswith(suffix):
return os.path.join(subdirectory, filename)
fullname = os.path.join(subdirectory, filename)
if os.path.isdir(Document.filename_to_path(fullname)):
return self.find_renamed_document(fullname)
return None
@property
def source_filename(self):
# Initial filename generation (for new documents)
if self.filename is None:
self.filename = self.generate_source_filename()
# Check if document is still available under filename
elif not os.path.isfile(Document.filename_to_path(self.filename)):
recovered_filename = self.find_renamed_document()
# If we have found the file so update the filename
if recovered_filename is not None:
logger = logging.getLogger(__name__)
logger.warning("Filename of document " + str(self.id) +
" has changed and was successfully updated")
self.filename = recovered_filename
# Remove all empty subdirectories from MEDIA_ROOT
Document.delete_all_empty_subdirectories(
Document.filename_to_path(""))
else:
logger = logging.getLogger(__name__)
logger.error("File of document " + str(self.id) + " has " +
"gone and could not be recovered")
return self.filename
@staticmethod
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter+1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_source_filename(self):
# Create filename based on configured format
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
self.many_to_dictionary(self.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(self.correspondent),
title=slugify(self.title),
created=slugify(self.created),
added=slugify(self.added),
tags=tags)
else:
path = ""
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
else:
filename = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def create_source_directory(self):
new_filename = self.generate_source_filename()
# Determine the full "target" path
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
# Create new path
os.makedirs(dir_new, exist_ok=True)
@property
def source_path(self):
return Document.filename_to_path(self.source_filename)
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
@staticmethod
def filename_to_path(filename):
return os.path.join(
settings.ORIGINALS_DIR,
filename
fname
)
@property
@ -362,125 +255,6 @@ class Document(models.Model):
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
def set_filename(self, filename):
if os.path.isfile(Document.filename_to_path(filename)):
self.filename = filename
@staticmethod
def try_delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(Document.filename_to_path(""))
while directory != root:
# Try to delete the current directory
try:
os.rmdir(directory)
except os.error:
# Directory not empty, no need to go further up
return
# Cut off actual directory and go one level up
directory, _ = os.path.split(directory)
directory = os.path.normpath(directory)
@staticmethod
def delete_all_empty_subdirectories(directory):
# Go through all folders and try to delete all directories
root = os.path.normpath(Document.filename_to_path(directory))
for filename in os.listdir(root):
fullname = os.path.join(directory, filename)
if not os.path.isdir(Document.filename_to_path(fullname)):
continue
# Go into subdirectory to see, if there is more to delete
Document.delete_all_empty_subdirectories(
os.path.join(directory, filename))
# Try to delete the directory
try:
os.rmdir(Document.filename_to_path(fullname))
continue
except os.error:
# Directory not empty, no need to go further up
continue
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename(sender, instance, **kwargs):
# Skip if document has not been saved yet
if instance.filename is None:
return
# Check is file exists and update filename otherwise
if not os.path.isfile(Document.filename_to_path(instance.filename)):
instance.filename = instance.source_filename
# Build the new filename
new_filename = instance.generate_source_filename()
# If the filename is the same, then nothing needs to be done
if instance.filename == new_filename:
return
# Determine the full "target" path
path_new = instance.filename_to_path(new_filename)
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
# Create new path
instance.create_source_directory()
# Determine the full "current" path
path_current = instance.filename_to_path(instance.source_filename)
# Move file
try:
os.rename(path_current, path_new)
except PermissionError:
# Do not update filename in object
return
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.error("Renaming of document " + str(instance.id) + " failed " +
"as file " + instance.filename + " was no longer present")
return
# Delete empty directory
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
instance.filename = new_filename
# Save instance
# This will not cause a cascade of post_save signals, as next time
# nothing needs to be renamed
instance.save()
@receiver(models.signals.post_delete, sender=Document)
def delete_files(sender, instance, **kwargs):
if instance.filename is None:
return
# Remove the document
old_file = instance.filename_to_path(instance.filename)
try:
os.remove(old_file)
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.warning("Deleted document " + str(instance.id) + " but file " +
old_file + " was no longer present")
# And remove the directory (if applicable)
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
class Log(models.Model):
@ -518,7 +292,7 @@ class FileInfo:
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
# TODO: what is this used for
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(

View File

@ -20,13 +20,16 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)
@ -39,17 +42,16 @@ def get_parser_class(doc):
Determine the appropriate parser class based on the file
"""
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
#TODO: add a check that checks parser availability.
options = []
for parser in parsers:
result = parser(doc)
if result:
options.append(result)
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
parser_test = parser_declaration["test"]
if parser_test(doc):
options.append(parser_declaration)
if not options:
return None
@ -59,7 +61,7 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False
args += ['-trim'] if trim else []
args += ['-type', str(type)] if type else []
args += ['-depth', str(depth)] if depth else []
args += [input, output]
args += [input_file, output_file]
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
@ -100,17 +102,17 @@ class ParseError(Exception):
pass
class DocumentParser:
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group, progress_callback):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__)
self.logging_group = logging_group
self.progress_callback = progress_callback
def get_thumbnail(self):
@ -121,16 +123,19 @@ class DocumentParser:
def optimise_thumbnail(self, in_path):
out_path = os.path.join(self.tempdir, "optipng.png")
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
self.log('debug', 'Execute: ' + " ".join(args))
self.log('debug', 'Execute: ' + " ".join(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
@ -222,11 +227,6 @@ class DocumentParser:
return date
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir)

View File

@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer):
class LogSerializer(serializers.ModelSerializer):
class Meta:
model = Log
fields = (

View File

@ -1,5 +1,5 @@
from django.dispatch import Signal
document_consumption_started = Signal(providing_args=["filename"])
document_consumption_finished = Signal(providing_args=["document"])
document_consumer_declaration = Signal(providing_args=[])
document_consumption_started = Signal()
document_consumption_finished = Signal()
document_consumer_declaration = Signal()

View File

@ -6,9 +6,13 @@ from django.conf import settings
from django.contrib.admin.models import ADDITION, LogEntry
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.db import models, DatabaseError
from django.dispatch import receiver
from django.utils import timezone
from .. import index, matching
from ..file_handling import delete_empty_directories, generate_filename, \
create_source_path_directory
from ..models import Document, Tag
@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs):
)).wait()
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
if not isinstance(instance, Document):
return
for f in (instance.source_path, instance.thumbnail_path):
try:
os.unlink(f)
except FileNotFoundError:
pass # The file's already gone, so we're cool with it.
delete_empty_directories(os.path.dirname(instance.source_path))
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename_and_move_files(sender, instance, **kwargs):
if not instance.filename:
# Can't update the filename if there is not filename to begin with
# This happens after the consumer creates a new document.
# The PK needs to be set first by saving the document once. When this
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
# renamed anyway. In all other cases, instance.filename will be set.
return
old_filename = instance.filename
old_path = instance.source_path
new_filename = generate_filename(instance)
if new_filename == instance.filename:
# Don't do anything if its the same.
return
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
return
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
return
create_source_path_directory(new_path)
try:
os.rename(old_path, new_path)
instance.filename = new_filename
instance.save()
except OSError as e:
instance.filename = old_filename
except DatabaseError as e:
os.rename(new_path, old_path)
instance.filename = old_filename
if not os.path.isfile(old_path):
delete_empty_directories(os.path.dirname(old_path))
def set_log_entry(sender, document=None, logging_group=None, **kwargs):

Some files were not shown because too many files have changed in this diff Show More