Merge branch 'dev' into celery-tasks
@ -1,3 +1,4 @@
|
||||
/src-ui/.vscode
|
||||
/src-ui/node_modules
|
||||
/src-ui/dist
|
||||
.git
|
||||
@ -5,3 +6,7 @@
|
||||
/consume
|
||||
/media
|
||||
/data
|
||||
/docs
|
||||
.pytest_cache
|
||||
/dist
|
||||
/scripts
|
||||
|
4
.gitignore
vendored
@ -66,6 +66,7 @@ target/
|
||||
virtualenv
|
||||
/venv
|
||||
docker-compose.env
|
||||
docker-compose.yml
|
||||
|
||||
# Used for development
|
||||
scripts/import-for-development
|
||||
@ -86,3 +87,6 @@ scripts/nuke
|
||||
/consume
|
||||
/export
|
||||
/src-ui/.vscode
|
||||
|
||||
# this is where the compiled frontend is moved to.
|
||||
/src/documents/static/frontend/
|
||||
|
60
.travis.yml
@ -1,56 +1,22 @@
|
||||
language: python
|
||||
|
||||
python:
|
||||
- "3.6"
|
||||
- "3.7"
|
||||
- "3.8"
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
|
||||
|
||||
sudo: false
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- python: "3.5"
|
||||
- python: "3.6"
|
||||
- python: "3.7-dev"
|
||||
- env:
|
||||
- BUILD_DOCKER=1
|
||||
# Variable to add to publish the Docker image:
|
||||
# * DOCKER_USERNAME
|
||||
# * DOCKER_PASSWORD, to be encrypted, use `travis encrypt DOCKER_PASSWORD=<password>`
|
||||
services:
|
||||
- docker
|
||||
before_install:
|
||||
- true
|
||||
install:
|
||||
- true
|
||||
script:
|
||||
- docker build --tag=the-paperless-project/paperless .
|
||||
after_success:
|
||||
- true
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
|
||||
|
||||
install:
|
||||
- pip install --upgrade pip pipenv sphinx
|
||||
- pipenv lock -r > requirements.txt
|
||||
- pip install -r requirements.txt
|
||||
- pip install --upgrade pipenv
|
||||
- pipenv install --system --dev
|
||||
|
||||
script:
|
||||
- cd src/
|
||||
- pytest --cov
|
||||
- pycodestyle
|
||||
- sphinx-build -b html ../docs ../docs/_build -W
|
||||
- cd src/
|
||||
- pipenv run pytest --cov
|
||||
- pipenv run pycodestyle
|
||||
|
||||
after_success:
|
||||
- coveralls
|
||||
|
||||
deploy:
|
||||
- provider: script
|
||||
skip_cleanup: true
|
||||
script: ci/deploy-docker
|
||||
on:
|
||||
tags: true
|
||||
condition: '"${BUILD_DOCKER}" = 1'
|
||||
- provider: script
|
||||
skip_cleanup: true
|
||||
script: ci/deploy-docker
|
||||
on:
|
||||
branch: master
|
||||
condition: '"${BUILD_DOCKER}" = 1'
|
||||
- pipenv run coveralls
|
||||
|
82
Dockerfile
@ -1,82 +0,0 @@
|
||||
###############################################################################
|
||||
### Front end ###
|
||||
###############################################################################
|
||||
|
||||
FROM node:current AS frontend
|
||||
|
||||
WORKDIR /usr/src/paperless/src-ui/
|
||||
|
||||
COPY src-ui/package* ./
|
||||
RUN npm install
|
||||
|
||||
COPY src-ui .
|
||||
RUN node_modules/.bin/ng build --prod --output-hashing none --sourceMap=false
|
||||
|
||||
###############################################################################
|
||||
### Back end ###
|
||||
###############################################################################
|
||||
|
||||
FROM ubuntu:20.04
|
||||
|
||||
WORKDIR /usr/src/paperless/
|
||||
|
||||
COPY Pipfile* ./
|
||||
|
||||
#Dependencies
|
||||
RUN apt-get update \
|
||||
&& DEBIAN_FRONTEND="noninteractive" apt-get -y --no-install-recommends install \
|
||||
build-essential \
|
||||
curl \
|
||||
ghostscript \
|
||||
gnupg \
|
||||
imagemagick \
|
||||
libmagic-dev \
|
||||
libpoppler-cpp-dev \
|
||||
libpq-dev \
|
||||
optipng \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
sudo \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-fra \
|
||||
tesseract-ocr-ita \
|
||||
tesseract-ocr-spa \
|
||||
tzdata \
|
||||
unpaper \
|
||||
&& pip3 install --upgrade pipenv supervisor setuptools \
|
||||
&& pipenv install --system --deploy \
|
||||
&& pipenv --clear \
|
||||
&& apt-get -y purge build-essential python3-pip python3-dev \
|
||||
&& apt-get -y autoremove --purge \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir /var/log/supervisord /var/run/supervisord
|
||||
|
||||
# copy scripts
|
||||
# this fixes issues with imagemagick and PDF
|
||||
COPY scripts/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
|
||||
COPY scripts/gunicorn.conf.py ./
|
||||
COPY scripts/supervisord.conf /etc/supervisord.conf
|
||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
|
||||
# copy app
|
||||
COPY src/ ./src/
|
||||
COPY --from=frontend /usr/src/paperless/src-ui/dist/paperless-ui/ ./src/documents/static/frontend/
|
||||
|
||||
# add users, setup scripts
|
||||
RUN addgroup --gid 1000 paperless \
|
||||
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
|
||||
&& chown -R paperless:paperless . \
|
||||
&& chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
WORKDIR /usr/src/paperless/src/
|
||||
|
||||
RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input
|
||||
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["supervisord", "-c", "/etc/supervisord.conf"]
|
||||
|
||||
LABEL maintainer="Jonas Winkler <dev@jpwinkler.de>"
|
11
Pipfile
@ -3,6 +3,11 @@ url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[[source]]
|
||||
url = "https://www.piwheels.org/simple"
|
||||
verify_ssl = true
|
||||
name = "piwheels"
|
||||
|
||||
[packages]
|
||||
django = "~=3.1"
|
||||
pillow = "*"
|
||||
@ -21,7 +26,7 @@ psycopg2-binary = "*"
|
||||
scikit-learn="~=0.23"
|
||||
whoosh="~=2.7"
|
||||
gunicorn = "*"
|
||||
whitenoise = "*"
|
||||
whitenoise = "~=5.2"
|
||||
fuzzywuzzy = "*"
|
||||
python-Levenshtein = "*"
|
||||
django-extensions = "*"
|
||||
@ -29,6 +34,7 @@ watchdog = "*"
|
||||
pathvalidate = "*"
|
||||
django-q = "*"
|
||||
redis = "*"
|
||||
imap-tools = "*"
|
||||
channels = "~=3.0"
|
||||
channels-redis = "*"
|
||||
daphne = "~=3.0"
|
||||
@ -36,7 +42,7 @@ daphne = "~=3.0"
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
factory-boy = "*"
|
||||
sphinx = "*"
|
||||
sphinx = "~=3.3"
|
||||
tox = "*"
|
||||
pycodestyle = "*"
|
||||
pytest = "*"
|
||||
@ -45,3 +51,4 @@ pytest-django = "*"
|
||||
pytest-sugar = "*"
|
||||
pytest-env = "*"
|
||||
pytest-xdist = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
59
README.md
@ -1,3 +1,9 @@
|
||||
[](https://travis-ci.org/jonaswinkler/paperless-ng)
|
||||
[](https://paperless-ng.readthedocs.io/en/latest/?badge=latest)
|
||||
[](https://hub.docker.com/r/jonaswinkler/paperless-ng)
|
||||
|
||||
# Paperless-ng
|
||||
|
||||
[Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents.
|
||||
|
||||
Paperless-ng is a fork of the original project, adding a new interface and many other changes under the hood. For a detailed list of changes, see below.
|
||||
@ -22,35 +28,16 @@ Here's what you get:
|
||||
|
||||
I wanted to make big changes to the project that will impact the way it is used by its users greatly. Among the users who currently use paperless in production there are probably many that don't want these changes right away. I also wanted to have more control over what goes into the code and what does not. Therefore, paperless-ng was created. NG stands for both Angular (the framework used for the Frontend) and next-gen. Publishing this project under a different name also avoids confusion between paperless and paperless-ng.
|
||||
|
||||
This is a list of changes that have been made to the original project.
|
||||
The gist of the changes is the following:
|
||||
|
||||
## Added
|
||||
- **A new single page UI** built with bootstrap and Angular. Its much more responsive than the django admin pages. It features the follwing improvements over the old django admin interface:
|
||||
- *Dashboard.* The landing page shows some useful information, such as statistics, recently scanned documents, file uploading, and possibly more in the future.
|
||||
- *Document uploading on the web page.* This is very crude right now, but gets the job done. It simply uploads the documents and stores them in the configured consumer directory. The API for that has always been in the project, there simply was no form on the UI to support it.
|
||||
- *Full text search* with a proper document indexer: The search feature sorts documents by relevance to the search query, highlights query terms in the found documents and provides autocomplete while typing the query. This is still very basic but will see extensions in the future.
|
||||
- *Saveable filters.* Save filter and sorting presets and optionally display a couple documents of saved filters (i.e., your inbox sorted descending by added date, or tagged TODO, oldest to newest) on the dash board.
|
||||
- *Statistics.* Provides basic statistics about your document collection.
|
||||
- **Document types.** Similar to correspondents, each document may have a type (i.e., invoice, letter, receipt, bank statement, ...). I've initially intented to use this for some individual processing of differently typed documents, however, no such features exists yet.
|
||||
- **Inbox tags.** These tags are automatically assigned to every newly scanned document. They are intented to be removed once you have manually edited the meta data of a document.
|
||||
- **Automatic matching** for document types, correspondents, and tags. A new matching algorithm has been implemented (Auto), which is based on a classification model (simple feed forward neural nets are used). This classifier is trained on your document collection and learns to assign metadata to new documents based on their similiarity to existing documents.
|
||||
- If, for example, all your bank statements for a specific account are tagged with "bank_account_1234" and the matching algorithm of that tag is set to Auto, the classifier learns relevant phrases and words in the documents and assigns this tag automatically to newly scanned and matching documents.
|
||||
- This works reasonably well, if there is a correlation between the tag and the content of the document. Tags such as 'TODO' or 'Contact Correspondent' cannot be assigned automatically.
|
||||
- **Archive serial numbers.** These are there to support the recommended workflow for storing physical copies of very important documents. The idea is that if a document has to be kept in physical form, you write a running number on the document before scanning (the archive serial number) and keep these documents sorted by number in a binder. If you need to access a specific physical document at some point in time, search for the document in paperless, identify the ASN and grab the document.
|
||||
* New front end. This will eventually be mobile friendly as well.
|
||||
* New full text search.
|
||||
* Machine learning powered document matching.
|
||||
* Code cleanup in many, MANY areas.
|
||||
|
||||
## Modified
|
||||
- **(BREAKING) REST API changes.** In order to support the new UI, changes had to be made to the API. Some filters are not available anymore, other filters were added. Furthermore, foreign key relationships are not expressed with URLs anymore, but with their respective ids. Also, the urls for fetching documents and thumbnails have changed. Redirects are in place to support the old urls.
|
||||
If you want to see some screenshots of paperless-ng in action, [some are available in the documentation](https://paperless-ng.readthedocs.io/en/latest/screenshots.html).
|
||||
|
||||
## Internal changes
|
||||
- Many improvements to the code. More concise logging of the consumer, better multithreading of the tesseract parser for large documents, less hacks overall.
|
||||
- Updated docker image. This image runs everything in a single container. (Except the optional database, of course)
|
||||
|
||||
## Removed
|
||||
|
||||
These features were removed each due to two reasons. First, I did not feel these features contributed all that much to the over project, and second, I don't want to maintain these features.
|
||||
|
||||
- **(BREAKING) Reminders.** I have no idea what they were used for and thus removed them from the project.
|
||||
- **Every customization made to the admin interface.** Since this is not the primary interface for the application anymore, there is no need to keep and maintain these. Besides, some changes were incompatible with the most recent versions of django. The interface is completely usable, though.
|
||||
For a complete list of changes, check out the [changelog](https://paperless-ng.readthedocs.io/en/latest/changelog.html)
|
||||
|
||||
## Planned
|
||||
|
||||
@ -61,6 +48,7 @@ These features will make it into the application at some point, sorted by priori
|
||||
- Ability to search for “Similar documents” in the search results
|
||||
- Provide corrections for mispelled queries
|
||||
- **More robust consumer** that shows its progress on the web page.
|
||||
- **More rigid email processing**. Like, dont delete imported mail, provide filters, etc...
|
||||
- **Arbitrary tag colors**. Allow the selection of any color with a color picker.
|
||||
|
||||
## On the chopping block.
|
||||
@ -68,39 +56,34 @@ These features will make it into the application at some point, sorted by priori
|
||||
I don't know if these features are used all that much. I don't exactly know how they work and will probably remove them at some point in the future.
|
||||
|
||||
- **GnuPG encrypion.** Since its disabled by default and the website allows transparent access to encrypted documents anyway, this doesn’t really provide any benefit over having the application stored on an encrypted file system.
|
||||
- **E-Mail scanning.** I don’t use it and don’t know the state of the implementation. I’ll have to look into that.
|
||||
|
||||
# Getting started
|
||||
|
||||
The recommended way to deploy paperless is docker-compose.
|
||||
The recommended way to deploy paperless is docker-compose. Use the provided docker-compose.yml files to get started. This pulls the image from Docker hub. Alternatively, you can build the image yourself.
|
||||
|
||||
git clone https://github.com/jonaswinkler/paperless
|
||||
cd paperless
|
||||
cp docker-compose.yml.example docker-compose.yml
|
||||
cp docker-compose.env.example docker-compose.env
|
||||
docker-compose up -d
|
||||
|
||||
Please be aware that this uses a postgres database instead of sqlite. If you want to continue using sqlite, remove the database-related options from the docker-compose.env file.
|
||||
Read the [documentation](https://paperless-ng.readthedocs.io/en/latest/setup.html#installation) on how to get started.
|
||||
|
||||
Alternatively, you can install the dependencies and setup apache and a database server yourself. Details for that will be available in the documentation.
|
||||
|
||||
# Migrating to paperless-ng
|
||||
|
||||
Don't do it yet. The migrations are in place, but I have not verified yet that they work.
|
||||
Read the section about [migration](https://paperless-ng.readthedocs.io/en/latest/setup.html#migration-to-paperless-ng) in the documentation.
|
||||
|
||||
# Documentation
|
||||
|
||||
The documentation for Paperless is available on [ReadTheDocs](https://paperless-ng.readthedocs.io/). Updated documentation for this project is not yet available.
|
||||
The documentation for Paperless-ng is available on [ReadTheDocs](https://paperless-ng.readthedocs.io/).
|
||||
|
||||
# Affiliated Projects
|
||||
|
||||
Paperless has been around a while now, and people are starting to build stuff on top of it. If you're one of those people, we can add your project to this list:
|
||||
|
||||
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless. This app is not compatible at this point.
|
||||
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless.
|
||||
* [Paperless Desktop](https://github.com/thomasbrueggemann/paperless-desktop): A desktop UI for your Paperless installation. Runs on Mac, Linux, and Windows.
|
||||
* [ansible-role-paperless](https://github.com/ovv/ansible-role-paperless): An easy way to get Paperless running via Ansible.
|
||||
* [paperless-cli](https://github.com/stgarf/paperless-cli): A golang command line binary to interact with a Paperless instance.
|
||||
|
||||
Compatibility with Paperless-ng is unknown.
|
||||
|
||||
# Important Note
|
||||
|
||||
Document scanners are typically used to scan sensitive documents. Things like your social insurance number, tax records, invoices, etc. Everything is stored in the clear without encryption by default (it needs to be searchable, so if someone has ideas on how to do that on encrypted data, I'm all ears). This means that Paperless should never be run on an untrusted host. Instead, I recommend that if you do want to use it, run it locally on a server in your own home.
|
||||
|
@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "${DOCKER_USERNAME}" == "" -o "${DOCKER_PASSWORD}" == "" ]
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
docker login --username=${DOCKER_USERNAME} --password=${DOCKER_PASSWORD}
|
||||
if [ "${TRAVIS_TAG}" != "" ]
|
||||
then
|
||||
docker tag the-paperless-project/paperless the-paperless-project/paperless:${TRAVIS_TAG}
|
||||
docker push the-paperless-project/paperless:${TRAVIS_TAG}
|
||||
else
|
||||
docker push the-paperless-project/paperless
|
||||
fi
|
@ -24,16 +24,16 @@
|
||||
|
||||
# Adjust this key if you plan to make paperless available publicly. It should
|
||||
# be a very long sequence of random characters. You don't need to remember it.
|
||||
#PAPERLESS_SECRET_KEY="change-me"
|
||||
#PAPERLESS_SECRET_KEY=change-me
|
||||
|
||||
# Use this variable to set a timezone for the Paperless Docker containers. If not specified, defaults to UTC.
|
||||
#PAPERLESS_TIME_ZONE=America/Los_Angeles
|
||||
|
||||
# The default language to use for OCR. Set this to the language most of your
|
||||
# documents are written in.
|
||||
#PAPERLESS_OCR_LANGUAGE="eng"
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
|
||||
# By default Paperless does not OCR a document if the text can be retrieved from
|
||||
# the document directly. Set to true to always OCR documents. (i.e., if you
|
||||
# know that some of your documents have faulty/bad OCR data)
|
||||
#PAPERLESS_OCR_ALWAYS="true"
|
||||
#PAPERLESS_OCR_ALWAYS=true
|
@ -86,5 +86,9 @@ if [[ ! -z "$PAPERLESS_OCR_LANGUAGES" ]]; then
|
||||
install_languages "$PAPERLESS_OCR_LANGUAGES"
|
||||
fi
|
||||
|
||||
exec "$@"
|
||||
if [[ "$1" != "/"* ]]; then
|
||||
exec sudo -HEu paperless python3 manage.py "$@"
|
||||
else
|
||||
exec "$@"
|
||||
fi
|
||||
|
@ -1,12 +1,12 @@
|
||||
version: "3.4"
|
||||
services:
|
||||
broker:
|
||||
image: redis:latest
|
||||
#restart: always
|
||||
image: redis:6.0
|
||||
restart: always
|
||||
|
||||
db:
|
||||
image: postgres:13
|
||||
#restart: always
|
||||
restart: always
|
||||
volumes:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
environment:
|
||||
@ -15,10 +15,11 @@ services:
|
||||
POSTGRES_PASSWORD: paperless
|
||||
|
||||
webserver:
|
||||
image: paperless-ng:latest
|
||||
#restart: always
|
||||
image: jonaswinkler/paperless-ng:0.9.1
|
||||
restart: always
|
||||
depends_on:
|
||||
- db
|
||||
- broker
|
||||
ports:
|
||||
- 8000:8000
|
||||
healthcheck:
|
||||
@ -35,7 +36,6 @@ services:
|
||||
environment:
|
||||
PAPERLESS_REDIS: redis://broker:6379
|
||||
PAPERLESS_DBHOST: db
|
||||
command: ["supervisord", "-c", "/etc/supervisord.conf"]
|
||||
|
||||
|
||||
volumes:
|
31
docker/hub/docker-compose.sqlite.yml
Normal file
@ -0,0 +1,31 @@
|
||||
version: "3.4"
|
||||
services:
|
||||
broker:
|
||||
image: redis:6.0
|
||||
restart: always
|
||||
|
||||
webserver:
|
||||
image: jonaswinkler/paperless-ng:0.9.1
|
||||
restart: always
|
||||
depends_on:
|
||||
- broker
|
||||
ports:
|
||||
- 8000:8000
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
- ./export:/usr/src/paperless/export
|
||||
- ./consume:/usr/src/paperless/consume
|
||||
env_file: docker-compose.env
|
||||
environment:
|
||||
PAPERLESS_REDIS: redis://broker:6379
|
||||
|
||||
|
||||
volumes:
|
||||
data:
|
||||
media:
|
60
docker/local/Dockerfile
Normal file
@ -0,0 +1,60 @@
|
||||
FROM python:3.7-slim
|
||||
|
||||
WORKDIR /usr/src/paperless/
|
||||
|
||||
COPY requirements.txt ./
|
||||
|
||||
#Dependencies
|
||||
RUN apt-get update \
|
||||
&& apt-get -y --no-install-recommends install \
|
||||
build-essential \
|
||||
curl \
|
||||
ghostscript \
|
||||
gnupg \
|
||||
imagemagick \
|
||||
libatlas-base-dev \
|
||||
libmagic-dev \
|
||||
libpoppler-cpp-dev \
|
||||
libpq-dev \
|
||||
optipng \
|
||||
sudo \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-fra \
|
||||
tesseract-ocr-ita \
|
||||
tesseract-ocr-spa \
|
||||
tzdata \
|
||||
unpaper \
|
||||
&& pip3 install --upgrade supervisor setuptools \
|
||||
&& pip install --no-cache-dir -r requirements.txt \
|
||||
&& apt-get -y purge build-essential \
|
||||
&& apt-get -y autoremove --purge \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir /var/log/supervisord /var/run/supervisord
|
||||
|
||||
# copy scripts
|
||||
# this fixes issues with imagemagick and PDF
|
||||
COPY docker/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
|
||||
COPY docker/gunicorn.conf.py ./
|
||||
COPY docker/supervisord.conf /etc/supervisord.conf
|
||||
COPY docker/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
|
||||
# copy app
|
||||
COPY src/ ./src/
|
||||
|
||||
# add users, setup scripts
|
||||
RUN addgroup --gid 1000 paperless \
|
||||
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
|
||||
&& chown -R paperless:paperless . \
|
||||
&& chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
WORKDIR /usr/src/paperless/src/
|
||||
|
||||
RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input
|
||||
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisord.conf"]
|
||||
|
||||
LABEL maintainer="Jonas Winkler <dev@jpwinkler.de>"
|
44
docker/local/docker-compose.postgres.yml
Normal file
@ -0,0 +1,44 @@
|
||||
version: "3.4"
|
||||
services:
|
||||
broker:
|
||||
image: redis:6.0
|
||||
restart: always
|
||||
|
||||
db:
|
||||
image: postgres:13
|
||||
restart: always
|
||||
volumes:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
environment:
|
||||
POSTGRES_DB: paperless
|
||||
POSTGRES_USER: paperless
|
||||
POSTGRES_PASSWORD: paperless
|
||||
|
||||
webserver:
|
||||
build: .
|
||||
restart: always
|
||||
depends_on:
|
||||
- db
|
||||
- broker
|
||||
ports:
|
||||
- 8000:8000
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
- ./export:/usr/src/paperless/export
|
||||
- ./consume:/usr/src/paperless/consume
|
||||
env_file: docker-compose.env
|
||||
environment:
|
||||
PAPERLESS_REDIS: redis://broker:6379
|
||||
PAPERLESS_DBHOST: db
|
||||
|
||||
|
||||
volumes:
|
||||
data:
|
||||
media:
|
||||
pgdata:
|
31
docker/local/docker-compose.sqlite.yml
Normal file
@ -0,0 +1,31 @@
|
||||
version: "3.4"
|
||||
services:
|
||||
broker:
|
||||
image: redis:6.0
|
||||
restart: always
|
||||
|
||||
webserver:
|
||||
build: .
|
||||
restart: always
|
||||
depends_on:
|
||||
- broker
|
||||
ports:
|
||||
- 8000:8000
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
- ./export:/usr/src/paperless/export
|
||||
- ./consume:/usr/src/paperless/consume
|
||||
env_file: docker-compose.env
|
||||
environment:
|
||||
PAPERLESS_REDIS: redis://broker:6379
|
||||
|
||||
|
||||
volumes:
|
||||
data:
|
||||
media:
|
@ -1,10 +1,11 @@
|
||||
[supervisord]
|
||||
nodaemon=true ; start in foreground if true; default false
|
||||
logfile=/var/log/supervisord/supervisord.log ; main log file; default $CWD/supervisord.log
|
||||
pidfile=/var/log/supervisord/supervisord.pid ; supervisord pidfile; default supervisord.pid
|
||||
pidfile=/var/run/supervisord/supervisord.pid ; supervisord pidfile; default supervisord.pid
|
||||
logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB
|
||||
logfile_backups=10 ; # of main logfile backups; 0 means none, default 10
|
||||
loglevel=info ; log level; default info; others: debug,warn,trace
|
||||
user=root
|
||||
|
||||
[program:daphne]
|
||||
command=daphne -b 0.0.0.0 -p 8000 paperless.asgi:application
|
BIN
docs/_static/Screenshot_first_logged.png
vendored
Before Width: | Height: | Size: 60 KiB |
BIN
docs/_static/Screenshot_first_run_login.png
vendored
Before Width: | Height: | Size: 26 KiB |
BIN
docs/_static/Screenshot_upload_and_scanned.png
vendored
Before Width: | Height: | Size: 113 KiB |
BIN
docs/_static/paperless-0-dashboard.png
vendored
Executable file
After Width: | Height: | Size: 52 KiB |
BIN
docs/_static/paperless-1-list-table.png
vendored
Executable file
After Width: | Height: | Size: 62 KiB |
BIN
docs/_static/paperless-10-mobile.png
vendored
Executable file
After Width: | Height: | Size: 56 KiB |
BIN
docs/_static/paperless-11-mail-filters.png
vendored
Normal file
After Width: | Height: | Size: 70 KiB |
BIN
docs/_static/paperless-2-list-smallcards.png
vendored
Executable file
After Width: | Height: | Size: 256 KiB |
BIN
docs/_static/paperless-3-list-largecards.png
vendored
Executable file
After Width: | Height: | Size: 224 KiB |
BIN
docs/_static/paperless-4-filter.png
vendored
Executable file
After Width: | Height: | Size: 101 KiB |
BIN
docs/_static/paperless-5-editing.png
vendored
Executable file
After Width: | Height: | Size: 196 KiB |
BIN
docs/_static/paperless-6-tags.png
vendored
Executable file
After Width: | Height: | Size: 50 KiB |
BIN
docs/_static/paperless-7-autocomplete.png
vendored
Executable file
After Width: | Height: | Size: 53 KiB |
BIN
docs/_static/paperless-8-search-results.png
vendored
Executable file
After Width: | Height: | Size: 214 KiB |
BIN
docs/_static/paperless-9-admin.png
vendored
Executable file
After Width: | Height: | Size: 50 KiB |
BIN
docs/_static/recommended_workflow.png
vendored
Normal file
After Width: | Height: | Size: 67 KiB |
391
docs/administration.rst
Normal file
@ -0,0 +1,391 @@
|
||||
|
||||
**************
|
||||
Administration
|
||||
**************
|
||||
|
||||
.. _administration-backup:
|
||||
|
||||
Making backups
|
||||
##############
|
||||
|
||||
Multiple options exist for making backups of your paperless instance,
|
||||
depending on how you installed paperless.
|
||||
|
||||
Before making backups, make sure that paperless is not running.
|
||||
|
||||
Options available to any installation of paperless:
|
||||
|
||||
* Use the :ref:`document exporter <utilities-exporter>`.
|
||||
The document exporter exports all your documents, thumbnails and
|
||||
metadata to a specific folder. You may import your documents into a
|
||||
fresh instance of paperless again or store your documents in another
|
||||
DMS with this export.
|
||||
|
||||
Options available to docker installations:
|
||||
|
||||
* Backup the docker volumes. These usually reside within
|
||||
``/var/lib/docker/volumes`` on the host and you need to be root in order
|
||||
to access them.
|
||||
|
||||
Paperless uses 3 volumes:
|
||||
|
||||
* ``paperless_media``: This is where your documents are stored.
|
||||
* ``paperless_data``: This is where auxilliary data is stored. This
|
||||
folder also contains the SQLite database, if you use it.
|
||||
* ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
|
||||
the database.
|
||||
|
||||
Options available to bare-metal and non-docker installations:
|
||||
|
||||
* Backup the entire paperless folder. This ensures that if your paperless instance
|
||||
crashes at some point or your disk fails, you can simply copy the folder back
|
||||
into place and it works.
|
||||
|
||||
When using PostgreSQL, you'll also have to backup the database.
|
||||
|
||||
.. _migrating-restoring:
|
||||
|
||||
Restoring
|
||||
=========
|
||||
|
||||
|
||||
|
||||
|
||||
.. _administration-updating:
|
||||
|
||||
Updating paperless
|
||||
##################
|
||||
|
||||
If a new release of paperless-ng is available, upgrading depends on how you
|
||||
installed paperless-ng in the first place. The releases are available at
|
||||
`release page <https://github.com/jonaswinkler/paperless-ng/releases>`_.
|
||||
|
||||
First of all, ensure that paperless is stopped.
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose down
|
||||
|
||||
After that, :ref:`make a backup <administration-backup>`.
|
||||
|
||||
A. If you used the docker-compose file, simply download the files of the new release,
|
||||
adjust the settings in the files (i.e., the path to your consumption directory),
|
||||
and replace your existing docker-compose files. Then start paperless as usual,
|
||||
which will pull the new image, and update your database, if necessary:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose up
|
||||
|
||||
If you see everything working, you can start paperless-ng with "-d" to have it
|
||||
run in the background.
|
||||
|
||||
.. hint::
|
||||
|
||||
The released docker-compose files specify exact versions to be pulled from the hub.
|
||||
This is to ensure that if the docker-compose files should change at some point
|
||||
(i.e., services updates/configured differently), you wont run into trouble due to
|
||||
docker pulling the ``latest`` image and running it in an older environment.
|
||||
|
||||
B. If you built the image yourself, grab the new archive and replace your current
|
||||
paperless folder with the new contents.
|
||||
|
||||
After that, make the necessary adjustments to the docker-compose.yml (i.e.,
|
||||
adjust your consumption directory).
|
||||
|
||||
Build and start the new image with:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose build
|
||||
$ docker-compose up
|
||||
|
||||
If you see everything working, you can start paperless-ng with "-d" to have it
|
||||
run in the background.
|
||||
|
||||
.. hint::
|
||||
|
||||
You can usually keep your ``docker-compose.env`` file, since this file will
|
||||
never include mandantory configuration options. However, it is worth checking
|
||||
out the new version of this file, since it might have new recommendations
|
||||
on what to configure.
|
||||
|
||||
|
||||
Updating paperless without docker
|
||||
=================================
|
||||
|
||||
After grabbing the new release and unpacking the contents, do the following:
|
||||
|
||||
1. Update python requirements. Paperless uses
|
||||
`Pipenv`_ for managing dependencies:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ pip install --upgrade pipenv
|
||||
$ cd /path/to/paperless
|
||||
$ pipenv install
|
||||
$ pipenv clean
|
||||
|
||||
This creates a new virtual environment (or uses your existing environment)
|
||||
and installs all dependencies into it.
|
||||
|
||||
2. Collect static files.
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd src
|
||||
$ pipenv run python3 manage.py collectstatic --clear
|
||||
|
||||
3. Migrate the database.
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd src
|
||||
$ pipenv run python3 manage.py migrate
|
||||
|
||||
|
||||
Management utilities
|
||||
####################
|
||||
|
||||
Paperless comes with some management commands that perform various maintenance
|
||||
tasks on your paperless instance. You can invoke these commands either by
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose run --rm webserver <command> <arguments>
|
||||
|
||||
or
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd /path/to/paperless/src
|
||||
$ pipenv run python manage.py <command> <arguments>
|
||||
|
||||
depending on whether you use docker or not.
|
||||
|
||||
All commands have built-in help, which can be accessed by executing them with
|
||||
the argument ``--help``.
|
||||
|
||||
.. _utilities-exporter:
|
||||
|
||||
Document exporter
|
||||
=================
|
||||
|
||||
The document exporter exports all your data from paperless into a folder for
|
||||
backup or migration to another DMS.
|
||||
|
||||
.. code::
|
||||
|
||||
document_exporter target
|
||||
|
||||
``target`` is a folder to which the data gets written. This includes documents,
|
||||
thumbnails and a ``manifest.json`` file. The manifest contains all metadata from
|
||||
the database (correspondents, tags, etc).
|
||||
|
||||
When you use the provided docker compose script, specify ``../export`` as the
|
||||
target. This path inside the container is automatically mounted on your host on
|
||||
the folder ``export``.
|
||||
|
||||
|
||||
.. _utilities-importer:
|
||||
|
||||
Document importer
|
||||
=================
|
||||
|
||||
The document importer takes the export produced by the `Document exporter`_ and
|
||||
imports it into paperless.
|
||||
|
||||
The importer works just like the exporter. You point it at a directory, and
|
||||
the script does the rest of the work:
|
||||
|
||||
.. code::
|
||||
|
||||
document_importer source
|
||||
|
||||
When you use the provided docker compose script, put the export inside the
|
||||
``export`` folder in your paperless source directory. Specify ``../export``
|
||||
as the ``source``.
|
||||
|
||||
|
||||
.. _utilities-retagger:
|
||||
|
||||
Document retagger
|
||||
=================
|
||||
|
||||
Say you've imported a few hundred documents and now want to introduce
|
||||
a tag or set up a new correspondent, and apply its matching to all of
|
||||
the currently-imported docs. This problem is common enough that
|
||||
there are tools for it.
|
||||
|
||||
.. code::
|
||||
|
||||
document_retagger [-h] [-c] [-T] [-t] [-i] [--use-first] [-f]
|
||||
|
||||
optional arguments:
|
||||
-c, --correspondent
|
||||
-T, --tags
|
||||
-t, --document_type
|
||||
-i, --inbox-only
|
||||
--use-first
|
||||
-f, --overwrite
|
||||
|
||||
Run this after changing or adding matching rules. It'll loop over all
|
||||
of the documents in your database and attempt to match documents
|
||||
according to the new rules.
|
||||
|
||||
Specify any combination of ``-c``, ``-T`` and ``-t`` to have the
|
||||
retagger perform matching of the specified metadata type. If you don't
|
||||
specify any of these options, the document retagger won't do anything.
|
||||
|
||||
Specify ``-i`` to have the document retagger work on documents tagged
|
||||
with inbox tags only. This is useful when you don't want to mess with
|
||||
your already processed documents.
|
||||
|
||||
When multiple document types or correspondents match a single document,
|
||||
the retagger won't assign these to the document. Specify ``--use-first``
|
||||
to override this behaviour and just use the first correspondent or type
|
||||
it finds. This option does not apply to tags, since any amount of tags
|
||||
can be applied to a document.
|
||||
|
||||
Finally, ``-f`` specifies that you wish to overwrite already assigned
|
||||
correspondents, types and/or tags. The default behaviour is to not
|
||||
assign correspondents and types to documents that have this data already
|
||||
assigned. ``-f`` works differently for tags: By default, only additional tags get
|
||||
added to documents, no tags will be removed. With ``-f``, tags that don't
|
||||
match a document anymore get removed as well.
|
||||
|
||||
|
||||
Managing the Automatic matching algorithm
|
||||
=========================================
|
||||
|
||||
The *Auto* matching algorithm requires a trained neural network to work.
|
||||
This network needs to be updated whenever somethings in your data
|
||||
changes. The docker image takes care of that automatically with the task
|
||||
scheduler. You can manually renew the classifier by invoking the following
|
||||
management command:
|
||||
|
||||
.. code::
|
||||
|
||||
document_create_classifier
|
||||
|
||||
This command takes no arguments.
|
||||
|
||||
|
||||
Managing the document search index
|
||||
==================================
|
||||
|
||||
The document search index is responsible for delivering search results for the
|
||||
website. The document index is automatically updated whenever documents get
|
||||
added to, changed, or removed from paperless. However, if the search yields
|
||||
non-existing documents or won't find anything, you may need to recreate the
|
||||
index manually.
|
||||
|
||||
.. code::
|
||||
|
||||
document_index {reindex,optimize}
|
||||
|
||||
Specify ``reindex`` to have the index created from scratch. This may take some
|
||||
time.
|
||||
|
||||
Specify ``optimize`` to optimize the index. This updates certain aspects of
|
||||
the index and usually makes queries faster and also ensures that the
|
||||
autocompletion works properly. This command is regularly invoked by the task
|
||||
scheduler.
|
||||
|
||||
.. _utilities-renamer:
|
||||
|
||||
Managing filenames
|
||||
==================
|
||||
|
||||
If you use paperless' feature to
|
||||
:ref:`assign custom filenames to your documents <advanced-file_name_handling>`,
|
||||
you can use this command to move all your files after changing
|
||||
the naming scheme.
|
||||
|
||||
.. warning::
|
||||
|
||||
Since this command moves you documents around alot, it is advised to to
|
||||
a backup before. The renaming logic is robust and will never overwrite
|
||||
or delete a file, but you can't ever be careful enough.
|
||||
|
||||
.. code::
|
||||
|
||||
document_renamer
|
||||
|
||||
The command takes no arguments and processes all your documents at once.
|
||||
|
||||
|
||||
Fetching e-mail
|
||||
===============
|
||||
|
||||
Paperless automatically fetches your e-mail every 10 minutes by default. If
|
||||
you want to invoke the email consumer manually, call the following management
|
||||
command:
|
||||
|
||||
.. code::
|
||||
|
||||
mail_fetcher
|
||||
|
||||
The command takes no arguments and processes all your mail accounts and rules.
|
||||
|
||||
.. _utilities-encyption:
|
||||
|
||||
Managing encryption
|
||||
===================
|
||||
|
||||
Documents can be stored in Paperless using GnuPG encryption.
|
||||
|
||||
.. danger::
|
||||
|
||||
Encryption is depreceated since paperless-ng 0.9 and doesn't really provide any
|
||||
additional security, since you have to store the passphrase in a configuration
|
||||
file on the same system as the encrypted documents for paperless to work.
|
||||
Furthermore, the entire text content of the documents is stored plain in the
|
||||
database, even if your documents are encrypted. Filenames are not encrypted as
|
||||
well.
|
||||
|
||||
Also, the web server provides transparent access to your encrypted documents.
|
||||
|
||||
Consider running paperless on an encrypted filesystem instead, which will then
|
||||
at least provide security against physical hardware theft.
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase PASSPHRASE] {gpg,unencrypted} {gpg,unencrypted}
|
||||
|
||||
positional arguments:
|
||||
{gpg,unencrypted} The state you want to change your documents from
|
||||
{gpg,unencrypted} The state you want to change your documents to
|
||||
|
||||
optional arguments:
|
||||
--passphrase PASSPHRASE
|
||||
|
||||
Enabling encryption
|
||||
-------------------
|
||||
|
||||
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
|
||||
|
||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
|
||||
|
||||
|
||||
Disabling encryption
|
||||
--------------------
|
||||
|
||||
Basic usage to enable encryption of your document store:
|
||||
|
||||
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
|
||||
|
||||
|
||||
.. _Pipenv: https://pipenv.pypa.io/en/latest/
|
327
docs/advanced_usage.rst
Normal file
@ -0,0 +1,327 @@
|
||||
***************
|
||||
Advanced topics
|
||||
***************
|
||||
|
||||
Paperless offers a couple features that automate certain tasks and make your life
|
||||
easier.
|
||||
|
||||
Guesswork
|
||||
#########
|
||||
|
||||
|
||||
Any document you put into the consumption directory will be consumed, but if
|
||||
you name the file right, it'll automatically set some values in the database
|
||||
for you. This is is the logic the consumer follows:
|
||||
|
||||
1. Try to find the correspondent, title, and tags in the file name following
|
||||
the pattern: ``Date - Correspondent - Title - tag,tag,tag.pdf``. Note that
|
||||
the format of the date is **rigidly defined** as ``YYYYMMDDHHMMSSZ`` or
|
||||
``YYYYMMDDZ``. The ``Z`` refers "Zulu time" AKA "UTC".
|
||||
The tags are optional, so the format ``Date - Correspondent - Title.pdf``
|
||||
works as well.
|
||||
2. If that doesn't work, we skip the date and try this pattern:
|
||||
``Correspondent - Title - tag,tag,tag.pdf``.
|
||||
3. If that doesn't work, we try to find the correspondent and title in the file
|
||||
name following the pattern: ``Correspondent - Title.pdf``.
|
||||
4. If that doesn't work, just assume that the name of the file is the title.
|
||||
|
||||
So given the above, the following examples would work as you'd expect:
|
||||
|
||||
* ``20150314000700Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``20150314Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``Another Company - Letter of Reference.jpg``
|
||||
* ``Dad's Recipe for Pancakes.png``
|
||||
|
||||
These however wouldn't work:
|
||||
|
||||
* ``2015-03-14 00:07:00 UTC - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``2015-03-14 - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``Another Company- Letter of Reference.jpg``
|
||||
|
||||
Do I have to be so strict about naming?
|
||||
=======================================
|
||||
|
||||
Rather than using the strict document naming rules, one can also set the option
|
||||
``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
|
||||
that is accepted by dateparser_. Doing so will cause ``paperless`` to default
|
||||
to any date format that is found in the title, instead of a date pulled from
|
||||
the document's text, without requiring the strict formatting of the document
|
||||
filename as described above.
|
||||
|
||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
||||
|
||||
.. _advanced-transforming_filenames:
|
||||
|
||||
Transforming filenames for parsing
|
||||
==================================
|
||||
|
||||
Some devices can't produce filenames that can be parsed by the default
|
||||
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
|
||||
``paperless.conf`` one can add transformations that are applied to the filename
|
||||
before it's parsed.
|
||||
|
||||
The option contains a list of dictionaries of regular expressions (key:
|
||||
``pattern``) and replacements (key: ``repl``) in JSON format, which are
|
||||
applied in order by passing them to ``re.subn``. Transformation stops
|
||||
after the first match, so at most one transformation is applied. The general
|
||||
syntax is
|
||||
|
||||
.. code:: python
|
||||
|
||||
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
|
||||
|
||||
The example below is for a Brother ADS-2400N, a scanner that allows
|
||||
different names to different hardware buttons (useful for handling
|
||||
multiple entities in one instance), but insists on adding ``_<count>``
|
||||
to the filename.
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Brother profile configuration, support "Name_Date_Count" (the default
|
||||
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||
|
||||
|
||||
Matching tags, correspondents and document types
|
||||
################################################
|
||||
|
||||
After the consumer has tried to figure out what it could from the file name,
|
||||
it starts looking at the content of the document itself. It will compare the
|
||||
matching algorithms defined by every tag and correspondent already set in your
|
||||
database to see if they apply to the text in that document. In other words,
|
||||
if you defined a tag called ``Home Utility`` that had a ``match`` property of
|
||||
``bc hydro`` and a ``matching_algorithm`` of ``literal``, Paperless will
|
||||
automatically tag your newly-consumed document with your ``Home Utility`` tag
|
||||
so long as the text ``bc hydro`` appears in the body of the document somewhere.
|
||||
|
||||
The matching logic is quite powerful, and supports searching the text of your
|
||||
document with different algorithms, and as such, some experimentation may be
|
||||
necessary to get things right.
|
||||
|
||||
In order to have a tag, correspondent or type assigned automatically to newly
|
||||
consumed documents, assign a match and matching algorithm using the web
|
||||
interface. These settings define when to assign correspondents, tags and types
|
||||
to documents.
|
||||
|
||||
The following algorithms are available:
|
||||
|
||||
* **Any:** Looks for any occurrence of any word provided in match in the PDF.
|
||||
If you define the match as ``Bank1 Bank2``, it will match documents containing
|
||||
either of these terms.
|
||||
* **All:** Requires that every word provided appears in the PDF, albeit not in the
|
||||
order provided.
|
||||
* **Literal:** Matches only if the match appears exactly as provided in the PDF.
|
||||
* **Regular expression:** Parses the match as a regular expression and tries to
|
||||
find a match within the document.
|
||||
* **Fuzzy match:** I dont know. Look at the source.
|
||||
* **Auto:** Tries to automatically match new documents. This does not require you
|
||||
to set a match. See the notes below.
|
||||
|
||||
When using the "any" or "all" matching algorithms, you can search for terms
|
||||
that consist of multiple words by enclosing them in double quotes. For example,
|
||||
defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
|
||||
will match documents that contain either "Bank of America" or "BofA", but will
|
||||
not match documents containing "Bank of South America".
|
||||
|
||||
Then just save your tag/correspondent and run another document through the
|
||||
consumer. Once complete, you should see the newly-created document,
|
||||
automatically tagged with the appropriate data.
|
||||
|
||||
|
||||
.. _advanced-automatic_matching:
|
||||
|
||||
Automatic matching
|
||||
==================
|
||||
|
||||
Paperless-ng comes with a new matching algorithm called *Auto*. This matching
|
||||
algorithm tries to assign tags, correspondents and document types to your
|
||||
documents based on how you have assigned these on existing documents. It
|
||||
uses a neural network under the hood.
|
||||
|
||||
If, for example, all your bank statements of your account 123 at the Bank of
|
||||
America are tagged with the tag "bofa_123" and the matching algorithm of this
|
||||
tag is set to *Auto*, this neural network will examine your documents and
|
||||
automatically learn when to assign this tag.
|
||||
|
||||
There are a couple caveats you need to keep in mind when using this feature:
|
||||
|
||||
* Changes to your documents are not immediately reflected by the matching
|
||||
algorithm. The neural network needs to be *trained* on your documents after
|
||||
changes. Paperless periodically (default: once each hour) checks for changes
|
||||
and does this automatically for you.
|
||||
* The Auto matching algorithm only takes documents into account which are NOT
|
||||
placed in your inbox (i.e., have inbox tags assigned to them). This ensures
|
||||
that the neural network only learns from documents which you have correctly
|
||||
tagged before.
|
||||
* The matching algorithm can only work if there is a correlation between the
|
||||
tag, correspondent or document type and the document itself. Your bank
|
||||
statements usually contain your bank account number and the name of the bank,
|
||||
so this works reasonably well, However, tags such as "TODO" cannot be
|
||||
automatically assigned.
|
||||
* The matching algorithm needs a reasonable number of documents to identify when
|
||||
to assign tags, correspondents, and types. If one out of a thousand documents
|
||||
has the correspondent "Very obscure web shop I bought something five years
|
||||
ago", it will probably not assign this correspondent automatically if you buy
|
||||
something from them again. The more documents, the better.
|
||||
|
||||
Hooking into the consumption process
|
||||
####################################
|
||||
|
||||
Sometimes you may want to do something arbitrary whenever a document is
|
||||
consumed. Rather than try to predict what you may want to do, Paperless lets
|
||||
you execute scripts of your own choosing just before or after a document is
|
||||
consumed using a couple simple hooks.
|
||||
|
||||
Just write a script, put it somewhere that Paperless can read & execute, and
|
||||
then put the path to that script in ``paperless.conf`` with the variable name
|
||||
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
|
||||
``PAPERLESS_POST_CONSUME_SCRIPT``.
|
||||
|
||||
.. important::
|
||||
|
||||
These scripts are executed in a **blocking** process, which means that if
|
||||
a script takes a long time to run, it can significantly slow down your
|
||||
document consumption flow. If you want things to run asynchronously,
|
||||
you'll have to fork the process in your script and exit.
|
||||
|
||||
|
||||
Pre-consumption script
|
||||
======================
|
||||
|
||||
Executed after the consumer sees a new document in the consumption folder, but
|
||||
before any processing of the document is performed. This script receives exactly
|
||||
one argument:
|
||||
|
||||
* Document file name
|
||||
|
||||
A simple but common example for this would be creating a simple script like
|
||||
this:
|
||||
|
||||
``/usr/local/bin/ocr-pdf``
|
||||
|
||||
.. code:: bash
|
||||
|
||||
#!/usr/bin/env bash
|
||||
pdf2pdfocr.py -i ${1}
|
||||
|
||||
``/etc/paperless.conf``
|
||||
|
||||
.. code:: bash
|
||||
|
||||
...
|
||||
PAPERLESS_PRE_CONSUME_SCRIPT="/usr/local/bin/ocr-pdf"
|
||||
...
|
||||
|
||||
This will pass the path to the document about to be consumed to ``/usr/local/bin/ocr-pdf``,
|
||||
which will in turn call `pdf2pdfocr.py`_ on your document, which will then
|
||||
overwrite the file with an OCR'd version of the file and exit. At which point,
|
||||
the consumption process will begin with the newly modified file.
|
||||
|
||||
.. _pdf2pdfocr.py: https://github.com/LeoFCardoso/pdf2pdfocr
|
||||
|
||||
.. _advanced-post_consume_script:
|
||||
|
||||
Post-consumption script
|
||||
=======================
|
||||
|
||||
Executed after the consumer has successfully processed a document and has moved it
|
||||
into paperless. It receives the following arguments:
|
||||
|
||||
* Document id
|
||||
* Generated file name
|
||||
* Source path
|
||||
* Thumbnail path
|
||||
* Download URL
|
||||
* Thumbnail URL
|
||||
* Correspondent
|
||||
* Tags
|
||||
|
||||
The script can be in any language you like, but for a simple shell script
|
||||
example, you can take a look at ``post-consumption-example.sh`` in the
|
||||
``scripts`` directory in this project.
|
||||
|
||||
The post consumption script cannot cancel the consumption process.
|
||||
|
||||
.. _advanced-file_name_handling:
|
||||
|
||||
File name handling
|
||||
##################
|
||||
|
||||
By default, paperless stores your documents in the media directory and renames them
|
||||
using the identifier which it has assigned to each document. You will end up getting
|
||||
files like ``0000123.pdf`` in your media directory. This isn't necessarily a bad
|
||||
thing, because you normally don't have to access these files manually. However, if
|
||||
you wish to name your files differently, you can do that by adjustng the
|
||||
``PAPERLESS_FILENAME_FORMAT`` settings variable.
|
||||
|
||||
This variable allows you to configure the filename (folders are allowed!) using
|
||||
placeholders. For example, setting
|
||||
|
||||
.. code:: bash
|
||||
|
||||
PAPERLESS_FILENAME_FORMAT={created_year}/{correspondent}/{title}
|
||||
|
||||
will create a directory structure as follows:
|
||||
|
||||
.. code::
|
||||
|
||||
2019/
|
||||
my_bank/
|
||||
statement-january-0000001.pdf
|
||||
statement-february-0000002.pdf
|
||||
2020/
|
||||
my_bank/
|
||||
statement-january-0000003.pdf
|
||||
shoe_store/
|
||||
my_new_shoes-0000004.pdf
|
||||
|
||||
Paperless appends the unique identifier of each document to the filename. This
|
||||
avoides filename clashes.
|
||||
|
||||
.. danger::
|
||||
|
||||
Do not manually move your files in the media folder. Paperless remembers the
|
||||
last filename a document was stored as. If you do rename a file, paperless will
|
||||
report your files as missing and won't be able to find them.
|
||||
|
||||
Paperless provides the following placeholders withing filenames:
|
||||
|
||||
* ``{correspondent}``: The name of the correspondent, or "none".
|
||||
* ``{title}``: The title of the document.
|
||||
* ``{created}``: The full date and time the document was created.
|
||||
* ``{created_year}``: Year created only.
|
||||
* ``{created_month}``: Month created only (number 1-12).
|
||||
* ``{created_day}``: Day created only (number 1-31).
|
||||
* ``{added}``: The full date and time the document was added to paperless.
|
||||
* ``{added_year}``: Year added only.
|
||||
* ``{added_month}``: Month added only (number 1-12).
|
||||
* ``{added_day}``: Day added only (number 1-31).
|
||||
* ``{tags}``: I don't know how this works. Look at the source.
|
||||
|
||||
Paperless will convert all values for the placeholders into values which are safe
|
||||
for use in filenames.
|
||||
|
||||
.. hint::
|
||||
|
||||
Paperless checks the filename of a document whenever it is saved. Therefore,
|
||||
you need to update the filenames of your documents and move them after altering
|
||||
this setting by invoking the :ref:`document renamer <utilities-renamer>`.
|
||||
|
||||
.. warning::
|
||||
|
||||
Make absolutely sure you get the spelling of the placeholders right, or else
|
||||
paperless will use the default naming scheme instead.
|
||||
|
||||
.. caution::
|
||||
|
||||
As of now, you could totally tell paperless to store your files anywhere outside
|
||||
the media directory by setting
|
||||
|
||||
.. code::
|
||||
|
||||
PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title}
|
||||
|
||||
However, keep in mind that inside docker, if files get stored outside of the
|
||||
predefined volumes, they will be lost after a restart of paperless.
|
178
docs/api.rst
@ -1,23 +1,173 @@
|
||||
.. _api:
|
||||
|
||||
************
|
||||
The REST API
|
||||
############
|
||||
************
|
||||
|
||||
Paperless makes use of the `Django REST Framework`_ standard API interface
|
||||
because of its inherent awesomeness. Conveniently, the system is also
|
||||
self-documenting, so to learn more about the access points, schema, what's
|
||||
accepted and what isn't, you need only visit ``/api`` on your local Paperless
|
||||
installation.
|
||||
|
||||
Paperless makes use of the `Django REST Framework`_ standard API interface.
|
||||
It provides a browsable API for most of its endpoints, which you can inspect
|
||||
at ``http://<paperless-host>:<port>/api/``. This also documents most of the
|
||||
available filters and ordering fields.
|
||||
|
||||
.. _Django REST Framework: http://django-rest-framework.org/
|
||||
|
||||
The API provides 5 main endpoints:
|
||||
|
||||
.. _api-uploading:
|
||||
* ``/api/correspondents/``: Full CRUD support.
|
||||
* ``/api/document_types/``: Full CRUD support.
|
||||
* ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
|
||||
* ``/api/logs/``: Read-Only.
|
||||
* ``/api/tags/``: Full CRUD support.
|
||||
|
||||
Uploading
|
||||
---------
|
||||
All of these endpoints except for the logging endpoint
|
||||
allow you to fetch, edit and delete individual objects
|
||||
by appending their primary key to the path, for example ``/api/documents/454/``.
|
||||
|
||||
File uploads in an API are hard and so far as I've been able to tell, there's
|
||||
no standard way of accepting them, so rather than crowbar file uploads into the
|
||||
REST API and endure that headache, I've left that process to a simple HTTP
|
||||
POST, documented on the :ref:`consumption page <consumption-http>`.
|
||||
In addition to that, the document endpoint offers these additional actions on
|
||||
individual documents:
|
||||
|
||||
* ``/api/documents/<pk>/download/``: Download the original document.
|
||||
* ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
|
||||
* ``/api/documents/<pk>/preview/``: Display the original document inline,
|
||||
without downloading it.
|
||||
|
||||
.. hint::
|
||||
|
||||
Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
|
||||
``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
|
||||
are in place. However, if you use these old URLs to access documents, you
|
||||
should update your app or script to use the new URLs.
|
||||
|
||||
Searching for documents
|
||||
#######################
|
||||
|
||||
Paperless-ng offers API endpoints for full text search. These are as follows:
|
||||
|
||||
``/api/search/``
|
||||
================
|
||||
|
||||
Get search results based on a query.
|
||||
|
||||
Query parameters:
|
||||
|
||||
* ``query``: The query string. See
|
||||
`here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
|
||||
for details on the syntax.
|
||||
* ``page``: Specify the page you want to retrieve. Each page
|
||||
contains 10 search results and the first page is ``page=1``, which
|
||||
is the default if this is omitted.
|
||||
|
||||
Result list object returned by the endpoint:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{
|
||||
"count": 1,
|
||||
"page": 1,
|
||||
"page_count": 1,
|
||||
"results": [
|
||||
|
||||
]
|
||||
}
|
||||
|
||||
* ``count``: The approximate total number of results.
|
||||
* ``page``: The page returned to you. This might be different from
|
||||
the page you requested, if you requested a page that is behind
|
||||
the last page. In that case, the last page is returned.
|
||||
* ``page_count``: The total number of pages.
|
||||
* ``results``: A list of result objects on the current page.
|
||||
|
||||
Result object:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"highlights": [
|
||||
|
||||
],
|
||||
"score": 6.34234,
|
||||
"rank": 23,
|
||||
"document": {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
* ``id``: the primary key of the found document
|
||||
* ``highlights``: an object containing parseable highlights for the result.
|
||||
See below.
|
||||
* ``score``: The score assigned to the document. A higher score indicates a
|
||||
better match with the query. Search results are sorted descending by score.
|
||||
* ``rank``: the position of the document within the entire search results list.
|
||||
* ``document``: The full json of the document, as returned by
|
||||
``/api/documents/<id>/``.
|
||||
|
||||
Highlights object:
|
||||
|
||||
Highlights are provided as a list of fragments. A fragment is a longer section of
|
||||
text from the original document.
|
||||
Each fragment contains a list of strings, and some of them are marked as a highlight.
|
||||
|
||||
.. code:: json
|
||||
|
||||
[
|
||||
[
|
||||
{"text": "This is a sample text with a "},
|
||||
{"text": "highlighted", "term": 0},
|
||||
{"text": " word."}
|
||||
],
|
||||
[
|
||||
{"text": "Another", "term": 1},
|
||||
{"text": " fragment with a highlight."}
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
When ``term`` is present within a string, the word within ``text`` should be highlighted.
|
||||
The term index groups multiple matches together and words with the same index
|
||||
should get identical highlighting.
|
||||
A client may use this example to produce the following output:
|
||||
|
||||
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
|
||||
|
||||
``/api/search/autocomplete/``
|
||||
=============================
|
||||
|
||||
Get auto completions for a partial search term.
|
||||
|
||||
Query parameters:
|
||||
|
||||
* ``term``: The incomplete term.
|
||||
* ``limit``: Amount of results. Defaults to 10.
|
||||
|
||||
Results returned by the endpoint are ordered by importance of the term in the
|
||||
document index. The first result is the term that has the highest Tf/Idf score
|
||||
in the index.
|
||||
|
||||
.. code:: json
|
||||
|
||||
[
|
||||
"term1",
|
||||
"term3",
|
||||
"term6",
|
||||
"term4"
|
||||
]
|
||||
|
||||
|
||||
.. _api-file_uploads:
|
||||
|
||||
POSTing documents
|
||||
#################
|
||||
|
||||
The API provides a special endpoint for file uploads:
|
||||
|
||||
``/api/documents/post_document/``
|
||||
|
||||
POST a multipart form to this endpoint, where the form field ``document`` contains
|
||||
the document that you want to upload to paperless. The filename is sanitized and
|
||||
then used to store the document in the consumption folder, where the consumer will
|
||||
detect the document and process it as any other document.
|
||||
|
||||
The endpoint will immediately return "OK." if the document was stored in the
|
||||
consumption directory.
|
||||
|
@ -1,12 +1,119 @@
|
||||
|
||||
.. _paperless_changelog:
|
||||
|
||||
*********
|
||||
Changelog
|
||||
#########
|
||||
*********
|
||||
|
||||
paperless-ng 0.9.1
|
||||
##################
|
||||
|
||||
* Moved documentation of the settings to the actual documentation.
|
||||
* Updated release script to force the user to choose between SQLite
|
||||
and PostgreSQL. This avoids confusion when upgrading from paperless.
|
||||
|
||||
|
||||
paperless-ng 0.9.0
|
||||
##################
|
||||
|
||||
* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
|
||||
This features will most likely be removed in future versions.
|
||||
|
||||
* **Added:** New frontend. Features:
|
||||
|
||||
* Single page application: It's much more responsive than the django admin pages.
|
||||
* Dashboard. Shows recently scanned documents, or todos, or other documents
|
||||
at wish. Allows uploading of documents. Shows basic statistics.
|
||||
* Better document list with multiple display options.
|
||||
* Full text search with result highlighting, auto completion and scoring based
|
||||
on the query. It uses a document search index in the background.
|
||||
* Saveable filters.
|
||||
* Better log viewer.
|
||||
|
||||
* **Added:** Document types. Assign these to documents just as correspondents.
|
||||
They may be used in the future to perform automatic operations on documents
|
||||
depending on the type.
|
||||
* **Added:** Inbox tags. Define an inbox tag and it will automatically be
|
||||
assigned to any new document scanned into the system.
|
||||
* **Added:** Automatic matching. A new matching algorithm that automatically
|
||||
assigns tags, document types and correspondents to your documents. It uses
|
||||
a neural network trained on your data.
|
||||
* **Added:** Archive serial numbers. Assign these to quickly find documents stored in
|
||||
physical binders.
|
||||
* **Added:** Enabled the internal user management of django. This isn't really a
|
||||
multi user solution, however, it allows more than one user to access the website
|
||||
and set some basic permissions / renew passwords.
|
||||
|
||||
* **Modified [breaking]:** All new mail consumer with customizable filters, actions and
|
||||
multiple account support. Replaces the old mail consumer. The new mail consumer
|
||||
needs different configuration but can be configured to act exactly like the old
|
||||
consumer.
|
||||
|
||||
|
||||
* **Modified:** Changes to the consumer:
|
||||
|
||||
* Now uses the excellent watchdog library that should make sure files are
|
||||
discovered no matter what the platform is.
|
||||
* The consumer now uses a task scheduler to run consumption processes in parallel.
|
||||
This means that consuming many documents should be much faster on systems with
|
||||
many cores.
|
||||
* Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
|
||||
and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
|
||||
* The consumer no longer blocks the database for extended periods of time.
|
||||
* An issue with tesseract running multiple threads per page and slowing down
|
||||
the consumer was fixed.
|
||||
|
||||
* **Modified [breaking]:** REST Api changes:
|
||||
|
||||
* New filters added, other filters removed (case sensitive filters, slug filters)
|
||||
* Endpoints for thumbnails, previews and downloads replace the old ``/fetch/`` urls. Redirects are in place.
|
||||
* Endpoint for document uploads replaces the old ``/push`` url. Redirects are in place.
|
||||
* Foreign key relationships are now served as IDs, not as urls.
|
||||
|
||||
* **Modified [breaking]:** PostgreSQL:
|
||||
|
||||
* If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses postgresql instead of sqlite.
|
||||
Username, database and password all default to ``paperless`` if not specified.
|
||||
|
||||
* **Modified [breaking]:** document_retagger management command rework. See
|
||||
:ref:`utilities-retagger` for details. Replaces ``document_correspondents``
|
||||
management command.
|
||||
* **Removed [breaking]:** Reminders.
|
||||
* **Removed:** All customizations made to the django admin pages.
|
||||
* **Removed [breaking]:** The docker image no longer supports SSL. If you want to expose
|
||||
paperless to the internet, hide paperless behind a proxy server that handles SSL
|
||||
requests.
|
||||
* **Internal changes:** Mostly code cleanup, including:
|
||||
|
||||
* Rework of the code of the tesseract parser. This is now a lot cleaner.
|
||||
* Rework of the filename handling code. It was a mess.
|
||||
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
|
||||
* Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
|
||||
and consuming documents.
|
||||
* Updated dependencies. Now uses Pipenv all around.
|
||||
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
|
||||
|
||||
* **Settings:**
|
||||
|
||||
* ``PAPERLESS_FORGIVING_OCR`` is now default and gone. Reason: Even if ``langdetect`` fails to detect
|
||||
a language, tesseract still does a very good job at ocr'ing a document with the default language.
|
||||
Certain language specifics such as umlauts may not get picked up properly.
|
||||
* ``PAPERLESS_DEBUG`` defaults to ``false``.
|
||||
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
|
||||
sqlite.
|
||||
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
|
||||
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
|
||||
* ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
|
||||
optimization. This is useful on less powerful devices.
|
||||
|
||||
* Many more small changes here and there. The usual stuff.
|
||||
|
||||
2.7.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* `syntonym`_ submitted a pull request to catch IMAP connection errors `#475`_.
|
||||
* `Stéphane Brunner`_ added ``psycopg2`` to the Pipfile `#489`_. He also fixed
|
||||
a syntax error in ``docker-compose.yml.example`` `#488`_ and added [DjangoQL](https://github.com/ivelum/djangoql),
|
||||
a syntax error in ``docker-compose.yml.example`` `#488`_ and added `DjangoQL`_,
|
||||
which allows a litany of handy search functionality `#492`_.
|
||||
* `CkuT`_ and `JOKer`_ hacked out a simple, but super-helpful optimisation to
|
||||
how the thumbnails are served up, improving performance considerably `#481`_.
|
||||
@ -19,7 +126,7 @@ Changelog
|
||||
|
||||
|
||||
2.6.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* We now have a logo, complete with a favicon :-)
|
||||
* Removed some problematic tests.
|
||||
@ -31,7 +138,7 @@ Changelog
|
||||
|
||||
|
||||
2.6.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Allow an infinite number of logs to be deleted. Thanks to `Ulli`_ for noting
|
||||
the problem in `#433`_.
|
||||
@ -52,7 +159,7 @@ Changelog
|
||||
|
||||
|
||||
2.5.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* **New dependency**: Paperless now optimises thumbnail generation with
|
||||
`optipng`_, so you'll need to install that somewhere in your PATH or declare
|
||||
@ -96,7 +203,7 @@ Changelog
|
||||
|
||||
|
||||
2.4.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* A new set of actions are now available thanks to `jonaswinkler`_'s very first
|
||||
pull request! You can now do nifty things like tag documents in bulk, or set
|
||||
@ -117,7 +224,7 @@ Changelog
|
||||
|
||||
|
||||
2.3.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Support for consuming plain text & markdown documents was added by
|
||||
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
|
||||
@ -135,14 +242,14 @@ Changelog
|
||||
|
||||
|
||||
2.2.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
|
||||
the ``DISABLE_LOGIN`` feature: `#392`_.
|
||||
|
||||
|
||||
2.2.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
|
||||
version of Paperless that supports Django 2.0! As a result of their hard
|
||||
@ -159,7 +266,7 @@ Changelog
|
||||
|
||||
|
||||
2.1.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* `Enno Lohmeier`_ added three simple features that make Paperless a lot more
|
||||
user (and developer) friendly:
|
||||
@ -178,7 +285,7 @@ Changelog
|
||||
|
||||
|
||||
2.0.0
|
||||
=====
|
||||
#####
|
||||
|
||||
This is a big release as we've changed a core-functionality of Paperless: we no
|
||||
longer encrypt files with GPG by default.
|
||||
@ -194,7 +301,7 @@ that it was more an annoyance than anything else, so this feature is now turned
|
||||
off unless you explicitly set a passphrase in your config file.
|
||||
|
||||
Migrating from 1.x
|
||||
------------------
|
||||
==================
|
||||
|
||||
Encryption isn't gone, it's just off for new users. So long as you have
|
||||
``PAPERLESS_PASSPHRASE`` set in your config or your environment, Paperless
|
||||
@ -210,7 +317,7 @@ Special thanks to `erikarvstedt`_, `matthewmoto`_, and `mcronce`_ who did the
|
||||
bulk of the work on this big change.
|
||||
|
||||
1.4.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* `Quentin Dawans`_ has refactored the document consumer to allow for some
|
||||
command-line options. Notably, you can now direct it to consume from a
|
||||
@ -245,7 +352,7 @@ bulk of the work on this big change.
|
||||
to some excellent work from `erikarvstedt`_ on `#351`_
|
||||
|
||||
1.3.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* You can now run Paperless without a login, though you'll still have to create
|
||||
at least one user. This is thanks to a pull-request from `matthewmoto`_:
|
||||
@ -268,7 +375,7 @@ bulk of the work on this big change.
|
||||
problem and helping me find where to fix it.
|
||||
|
||||
1.2.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||
and `Pit`_. This new image is dramatically smaller than the Debian-based
|
||||
@ -287,7 +394,7 @@ bulk of the work on this big change.
|
||||
in the document text.
|
||||
|
||||
1.1.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||
@ -297,7 +404,7 @@ bulk of the work on this big change.
|
||||
`Dan Panzarella`_
|
||||
|
||||
1.0.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Upgrade to Django 1.11. **You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
@ -316,14 +423,14 @@ bulk of the work on this big change.
|
||||
`Lukas Winkler`_'s issue `#278`_
|
||||
|
||||
0.8.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||
`#255`_.
|
||||
|
||||
0.7.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||
automatically delete documents attached to correspondents when those
|
||||
@ -335,7 +442,7 @@ bulk of the work on this big change.
|
||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||
|
||||
0.6.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||
of BasicAuth or Django session.
|
||||
@ -349,7 +456,7 @@ bulk of the work on this big change.
|
||||
the help with this feature.
|
||||
|
||||
0.5.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||
@ -367,13 +474,13 @@ bulk of the work on this big change.
|
||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||
|
||||
0.4.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||
all-caps suffixes like ``.PDF``
|
||||
|
||||
0.4.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Introducing reminders. See `#199`_ for more information, but the short
|
||||
explanation is that you can now attach simple notes & times to documents
|
||||
@ -383,7 +490,7 @@ bulk of the work on this big change.
|
||||
like to make use of this feature in his project.
|
||||
|
||||
0.3.6
|
||||
=====
|
||||
#####
|
||||
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
@ -397,7 +504,7 @@ bulk of the work on this big change.
|
||||
documentation is on its way.
|
||||
|
||||
0.3.5
|
||||
=====
|
||||
#####
|
||||
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
@ -408,7 +515,7 @@ bulk of the work on this big change.
|
||||
consumption.
|
||||
|
||||
0.3.4
|
||||
=====
|
||||
#####
|
||||
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
@ -421,26 +528,26 @@ bulk of the work on this big change.
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
|
||||
0.3.3
|
||||
=====
|
||||
#####
|
||||
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
|
||||
0.3.2
|
||||
=====
|
||||
#####
|
||||
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
|
||||
0.3.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
|
||||
0.3.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
@ -453,7 +560,7 @@ bulk of the work on this big change.
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
|
||||
0.2.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
@ -481,7 +588,7 @@ bulk of the work on this big change.
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
|
||||
0.1.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
@ -505,7 +612,7 @@ bulk of the work on this big change.
|
||||
to be imported but made unavailable.
|
||||
|
||||
0.1.0
|
||||
=====
|
||||
#####
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
@ -524,14 +631,14 @@ bulk of the work on this big change.
|
||||
* Added tox with pep8 checking
|
||||
|
||||
0.0.6
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
|
||||
0.0.5
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
@ -540,7 +647,7 @@ bulk of the work on this big change.
|
||||
* Documentation for the above as well as data migration
|
||||
|
||||
0.0.4
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
@ -548,19 +655,19 @@ bulk of the work on this big change.
|
||||
* Added ``pytz`` to the list of requirements
|
||||
|
||||
0.0.3
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added basic tagging
|
||||
|
||||
0.0.2
|
||||
=====
|
||||
#####
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
|
||||
0.0.1
|
||||
=====
|
||||
#####
|
||||
|
||||
* Initial release
|
||||
|
||||
@ -739,6 +846,6 @@ bulk of the work on this big change.
|
||||
.. _#489: https://github.com/the-paperless-project/paperless/pull/489
|
||||
.. _#492: https://github.com/the-paperless-project/paperless/pull/492
|
||||
|
||||
.. _pipenv: https://docs.pipenv.org/
|
||||
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
|
||||
.. _optipng: http://optipng.sourceforge.net/
|
||||
.. _DjangoQL: https://github.com/ivelum/djangoql
|
||||
|
@ -1,15 +0,0 @@
|
||||
Changelog (jonaswinkler)
|
||||
########################
|
||||
|
||||
1.0.0
|
||||
=====
|
||||
|
||||
* First release based on paperless 2.6.0
|
||||
* Added: Automatic document classification using neural networks (replaces
|
||||
regex-based tagging)
|
||||
* Added: Document types
|
||||
* Added: Archive serial number allows easy referencing of physical document
|
||||
copies
|
||||
* Added: Inbox tags (added automatically to newly consumed documents)
|
||||
* Added: Document viewer on document edit page
|
||||
* Database backend is now configurable
|
@ -54,7 +54,7 @@ source_suffix = '.rst'
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'Paperless'
|
||||
project = u'Paperless-ng'
|
||||
copyright = u'2015, Daniel Quinn'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
@ -205,7 +205,8 @@ try:
|
||||
import sphinx_rtd_theme
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
except ImportError:
|
||||
except ImportError as e:
|
||||
print("error " + str(e))
|
||||
pass
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
306
docs/configuration.rst
Normal file
@ -0,0 +1,306 @@
|
||||
.. _configuration:
|
||||
|
||||
*************
|
||||
Configuration
|
||||
*************
|
||||
|
||||
Paperless provides a wide range of customizations.
|
||||
Depending on how you run paperless, these settings have to be defined in different
|
||||
places.
|
||||
|
||||
* If you run paperless on docker, ``paperless.conf`` is not used. Rather, configure
|
||||
paperless by copying necessary options to ``docker-compose.env``.
|
||||
* If you are running paperless on anything else, paperless will search for the
|
||||
configuration file in these locations and use the first one it finds:
|
||||
|
||||
.. code::
|
||||
|
||||
/path/to/paperless/paperless.conf
|
||||
/etc/paperless.conf
|
||||
/usr/local/etc/paperless.conf
|
||||
|
||||
|
||||
Required services
|
||||
#################
|
||||
|
||||
PAPERLESS_REDIS=<url>
|
||||
This is required for processing scheduled tasks such as email fetching, index
|
||||
optimization and for training the automatic document matcher.
|
||||
|
||||
Defaults to redis://localhost:6379.
|
||||
|
||||
PAPERLESS_DBHOST=<hostname>
|
||||
By default, sqlite is used as the database backend. This can be changed here.
|
||||
Set PAPERLESS_DBHOST and PostgreSQL will be used instead of mysql.
|
||||
|
||||
PAPERLESS_DBPORT=<port>
|
||||
Adjust port if necessary.
|
||||
|
||||
Default is 5432.
|
||||
|
||||
PAPERLESS_DBNAME=<name>
|
||||
Database name in PostgreSQL.
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
PAPERLESS_DBUSER=<name>
|
||||
Database user in PostgreSQL.
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
PAPERLESS_DBPASS=<password>
|
||||
Database password for PostgreSQL.
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
|
||||
Paths and folders
|
||||
#################
|
||||
|
||||
PAPERLESS_CONSUMPTION_DIR=<path>
|
||||
This where your documents should go to be consumed. Make sure that it exists
|
||||
and that the user running the paperless service can read/write its contents
|
||||
before you start Paperless.
|
||||
|
||||
Don't change this when using docker, as it only changes the path within the
|
||||
container. Change the local consumption directory in the docker-compose.yml
|
||||
file instead.
|
||||
|
||||
Defaults to "../consume", relative to the "src" directory.
|
||||
|
||||
PAPERLESS_DATA_DIR=<path>
|
||||
This is where paperless stores all its data (search index, sqlite database,
|
||||
classification model, etc).
|
||||
|
||||
Defaults to "../data", relative to the "src" directory.
|
||||
|
||||
PAPERLESS_MEDIA_ROOT=<path>
|
||||
This is where your documents and thumbnails are stored.
|
||||
|
||||
You can set this and PAPERLESS_DATA_DIR to the same folder to have paperless
|
||||
store all its data within the same volume.
|
||||
|
||||
Defaults to "../media", relative to the "src" directory.
|
||||
|
||||
PAPERLESS_STATICDIR=<path>
|
||||
Override the default STATIC_ROOT here. This is where all static files
|
||||
created using "collectstatic" manager command are stored.
|
||||
|
||||
Unless you're doing something fancy, there is no need to override this.
|
||||
|
||||
Defaults to "../static", relative to the "src" directory.
|
||||
|
||||
PAPERLESS_FILENAME_FORMAT=<format>
|
||||
Changes the filenames paperless uses to store documents in the media directory.
|
||||
See :ref:`advanced-file_name_handling` for details.
|
||||
|
||||
Default is none, which disables this feature.
|
||||
|
||||
Hosting & Security
|
||||
##################
|
||||
|
||||
PAPERLESS_SECRET_KEY=<key>
|
||||
Paperless uses this to make session tokens. If you exose paperless on the
|
||||
internet, you need to change this, since the default secret is well known.
|
||||
|
||||
Use any sequence of characters. The more, the better. You don't need to
|
||||
remember this. Just face-roll your keyboard.
|
||||
|
||||
Default is listed in the file ``src/paperless/settings.py``.
|
||||
|
||||
PAPERLESS_ALLOWED_HOSTS<comma-separated-list>
|
||||
If you're planning on putting Paperless on the open internet, then you
|
||||
really should set this value to the domain name you're using. Failing to do
|
||||
so leaves you open to HTTP host header attacks:
|
||||
https://docs.djangoproject.com/en/3.1/topics/security/#host-header-validation
|
||||
|
||||
Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
|
||||
Defaults to "*", which is all hosts.
|
||||
|
||||
PAPERLESS_CORS_ALLOWED_HOSTS<comma-separated-list>
|
||||
You need to add your servers to the list of allowed hosts that can do CORS
|
||||
calls. Set this to your public domain name.
|
||||
|
||||
Defaults to "http://localhost:8000".
|
||||
|
||||
PAPERLESS_FORCE_SCRIPT_NAME=<path>
|
||||
To host paperless under a subpath url like example.com/paperless you set
|
||||
this value to /paperless. No trailing slash!
|
||||
|
||||
.. note::
|
||||
|
||||
I don't know if this works in paperless-ng. Probably not.
|
||||
|
||||
Defaults to none, which hosts paperless at "/".
|
||||
|
||||
PAPERLESS_STATIC_URL=<path>
|
||||
Override the STATIC_URL here. Unless you're hosting Paperless off a
|
||||
subdomain like /paperless/, you probably don't need to change this.
|
||||
|
||||
Defaults to "/static/".
|
||||
|
||||
|
||||
Software tweaks
|
||||
###############
|
||||
|
||||
PAPERLESS_TASK_WORKERS=<num>
|
||||
Paperless does multiple things in the background: Maintain the search index,
|
||||
maintain the automatic matching algorithm, check emails, consume documents,
|
||||
etc. This variable specifies how many things it will do in parallel.
|
||||
|
||||
PAPERLESS_THREADS_PER_WORKER=<num>
|
||||
Furthermore, paperless uses multiple threads when consuming documents to
|
||||
speed up OCR. This variable specifies how many pages paperless will process
|
||||
in parallel on a single document.
|
||||
|
||||
.. caution::
|
||||
|
||||
Ensure that the product
|
||||
|
||||
PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
|
||||
|
||||
does not exceed your CPU core count or else paperless will be extremely slow.
|
||||
If you want paperless to process many documents in parallel, choose a high
|
||||
worker count. If you want paperless to process very large documents faster,
|
||||
use a higher thread per worker count.
|
||||
|
||||
The default is a balance between the two, according to your CPU core count,
|
||||
with a slight favor towards threads per worker, and using as much cores as
|
||||
possible.
|
||||
|
||||
If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
|
||||
PAPERLESS_THREADS_PER_WORKER automatically.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_TIME_ZONE=<timezone>
|
||||
Set the time zone here.
|
||||
See https://docs.djangoproject.com/en/3.1/ref/settings/#std:setting-TIME_ZONE
|
||||
for details on how to set it.
|
||||
|
||||
Defaults to UTC.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the default language that tesseract will attempt to use when
|
||||
parsing documents. The default language is used whenever
|
||||
|
||||
* No language could be detected on a document
|
||||
* No tesseract data files are available for the detected language
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
Set this to the language most of your documents are written in.
|
||||
|
||||
Defaults to "eng".
|
||||
|
||||
PAPERLESS_OCR_ALWAYS=<bool>
|
||||
By default Paperless does not OCR a document if the text can be retrieved from
|
||||
the document directly. Set to true to always OCR documents.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
PAPERLESS_CONSUMER_POLLING=<num>
|
||||
If paperless won't find documents added to your consume folder, it might
|
||||
not be able to automatically detect filesystem changes. In that case,
|
||||
specify a polling interval in seconds here, which will then cause paperless
|
||||
to periodically check your consumption directory for changes.
|
||||
|
||||
Defaults to 0, which disables polling and uses filesystem notifiactions.
|
||||
|
||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
|
||||
When the consumer detects a duplicate document, it will not touch the
|
||||
original document. This default behavior can be changed here.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
|
||||
On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||
may explode, complaining about how it's "unable to extend pixel cache". In
|
||||
such cases, try setting this to a reasonably low value, like 32. The
|
||||
default is to use whatever is necessary to do everything without writing to
|
||||
disk, and units are in megabytes.
|
||||
|
||||
For more information on how to use this value, you should search
|
||||
the web for "MAGICK_MEMORY_LIMIT".
|
||||
|
||||
Defaults to 0, which disables the limit.
|
||||
|
||||
PAPERLESS_CONVERT_TMPDIR=<path>
|
||||
Similar to the memory limit, if you've got a small system and your OS mounts
|
||||
/tmp as tmpfs, you should set this to a path that's on a physical disk, like
|
||||
/home/your_user/tmp or something. ImageMagick will use this as scratch space
|
||||
when crunching through very large documents.
|
||||
|
||||
For more information on how to use this value, you should search
|
||||
the web for "MAGICK_TMPDIR".
|
||||
|
||||
Default is none, which disables the temporary directory.
|
||||
|
||||
PAPERLESS_CONVERT_DENSITY=<num>
|
||||
This setting has a high impact on the physical size of tmp page files,
|
||||
the speed of document conversion, and can affect the accuracy of OCR
|
||||
results. Individual results can vary and this setting should be tested
|
||||
thoroughly against the documents you are importing to see if it has any
|
||||
impacts either negative or positive.
|
||||
Testing on limited document sets has shown a setting of 200 can cut the
|
||||
size of tmp files by 1/3, and speed up conversion by up to 4x
|
||||
with little impact to OCR accuracy.
|
||||
|
||||
Default is 300.
|
||||
|
||||
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
|
||||
Use optipng to optimize thumbnails. This usually reduces the sice of
|
||||
thumbnails by about 20%, but uses considerable compute time during
|
||||
consumption.
|
||||
|
||||
Defaults to true.
|
||||
|
||||
PAPERLESS_POST_CONSUME_SCRIPT=<filename>
|
||||
After a document is consumed, Paperless can trigger an arbitrary script if
|
||||
you like. This script will be passed a number of arguments for you to work
|
||||
with. For more information, take a look at :ref:`advanced-post_consume_script`.
|
||||
|
||||
The default is blank, which means nothing will be executed.
|
||||
|
||||
PAPERLESS_FILENAME_DATE_ORDER=<format>
|
||||
Paperless will check the document text for document date information.
|
||||
Use this setting to enable checking the document filename for date
|
||||
information. The date order can be set to any option as specified in
|
||||
https://dateparser.readthedocs.io/en/latest/settings.html#date-order.
|
||||
The filename will be checked first, and if nothing is found, the document
|
||||
text will be checked as normal.
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS
|
||||
Transforms filenames before they are processed by paperless. See
|
||||
:ref:`advanced-transforming_filenames` for details.
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
||||
Binaries
|
||||
########
|
||||
|
||||
There are a few external software packages that Paperless expects to find on
|
||||
your system when it starts up. Unless you've done something creative with
|
||||
their installation, you probably won't need to edit any of these. However,
|
||||
if you've installed these programs somewhere where simply typing the name of
|
||||
the program doesn't automatically execute it (ie. the program isn't in your
|
||||
$PATH), then you'll need to specify the literal path for that program.
|
||||
|
||||
PAPERLESS_CONVERT_BINARY=<path>
|
||||
Defaults to "/usr/bin/convert".
|
||||
|
||||
PAPERLESS_GS_BINARY=<path>
|
||||
Defaults to "/usr/bin/gs".
|
||||
|
||||
PAPERLESS_UNPAPER_BINARY=<path>
|
||||
Defaults to "/usr/bin/unpaper".
|
||||
|
||||
PAPERLESS_OPTIPNG_BINARY=<path>
|
||||
Defaults to "/usr/bin/optipng".
|
@ -1,255 +0,0 @@
|
||||
.. _consumption:
|
||||
|
||||
Consumption
|
||||
###########
|
||||
|
||||
Once you've got Paperless setup, you need to start feeding documents into it.
|
||||
Currently, there are three options: the consumption directory, IMAP (email), and
|
||||
HTTP POST.
|
||||
|
||||
|
||||
.. _consumption-directory:
|
||||
|
||||
The Consumption Directory
|
||||
=========================
|
||||
|
||||
The primary method of getting documents into your database is by putting them in
|
||||
the consumption directory. The ``document_consumer`` script runs in an infinite
|
||||
loop looking for new additions to this directory and when it finds them, it goes
|
||||
about the process of parsing them with the OCR, indexing what it finds, and
|
||||
encrypting the PDF (if ``PAPERLESS_PASSPHRASE`` is set), storing it in the
|
||||
media directory.
|
||||
|
||||
Getting stuff into this directory is up to you. If you're running Paperless
|
||||
on your local computer, you might just want to drag and drop files there, but if
|
||||
you're running this on a server and want your scanner to automatically push
|
||||
files to this directory, you'll need to setup some sort of service to accept the
|
||||
files from the scanner. Typically, you're looking at an FTP server like
|
||||
`Proftpd`_ or `Samba`_.
|
||||
|
||||
.. _Proftpd: http://www.proftpd.org/
|
||||
.. _Samba: http://www.samba.org/
|
||||
|
||||
So where is this consumption directory? It's wherever you define it. Look for
|
||||
the ``CONSUMPTION_DIR`` value in ``settings.py``. Set that to somewhere
|
||||
appropriate for your use and put some documents in there. When you're ready,
|
||||
follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
|
||||
|
||||
|
||||
.. _consumption-directory-hook:
|
||||
|
||||
Hooking into the Consumption Process
|
||||
------------------------------------
|
||||
|
||||
Sometimes you may want to do something arbitrary whenever a document is
|
||||
consumed. Rather than try to predict what you may want to do, Paperless lets
|
||||
you execute scripts of your own choosing just before or after a document is
|
||||
consumed using a couple simple hooks.
|
||||
|
||||
Just write a script, put it somewhere that Paperless can read & execute, and
|
||||
then put the path to that script in ``paperless.conf`` with the variable name
|
||||
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
|
||||
``PAPERLESS_POST_CONSUME_SCRIPT``. The script will be executed before or
|
||||
or after the document is consumed respectively.
|
||||
|
||||
.. important::
|
||||
|
||||
These scripts are executed in a **blocking** process, which means that if
|
||||
a script takes a long time to run, it can significantly slow down your
|
||||
document consumption flow. If you want things to run asynchronously,
|
||||
you'll have to fork the process in your script and exit.
|
||||
|
||||
|
||||
.. _consumption-directory-hook-variables:
|
||||
|
||||
What Can These Scripts Do?
|
||||
..........................
|
||||
|
||||
It's your script, so you're only limited by your imagination and the laws of
|
||||
physics. However, the following values are passed to the scripts in order:
|
||||
|
||||
|
||||
.. _consumption-director-hook-variables-pre:
|
||||
|
||||
Pre-consumption script
|
||||
::::::::::::::::::::::
|
||||
|
||||
* Document file name
|
||||
|
||||
A simple but common example for this would be creating a simple script like
|
||||
this:
|
||||
|
||||
``/usr/local/bin/ocr-pdf``
|
||||
|
||||
.. code:: bash
|
||||
|
||||
#!/usr/bin/env bash
|
||||
pdf2pdfocr.py -i ${1}
|
||||
|
||||
``/etc/paperless.conf``
|
||||
|
||||
.. code:: bash
|
||||
|
||||
...
|
||||
PAPERLESS_PRE_CONSUME_SCRIPT="/usr/local/bin/ocr-pdf"
|
||||
...
|
||||
|
||||
This will pass the path to the document about to be consumed to ``/usr/local/bin/ocr-pdf``,
|
||||
which will in turn call `pdf2pdfocr.py`_ on your document, which will then
|
||||
overwrite the file with an OCR'd version of the file and exit. At which point,
|
||||
the consumption process will begin with the newly modified file.
|
||||
|
||||
.. _pdf2pdfocr.py: https://github.com/LeoFCardoso/pdf2pdfocr
|
||||
|
||||
|
||||
.. _consumption-director-hook-variables-post:
|
||||
|
||||
Post-consumption script
|
||||
:::::::::::::::::::::::
|
||||
|
||||
* Document id
|
||||
* Generated file name
|
||||
* Source path
|
||||
* Thumbnail path
|
||||
* Download URL
|
||||
* Thumbnail URL
|
||||
* Correspondent
|
||||
* Tags
|
||||
|
||||
The script can be in any language you like, but for a simple shell script
|
||||
example, you can take a look at ``post-consumption-example.sh`` in the
|
||||
``scripts`` directory in this project.
|
||||
|
||||
|
||||
.. _consumption-imap:
|
||||
|
||||
IMAP (Email)
|
||||
============
|
||||
|
||||
Another handy way to get documents into your database is to email them to
|
||||
yourself. The typical use-case would be to be out for lunch and want to send a
|
||||
copy of the receipt back to your system at home. Paperless can be taught to
|
||||
pull emails down from an arbitrary account and dump them into the consumption
|
||||
directory where the process :ref:`above <consumption-directory>` will follow the
|
||||
usual pattern on consuming the document.
|
||||
|
||||
Some things you need to know about this feature:
|
||||
|
||||
* It's disabled by default. By setting the values below it will be enabled.
|
||||
* It's been tested in a limited environment, so it may not work for you (please
|
||||
submit a pull request if you can!)
|
||||
* It's designed to **delete mail from the server once consumed**. So don't go
|
||||
pointing this to your personal email account and wonder where all your stuff
|
||||
went.
|
||||
* Currently, only one photo (attachment) per email will work.
|
||||
|
||||
So, with all that in mind, here's what you do to get it running:
|
||||
|
||||
1. Setup a new email account somewhere, or if you're feeling daring, create a
|
||||
folder in an existing email box and note the path to that folder.
|
||||
2. In ``/etc/paperless.conf`` set all of the appropriate values in
|
||||
``PATHS AND FOLDERS`` and ``SECURITY``.
|
||||
If you decided to use a subfolder of an existing account, then make sure you
|
||||
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
|
||||
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
|
||||
have to include that in every email you send.
|
||||
3. Restart the :ref:`consumer <utilities-consumer>`. The consumer will check
|
||||
the configured email account at startup and from then on every 10 minutes
|
||||
for something new and pulls down whatever it finds.
|
||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
||||
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
||||
get what you expect. Also, you must include the aforementioned secret
|
||||
string in every email so the fetcher knows that it's safe to import.
|
||||
Note that Paperless only allows the email title to consist of safe characters
|
||||
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
|
||||
5. After a few minutes, the consumer will poll your mailbox, pull down the
|
||||
message, and place the attachment in the consumption directory with the
|
||||
appropriate name. A few minutes later, the consumer will import it like any
|
||||
other file.
|
||||
|
||||
|
||||
.. _consumption-http:
|
||||
|
||||
HTTP POST
|
||||
=========
|
||||
|
||||
You can also submit a document via HTTP POST, so long as you do so after
|
||||
authenticating. To push your document to Paperless, send an HTTP POST to the
|
||||
server with the following name/value pairs:
|
||||
|
||||
* ``correspondent``: The name of the document's correspondent. Note that there
|
||||
are restrictions on what characters you can use here. Specifically,
|
||||
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else is
|
||||
out. You also can't use the sequence ` - ` (space, dash, space).
|
||||
* ``title``: The title of the document. The rules for characters is the same
|
||||
here as the correspondent.
|
||||
* ``document``: The file you're uploading
|
||||
|
||||
Specify ``enctype="multipart/form-data"``, and then POST your file with::
|
||||
|
||||
Content-Disposition: form-data; name="document"; filename="whatever.pdf"
|
||||
|
||||
An example of this in HTML is a typical form:
|
||||
|
||||
.. code:: html
|
||||
|
||||
<form method="post" enctype="multipart/form-data">
|
||||
<input type="text" name="correspondent" value="My Correspondent" />
|
||||
<input type="text" name="title" value="My Title" />
|
||||
<input type="file" name="document" />
|
||||
<input type="submit" name="go" value="Do the thing" />
|
||||
</form>
|
||||
|
||||
But a potentially more useful way to do this would be in Python. Here we use
|
||||
the requests library to handle basic authentication and to send the POST data
|
||||
to the URL.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import os
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
# You authenticate via BasicAuth or with a session id.
|
||||
# We use BasicAuth here
|
||||
username = "my-username"
|
||||
password = "my-super-secret-password"
|
||||
|
||||
# Where you have Paperless installed and listening
|
||||
url = "http://localhost:8000/push"
|
||||
|
||||
# Document metadata
|
||||
correspondent = "Test Correspondent"
|
||||
title = "Test Title"
|
||||
|
||||
# The local file you want to push
|
||||
path = "/path/to/some/directory/my-document.pdf"
|
||||
|
||||
|
||||
with open(path, "rb") as f:
|
||||
|
||||
response = requests.post(
|
||||
url=url,
|
||||
data={"title": title, "correspondent": correspondent},
|
||||
files={"document": (os.path.basename(path), f, "application/pdf")},
|
||||
auth=HTTPBasicAuth(username, password),
|
||||
allow_redirects=False
|
||||
)
|
||||
|
||||
if response.status_code == 202:
|
||||
|
||||
# Everything worked out ok
|
||||
print("Upload successful")
|
||||
|
||||
else:
|
||||
|
||||
# If you don't get a 202, it's probably because your credentials
|
||||
# are wrong or something. This will give you a rough idea of what
|
||||
# happened.
|
||||
|
||||
print("We got HTTP status code: {}".format(response.status_code))
|
||||
for k, v in response.headers.items():
|
||||
print("{}: {}".format(k, v))
|
@ -3,6 +3,10 @@
|
||||
Contributing to Paperless
|
||||
#########################
|
||||
|
||||
.. warning::
|
||||
|
||||
This section is not updated to paperless-ng yet.
|
||||
|
||||
Maybe you've been using Paperless for a while and want to add a feature or two,
|
||||
or maybe you've come across a bug that you have some ideas how to solve. The
|
||||
beauty of Free software is that you can see what's wrong and help to get it
|
||||
|
@ -1,42 +0,0 @@
|
||||
.. _customising:
|
||||
|
||||
Customising Paperless
|
||||
#####################
|
||||
|
||||
Currently, the Paperless' interface is just the default Django admin, which
|
||||
while powerful, is rather boring. If you'd like to give the site a bit of a
|
||||
face-lift, or if you simply want to adjust the colours, contrast, or font size
|
||||
to make things easier to read, you can do that by adding your own CSS or
|
||||
Javascript quite easily.
|
||||
|
||||
|
||||
.. _customising-overrides:
|
||||
|
||||
Overrides
|
||||
=========
|
||||
|
||||
On every page load, Paperless looks for two files in your media root directory
|
||||
(the directory defined by your ``PAPERLESS_MEDIADIR`` configuration variable or
|
||||
the default, ``<project root>/media/``) for two files:
|
||||
|
||||
* ``overrides.css``
|
||||
* ``overrides.js``
|
||||
|
||||
If it finds either or both of those files, they'll be loaded into the page: the
|
||||
CSS in the ``<head>``, and the Javascript stuffed into the last line of the
|
||||
``<body>``.
|
||||
|
||||
|
||||
.. _customising-overrides-note:
|
||||
|
||||
An important note about customisation
|
||||
-------------------------------------
|
||||
|
||||
Any changes you make to the site with your CSS or Javascript are likely to
|
||||
depend on the structure of the current HTML and/or the existing CSS rules. For
|
||||
the most part it's safe to assume that these bits won't change, but *sometimes
|
||||
they do* as features are added or bugs are fixed.
|
||||
|
||||
If you make a change that you think others would appreciate though, submit it
|
||||
as a pull request and maybe we can find a way to work it into the project by
|
||||
default!
|
@ -3,6 +3,10 @@
|
||||
Extending Paperless
|
||||
===================
|
||||
|
||||
.. warning::
|
||||
|
||||
This section is not updated to paperless-ng yet.
|
||||
|
||||
For the most part, Paperless is monolithic, so extending it is often best
|
||||
managed by way of modifying the code directly and issuing a pull request on
|
||||
`GitHub`_. However, over time the project has been evolving to be a little
|
||||
|
57
docs/faq.rst
Normal file
@ -0,0 +1,57 @@
|
||||
|
||||
**************************
|
||||
Frequently asked questions
|
||||
**************************
|
||||
|
||||
**Q:** *I'm using docker. Where are my documents?*
|
||||
|
||||
**A:** Your documents are stored inside the docker volume ``paperless_media``.
|
||||
Docker manages this volume automatically for you. It is a persistent storage
|
||||
and will persist as long as you don't explicitly delete it. The actual location
|
||||
depends on your host operating system. On Linux, chances are high that this location
|
||||
is
|
||||
|
||||
.. code::
|
||||
|
||||
/var/lib/docker/volumes/paperless_media/_data
|
||||
|
||||
.. caution::
|
||||
|
||||
Dont mess with this folder. Don't change permissions and don't move
|
||||
files around manually. This folder is meant to be entirely managed by docker
|
||||
and paperless.
|
||||
|
||||
**Q:** *Will paperless-ng run on Raspberry Pi?*
|
||||
|
||||
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
|
||||
The long answer is that certain parts of
|
||||
Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
|
||||
try to OCR documents before feeding them into paperless so that paperless can
|
||||
reuse the text. The web interface should be alot snappier, since it runs
|
||||
in your browser and paperless has to do much less work to serve the data.
|
||||
|
||||
.. note::
|
||||
|
||||
Consider setting ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to false to speed up
|
||||
the consumption process. This takes quite a bit of time on Raspberry Pi.
|
||||
|
||||
.. note::
|
||||
|
||||
Updating the :ref:`automatic matching algorithm <advanced-automatic_matching>`
|
||||
takes quite a bit of time. However, the update mechanism checks if your
|
||||
data has changed before doing the heavy lifting. If you experience the
|
||||
algorithm taking too much cpu time, consider changing the schedule in the
|
||||
admin interface to daily or weekly. You can also manually invoke the task
|
||||
by changing the date and time of the next run to today/now.
|
||||
|
||||
The actual matching of the algorithm is fast and works on Raspberry Pi as
|
||||
well as on any other device.
|
||||
|
||||
|
||||
|
||||
**Q:** *How do I install paperless-ng on Raspberry Pi?*
|
||||
|
||||
**A:** There is not docker image for ARM available. If you know how to build
|
||||
that automatically, I'm all ears. For now, you have to grab the latest release
|
||||
archive from the project page and build the image yourself. The release comes
|
||||
with the front end already compiled, so you don't have to do this on the Pi.
|
@ -1,131 +0,0 @@
|
||||
.. _guesswork:
|
||||
|
||||
Guesswork
|
||||
#########
|
||||
|
||||
During the consumption process, Paperless tries to guess some of the attributes
|
||||
of the document it's looking at. To do this it uses two approaches:
|
||||
|
||||
|
||||
.. _guesswork-naming:
|
||||
|
||||
File Naming
|
||||
===========
|
||||
|
||||
Any document you put into the consumption directory will be consumed, but if
|
||||
you name the file right, it'll automatically set some values in the database
|
||||
for you. This is is the logic the consumer follows:
|
||||
|
||||
1. Try to find the correspondent, title, and tags in the file name following
|
||||
the pattern: ``Date - Correspondent - Title - tag,tag,tag.pdf``. Note that
|
||||
the format of the date is **rigidly defined** as ``YYYYMMDDHHMMSSZ`` or
|
||||
``YYYYMMDDZ``. The ``Z`` refers "Zulu time" AKA "UTC".
|
||||
The tags are optional, so the format ``Date - Correspondent - Title.pdf``
|
||||
works as well.
|
||||
2. If that doesn't work, we skip the date and try this pattern:
|
||||
``Correspondent - Title - tag,tag,tag.pdf``.
|
||||
3. If that doesn't work, we try to find the correspondent and title in the file
|
||||
name following the pattern: ``Correspondent - Title.pdf``.
|
||||
4. If that doesn't work, just assume that the name of the file is the title.
|
||||
|
||||
So given the above, the following examples would work as you'd expect:
|
||||
|
||||
* ``20150314000700Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``20150314Z - Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``Some Company Name - Invoice 2016-01-01 - money,invoices.pdf``
|
||||
* ``Another Company - Letter of Reference.jpg``
|
||||
* ``Dad's Recipe for Pancakes.png``
|
||||
|
||||
These however wouldn't work:
|
||||
|
||||
* ``2015-03-14 00:07:00 UTC - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``2015-03-14 - Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
|
||||
* ``Another Company- Letter of Reference.jpg``
|
||||
|
||||
Do I have to be so strict about naming?
|
||||
---------------------------------------
|
||||
Rather than using the strict document naming rules, one can also set the option
|
||||
``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
|
||||
that is accepted by dateparser_. Doing so will cause ``paperless`` to default
|
||||
to any date format that is found in the title, instead of a date pulled from
|
||||
the document's text, without requiring the strict formatting of the document
|
||||
filename as described above.
|
||||
|
||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
||||
|
||||
Transforming filenames for parsing
|
||||
----------------------------------
|
||||
Some devices can't produce filenames that can be parsed by the default
|
||||
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
|
||||
``paperless.conf`` one can add transformations that are applied to the filename
|
||||
before it's parsed.
|
||||
|
||||
The option contains a list of dictionaries of regular expressions (key:
|
||||
``pattern``) and replacements (key: ``repl``) in JSON format, which are
|
||||
applied in order by passing them to ``re.subn``. Transformation stops
|
||||
after the first match, so at most one transformation is applied. The general
|
||||
syntax is
|
||||
|
||||
.. code:: python
|
||||
|
||||
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
|
||||
|
||||
The example below is for a Brother ADS-2400N, a scanner that allows
|
||||
different names to different hardware buttons (useful for handling
|
||||
multiple entities in one instance), but insists on adding ``_<count>``
|
||||
to the filename.
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Brother profile configuration, support "Name_Date_Count" (the default
|
||||
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||
|
||||
.. _guesswork-content:
|
||||
|
||||
Reading the Document Contents
|
||||
=============================
|
||||
|
||||
After the consumer has tried to figure out what it could from the file name,
|
||||
it starts looking at the content of the document itself. It will compare the
|
||||
matching algorithms defined by every tag and correspondent already set in your
|
||||
database to see if they apply to the text in that document. In other words,
|
||||
if you defined a tag called ``Home Utility`` that had a ``match`` property of
|
||||
``bc hydro`` and a ``matching_algorithm`` of ``literal``, Paperless will
|
||||
automatically tag your newly-consumed document with your ``Home Utility`` tag
|
||||
so long as the text ``bc hydro`` appears in the body of the document somewhere.
|
||||
|
||||
The matching logic is quite powerful, and supports searching the text of your
|
||||
document with different algorithms, and as such, some experimentation may be
|
||||
necessary to get things Just Right.
|
||||
|
||||
|
||||
.. _guesswork-content-howto:
|
||||
|
||||
How Do I Set Up These Matching Algorithms?
|
||||
------------------------------------------
|
||||
|
||||
Setting up of the algorithms is easily done through the admin interface. When
|
||||
you create a new correspondent or tag, there are optional fields for matching
|
||||
text and matching algorithm. From the help info there:
|
||||
|
||||
.. note::
|
||||
|
||||
Which algorithm you want to use when matching text to the OCR'd PDF. Here,
|
||||
"any" looks for any occurrence of any word provided in the PDF, while "all"
|
||||
requires that every word provided appear in the PDF, albeit not in the
|
||||
order provided. A "literal" match means that the text you enter must
|
||||
appear in the PDF exactly as you've entered it, and "regular expression"
|
||||
uses a regex to match the PDF. If you don't know what a regex is, you
|
||||
probably don't want this option.
|
||||
|
||||
When using the "any" or "all" matching algorithms, you can search for terms
|
||||
that consist of multiple words by enclosing them in double quotes. For example,
|
||||
defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
|
||||
will match documents that contain either "Bank of America" or "BofA", but will
|
||||
not match documents containing "Bank of South America".
|
||||
|
||||
Then just save your tag/correspondent and run another document through the
|
||||
consumer. Once complete, you should see the newly-created document,
|
||||
automatically tagged with the appropriate data.
|
@ -1,17 +1,14 @@
|
||||
.. _index:
|
||||
|
||||
*********
|
||||
Paperless
|
||||
=========
|
||||
*********
|
||||
|
||||
Paperless is a simple Django application running in two parts:
|
||||
a :ref:`consumer <utilities-consumer>` (the thing that does the indexing) and
|
||||
the :ref:`webserver <utilities-webserver>` (the part that lets you search &
|
||||
a *Consumer* (the thing that does the indexing) and
|
||||
the *Web server* (the part that lets you search &
|
||||
download already-indexed documents). If you want to learn more about its
|
||||
functions keep on reading after the installation section.
|
||||
|
||||
|
||||
.. _index-why-this-exists:
|
||||
|
||||
Why This Exists
|
||||
===============
|
||||
|
||||
@ -25,26 +22,51 @@ finding stuff again. I feed documents right from the post box into the scanner
|
||||
and then shred them. Perhaps you might find it useful too.
|
||||
|
||||
|
||||
Paperless-ng
|
||||
============
|
||||
|
||||
Paperless-ng is a fork of the original paperless project. It changes many
|
||||
things both on the surface and under the hood. Paperless-ng was created
|
||||
because I feel that these changes are too big to be pushed into the main
|
||||
repository right away.
|
||||
|
||||
NG stands for both Angular (the framework used for the
|
||||
Frontend) and next-gen. Publishing this project under a different name also
|
||||
avoids confusion between paperless and paperless-ng.
|
||||
|
||||
If you want to learn about what's different in paperless-ng, check out these
|
||||
resources in the documentation:
|
||||
|
||||
* :ref:`Some screenshots <screenshots>` of the new UI are available.
|
||||
* Read :ref:`this section <advanced-automatic_matching>` if you want to
|
||||
learn about how paperless automates all tagging using machine learning.
|
||||
* Paperless now comes with a :ref:`proper email consumer <usage-email>`
|
||||
that's fully tested and production ready.
|
||||
* See :ref:`this note <utilities-encyption>` about GnuPG encryption in
|
||||
paperless-ng.
|
||||
* The :ref:`changelog <paperless_changelog>` contains a detailed list of all changes
|
||||
in paperless-ng.
|
||||
|
||||
It would be great if this project could eventually merge back into the main
|
||||
repository, but it needs a lot more work before that can happen.
|
||||
|
||||
|
||||
Contents
|
||||
========
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:maxdepth: 1
|
||||
|
||||
requirements
|
||||
setup
|
||||
consumption
|
||||
usage_overview
|
||||
advanced_usage
|
||||
administration
|
||||
configuration
|
||||
api
|
||||
utilities
|
||||
guesswork
|
||||
migrating
|
||||
customising
|
||||
faq
|
||||
extending
|
||||
troubleshooting
|
||||
contributing
|
||||
scanners
|
||||
screenshots
|
||||
changelog
|
||||
changelog_jonaswinkler
|
||||
|
@ -1,109 +0,0 @@
|
||||
.. _migrating:
|
||||
|
||||
Migrating, Updates, and Backups
|
||||
===============================
|
||||
|
||||
As Paperless is still under active development, there's a lot that can change
|
||||
as software updates roll out. You should backup often, so if anything goes
|
||||
wrong during an update, you at least have a means of restoring to something
|
||||
usable. Thankfully, there are automated ways of backing up, restoring, and
|
||||
updating the software.
|
||||
|
||||
|
||||
.. _migrating-backup:
|
||||
|
||||
Backing Up
|
||||
----------
|
||||
|
||||
So you're bored of this whole project, or you want to make a remote backup of
|
||||
your files for whatever reason. This is easy to do, simply use the
|
||||
:ref:`exporter <utilities-exporter>` to dump your documents and database out
|
||||
into an arbitrary directory.
|
||||
|
||||
|
||||
.. _migrating-restoring:
|
||||
|
||||
Restoring
|
||||
---------
|
||||
|
||||
Restoring your data is just as easy, since nearly all of your data exists either
|
||||
in the file names, or in the contents of the files themselves. You just need to
|
||||
create an empty database (just follow the
|
||||
:ref:`installation instructions <setup-installation>` again) and then import the
|
||||
``tags.json`` file you created as part of your backup. Lastly, copy your
|
||||
exported documents into the consumption directory and start up the consumer.
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ rm data/db.sqlite3 # Delete the database
|
||||
$ cd src
|
||||
$ ./manage.py migrate # Create the database
|
||||
$ ./manage.py createsuperuser
|
||||
$ ./manage.py loaddata /path/to/arbitrary/place/tags.json
|
||||
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
|
||||
$ ./manage.py document_consumer
|
||||
|
||||
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
|
||||
is almost as simple:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
# Stop and remove your current containers
|
||||
$ docker-compose stop
|
||||
$ docker-compose rm -f
|
||||
|
||||
# Recreate them, add the superuser
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
# Load the tags
|
||||
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
|
||||
|
||||
# Load your exported documents into the consumption directory
|
||||
# (How you do this highly depends on how you have set this up)
|
||||
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
|
||||
|
||||
After loading the documents into the consumption directory the consumer will
|
||||
immediately start consuming the documents.
|
||||
|
||||
|
||||
.. _migrating-updates:
|
||||
|
||||
Updates
|
||||
-------
|
||||
|
||||
For the most part, all you have to do to update Paperless is run ``git pull``
|
||||
on the directory containing the project files, and then use Django's
|
||||
``migrate`` command to execute any database schema updates that might have been
|
||||
rolled in as part of the update:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ pip install -r requirements.txt
|
||||
$ cd src
|
||||
$ ./manage.py migrate
|
||||
|
||||
Note that it's possible (even likely) that while ``git pull`` may update some
|
||||
files, the ``migrate`` step may not update anything. This is totally normal.
|
||||
|
||||
Additionally, as new features are added, the ability to control those features
|
||||
is typically added by way of an environment variable set in ``paperless.conf``.
|
||||
You may want to take a look at the ``paperless.conf.example`` file to see if
|
||||
there's anything new in there compared to what you've got in ``/etc``.
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||
is similar:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ docker build -t paperless .
|
||||
$ docker-compose run --rm consumer migrate
|
||||
$ docker-compose up -d
|
||||
|
||||
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||
the remaining steps.
|
@ -1,125 +0,0 @@
|
||||
.. _requirements:
|
||||
|
||||
Requirements
|
||||
============
|
||||
|
||||
You need a Linux machine or Unix-like setup (theoretically an Apple machine
|
||||
should work) that has the following software installed:
|
||||
|
||||
* `Python3`_ (with development libraries, pip and virtualenv)
|
||||
* `GNU Privacy Guard`_
|
||||
* `Tesseract`_, plus its language files matching your document base.
|
||||
* `Imagemagick`_ version 6.7.5 or higher
|
||||
* `unpaper`_
|
||||
* `libpoppler-cpp-dev`_ PDF rendering library
|
||||
* `optipng`_
|
||||
|
||||
.. _Python3: https://python.org/
|
||||
.. _GNU Privacy Guard: https://gnupg.org
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Imagemagick: http://imagemagick.org/
|
||||
.. _unpaper: https://github.com/unpaper/unpaper
|
||||
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
|
||||
.. _optipng: http://optipng.sourceforge.net/
|
||||
|
||||
Notably, you should confirm how you access your Python3 installation. Many
|
||||
Linux distributions will install Python3 in parallel to Python2, using the
|
||||
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||
``pip``. Running Paperless with Python2 will likely break things, so make sure
|
||||
that you're using the right version.
|
||||
|
||||
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
||||
refer to their Python3 versions.
|
||||
|
||||
In addition to the above, there are a number of Python requirements, all of
|
||||
which are listed in a file called ``requirements.txt`` in the project root
|
||||
directory.
|
||||
|
||||
If you're not working on a virtual environment (like Docker), you
|
||||
should probably be using a virtualenv, but that's your call. The reasons why
|
||||
you might choose a virtualenv or not aren't really within the scope of this
|
||||
document. Needless to say if you don't know what a virtualenv is, you should
|
||||
probably figure that out before continuing.
|
||||
|
||||
|
||||
.. _requirements-apple:
|
||||
|
||||
Problems with Imagemagick & PDFs
|
||||
--------------------------------
|
||||
|
||||
Some users have `run into problems`_ with getting ImageMagick to do its thing
|
||||
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
|
||||
Linuxes have been a problem as well. The solution appears to be to install
|
||||
ghostscript as well as ImageMagick:
|
||||
|
||||
.. _run into problems: https://github.com/the-paperless-project/paperless/issues/25
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ brew install ghostscript
|
||||
$ brew install imagemagick
|
||||
$ brew install libmagic
|
||||
|
||||
|
||||
.. _requirements-baremetal:
|
||||
|
||||
Python-specific Requirements: No Virtualenv
|
||||
-------------------------------------------
|
||||
|
||||
If you don't care to use a virtual env, then installation of the Python
|
||||
dependencies is easy:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pip install --user --requirement /path/to/paperless/requirements.txt
|
||||
|
||||
This will download and install all of the requirements into
|
||||
``${HOME}/.local``. Remember that your distribution may be using ``pip3`` as
|
||||
mentioned above.
|
||||
|
||||
|
||||
.. _requirements-virtualenv:
|
||||
|
||||
Python-specific Requirements: Virtualenv
|
||||
----------------------------------------
|
||||
|
||||
Using a virtualenv for this is pretty straightforward: create a virtualenv,
|
||||
enter it, and install the requirements using the ``requirements.txt`` file:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ virtualenv --python=/path/to/python3 /path/to/arbitrary/directory
|
||||
$ . /path/to/arbitrary/directory/bin/activate
|
||||
$ pip install --requirement /path/to/paperless/requirements.txt
|
||||
|
||||
Now you're ready to go. Just remember to enter (activate) your virtualenv
|
||||
whenever you want to use Paperless.
|
||||
|
||||
|
||||
.. _requirements-documentation:
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
As generation of the documentation is not required for the use of Paperless,
|
||||
dependencies for this process are not included in ``requirements.txt``. If
|
||||
you'd like to generate your own docs locally, you'll need to:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pip install sphinx
|
||||
|
||||
and then cd into the ``docs`` directory and type ``make html``.
|
||||
|
||||
If you are using Docker, you can use the following commands to build the
|
||||
documentation and run a webserver serving it on `port 8001`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pwd
|
||||
/path/to/paperless
|
||||
|
||||
$ docker build -t paperless:docs -f docs/Dockerfile .
|
||||
$ docker run --rm -it -p "8001:8000" paperless:docs
|
||||
|
||||
.. _port 8001: http://127.0.0.1:8001
|
@ -1,7 +1,9 @@
|
||||
|
||||
.. _scanners:
|
||||
|
||||
Scanner Recommendations
|
||||
=======================
|
||||
***********************
|
||||
Scanner recommendations
|
||||
***********************
|
||||
|
||||
As Paperless operates by watching a folder for new files, doesn't care what
|
||||
scanner you use, but sometimes finding a scanner that will write to an FTP,
|
||||
@ -23,16 +25,19 @@ that works right for you based on recommentations from other Paperless users.
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Fujitsu | `ix500`_ | yes | | yes | `eonist`_ |
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Fujitsu | `S1300i`_ | yes | | yes | `jonaswinkler`_|
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
|
||||
.. _ADS-1500W: https://www.brother.ca/en/p/ads1500w
|
||||
.. _MFC-J6930DW: https://www.brother.ca/en/p/MFCJ6930DW
|
||||
.. _MFC-J5910DW: https://www.brother.co.uk/printers/inkjet-printers/mfcj5910dw
|
||||
.. _MFC-9142CDN: https://www.brother.co.uk/printers/laser-printers/mfc9140cdn
|
||||
.. _ix500: http://www.fujitsu.com/us/products/computing/peripheral/scanners/scansnap/ix500/
|
||||
.. _ix500: https://www.fujitsu.com/global/products/computing/peripheral/scanners/scansnap/ix500/
|
||||
.. _S1300i: https://www.fujitsu.com/global/products/computing/peripheral/scanners/soho/s1300i/
|
||||
|
||||
.. _danielquinn: https://github.com/danielquinn
|
||||
.. _ayounggun: https://github.com/ayounggun
|
||||
.. _bmsleight: https://github.com/bmsleight
|
||||
.. _eonist: https://github.com/eonist
|
||||
.. _REOLDEV: https://github.com/REOLDEV
|
||||
|
||||
.. _jonaswinkler: https://github.com/jonaswinkler
|
||||
|
@ -1,16 +1,48 @@
|
||||
.. _screenshots:
|
||||
|
||||
***********
|
||||
Screenshots
|
||||
===========
|
||||
***********
|
||||
|
||||
Once everything is set-up login to paperless using the web front-end
|
||||
This is what paperless-ng looks like. You shouldn't use paperless to index
|
||||
research papers though, its a horrible tool for that job.
|
||||
|
||||
.. image:: ./_static/Screenshot_first_run_login.png
|
||||
The dashboard shows customizable views on your document and allows document uploads:
|
||||
|
||||
Nice clean interface
|
||||
.. image:: _static/paperless-0-dashboard.png
|
||||
|
||||
.. image:: ./_static/Screenshot_first_logged.png
|
||||
The document list provides three different styles to scroll through your documents:
|
||||
|
||||
Some documents loaded in via ftp or using the scanners ftp.
|
||||
.. image:: _static/paperless-1-list-table.png
|
||||
.. image:: _static/paperless-2-list-smallcards.png
|
||||
.. image:: _static/paperless-3-list-largecards.png
|
||||
|
||||
Extensive filtering mechanisms:
|
||||
|
||||
.. image:: _static/paperless-4-filter.png
|
||||
|
||||
Side-by-side editing of documents. Optmized for 1080p.
|
||||
|
||||
.. image:: _static/paperless-5-editing.png
|
||||
|
||||
Tag editing. This looks about the same for correspondents and document types.
|
||||
|
||||
.. image:: _static/paperless-6-tags.png
|
||||
|
||||
Searching provides auto complete and highlights the results.
|
||||
|
||||
.. image:: _static/paperless-7-autocomplete.png
|
||||
.. image:: _static/paperless-8-search-results.png
|
||||
|
||||
The old admin is still there and accessible!
|
||||
|
||||
.. image:: _static/paperless-9-admin.png
|
||||
|
||||
Fancy mail filters!
|
||||
|
||||
.. image:: _static/paperless-11-mail-filters.png
|
||||
|
||||
Mobile support in the future? This doesn't really work yet.
|
||||
|
||||
.. image:: _static/paperless-10-mobile.png
|
||||
|
||||
.. image:: ./_static/Screenshot_upload_and_scanned.png
|
||||
|
658
docs/setup.rst
@ -1,500 +1,282 @@
|
||||
.. _setup:
|
||||
|
||||
*****
|
||||
Setup
|
||||
=====
|
||||
|
||||
Paperless isn't a very complicated app, but there are a few components, so some
|
||||
basic documentation is in order. If you follow along in this document and
|
||||
still have trouble, please open an `issue on GitHub`_ so I can fill in the
|
||||
gaps.
|
||||
|
||||
.. _issue on GitHub: https://github.com/the-paperless-project/paperless/issues
|
||||
|
||||
|
||||
.. _setup-download:
|
||||
*****
|
||||
|
||||
Download
|
||||
--------
|
||||
########
|
||||
|
||||
The source is currently only available via GitHub, so grab it from there,
|
||||
either by using ``git``:
|
||||
Go to the project page on GitHub and download the
|
||||
`latest release <https://github.com/jonaswinkler/paperless-ng/releases>`_.
|
||||
There are multiple options available.
|
||||
|
||||
.. code:: bash
|
||||
* Download the docker-compose files if you want to pull paperless from
|
||||
Docker Hub.
|
||||
|
||||
$ git clone https://github.com/the-paperless-project/paperless.git
|
||||
$ cd paperless
|
||||
* Download the archive and extract it if you want to build the docker image
|
||||
yourself or want to install paperless without docker.
|
||||
|
||||
or just download the tarball and go that route:
|
||||
.. hint::
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd to the directory where you want to run Paperless
|
||||
$ wget https://github.com/the-paperless-project/paperless/archive/master.zip
|
||||
$ unzip master.zip
|
||||
$ cd paperless-master
|
||||
In contrast to paperless, the recommended way to get and update paperless-ng
|
||||
is not to pull the entire git repository. Paperless-ng includes artifacts
|
||||
that need to be compiled, and that's already done for you in the release.
|
||||
|
||||
|
||||
.. _setup-installation:
|
||||
Overview of Paperless-ng
|
||||
########################
|
||||
|
||||
Installation & Configuration
|
||||
----------------------------
|
||||
Compared to paperless, paperless-ng works a little different under the hood and has
|
||||
more moving parts that work together. While this increases the complexity of
|
||||
the system, it also brings many benefits.
|
||||
|
||||
Paperless consists of the following components:
|
||||
|
||||
* **The webserver:** This is pretty much the same as in paperless. It serves
|
||||
the administration pages, the API, and the new frontend. This is the main
|
||||
tool you'll be using to interact with paperless. You may start the webserver
|
||||
with
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
|
||||
|
||||
or by any other means such as Apache ``mod_wsgi``.
|
||||
|
||||
* **The consumer:** This is what watches your consumption folder for documents.
|
||||
However, the consumer itself does not consume really consume your documents anymore.
|
||||
It rather notifies a task processor that a new file is ready for consumption.
|
||||
I suppose it should be named differently.
|
||||
This also used to check your emails, but that's now gone elsewhere as well.
|
||||
|
||||
Start the consumer with the management command ``document_consumer``:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run python3 manage.py document_consumer
|
||||
|
||||
* **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
|
||||
for doing much of the heavy lifting. This is a task queue that accepts tasks from
|
||||
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
|
||||
certain commands periodically.
|
||||
|
||||
This task processor is responsible for:
|
||||
|
||||
* Consuming documents. When the consumer finds new documents, it notifies the task processor to
|
||||
start a consumption task.
|
||||
* Consuming emails. It periodically checks your configured accounts for new mails and
|
||||
produces consumption tasks for any documents it finds.
|
||||
* The task processor also performs the consumption of any documents you upload through
|
||||
the web interface.
|
||||
* Maintain the search index and the automatic matching algorithm. These are things that paperless
|
||||
needs to do from time to time in order to operate properly.
|
||||
|
||||
This allows paperless to process multiple documents from your consumption folder in parallel! On
|
||||
a modern multicore system, consumption with full ocr is blazing fast.
|
||||
|
||||
The task processor comes with a built-in admin interface that you can use to see whenever any of the
|
||||
tasks fail and inspect the errors.
|
||||
|
||||
You may start the task processor by executing:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run python3 manage.py qcluster
|
||||
|
||||
* A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
|
||||
for getting the tasks from the webserver and consumer to the task scheduler. These run in different
|
||||
processes (maybe even on different machines!), and therefore, this is necessary.
|
||||
|
||||
* A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
|
||||
added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
|
||||
|
||||
|
||||
Installation
|
||||
############
|
||||
|
||||
You can go multiple routes with setting up and running Paperless:
|
||||
|
||||
* The `bare metal route`_
|
||||
* The `docker route`_
|
||||
* A suggested `linux containers route`_
|
||||
* The `docker route`_
|
||||
* The `bare metal route`_
|
||||
|
||||
The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
|
||||
from above automatically so that it just works and uses sensible defaults for all configuration options.
|
||||
|
||||
The `bare metal route`_ is more complicated to setup but makes it easier
|
||||
should you want to contribute some code back. You need to configure and
|
||||
run the above mentioned components yourself.
|
||||
|
||||
Docker Route
|
||||
============
|
||||
|
||||
1. Install `Docker`_ and `docker-compose`_. [#compose]_
|
||||
|
||||
.. caution::
|
||||
|
||||
If you want to use the included ``docker-compose.*.yml`` file, you
|
||||
need to have at least Docker version **17.09.0** and docker-compose
|
||||
version **1.17.0**.
|
||||
|
||||
See the `Docker installation guide`_ on how to install the current
|
||||
version of Docker for your operating system or Linux distribution of
|
||||
choice. To get an up-to-date version of docker-compose, follow the
|
||||
`docker-compose installation guide`_ if your package repository doesn't
|
||||
include it.
|
||||
|
||||
.. _Docker installation guide: https://docs.docker.com/engine/installation/
|
||||
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
|
||||
|
||||
2. Copy either ``docker-compose.sqlite.yml`` or ``docker-compose.postgres.yml`` to
|
||||
``docker-compose.yml``, depending on which database backend you want to use.
|
||||
|
||||
.. hint::
|
||||
|
||||
For new installations, it is recommended to use postgresql as the database
|
||||
backend. This is due to the increased amount of concurrency in paperless-ng.
|
||||
|
||||
2. Modify ``docker-compose.yml`` to your preferences. You should change the path
|
||||
to the consumption directory in this file. Find the line that specifies where
|
||||
to mount the consumption directory:
|
||||
|
||||
.. code::
|
||||
|
||||
- ./consume:/usr/src/paperless/consume
|
||||
|
||||
Replace the part BEFORE the colon with a local directory of your choice:
|
||||
|
||||
.. code::
|
||||
|
||||
- /home/jonaswinkler/paperless-inbox:/usr/src/paperless/consume
|
||||
|
||||
Don't change the part after the colon or paperless wont find your documents.
|
||||
|
||||
|
||||
The `docker route`_ is quick & easy.
|
||||
3. Modify ``docker-compose.env``, following the comments in the file. The
|
||||
most important change is to set ``USERMAP_UID`` and ``USERMAP_GID``
|
||||
to the uid and gid of your user on the host system. This ensures that
|
||||
both the docker container and you on the host machine have write access
|
||||
to the consumption directory. If your UID and GID on the host system is
|
||||
1000 (the default for the first normal user on most systems), it will
|
||||
work out of the box without any modifications.
|
||||
|
||||
The `bare metal route`_ is a bit more complicated to setup but makes it easier
|
||||
should you want to contribute some code back.
|
||||
.. note::
|
||||
|
||||
The `linux containers route`_ is quick, but makes alot of assumptions on the
|
||||
set-up, on the other hand the script could be used to install on a base
|
||||
debian or ubuntu server.
|
||||
You can use any settings from the file ``paperless.conf`` in this file.
|
||||
Have a look at :ref:`configuration` to see whats available.
|
||||
|
||||
.. _docker route: setup-installation-docker_
|
||||
.. _bare metal route: setup-installation-bare-metal_
|
||||
.. _Docker Machine: https://docs.docker.com/machine/
|
||||
4. Run ``docker-compose up -d``. This will create and start the necessary
|
||||
containers. This will also build the image of paperless if you grabbed the
|
||||
source archive.
|
||||
|
||||
.. _setup-installation-bare-metal:
|
||||
5. To be able to login, you will need a super user. To create it, execute the
|
||||
following command:
|
||||
|
||||
Standard (Bare Metal)
|
||||
+++++++++++++++++++++
|
||||
.. code-block:: shell-session
|
||||
|
||||
1. Install the requirements as per the :ref:`requirements <requirements>` page.
|
||||
2. Within the extract of master.zip go to the ``src`` directory.
|
||||
3. Copy ``../paperless.conf.example`` to ``/etc/paperless.conf`` and open it in
|
||||
your favourite editor. As this file contains passwords. It should only be
|
||||
readable by user root and paperless! Set the values for:
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
Set the values for:
|
||||
|
||||
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||
dumped to be consumed by Paperless.
|
||||
* ``PAPERLESS_OCR_THREADS``: this is the number of threads the OCR process
|
||||
will spawn to process document pages in parallel.
|
||||
* ``PAPERLESS_PASSPHRASE``: this is only required if you want to use GPG to
|
||||
encrypt your document files. This is the passphrase Paperless uses to
|
||||
encrypt/decrypt the original documents. Don't worry about defining this
|
||||
if you don't want to use encryption (the default).
|
||||
|
||||
Note also that if you're using the ``runserver`` as mentioned below, you
|
||||
should make sure that PAPERLESS_DEBUG="true" or is just commented out as
|
||||
this is the default.
|
||||
|
||||
4. Initialise the SQLite database with ``./manage.py migrate``.
|
||||
5. Collect the static files for the webserver with ``./manage.py collectstatic``.
|
||||
6. Create a user for your Paperless instance with
|
||||
``./manage.py createsuperuser``. Follow the prompts to create your user.
|
||||
7. Start the webserver with ``./manage.py runserver <IP>:<PORT>``.
|
||||
If no specific IP or port is given, the default is ``127.0.0.1:8000`` also
|
||||
known as http://localhost:8000/.
|
||||
You should now be able to visit your (empty) installation at
|
||||
`Paperless webserver`_ or whatever you chose before. You can login with the
|
||||
user/pass you created in #5.
|
||||
|
||||
8. In a separate window, change to the ``src`` directory in this repo again,
|
||||
but this time, you should start the consumer script with
|
||||
``./manage.py document_consumer``.
|
||||
9. Scan something or put a file into the ``CONSUMPTION_DIR``.
|
||||
10. Wait a few minutes
|
||||
11. Visit the document list on your webserver, and it should be there, indexed
|
||||
and downloadable.
|
||||
|
||||
.. caution::
|
||||
|
||||
This installation is not secure. Once everything is working head over to
|
||||
`Making things more permanent`_
|
||||
|
||||
.. _Paperless webserver: http://127.0.0.1:8000
|
||||
.. _Making things more permanent: setup-permanent_
|
||||
|
||||
.. _setup-installation-docker:
|
||||
|
||||
Docker Method
|
||||
+++++++++++++
|
||||
|
||||
1. Install `Docker`_.
|
||||
|
||||
.. caution::
|
||||
|
||||
As mentioned earlier, this guide assumes that you use Docker natively
|
||||
under Linux. If you are using `Docker Machine`_ under Mac OS X or
|
||||
Windows, you will have to adapt IP addresses, volume-mounting, command
|
||||
execution and maybe more.
|
||||
|
||||
2. Install `docker-compose`_. [#compose]_
|
||||
|
||||
.. caution::
|
||||
|
||||
If you want to use the included ``docker-compose.yml.example`` file, you
|
||||
need to have at least Docker version **1.12.0** and docker-compose
|
||||
version **1.9.0**.
|
||||
|
||||
See the `Docker installation guide`_ on how to install the current
|
||||
version of Docker for your operating system or Linux distribution of
|
||||
choice. To get an up-to-date version of docker-compose, follow the
|
||||
`docker-compose installation guide`_ if your package repository doesn't
|
||||
include it.
|
||||
|
||||
.. _Docker installation guide: https://docs.docker.com/engine/installation/
|
||||
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
|
||||
|
||||
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``
|
||||
and a copy of ``docker-compose.env.example`` as ``docker-compose.env``.
|
||||
You'll be editing both these files: taking a copy ensures that you can
|
||||
``git pull`` to receive updates without risking merge conflicts with your
|
||||
modified versions of the configuration files.
|
||||
4. Modify ``docker-compose.yml`` to your preferences, following the
|
||||
instructions in comments in the file. The only change that is a hard
|
||||
requirement is to specify where the consumption directory should
|
||||
mount.[#dockercomposeyml]_
|
||||
|
||||
.. caution::
|
||||
|
||||
If you are using NFS mounts for the consume directory you also need to
|
||||
change the command to turn off inotify as it doesn't work with NFS
|
||||
|
||||
``command: ["document_consumer", "--no-inotify"]``
|
||||
|
||||
|
||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||
|
||||
``PAPERLESS_PASSPHRASE``
|
||||
This is the passphrase Paperless uses to encrypt/decrypt the original
|
||||
document. If you aren't planning on using GPG encryption, you can just
|
||||
leave this undefined.
|
||||
|
||||
``PAPERLESS_OCR_THREADS``
|
||||
This is the number of threads the OCR process will spawn to process
|
||||
document pages in parallel. If the variable is not set, Python determines
|
||||
the core-count of your CPU and uses that value.
|
||||
|
||||
``PAPERLESS_OCR_LANGUAGES``
|
||||
If you want the OCR to recognize other languages in addition to the
|
||||
default English, set this parameter to a space separated list of
|
||||
three-letter language-codes after `ISO 639-2/T`_. For a list of available
|
||||
languages -- including their three letter codes -- see the
|
||||
`Alpine packagelist`_.
|
||||
|
||||
``USERMAP_UID`` and ``USERMAP_GID``
|
||||
If you want to mount the consumption volume (directory ``/consume`` within
|
||||
the containers) to a host-directory -- which you probably want to do --
|
||||
access rights might be an issue. The default user and group ``paperless``
|
||||
in the containers have an id of 1000. The containers will enforce that the
|
||||
owning group of the consumption directory will be ``paperless`` to be able
|
||||
to delete consumed documents. If your host-system has a group with an ID
|
||||
of 1000 and you don't want this group to have access rights to the
|
||||
consumption directory, you can use ``USERMAP_GID`` to change the id in the
|
||||
container and thus the one of the consumption directory. Furthermore, you
|
||||
can change the id of the default user as well using ``USERMAP_UID``.
|
||||
|
||||
``PAPERLESS_USE_SSL``
|
||||
If you want Paperless to use SSL for the user interface, set this variable
|
||||
to ``true``. You also need to copy your certificate and key to the ``data``
|
||||
directory, named ``ssl.cert`` and ``ssl.key``.
|
||||
This is not an ideal solution and, if possible, a reverse proxy with nginx
|
||||
is preferred.
|
||||
|
||||
6. Run ``docker-compose up -d``. This will create and start the necessary
|
||||
containers.
|
||||
7. To be able to login, you will need a super user. To create it, execute the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
This will prompt you to set a username (default ``paperless``), an optional
|
||||
e-mail address and finally a password.
|
||||
8. The default ``docker-compose.yml`` exports the webserver on your local port
|
||||
8000. If you haven't adapted this, you should now be able to visit your
|
||||
`Paperless webserver`_ at ``http://127.0.0.1:8000`` (or
|
||||
``https://127.0.0.1:8000`` if you enabled SSL). You can login with the
|
||||
user and password you just created.
|
||||
9. Add files to consumption directory the way you prefer to. Following are two
|
||||
possible options:
|
||||
|
||||
1. Mount the consumption directory to a local host path by modifying your
|
||||
``docker-compose.yml``:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
diff --git a/docker-compose.yml b/docker-compose.yml
|
||||
--- a/docker-compose.yml
|
||||
+++ b/docker-compose.yml
|
||||
@@ -17,9 +18,8 @@ services:
|
||||
volumes:
|
||||
- paperless-data:/usr/src/paperless/data
|
||||
- paperless-media:/usr/src/paperless/media
|
||||
- - /consume
|
||||
+ - /local/path/you/choose:/consume
|
||||
|
||||
.. danger::
|
||||
|
||||
While the consumption container will ensure at startup that it can
|
||||
**delete** a consumed file from a host-mounted directory, it might
|
||||
not be able to **read** the document in the first place if the access
|
||||
rights to the file are incorrect.
|
||||
|
||||
Make sure that the documents you put into the consumption directory
|
||||
will either be readable by everyone (``chmod o+r file.pdf``) or
|
||||
readable by the default user or group id 1000 (or the one you have
|
||||
set with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
|
||||
|
||||
2. Use ``docker cp`` to copy your files directly into the container:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
|
||||
|
||||
``docker cp`` is a one-shot-command, just like ``cp``. This means that
|
||||
every time you want to consume a new document, you will have to execute
|
||||
``docker cp`` again. You can of course automate this process, but option
|
||||
1 is generally the preferred one.
|
||||
|
||||
.. danger::
|
||||
|
||||
``docker cp`` will change the owning user and group of a copied file
|
||||
to the acting user at the destination, which will be ``root``.
|
||||
|
||||
You therefore need to ensure that the documents you want to copy into
|
||||
the container are readable by everyone (``chmod o+r file.pdf``)
|
||||
before copying them.
|
||||
This will prompt you to set a username, an optional e-mail address and
|
||||
finally a password.
|
||||
|
||||
6. The default ``docker-compose.yml`` exports the webserver on your local port
|
||||
8000. If you haven't adapted this, you should now be able to visit your
|
||||
Paperless instance at ``http://127.0.0.1:8000``. You can login with the
|
||||
user and password you just created.
|
||||
|
||||
.. _Docker: https://www.docker.com/
|
||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||
.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
|
||||
|
||||
.. [#compose] You of course don't have to use docker-compose, but it
|
||||
simplifies deployment immensely. If you know your way around Docker, feel
|
||||
free to tinker around without using compose!
|
||||
|
||||
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
|
||||
version 1.1.0 or earlier, you might need to change in the
|
||||
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
|
||||
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
|
||||
newer ``docker-compose.yml.example`` file
|
||||
|
||||
Bare Metal Route
|
||||
================
|
||||
|
||||
.. _setup-permanent:
|
||||
.. warning::
|
||||
|
||||
Making Things a Little more Permanent
|
||||
-------------------------------------
|
||||
TBD. User docker for now.
|
||||
|
||||
Once you've tested things and are happy with the work flow, you should secure
|
||||
the installation and automate the process of starting the webserver and
|
||||
consumer.
|
||||
Migration to paperless-ng
|
||||
#########################
|
||||
|
||||
At its core, paperless-ng is still paperless and fully compatible. However, some
|
||||
things have changed under the hood, so you need to adapt your setup depending on
|
||||
how you installed paperless. The important things to keep in mind are as follows.
|
||||
|
||||
.. _setup-permanent-webserver:
|
||||
* Read the :ref:`changelog <paperless_changelog>` and take note of breaking changes.
|
||||
* It is recommended to use postgresql as the database now. If you want to continue
|
||||
using SQLite, which is the default of paperless, use ``docker-compose.sqlite.yml``.
|
||||
See :ref:`setup-sqlite_to_psql` for details on how to move your data from
|
||||
sqlite to postgres.
|
||||
* The task scheduler of paperless, which is used to execute periodic tasks
|
||||
such as email checking and maintenance, requires a `redis`_ message broker
|
||||
instance. The docker-compose route takes care of that.
|
||||
* The layout of the folder structure for your documents and data remains the
|
||||
same, so you can just plug your old docker volumes into paperless-ng and
|
||||
expect it to find everything where it should be.
|
||||
|
||||
Using a Real Webserver
|
||||
++++++++++++++++++++++
|
||||
Migration to paperless-ng is then performed in a few simple steps:
|
||||
|
||||
The default is to use Django's development server, as that's easy and does the
|
||||
job well enough on a home network. However it is heavily discouraged to use
|
||||
it for more than that.
|
||||
1. Stop paperless.
|
||||
|
||||
If you want to do things right you should use a real webserver capable of
|
||||
handling more than one thread. You will also have to let the webserver serve
|
||||
the static files (CSS, JavaScript) from the directory configured in
|
||||
``PAPERLESS_STATICDIR``. The default static files directory is ``../static``.
|
||||
.. code:: bash
|
||||
|
||||
For that you need to activate your virtual environment and collect the static
|
||||
files with the command:
|
||||
$ cd /path/to/current/paperless
|
||||
$ docker-compose down
|
||||
|
||||
.. code:: bash
|
||||
2. Do a backup for two purposes: If something goes wrong, you still have your
|
||||
data. Second, if you don't like paperless-ng, you can switch back to
|
||||
paperless.
|
||||
|
||||
$ cd <paperless directory>/src
|
||||
$ ./manage.py collectstatic
|
||||
3. Download the latest release of paperless-ng. You can either go with the
|
||||
docker-compose files or use the archive to build the image yourself.
|
||||
You can either replace your current paperless folder or put paperless-ng
|
||||
in a different location.
|
||||
|
||||
.. caution::
|
||||
|
||||
Apache
|
||||
~~~~~~
|
||||
Make sure you also download the ``.env`` file. This will set the
|
||||
project name for docker compose to ``paperless`` and then it will
|
||||
automatically reuse your existing paperless volumes.
|
||||
|
||||
This is a configuration supplied by `steckerhalter`_ on GitHub. It uses Apache
|
||||
and mod_wsgi, with a Paperless installation in ``/home/paperless/``:
|
||||
4. Adjust ``docker-compose.yml`` and
|
||||
``docker-compose.env`` to your needs.
|
||||
See `docker route`_ for details on which edits are required.
|
||||
|
||||
.. code:: apache
|
||||
5. Start paperless-ng.
|
||||
|
||||
<VirtualHost *:80>
|
||||
ServerName example.com
|
||||
.. code:: bash
|
||||
|
||||
Alias /static/ /home/paperless/paperless/static/
|
||||
<Directory /home/paperless/paperless/static>
|
||||
Require all granted
|
||||
</Directory>
|
||||
$ docker-compose up
|
||||
|
||||
WSGIScriptAlias / /home/paperless/paperless/src/paperless/wsgi.py
|
||||
WSGIDaemonProcess example.com user=paperless group=paperless threads=5 python-path=/home/paperless/paperless/src:/home/paperless/.env/lib/python3.6/site-packages
|
||||
WSGIProcessGroup example.com
|
||||
If you see everything working (you should see some migrations getting
|
||||
applied, for instance), you can gracefully stop paperless-ng with Ctrl-C
|
||||
and then start paperless-ng as usual with
|
||||
|
||||
<Directory /home/paperless/paperless/src/paperless>
|
||||
<Files wsgi.py>
|
||||
Require all granted
|
||||
</Files>
|
||||
</Directory>
|
||||
</VirtualHost>
|
||||
.. code:: bash
|
||||
|
||||
.. _steckerhalter: https://github.com/steckerhalter
|
||||
$ docker-compose up -d
|
||||
|
||||
This will run paperless in the background and automatically start it on system boot.
|
||||
|
||||
Nginx + Gunicorn
|
||||
~~~~~~~~~~~~~~~~
|
||||
6. Paperless installed a permanent redirect to ``admin/`` in your browser. This
|
||||
redirect is still in place and prevents access to the new UI. Clear
|
||||
everything related to paperless in your browsers data in order to fix
|
||||
this issue.
|
||||
|
||||
If you're using Nginx, the most common setup is to combine it with a
|
||||
Python-based server like Gunicorn so that Nginx is acting as a proxy. Below is
|
||||
a copy of a simple Nginx configuration fragment making use of a gunicorn
|
||||
instance listening on localhost port 8000.
|
||||
|
||||
.. code:: nginx
|
||||
.. _setup-sqlite_to_psql:
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
Moving data from sqlite to postgresql
|
||||
=====================================
|
||||
|
||||
index index.html index.htm index.php;
|
||||
access_log /var/log/nginx/paperless_access.log;
|
||||
error_log /var/log/nginx/paperless_error.log;
|
||||
.. warning::
|
||||
|
||||
location /static {
|
||||
|
||||
autoindex on;
|
||||
alias <path-to-paperless-static-directory>;
|
||||
|
||||
}
|
||||
|
||||
location / {
|
||||
|
||||
proxy_set_header Host $http_host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
proxy_pass http://127.0.0.1:8000;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
The gunicorn server can be started with the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ <path-to-paperless-virtual-environment>/bin/gunicorn --pythonpath=<path-to-paperless>/src paperless.wsgi -w 2
|
||||
|
||||
|
||||
.. _setup-permanent-standard-systemd:
|
||||
|
||||
Standard (Bare Metal + Systemd)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you're running on a bare metal system that's using Systemd, you can use the
|
||||
service unit files in the ``scripts`` directory to set this up.
|
||||
|
||||
1. You'll need to create a group and user called ``paperless`` (without login)
|
||||
2. Setup Paperless to be in a place that this new user can read and write to.
|
||||
3. Ensure ``/etc/paperless`` is readable by the ``paperless`` user.
|
||||
4. Copy the service file from the ``scripts`` directory to
|
||||
``/etc/systemd/system``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ cp /path/to/paperless/scripts/paperless-consumer.service /etc/systemd/system/
|
||||
$ cp /path/to/paperless/scripts/paperless-webserver.service /etc/systemd/system/
|
||||
|
||||
5. Edit the service file to point the ``ExecStart`` line to the proper location
|
||||
of your paperless install, referencing the appropriate Python binary. For
|
||||
example:
|
||||
``ExecStart=/path/to/python3 /path/to/paperless/src/manage.py document_consumer``.
|
||||
6. Start and enable (so they start on boot) the services.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ systemctl enable paperless-consumer
|
||||
$ systemctl enable paperless-webserver
|
||||
$ systemctl start paperless-consumer
|
||||
$ systemctl start paperless-webserver
|
||||
|
||||
|
||||
.. _setup-permanent-standard-upstart:
|
||||
|
||||
Standard (Bare Metal + Upstart)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Ubuntu 14.04 and earlier use the `Upstart`_ init system to start services
|
||||
during the boot process. To configure Upstart to run Paperless automatically
|
||||
after restarting your system:
|
||||
|
||||
1. Change to the directory where Upstart's configuration files are kept:
|
||||
``cd /etc/init``
|
||||
2. Create a new file: ``sudo nano paperless-server.conf``
|
||||
3. In the newly-created file enter::
|
||||
|
||||
start on (local-filesystems and net-device-up IFACE=eth0)
|
||||
stop on shutdown
|
||||
|
||||
respawn
|
||||
respawn limit 10 5
|
||||
|
||||
script
|
||||
exec <path to paperless virtual environment>/bin/gunicorn --pythonpath=<path to parperless>/src paperless.wsgi -w 2
|
||||
end script
|
||||
|
||||
Note that you'll need to replace ``/srv/paperless/src/manage.py`` with the
|
||||
path to the ``manage.py`` script in your installation directory.
|
||||
|
||||
If you are using a network interface other than ``eth0``, you will have to
|
||||
change ``IFACE=eth0``. For example, if you are connected via WiFi, you will
|
||||
likely need to replace ``eth0`` above with ``wlan0``. To see all interfaces,
|
||||
run ``ifconfig -a``.
|
||||
|
||||
Save the file.
|
||||
|
||||
4. Create a new file: ``sudo nano paperless-consumer.conf``
|
||||
|
||||
5. In the newly-created file enter::
|
||||
|
||||
start on (local-filesystems and net-device-up IFACE=eth0)
|
||||
stop on shutdown
|
||||
|
||||
respawn
|
||||
respawn limit 10 5
|
||||
|
||||
script
|
||||
exec <path to paperless virtual environment>/bin/python <path to parperless>/manage.py document_consumer
|
||||
end script
|
||||
|
||||
Replace the path placeholder and ``eth0`` with the appropriate value and save the file.
|
||||
|
||||
These two configuration files together will start both the Paperless webserver
|
||||
and document consumer processes when the file system and network interface
|
||||
specified is available after boot. Furthermore, if either process ever exits
|
||||
unexpectedly, Upstart will try to restart it a maximum of 10 times within a 5
|
||||
second period.
|
||||
|
||||
.. _Upstart: http://upstart.ubuntu.com/
|
||||
|
||||
|
||||
.. _setup-permanent-docker:
|
||||
|
||||
Docker
|
||||
~~~~~~
|
||||
|
||||
If you're using Docker, you can set a restart-policy_ in the
|
||||
``docker-compose.yml`` to have the containers automatically start with the
|
||||
Docker daemon.
|
||||
|
||||
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
|
||||
TBD.
|
||||
|
||||
.. _redis: https://redis.io/
|
||||
|
@ -1,12 +1,42 @@
|
||||
.. _troubleshooting:
|
||||
|
||||
***************
|
||||
Troubleshooting
|
||||
===============
|
||||
***************
|
||||
|
||||
No files are added by the consumer
|
||||
##################################
|
||||
|
||||
Check for the following issues:
|
||||
|
||||
* Ensure that the directory you're putting your documents in is the folder
|
||||
paperless is watching. With docker, this setting is performed in the
|
||||
``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
|
||||
setting. Don't adjust this setting if you're using docker.
|
||||
* Ensure that redis is up and running. Paperless does its task processing
|
||||
asynchronously, and for documents to arrive at the task processor, it needs
|
||||
redis to run.
|
||||
* Ensure that the task processor is running. Docker does this automatically.
|
||||
Manually invoke the task processor by executing
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ python3 manage.py qcluster
|
||||
|
||||
* Look at the output of paperless and inspect it for any errors.
|
||||
* Go to the admin interface, and check if there are failed tasks. If so, the
|
||||
tasks will contain an error message.
|
||||
|
||||
|
||||
Consumer fails to pickup any new files
|
||||
######################################
|
||||
|
||||
If you notice, that the consumer will only pickup files in the consumption
|
||||
directory at startup, but won't find any other files added later, check out
|
||||
the configuration file and enable filesystem polling with the setting
|
||||
``PAPERLESS_CONSUMER_POLLING``.
|
||||
|
||||
.. _troubleshooting-languagemissing:
|
||||
|
||||
Consumer warns ``OCR for XX failed``
|
||||
------------------------------------
|
||||
####################################
|
||||
|
||||
If you find the OCR accuracy to be too low, and/or the document consumer warns
|
||||
that ``OCR for XX failed, but we're going to stick with what we've got since
|
||||
@ -20,10 +50,9 @@ box, and your documents are written in Spanish you may need to run::
|
||||
apt-get install -y tesseract-ocr-spa
|
||||
|
||||
|
||||
.. _troubleshooting-convertpixelcache:
|
||||
|
||||
Consumer dies with ``convert: unable to extent pixel cache``
|
||||
------------------------------------------------------------
|
||||
############################################################
|
||||
|
||||
During the consumption process, Paperless invokes ImageMagick's ``convert``
|
||||
program to translate the source document into something that the OCR engine can
|
||||
@ -48,10 +77,9 @@ that's actually on a physical disk (and writable by the user running
|
||||
Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.
|
||||
|
||||
|
||||
.. _troubleshooting-decompressionbombwarning:
|
||||
|
||||
DecompressionBombWarning and/or no text in the OCR output
|
||||
---------------------------------------------------------
|
||||
#########################################################
|
||||
|
||||
Some users have had issues using Paperless to consume PDFs that were created
|
||||
by merging Very Large Scanned Images into one PDF. If this happens to you,
|
||||
it's likely because the PDF you've created contains some very large pages
|
||||
@ -72,4 +100,4 @@ with a DPI of 300, then merging the images into the single PDF with
|
||||
For more information on this and situations like it, you should take a look
|
||||
at `Issue #118`_ as that's where this tip originated.
|
||||
|
||||
.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118
|
||||
.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118
|
||||
|
246
docs/usage_overview.rst
Normal file
@ -0,0 +1,246 @@
|
||||
**************
|
||||
Usage Overview
|
||||
**************
|
||||
|
||||
Paperless is an application that manages your personal documents. With
|
||||
the help of a document scanner (see :ref:`scanners`), paperless transforms
|
||||
your wieldy physical document binders into a searchable archive and
|
||||
provices many utilities for finding and managing your documents.
|
||||
|
||||
|
||||
Terms and definitions
|
||||
#####################
|
||||
|
||||
Paperless esentially consists of two different parts for managing your
|
||||
documents:
|
||||
|
||||
* The *consumer* watches a specified folder and adds all documents in that
|
||||
folder to paperless.
|
||||
* The *web server* provides a UI that you use to manage and search for your
|
||||
scanned documents.
|
||||
|
||||
Each document has a couple of fields that you can assign to them:
|
||||
|
||||
* A *Document* is a piece of paper that sometimes contains valuable
|
||||
information.
|
||||
* The *correspondent* of a document is the person, institution or company that
|
||||
a document either originates form, or is sent to.
|
||||
* A *tag* is a label that you can assign to documents. Think of labels as more
|
||||
powerful folders: Multiple documents can be grouped together with a single
|
||||
tag, however, a single document can also have multiple tags. This is not
|
||||
possible with folders. The reason folders are not implemented in paperless
|
||||
is simply that tags are much more versatile than folders.
|
||||
* A *document type* is used to demarkate the type of a document such as letter,
|
||||
bank statement, invoice, contract, etc. It is used to identify what a document
|
||||
is about.
|
||||
* The *date added* of a document is the date the document was scanned into
|
||||
paperless. You cannot and should not change this date.
|
||||
* The *date created* of a document is the date the document was intially issued.
|
||||
This can be the date you bought a product, the date you signed a contract, or
|
||||
the date a letter was sent to you.
|
||||
* The *archive serial number* (short: ASN) of a document is the identifier of
|
||||
the document in your physical document binders. See
|
||||
:ref:`usage-recommended_workflow` below.
|
||||
* The *content* of a document is the text that was OCR'ed from the document.
|
||||
This text is fed into the search engine and is used for matching tags,
|
||||
correspondents and document types.
|
||||
|
||||
|
||||
Frontend overview
|
||||
#################
|
||||
|
||||
.. warning::
|
||||
|
||||
TBD. Add some fancy screenshots!
|
||||
|
||||
Adding documents to paperless
|
||||
#############################
|
||||
|
||||
Once you've got Paperless setup, you need to start feeding documents into it.
|
||||
Currently, there are three options: the consumption directory, IMAP (email), and
|
||||
HTTP POST.
|
||||
|
||||
|
||||
The consumption directory
|
||||
=========================
|
||||
|
||||
The primary method of getting documents into your database is by putting them in
|
||||
the consumption directory. The consumer runs in an infinite
|
||||
loop looking for new additions to this directory and when it finds them, it goes
|
||||
about the process of parsing them with the OCR, indexing what it finds, and storing
|
||||
it in the media directory.
|
||||
|
||||
Getting stuff into this directory is up to you. If you're running Paperless
|
||||
on your local computer, you might just want to drag and drop files there, but if
|
||||
you're running this on a server and want your scanner to automatically push
|
||||
files to this directory, you'll need to setup some sort of service to accept the
|
||||
files from the scanner. Typically, you're looking at an FTP server like
|
||||
`Proftpd`_ or a Windows folder share with `Samba`_.
|
||||
|
||||
.. _Proftpd: http://www.proftpd.org/
|
||||
.. _Samba: http://www.samba.org/
|
||||
|
||||
.. TODO: hyperref to configuration of the location of this magic folder.
|
||||
|
||||
.. _usage-email:
|
||||
|
||||
IMAP (Email)
|
||||
============
|
||||
|
||||
You can tell paperless-ng to consume documents from your email accounts.
|
||||
This is a very flexible and powerful feature, if you regularly received documents
|
||||
via mail that you need to archive. The mail consumer can be configured by using the
|
||||
admin interface in the following manner:
|
||||
|
||||
1. Define e-mail accounts.
|
||||
2. Define mail rules for your account.
|
||||
|
||||
These rules perform the following:
|
||||
|
||||
1. Connect to the mail server.
|
||||
2. Fetch all matching mails (as defined by folder, maximum age and the filters)
|
||||
3. Check if there are any consumable attachments.
|
||||
4. If so, instruct paperless to consume the attachments and optionally
|
||||
use the metadata provided in the rule for the new document.
|
||||
5. If documents were consumed from a mail, the rule action is performed
|
||||
on that mail.
|
||||
|
||||
Paperless will completely ignore mails that do not match your filters. It will also
|
||||
only perform the action on mails that it has consumed documents from.
|
||||
|
||||
The actions all ensure that the same mail is not consumed twice by different means.
|
||||
These are as follows:
|
||||
|
||||
* **Delete:** Immediately deletes mail that paperless has consumed documents from.
|
||||
Use with caution.
|
||||
* **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
|
||||
from already read mails. If you read a mail before paperless sees it, it will be
|
||||
ignored.
|
||||
* **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
|
||||
will not consume flagged mails.
|
||||
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
|
||||
consume them again.
|
||||
|
||||
.. caution::
|
||||
|
||||
The mail consumer will perform these actions on all mails it has consumed
|
||||
documents from. Keep in mind that the actual consumption process may fail
|
||||
for some reason, leaving you with missing documents in paperless.
|
||||
|
||||
.. note::
|
||||
|
||||
With the correct set of rules, you can completely automate your email documents.
|
||||
Create rules for every correspondent you receive digital documents from and
|
||||
paperless will read them automatically. The default acion "mark as read" is
|
||||
pretty tame and will not cause any damage or data loss whatsoever.
|
||||
|
||||
You can also setup a special folder in your mail account for paperless and use
|
||||
your favorite mail client to move to be consumed mails into that folder
|
||||
automatically or manually and tell paperless to move them to yet another folder
|
||||
after consumption. It's up to you.
|
||||
|
||||
.. note::
|
||||
|
||||
Paperless will process the rules in the order defined in the admin page.
|
||||
|
||||
You can define catch-all rules and have them executed last to consume
|
||||
any documents not matched by previous rules. Such a rule may assign an "Unknown
|
||||
mail document" tag to consumed documents so you can inspect them further.
|
||||
|
||||
Paperless is set up to check your mails every 10 minutes. This can be configured on the
|
||||
'Scheduled tasks' page in the admin.
|
||||
|
||||
|
||||
REST API
|
||||
========
|
||||
|
||||
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
|
||||
|
||||
|
||||
.. _usage-recommended_workflow:
|
||||
|
||||
The recommended workflow
|
||||
########################
|
||||
|
||||
Once you have familiarized yourself with paperless and are ready to use it
|
||||
for all your documents, the recommended workflow for managing your documents
|
||||
is as follows. This workflow also takes into account that some documents
|
||||
have to be kept in physical form, but still ensures that you get all the
|
||||
advantages for these documents as well.
|
||||
|
||||
The following diagram shows how easy it is to manage your documents.
|
||||
|
||||
.. image:: _static/recommended_workflow.png
|
||||
|
||||
Preparations in paperless
|
||||
=========================
|
||||
|
||||
* Create an inbox tag that gets assigned to all new documents.
|
||||
* Create a TODO tag.
|
||||
|
||||
Processing of the physical documents
|
||||
====================================
|
||||
|
||||
Keep a physical inbox. Whenever you receive a document that you need to
|
||||
archive, put it into your inbox. Regulary, do the following for all documents
|
||||
in your inbox:
|
||||
|
||||
1. For each document, decide if you need to keep the document in physical
|
||||
form. This applies to certain important documents, such as contracts and
|
||||
certificates.
|
||||
2. If you need to keep the document, write a running number on the document
|
||||
before scanning, starting at one and counting upwards. This is the archive
|
||||
serial number, or ASN in short.
|
||||
3. Scan the document.
|
||||
4. If the document has an ASN assigned, store it in a *single* binder, sorted
|
||||
by ASN. Don't order this binder in any other way.
|
||||
5. If the document has no ASN, throw it away. Yay!
|
||||
|
||||
Over time, you will notice that your physical binder will fill up. If it is
|
||||
full, label the binder with the range of ASNs in this binder (i.e., "Documents
|
||||
1 to 343"), store the binder in your cellar or elsewhere, and start a new
|
||||
binder.
|
||||
|
||||
The idea behind this process is that you will never have to use the physical
|
||||
binders to find a document. If you need a specific physical document, you
|
||||
may find this document by:
|
||||
|
||||
1. Searching in paperless for the document.
|
||||
2. Identify the ASN of the document, since it appears on the scan.
|
||||
3. Grab the relevant document binder and get the document. This is easy since
|
||||
they are sorted by ASN.
|
||||
|
||||
Processing of documents in paperless
|
||||
====================================
|
||||
|
||||
Once you have scanned in a document, proceed in paperless as follows.
|
||||
|
||||
1. If the document has an ASN, assign the ASN to the document.
|
||||
2. Assign a correspondent to the document (i.e., your employer, bank, etc)
|
||||
This isnt strictly necessary but helps in finding a document when you need
|
||||
it.
|
||||
3. Assign a document type (i.e., invoice, bank statement, etc) to the document
|
||||
This isnt strictly necessary but helps in finding a document when you need
|
||||
it.
|
||||
4. Assign a proper title to the document (the name of an item you bought, the
|
||||
subject of the letter, etc)
|
||||
5. Check that the date of the document is corrent. Paperless tries to read
|
||||
the date from the content of the document, but this fails sometimes if the
|
||||
OCR is bad or multiple dates appear on the document.
|
||||
6. Remove inbox tags from the documents.
|
||||
|
||||
|
||||
Task management
|
||||
===============
|
||||
|
||||
Some documents require attention and require you to act on the document. You
|
||||
may take two different approaches to handle these documents based on how
|
||||
regularly you intent to use paperless and scan documents.
|
||||
|
||||
* If you scan and process your documents in paperless regularly, assign a
|
||||
TODO tag to all scanned documents that you need to process. Create a saved
|
||||
view on the dashboard that shows all documents with this tag.
|
||||
* If you do not scan documents regularly and use paperless solely for archiving,
|
||||
create a physical todo box next to your physical inbox and put documents you
|
||||
need to process in the TODO box. When you performed the task associated with
|
||||
the document, move it to the inbox.
|
@ -1,284 +0,0 @@
|
||||
.. _utilities:
|
||||
|
||||
Utilities
|
||||
=========
|
||||
|
||||
There's basically three utilities to Paperless: the webserver, consumer, and
|
||||
if needed, the exporter. They're all detailed here.
|
||||
|
||||
|
||||
.. _utilities-webserver:
|
||||
|
||||
The Webserver
|
||||
-------------
|
||||
|
||||
At the heart of it, Paperless is a simple Django webservice, and the entire
|
||||
interface is based on Django's standard admin interface. Once running, visiting
|
||||
the URL for your service delivers the admin, through which you can get a
|
||||
detailed listing of all available documents, search for specific files, and
|
||||
download whatever it is you're looking for.
|
||||
|
||||
|
||||
.. _utilities-webserver-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
The webserver is started via the ``manage.py`` script:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py runserver
|
||||
|
||||
By default, the server runs on localhost, port 8000, but you can change this
|
||||
with a few arguments, run ``manage.py --help`` for more information.
|
||||
|
||||
Add the option ``--noreload`` to reduce resource usage. Otherwise, the server
|
||||
continuously polls all source files for changes to auto-reload them.
|
||||
|
||||
Note that when exiting this command your webserver will disappear.
|
||||
If you want to run this full-time (which is kind of the point)
|
||||
you'll need to have it start in the background -- something you'll need to
|
||||
figure out for your own system. To get you started though, there are Systemd
|
||||
service files in the ``scripts`` directory.
|
||||
|
||||
|
||||
.. _utilities-consumer:
|
||||
|
||||
The Consumer
|
||||
------------
|
||||
|
||||
The consumer script runs in an infinite loop, constantly looking at a directory
|
||||
for documents to parse and index. The process is pretty straightforward:
|
||||
|
||||
1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2.
|
||||
If not, wait 10 seconds and try again. On Linux, new documents are detected
|
||||
instantly via inotify, so there's no waiting involved.
|
||||
2. Parse the document with Tesseract
|
||||
3. Create a new record in the database with the OCR'd text
|
||||
4. Attempt to automatically assign document attributes by doing some guesswork.
|
||||
Read up on the :ref:`guesswork documentation<guesswork>` for more
|
||||
information about this process.
|
||||
5. Encrypt the document (if you have a passphrase set) and store it in the
|
||||
``media`` directory under ``documents/originals``.
|
||||
6. Go to #1.
|
||||
|
||||
|
||||
.. _utilities-consumer-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
The consumer is started via the ``manage.py`` script:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_consumer
|
||||
|
||||
This starts the service that will consume documents as they appear in
|
||||
``CONSUMPTION_DIR``.
|
||||
|
||||
Note that this command runs continuously, so exiting it will mean your webserver
|
||||
disappears. If you want to run this full-time (which is kind of the point)
|
||||
you'll need to have it start in the background -- something you'll need to
|
||||
figure out for your own system. To get you started though, there are Systemd
|
||||
service files in the ``scripts`` directory.
|
||||
|
||||
Some command line arguments are available to customize the behavior of the
|
||||
consumer. By default it will use ``/etc/paperless.conf`` values. Display the
|
||||
help with:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_consumer --help
|
||||
|
||||
.. _utilities-exporter:
|
||||
|
||||
The Exporter
|
||||
------------
|
||||
|
||||
Tired of fiddling with Paperless, or just want to do something stupid and are
|
||||
afraid of accidentally damaging your files? You can export all of your
|
||||
documents into neatly named, dated, and unencrypted files.
|
||||
|
||||
|
||||
.. _utilities-exporter-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
This too is done via the ``manage.py`` script:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
|
||||
|
||||
This will dump all of your unencrypted documents into ``/path/to/somewhere``
|
||||
for you to do with as you please. The files are accompanied with a special
|
||||
file, ``manifest.json`` which can be used to :ref:`import the files
|
||||
<utilities-importer>` at a later date if you wish.
|
||||
|
||||
|
||||
.. _utilities-exporter-howto-docker:
|
||||
|
||||
Docker
|
||||
______
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>`, running the
|
||||
expoorter is almost as easy. To mount a volume for exports, follow the
|
||||
instructions in the ``docker-compose.yml.example`` file for the ``/export``
|
||||
volume (making the changes in your own ``docker-compose.yml`` file, of course).
|
||||
Once you have the volume mounted, the command to run an export is:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm consumer document_exporter /export
|
||||
|
||||
If you prefer to use ``docker run`` directly, supplying the necessary commandline
|
||||
options:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ # Make sure to replace your passphrase and remove or adapt the id mapping
|
||||
$ docker run --rm \
|
||||
--volumes-from paperless_data_1 \
|
||||
--volume /path/to/arbitrary/place:/export \
|
||||
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
|
||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||
paperless document_exporter /export
|
||||
|
||||
|
||||
.. _utilities-importer:
|
||||
|
||||
The Importer
|
||||
------------
|
||||
|
||||
Looking to transfer Paperless data from one instance to another, or just want
|
||||
to restore from a backup? This is your go-to toy.
|
||||
|
||||
|
||||
.. _utilities-importer-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
The importer works just like the exporter. You point it at a directory, and
|
||||
the script does the rest of the work:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
|
||||
|
||||
Docker
|
||||
______
|
||||
|
||||
Assuming that you've already gone through the steps above in the
|
||||
:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
|
||||
to do is just re-use the ``/export`` path you already setup:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm consumer document_importer /export
|
||||
|
||||
Similarly, if you're not using docker-compose, you can adjust the export
|
||||
instructions above to do the import.
|
||||
|
||||
|
||||
.. _utilities-retagger:
|
||||
|
||||
Re-running your tagging and correspondent matchers
|
||||
--------------------------------------------------
|
||||
|
||||
Say you've imported a few hundred documents and now want to introduce
|
||||
a tag or set up a new correspondent, and apply its matching to all of
|
||||
the currently-imported docs. This problem is common enough that
|
||||
there are tools for it.
|
||||
|
||||
|
||||
.. _utilities-retagger-howto:
|
||||
|
||||
How to Do It
|
||||
............
|
||||
|
||||
This too is done via the ``manage.py`` script:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_retagger
|
||||
|
||||
Run this after changing or adding tagging rules. It'll loop over all
|
||||
of the documents in your database and attempt to match all of your
|
||||
tags to them. If one matches, it'll be applied. And don't worry, you
|
||||
can run this as often as you like, it won't double-tag a document.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_correspondents
|
||||
|
||||
This is the similar command to run after adding or changing a correspondent.
|
||||
|
||||
.. _utilities-encyption:
|
||||
|
||||
Enabling Encrpytion
|
||||
-------------------
|
||||
|
||||
Let's say you've imported a few documents to play around with paperless and now
|
||||
you are using it more seriously and want to enable encryption of your files.
|
||||
|
||||
.. utilities-encryption-howto:
|
||||
|
||||
Basic Syntax
|
||||
.............
|
||||
|
||||
Again we'll use the ``manage.py`` script, passing ``change_storage_type``:
|
||||
|
||||
.. code:: console
|
||||
|
||||
$ /path/to/paperless/src/manage.py change_storage_type --help
|
||||
usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
|
||||
[--settings SETTINGS]
|
||||
[--pythonpath PYTHONPATH] [--traceback]
|
||||
[--no-color] [--passphrase PASSPHRASE]
|
||||
{gpg,unencrypted} {gpg,unencrypted}
|
||||
|
||||
This is how you migrate your stored documents from an encrypted state to an
|
||||
unencrypted one (or vice-versa)
|
||||
|
||||
positional arguments:
|
||||
{gpg,unencrypted} The state you want to change your documents from
|
||||
{gpg,unencrypted} The state you want to change your documents to
|
||||
|
||||
optional arguments:
|
||||
--passphrase PASSPHRASE
|
||||
If PAPERLESS_PASSPHRASE isn't set already, you need to
|
||||
specify it here
|
||||
|
||||
Enabling Encryption
|
||||
...................
|
||||
|
||||
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
|
||||
|
||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
|
||||
|
||||
|
||||
Disabling Encryption
|
||||
....................
|
||||
|
||||
Basic usage to enable encryption of your document store:
|
||||
|
||||
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
|
@ -1,269 +1,55 @@
|
||||
# Sample paperless.conf
|
||||
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
|
||||
# As this file contains passwords it should only be readable by the user
|
||||
# running paperless.
|
||||
# Have a look at the docs for documentation.
|
||||
# https://paperless-ng.readthedocs.io/en/latest/configuration.html
|
||||
|
||||
###############################################################################
|
||||
#### Message Broker ####
|
||||
###############################################################################
|
||||
# Debug. Only enable this for development.
|
||||
|
||||
# This is required for processing scheduled tasks such as email fetching, index
|
||||
# optimization and for training the automatic document matcher.
|
||||
# Defaults to localhost:6379.
|
||||
#PAPERLESS_REDIS="redis://localhost:6379"
|
||||
#PAPERLESS_DEBUG=false
|
||||
|
||||
# Required services
|
||||
|
||||
###############################################################################
|
||||
#### Database Settings ####
|
||||
###############################################################################
|
||||
#PAPERLESS_REDIS=redis://localhost:6379
|
||||
#PAPERLESS_DBHOST=localhost
|
||||
#PAPERLESS_DBPORT=5432
|
||||
#PAPERLESS_DBNAME=paperless
|
||||
#PAPERLESS_DBUSER=paperless
|
||||
#PAPERLESS_DBPASS=paperless
|
||||
|
||||
# By default, sqlite is used as the database backend. This can be changed here.
|
||||
# The docker-compose service definition uses a postgresql server. The
|
||||
# configuration for this is already done inside the docker-compose.env file.
|
||||
# Paths and folders
|
||||
|
||||
#Set PAPERLESS_DBHOST and postgresql will be used instead of mysql.
|
||||
#PAPERLESS_DBHOST="localhost"
|
||||
#PAPERLESS_CONSUMPTION_DIR=../consume
|
||||
#PAPERLESS_DATA_DIR=../data
|
||||
#PAPERLESS_MEDIA_ROOT=../media
|
||||
#PAPERLESS_STATICDIR=../static
|
||||
#PAPERLESS_FILENAME_FORMAT=
|
||||
|
||||
#Adjust port if necessary
|
||||
#PAPERLESS_DBPORT=
|
||||
# Security and hosting
|
||||
|
||||
#name, user and pass all default to "paperless"
|
||||
#PAPERLESS_DBNAME="paperless"
|
||||
#PAPERLESS_DBUSER="paperless"
|
||||
#PAPERLESS_DBPASS="paperless"
|
||||
#PAPERLESS_SECRET_KEY=change-me
|
||||
#PAPERLESS_ALLOWED_HOSTS=example.com,www.example.com
|
||||
#PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
|
||||
#PAPERLESS_FORCE_SCRIPT_NAME=
|
||||
#PAPERLESS_STATIC_URL=/static/
|
||||
|
||||
# Software tweaks
|
||||
|
||||
###############################################################################
|
||||
#### Paths & Folders ####
|
||||
###############################################################################
|
||||
|
||||
# This where your documents should go to be consumed. Make sure that it exists
|
||||
# and that the user running the paperless service can read/write its contents
|
||||
# before you start Paperless.
|
||||
PAPERLESS_CONSUMPTION_DIR="../consume"
|
||||
|
||||
# This is where paperless stores all its data (search index, sqlite database,
|
||||
# classification model, etc).
|
||||
#PAPERLESS_DATA_DIR="../data"
|
||||
|
||||
# This is where your documents and thumbnails are stored.
|
||||
#PAPERLESS_MEDIA_ROOT="../media"
|
||||
|
||||
# Override the default STATIC_ROOT here. This is where all static files
|
||||
# created using "collectstatic" manager command are stored.
|
||||
#PAPERLESS_STATICDIR="../static"
|
||||
|
||||
|
||||
# Override the STATIC_URL here. Unless you're hosting Paperless off a
|
||||
# subdomain like /paperless/, you probably don't need to change this.
|
||||
#PAPERLESS_STATIC_URL="/static/"
|
||||
|
||||
|
||||
# These values are required if you want paperless to check a particular email
|
||||
# box every 10 minutes and attempt to consume documents from there. If you
|
||||
# don't define a HOST, mail checking will just be disabled.
|
||||
#PAPERLESS_CONSUME_MAIL_HOST=""
|
||||
#PAPERLESS_CONSUME_MAIL_PORT=""
|
||||
#PAPERLESS_CONSUME_MAIL_USER=""
|
||||
#PAPERLESS_CONSUME_MAIL_PASS=""
|
||||
|
||||
# Override the default IMAP inbox here. If not set Paperless defaults to
|
||||
# "INBOX".
|
||||
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
|
||||
|
||||
# Any email sent to the target account that does not contain this text will be
|
||||
# ignored.
|
||||
PAPERLESS_EMAIL_SECRET=""
|
||||
|
||||
# Specify a filename format for the document (directories are supported)
|
||||
# Use the following placeholders:
|
||||
# * {correspondent}
|
||||
# * {title}
|
||||
# * {created}
|
||||
# * {added}
|
||||
# * {tags[KEY]} If your tags conform to key_value or key-value
|
||||
# * {tags[INDEX]} If your tags are strings, select the tag by index
|
||||
# Uniqueness of filenames is ensured, as an incrementing counter is attached
|
||||
# to each filename.
|
||||
#PAPERLESS_FILENAME_FORMAT=""
|
||||
|
||||
###############################################################################
|
||||
#### Security ####
|
||||
###############################################################################
|
||||
|
||||
# Controls whether django's debug mode is enabled. Disable this on production
|
||||
# systems. Debug mode is disabled by default.
|
||||
#PAPERLESS_DEBUG="false"
|
||||
|
||||
# GnuPG encryption is deprecated and will be removed in future versions.
|
||||
#
|
||||
# Paperless can be instructed to attempt to encrypt your PDF files with GPG
|
||||
# using the PAPERLESS_PASSPHRASE specified below. If however you're not
|
||||
# concerned about encrypting these files (for example if you have disk
|
||||
# encryption locally) then you don't need this and can safely leave this value
|
||||
# un-set.
|
||||
#
|
||||
# One final note about the passphrase. Once you've consumed a document with
|
||||
# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and
|
||||
# can't properly export documents that were encrypted with an old passphrase if
|
||||
# you've since changed it to a new one.
|
||||
#
|
||||
# The default is to not use encryption at all.
|
||||
#PAPERLESS_PASSPHRASE="secret"
|
||||
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
# public, you should change the key to something unique and verbose.
|
||||
#PAPERLESS_SECRET_KEY="change-me"
|
||||
|
||||
|
||||
# If you're planning on putting Paperless on the open internet, then you
|
||||
# really should set this value to the domain name you're using. Failing to do
|
||||
# so leaves you open to HTTP host header attacks:
|
||||
# https://docs.djangoproject.com/en/1.10/topics/security/#host-headers-virtual-hosting
|
||||
#
|
||||
# Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
|
||||
|
||||
# If you decide to use the Paperless API in an ajax call, you need to add your
|
||||
# servers to the list of allowed hosts that can do CORS calls. By default
|
||||
# Paperless allows calls from localhost:8080, but you'd like to change that,
|
||||
# you can set this value to a comma-separated list.
|
||||
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
|
||||
|
||||
# To host paperless under a subpath url like example.com/paperless you set
|
||||
# this value to /paperless. No trailing slash!
|
||||
#
|
||||
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
|
||||
#PAPERLESS_FORCE_SCRIPT_NAME=""
|
||||
|
||||
###############################################################################
|
||||
#### Software Tweaks ####
|
||||
###############################################################################
|
||||
|
||||
# After a document is consumed, Paperless can trigger an arbitrary script if
|
||||
# you like. This script will be passed a number of arguments for you to work
|
||||
# with. The default is blank, which means nothing will be executed. For more
|
||||
# information, take a look at the docs:
|
||||
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
|
||||
|
||||
# By default, paperless will check the document text for document date information.
|
||||
# Uncomment the line below to enable checking the document filename for date
|
||||
# information. The date order can be set to any option as specified in
|
||||
# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be
|
||||
# checked first, and if nothing is found, the document text will be checked
|
||||
# as normal.
|
||||
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
|
||||
|
||||
# Sometimes devices won't create filenames which can be parsed properly
|
||||
# by the filename parser (see
|
||||
# https://paperless.readthedocs.io/en/latest/guesswork.html).
|
||||
#
|
||||
# This setting allows to specify a list of transformations
|
||||
# in regular expression syntax, which are passed in order to re.sub.
|
||||
# Transformation stops after the first match, so at most one transformation
|
||||
# is applied.
|
||||
#
|
||||
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
|
||||
# as keys.
|
||||
#
|
||||
# The example below transforms filenames created by a Brother ADS-2400N
|
||||
# document scanner in its standard configuration `Name_Date_Count', so that
|
||||
# count is used as title, name as tag and date can be parsed by paperless.
|
||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
|
||||
|
||||
#
|
||||
# The following values use sensible defaults for modern systems, but if you're
|
||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
||||
# some of these values may be necessary.
|
||||
#
|
||||
|
||||
|
||||
# By default, Paperless will attempt to use all available CPU cores to process
|
||||
# a document, but if you would like to limit that, you can set this value to
|
||||
# an integer:
|
||||
#PAPERLESS_OCR_THREADS=1
|
||||
|
||||
|
||||
# Customize the default language that tesseract will attempt to use when
|
||||
# parsing documents. The default language is used whenever
|
||||
# - No language could be detected on a document
|
||||
# - No tesseract data files are available for the detected language
|
||||
# It should be a 3-letter language code consistent with ISO
|
||||
# 639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
|
||||
|
||||
# On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||
# may explode, complaining about how it's "unable to extend pixel cache". In
|
||||
# such cases, try setting this to a reasonably low value, like 32000000. The
|
||||
# default is to use whatever is necessary to do everything without writing to
|
||||
# disk, and units are in megabytes.
|
||||
#
|
||||
# For more information on how to use this value, you should probably search
|
||||
# the web for "MAGICK_MEMORY_LIMIT".
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
|
||||
|
||||
# Similar to the memory limit, if you've got a small system and your OS mounts
|
||||
# /tmp as tmpfs, you should set this to a path that's on a physical disk, like
|
||||
# /home/your_user/tmp or something. ImageMagick will use this as scratch space
|
||||
# when crunching through very large documents.
|
||||
#
|
||||
# For more information on how to use this value, you should probably search
|
||||
# the web for "MAGICK_TMPDIR".
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
|
||||
|
||||
# By default the conversion density setting for documents is 300DPI, in some
|
||||
# cases it has proven useful to configure a lesser value.
|
||||
# This setting has a high impact on the physical size of tmp page files,
|
||||
# the speed of document conversion, and can affect the accuracy of OCR
|
||||
# results. Individual results can vary and this setting should be tested
|
||||
# thoroughly against the documents you are importing to see if it has any
|
||||
# impacts either negative or positive.
|
||||
# Testing on limited document sets has shown a setting of 200 can cut the
|
||||
# size of tmp files by 1/3, and speed up conversion by up to 4x
|
||||
# with little impact to OCR accuracy.
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
|
||||
# By default Paperless does not OCR a document if the text can be retrieved from
|
||||
# the document directly. Set to true to always OCR documents.
|
||||
#PAPERLESS_OCR_ALWAYS="false"
|
||||
|
||||
|
||||
###############################################################################
|
||||
#### Interface ####
|
||||
###############################################################################
|
||||
|
||||
# Override the default UTC time zone here.
|
||||
# See https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-TIME_ZONE
|
||||
# for details on how to set it.
|
||||
#PAPERLESS_TASK_WORKERS=1
|
||||
#PAPERLESS_THREADS_PER_WORKER=1
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
#PAPERLESS_OCR_ALWAYS=false
|
||||
#PAPERLESS_CONSUMER_POLLING=10
|
||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
||||
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
|
||||
|
||||
# Binaries
|
||||
|
||||
###############################################################################
|
||||
#### Third-Party Binaries ####
|
||||
###############################################################################
|
||||
|
||||
# There are a few external software packages that Paperless expects to find on
|
||||
# your system when it starts up. Unless you've done something creative with
|
||||
# their installation, you probably won't need to edit any of these. However,
|
||||
# if you've installed these programs somewhere where simply typing the name of
|
||||
# the program doesn't automatically execute it (ie. the program isn't in your
|
||||
# $PATH), then you'll need to specify the literal path for that program here.
|
||||
|
||||
# Convert (part of the ImageMagick suite)
|
||||
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
||||
|
||||
# Ghostscript
|
||||
#PAPERLESS_GS_BINARY = /usr/bin/gs
|
||||
|
||||
# Unpaper
|
||||
#PAPERLESS_GS_BINARY=/usr/bin/gs
|
||||
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
|
||||
|
||||
# Optipng (for optimising thumbnail sizes)
|
||||
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
|
||||
|
101
scripts/make-release.sh
Executable file
@ -0,0 +1,101 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
VERSION=$1
|
||||
|
||||
if [ -z "$VERSION" ]
|
||||
then
|
||||
echo "Need a version string."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# source root directory of paperless
|
||||
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# output directory
|
||||
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
|
||||
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
|
||||
|
||||
if [ -d "$PAPERLESS_DIST" ]
|
||||
then
|
||||
echo "Removing $PAPERLESS_DIST"
|
||||
rm "$PAPERLESS_DIST" -r
|
||||
fi
|
||||
|
||||
mkdir "$PAPERLESS_DIST"
|
||||
mkdir "$PAPERLESS_DIST_APP"
|
||||
mkdir "$PAPERLESS_DIST_APP/docker"
|
||||
|
||||
# setup dependencies.
|
||||
|
||||
cd "$PAPERLESS_ROOT"
|
||||
|
||||
pipenv clean
|
||||
pipenv install --dev
|
||||
pipenv lock --keep-outdated -r > "$PAPERLESS_DIST_APP/requirements.txt"
|
||||
|
||||
# test if the application works.
|
||||
|
||||
cd "$PAPERLESS_ROOT/src"
|
||||
pipenv run pytest --cov
|
||||
pipenv run pycodestyle
|
||||
|
||||
# make the documentation.
|
||||
|
||||
cd "$PAPERLESS_ROOT/docs"
|
||||
make clean html
|
||||
|
||||
# copy stuff into place
|
||||
|
||||
# the application itself
|
||||
|
||||
cp "$PAPERLESS_ROOT/.env" \
|
||||
"$PAPERLESS_ROOT/.dockerignore" \
|
||||
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
|
||||
"$PAPERLESS_ROOT/LICENSE" \
|
||||
"$PAPERLESS_ROOT/Pipfile" \
|
||||
"$PAPERLESS_ROOT/Pipfile.lock" \
|
||||
"$PAPERLESS_ROOT/README.md" "$PAPERLESS_DIST_APP"
|
||||
|
||||
cp "$PAPERLESS_ROOT/paperless.conf.example" "$PAPERLESS_DIST_APP/paperless.conf"
|
||||
|
||||
# copy python source, templates and static files.
|
||||
cd "$PAPERLESS_ROOT"
|
||||
find src -wholename '*/templates/*' -o -wholename '*/static/*' -o -name '*.py' | cpio -pdm "$PAPERLESS_DIST_APP"
|
||||
|
||||
# build the front end.
|
||||
|
||||
cd "$PAPERLESS_ROOT/src-ui"
|
||||
ng build --prod --output-hashing none --sourceMap=false --output-path "$PAPERLESS_DIST_APP/src/documents/static/frontend"
|
||||
|
||||
# documentation
|
||||
cp "$PAPERLESS_ROOT/docs/_build/html/" "$PAPERLESS_DIST_APP/docs" -r
|
||||
|
||||
# docker files for building the image yourself
|
||||
cp "$PAPERLESS_ROOT/docker/local/"* "$PAPERLESS_DIST_APP"
|
||||
cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST_APP"
|
||||
|
||||
# docker files for pulling from docker hub
|
||||
cp "$PAPERLESS_ROOT/docker/hub/"* "$PAPERLESS_DIST"
|
||||
cp "$PAPERLESS_ROOT/.env" "$PAPERLESS_DIST"
|
||||
cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST"
|
||||
|
||||
# auxiliary files required for the docker image
|
||||
cp "$PAPERLESS_ROOT/docker/docker-entrypoint.sh" "$PAPERLESS_DIST_APP/docker/"
|
||||
cp "$PAPERLESS_ROOT/docker/gunicorn.conf.py" "$PAPERLESS_DIST_APP/docker/"
|
||||
cp "$PAPERLESS_ROOT/docker/imagemagick-policy.xml" "$PAPERLESS_DIST_APP/docker/"
|
||||
cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
|
||||
|
||||
# try to make the docker build.
|
||||
|
||||
cd "$PAPERLESS_DIST_APP"
|
||||
|
||||
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
|
||||
|
||||
# works. package the app!
|
||||
|
||||
cd "$PAPERLESS_DIST"
|
||||
|
||||
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/
|
23
scripts/push-release.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
VERSION=$1
|
||||
|
||||
if [ -z "$VERSION" ]
|
||||
then
|
||||
echo "Need a version string."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# source root directory of paperless
|
||||
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# output directory
|
||||
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
|
||||
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
|
||||
|
||||
cd "$PAPERLESS_DIST_APP"
|
||||
|
||||
docker push "jonaswinkler/paperless-ng:$VERSION"
|
2
scripts/start_services.sh
Executable file
@ -0,0 +1,2 @@
|
||||
docker run -p 5432:5432 -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13
|
||||
docker run -d -p 6379:6379 redis:latest
|
8
src-ui/package-lock.json
generated
@ -8260,6 +8260,14 @@
|
||||
"moment": "2.18.1"
|
||||
}
|
||||
},
|
||||
"ngx-cookie-service": {
|
||||
"version": "10.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ngx-cookie-service/-/ngx-cookie-service-10.1.1.tgz",
|
||||
"integrity": "sha512-HvBrYHdxMN1NvFJGEIF/8EuAg2fjxj8QwqTv9h6qZGqNLU+lUba8Pb2zRPw1YA+gqKkJawOy5dYNeH0kyPyipw==",
|
||||
"requires": {
|
||||
"tslib": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"ngx-file-drop": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ngx-file-drop/-/ngx-file-drop-10.0.0.tgz",
|
||||
|
@ -23,6 +23,7 @@
|
||||
"@ng-bootstrap/ng-bootstrap": "^8.0.0",
|
||||
"bootstrap": "^4.5.0",
|
||||
"ng-bootstrap": "^1.6.3",
|
||||
"ngx-cookie-service": "^10.1.1",
|
||||
"ngx-file-drop": "^10.0.0",
|
||||
"ngx-infinite-scroll": "^9.1.0",
|
||||
"rxjs": "~6.6.0",
|
||||
|
@ -39,6 +39,8 @@ import { InfiniteScrollModule } from 'ngx-infinite-scroll';
|
||||
import { DateTimeComponent } from './components/common/input/date-time/date-time.component';
|
||||
import { TagsComponent } from './components/common/input/tags/tags.component';
|
||||
import { SortableDirective } from './directives/sortable.directive';
|
||||
import { CookieService } from 'ngx-cookie-service';
|
||||
import { CsrfInterceptor } from './interceptors/csrf.interceptor';
|
||||
import { SavedViewWidgetComponent } from './components/dashboard/widgets/saved-view-widget/saved-view-widget.component';
|
||||
import { ConsumerStatusWidgetComponent } from './components/dashboard/widgets/consumer-status-widget/consumer-status-widget.component';
|
||||
import { StatisticsWidgetComponent } from './components/dashboard/widgets/statistics-widget/statistics-widget.component';
|
||||
@ -93,7 +95,12 @@ import { FileUploadWidgetComponent } from './components/dashboard/widgets/file-u
|
||||
InfiniteScrollModule
|
||||
],
|
||||
providers: [
|
||||
DatePipe
|
||||
DatePipe,
|
||||
CookieService, {
|
||||
provide: HTTP_INTERCEPTORS,
|
||||
useClass: CsrfInterceptor,
|
||||
multi: true
|
||||
}
|
||||
],
|
||||
bootstrap: [AppComponent]
|
||||
})
|
||||
|
@ -1,5 +1,5 @@
|
||||
<nav class="navbar navbar-dark sticky-top bg-dark flex-md-nowrap p-0 shadow">
|
||||
<span class="navbar-brand col-md-3 col-lg-2 mr-0 px-3" href="#">Paperless</span>
|
||||
<span class="navbar-brand col-md-3 col-lg-2 mr-0 px-3" href="#">Paperless-ng</span>
|
||||
<button class="navbar-toggler position-absolute d-md-none collapsed" type="button" data-toggle="collapse"
|
||||
data-target="#sidebarMenu" aria-controls="sidebarMenu" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
@ -69,6 +69,14 @@
|
||||
{{d.title}}
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item w-100" *ngIf="openDocuments.length > 1">
|
||||
<a class="nav-link text-truncate" [routerLink]="" (click)="closeAll()">
|
||||
<svg class="sidebaricon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#x"/>
|
||||
</svg>
|
||||
Close all
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h6 class="sidebar-heading d-flex justify-content-between align-items-center px-3 mt-4 mb-1 text-muted">
|
||||
@ -124,6 +132,28 @@
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h6 class="sidebar-heading d-flex justify-content-between align-items-center px-3 mt-4 mb-1 text-muted">
|
||||
<span>Misc</span>
|
||||
</h6>
|
||||
<ul class="nav flex-column mb-2">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="https://paperless-ng.readthedocs.io/en/latest/">
|
||||
<svg class="sidebaricon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#question-circle"/>
|
||||
</svg>
|
||||
Documentation
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="https://github.com/jonaswinkler/paperless-ng">
|
||||
<svg class="sidebaricon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#link"/>
|
||||
</svg>
|
||||
Github
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
|
@ -1,12 +1,13 @@
|
||||
import { Component, OnDestroy, OnInit } from '@angular/core';
|
||||
import { FormControl } from '@angular/forms';
|
||||
import { Router } from '@angular/router';
|
||||
import { ActivatedRoute, Router } from '@angular/router';
|
||||
import { from, Observable, Subscription } from 'rxjs';
|
||||
import { debounceTime, distinctUntilChanged, map, switchMap } from 'rxjs/operators';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { OpenDocumentsService } from 'src/app/services/open-documents.service';
|
||||
import { SearchService } from 'src/app/services/rest/search.service';
|
||||
import { SavedViewConfigService } from 'src/app/services/saved-view-config.service';
|
||||
import { DocumentDetailComponent } from '../document-detail/document-detail.component';
|
||||
|
||||
@Component({
|
||||
selector: 'app-app-frame',
|
||||
@ -17,6 +18,7 @@ export class AppFrameComponent implements OnInit, OnDestroy {
|
||||
|
||||
constructor (
|
||||
public router: Router,
|
||||
private activatedRoute: ActivatedRoute,
|
||||
private openDocumentsService: OpenDocumentsService,
|
||||
private searchService: SearchService,
|
||||
public viewConfigService: SavedViewConfigService
|
||||
@ -62,6 +64,19 @@ export class AppFrameComponent implements OnInit, OnDestroy {
|
||||
this.router.navigate(['search'], {queryParams: {query: this.searchField.value}})
|
||||
}
|
||||
|
||||
closeAll() {
|
||||
this.openDocumentsService.closeAll()
|
||||
|
||||
// TODO: is there a better way to do this?
|
||||
let route = this.activatedRoute
|
||||
while (route.firstChild) {
|
||||
route = route.firstChild
|
||||
}
|
||||
if (route.component == DocumentDetailComponent) {
|
||||
this.router.navigate([""])
|
||||
}
|
||||
}
|
||||
|
||||
ngOnInit() {
|
||||
this.openDocuments = this.openDocumentsService.getOpenDocuments()
|
||||
}
|
||||
|
@ -86,7 +86,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor {
|
||||
var modal = this.modalService.open(TagEditDialogComponent, {backdrop: 'static'})
|
||||
modal.componentInstance.dialogMode = 'create'
|
||||
modal.componentInstance.success.subscribe(newTag => {
|
||||
this.tagService.list().subscribe(tags => {
|
||||
this.tagService.listAll().subscribe(tags => {
|
||||
this.tags = tags.results
|
||||
this.addTag(newTag.id)
|
||||
})
|
||||
|
@ -1,6 +1,6 @@
|
||||
<div class="row pt-3 pb-2 mb-3 border-bottom align-items-center">
|
||||
<div class="row pt-3 pb-1 mb-3 border-bottom align-items-center" >
|
||||
<div class="col text-truncate">
|
||||
<h1 class="h2 text-truncate">{{title}}</h1>
|
||||
<h1 class="h2 text-truncate" style="line-height: 1.4">{{title}}</h1>
|
||||
</div>
|
||||
<div class="btn-toolbar col-auto">
|
||||
<ng-content></ng-content>
|
||||
|
@ -2,7 +2,7 @@
|
||||
<app-page-header title="Dashboard">
|
||||
</app-page-header>
|
||||
|
||||
<p>Welcome to paperless!</p>
|
||||
<p>Welcome to paperless-ng!</p>
|
||||
|
||||
<div class='row'>
|
||||
<div class="col-lg">
|
||||
|
@ -49,7 +49,6 @@ export class DocumentDetailComponent implements OnInit {
|
||||
private route: ActivatedRoute,
|
||||
private correspondentService: CorrespondentService,
|
||||
private documentTypeService: DocumentTypeService,
|
||||
private datePipe: DatePipe,
|
||||
private router: Router,
|
||||
private modalService: NgbModal,
|
||||
private openDocumentService: OpenDocumentsService,
|
||||
@ -89,7 +88,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
var modal = this.modalService.open(DocumentTypeEditDialogComponent, {backdrop: 'static'})
|
||||
modal.componentInstance.dialogMode = 'create'
|
||||
modal.componentInstance.success.subscribe(newDocumentType => {
|
||||
this.documentTypeService.list().subscribe(documentTypes => {
|
||||
this.documentTypeService.listAll().subscribe(documentTypes => {
|
||||
this.documentTypes = documentTypes.results
|
||||
this.documentForm.get('document_type_id').setValue(newDocumentType.id)
|
||||
})
|
||||
@ -100,7 +99,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
var modal = this.modalService.open(CorrespondentEditDialogComponent, {backdrop: 'static'})
|
||||
modal.componentInstance.dialogMode = 'create'
|
||||
modal.componentInstance.success.subscribe(newCorrespondent => {
|
||||
this.correspondentService.list().subscribe(correspondents => {
|
||||
this.correspondentService.listAll().subscribe(correspondents => {
|
||||
this.correspondents = correspondents.results
|
||||
this.documentForm.get('correspondent_id').setValue(newCorrespondent.id)
|
||||
})
|
||||
|
@ -85,8 +85,8 @@
|
||||
<th>Correspondent</th>
|
||||
<th>Title</th>
|
||||
<th>Document type</th>
|
||||
<th>Date created</th>
|
||||
<th>Date added</th>
|
||||
<th>Created</th>
|
||||
<th>Added</th>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr *ngFor="let d of docs.documents" routerLink="/documents/{{d.id}}">
|
||||
|
@ -1,3 +1,7 @@
|
||||
.log-entry-10 {
|
||||
color: lightslategray !important;
|
||||
}
|
||||
|
||||
.log-entry-30 {
|
||||
color: yellow !important;
|
||||
}
|
||||
|
@ -30,7 +30,7 @@ export class LogsComponent implements OnInit {
|
||||
onScroll() {
|
||||
let lastCreated = null
|
||||
if (this.logs.length > 0) {
|
||||
lastCreated = this.logs[this.logs.length-1].created
|
||||
lastCreated = new Date(this.logs[this.logs.length-1].created).toISOString()
|
||||
}
|
||||
this.logService.list(1, 25, 'created', 'des', {'created__lt': lastCreated, 'level__gte': this.level}).subscribe(result => {
|
||||
this.logs.push(...result.results)
|
||||
|
16
src-ui/src/app/interceptors/csrf.interceptor.spec.ts
Normal file
@ -0,0 +1,16 @@
|
||||
import { TestBed } from '@angular/core/testing';
|
||||
|
||||
import { CsrfInterceptor } from './csrf.interceptor';
|
||||
|
||||
describe('CsrfInterceptor', () => {
|
||||
beforeEach(() => TestBed.configureTestingModule({
|
||||
providers: [
|
||||
CsrfInterceptor
|
||||
]
|
||||
}));
|
||||
|
||||
it('should be created', () => {
|
||||
const interceptor: CsrfInterceptor = TestBed.inject(CsrfInterceptor);
|
||||
expect(interceptor).toBeTruthy();
|
||||
});
|
||||
});
|
30
src-ui/src/app/interceptors/csrf.interceptor.ts
Normal file
@ -0,0 +1,30 @@
|
||||
import { Injectable } from '@angular/core';
|
||||
import {
|
||||
HttpRequest,
|
||||
HttpHandler,
|
||||
HttpEvent,
|
||||
HttpInterceptor
|
||||
} from '@angular/common/http';
|
||||
import { Observable } from 'rxjs';
|
||||
import { CookieService } from 'ngx-cookie-service';
|
||||
|
||||
@Injectable()
|
||||
export class CsrfInterceptor implements HttpInterceptor {
|
||||
|
||||
constructor(private cookieService: CookieService) {
|
||||
|
||||
}
|
||||
|
||||
intercept(request: HttpRequest<unknown>, next: HttpHandler): Observable<HttpEvent<unknown>> {
|
||||
let csrfToken = this.cookieService.get('csrftoken')
|
||||
if (csrfToken) {
|
||||
request = request.clone({
|
||||
setHeaders: {
|
||||
'X-CSRFToken': csrfToken
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return next.handle(request);
|
||||
}
|
||||
}
|
@ -1,5 +1,4 @@
|
||||
import { Injectable } from '@angular/core';
|
||||
import { Observable, Subject } from 'rxjs';
|
||||
import { PaperlessDocument } from '../data/paperless-document';
|
||||
import { OPEN_DOCUMENT_SERVICE } from '../data/storage-keys';
|
||||
|
||||
@ -8,6 +7,8 @@ import { OPEN_DOCUMENT_SERVICE } from '../data/storage-keys';
|
||||
})
|
||||
export class OpenDocumentsService {
|
||||
|
||||
private MAX_OPEN_DOCUMENTS = 5
|
||||
|
||||
constructor() {
|
||||
if (sessionStorage.getItem(OPEN_DOCUMENT_SERVICE.DOCUMENTS)) {
|
||||
try {
|
||||
@ -31,7 +32,10 @@ export class OpenDocumentsService {
|
||||
|
||||
openDocument(doc: PaperlessDocument) {
|
||||
if (this.openDocuments.find(d => d.id == doc.id) == null) {
|
||||
this.openDocuments.push(doc)
|
||||
this.openDocuments.unshift(doc)
|
||||
if (this.openDocuments.length > this.MAX_OPEN_DOCUMENTS) {
|
||||
this.openDocuments.pop()
|
||||
}
|
||||
this.save()
|
||||
}
|
||||
}
|
||||
@ -44,6 +48,11 @@ export class OpenDocumentsService {
|
||||
}
|
||||
}
|
||||
|
||||
closeAll() {
|
||||
this.openDocuments.splice(0, this.openDocuments.length)
|
||||
this.save()
|
||||
}
|
||||
|
||||
save() {
|
||||
sessionStorage.setItem(OPEN_DOCUMENT_SERVICE.DOCUMENTS, JSON.stringify(this.openDocuments))
|
||||
}
|
||||
|
69
src-ui/src/assets/logo-dark-notext.svg
Normal file
@ -0,0 +1,69 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="69.999977mm"
|
||||
height="84.283669mm"
|
||||
viewBox="0 0 69.999977 84.283669"
|
||||
version="1.1"
|
||||
id="svg4812"
|
||||
inkscape:version="1.0.1 (3bc2e813f5, 2020-09-07)"
|
||||
sodipodi:docname="logo-dark-notext.svg">
|
||||
<defs
|
||||
id="defs4806" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="0.98994949"
|
||||
inkscape:cx="328.04904"
|
||||
inkscape:cy="330.33332"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="SvgjsG1020"
|
||||
inkscape:document-rotation="0"
|
||||
showgrid="false"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1016"
|
||||
inkscape:window-x="1280"
|
||||
inkscape:window-y="27"
|
||||
inkscape:window-maximized="1" />
|
||||
<metadata
|
||||
id="metadata4809">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(-9.9999792,-10.000082)">
|
||||
<g
|
||||
id="SvgjsG1020"
|
||||
featureKey="symbol1"
|
||||
fill="#ffffff"
|
||||
transform="matrix(0.10341565,0,0,0.10341565,1.2287665,8.3453496)">
|
||||
<path
|
||||
id="path57"
|
||||
style="fill:#ffffff;stroke-width:1.10017"
|
||||
d="M 752.4375,82.365234 C 638.02019,348.60552 87.938206,381.6089 263.96484,810.67383 c 2.20034,5.50083 -40.70621,63.80947 -69.31054,112.21679 -6.601,-24.20366 -14.30329,-50.6063 -13.20313,-52.80664 C 324.47281,700.65835 79.135592,604.94324 65.933594,466.32227 4.3242706,576.33891 -17.678136,768.86756 168.25,879.98438 c 1.10017,-10e-6 9.90207,41.80777 14.30273,62.71093 -4.40066,8.80133 -8.80162,17.60213 -11.00195,24.20313 -4.40066,11.00166 28.60352,9.90123 28.60352,12.10156 3.3005,-1.10017 81.41295,-138.62054 83.61328,-139.7207 C 726.0345,738.06398 804.14532,339.80419 752.4375,82.365234 Z M 526.9043,362.90625 C 320.073,547.73422 284.86775,685.25508 291.46875,752.36523 222.15826,588.44043 425.68898,408.01308 526.9043,362.90625 Z M 127.54297,626.94727 c 39.60599,36.30549 105.6163,147.4222 49.50781,212.33203 13.202,-29.7045 17.60234,-96.81455 -49.50781,-212.33203 z"
|
||||
transform="matrix(0.90895334,0,0,0.90895334,65.06894,-58.865357)" />
|
||||
<defs
|
||||
id="defs14302" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 2.9 KiB |
@ -1,5 +1,4 @@
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import Group, User
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from whoosh.writing import AsyncWriter
|
||||
@ -32,7 +31,7 @@ class TagAdmin(admin.ModelAdmin):
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
readonly_fields = ("slug", )
|
||||
|
||||
|
||||
class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
@ -51,9 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "file_type", "storage_type",)
|
||||
list_display = ("title", "created", "added", "correspondent",
|
||||
"tags_", "archive_serial_number", "document_type")
|
||||
readonly_fields = ("added", "file_type", "storage_type", "filename")
|
||||
list_display = (
|
||||
"title",
|
||||
"created",
|
||||
"added",
|
||||
"correspondent",
|
||||
"tags_",
|
||||
"archive_serial_number",
|
||||
"document_type",
|
||||
"filename"
|
||||
)
|
||||
list_filter = (
|
||||
"document_type",
|
||||
"tags",
|
||||
@ -120,8 +127,3 @@ admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(DocumentType, DocumentTypeAdmin)
|
||||
admin.site.register(Document, DocumentAdmin)
|
||||
admin.site.register(Log, LogAdmin)
|
||||
|
||||
|
||||
# Unless we implement multi-user, these default registrations don't make sense.
|
||||
admin.site.unregister(Group)
|
||||
admin.site.unregister(User)
|
||||
|
@ -1,5 +1,4 @@
|
||||
from django.apps import AppConfig
|
||||
from django.db.models.signals import post_delete
|
||||
|
||||
|
||||
class DocumentsConfig(AppConfig):
|
||||
@ -14,7 +13,6 @@ class DocumentsConfig(AppConfig):
|
||||
add_inbox_tags,
|
||||
run_pre_consume_script,
|
||||
run_post_consume_script,
|
||||
cleanup_document_deletion,
|
||||
set_log_entry,
|
||||
set_correspondent,
|
||||
set_document_type,
|
||||
@ -33,6 +31,4 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
post_delete.connect(cleanup_document_deletion)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@ -4,6 +4,8 @@ from django.conf import settings
|
||||
from django.core.checks import Error, register
|
||||
from django.db.utils import OperationalError, ProgrammingError
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
|
||||
@register()
|
||||
def changed_password_check(app_configs, **kwargs):
|
||||
@ -37,3 +39,17 @@ def changed_password_check(app_configs, **kwargs):
|
||||
"""))]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
return [Error("No parsers found. This is a bug. The consumer won't be "
|
||||
"able to onsume any documents without parsers.")]
|
||||
else:
|
||||
return []
|
||||
|
@ -3,7 +3,6 @@ import logging
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import time
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
@ -64,7 +63,7 @@ class DocumentClassifier(object):
|
||||
|
||||
def save_classifier(self):
|
||||
with open(settings.MODEL_FILE, "wb") as f:
|
||||
pickle.dump(self.FORMAT_VERSION, f) # Version
|
||||
pickle.dump(self.FORMAT_VERSION, f)
|
||||
pickle.dump(self.data_hash, f)
|
||||
pickle.dump(self.data_vectorizer, f)
|
||||
|
||||
@ -89,16 +88,14 @@ class DocumentClassifier(object):
|
||||
data.append(preprocessed_content)
|
||||
|
||||
y = -1
|
||||
if doc.document_type:
|
||||
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
if doc.correspondent:
|
||||
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_correspondent.append(y)
|
||||
|
||||
@ -120,8 +117,8 @@ class DocumentClassifier(object):
|
||||
|
||||
num_tags = len(labels_tags_unique)
|
||||
# substract 1 since -1 (null) is also part of the classes.
|
||||
num_correspondents = len(labels_correspondent) - 1
|
||||
num_document_types = len(labels_document_type) - 1
|
||||
num_correspondents = len(set(labels_correspondent)) - 1
|
||||
num_document_types = len(set(labels_document_type)) - 1
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
@ -137,7 +134,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug("Vectorizing data...")
|
||||
self.data_vectorizer = CountVectorizer(
|
||||
analyzer="word",
|
||||
ngram_range=(1,2),
|
||||
ngram_range=(1, 2),
|
||||
min_df=0.01
|
||||
)
|
||||
data_vectorized = self.data_vectorizer.fit_transform(data)
|
||||
|
@ -3,7 +3,6 @@ import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from asgiref.sync import async_to_sync
|
||||
from channels.layers import get_channel_layer
|
||||
@ -13,7 +12,9 @@ from django.utils import timezone
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .models import Document, FileInfo
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
@ -25,17 +26,10 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
2. Use tesseract on the pnm
|
||||
3. Store the document in the MEDIA_ROOT with optional encryption
|
||||
4. Store the OCR'd text in the database
|
||||
5. Delete the document and image(s)
|
||||
"""
|
||||
class Consumer(LoggingMixin):
|
||||
|
||||
def _send_progress(self, filename, current_progress, max_progress, status, message, document_id=None):
|
||||
def _send_progress(self, filename, current_progress, max_progress, status,
|
||||
message, document_id=None):
|
||||
payload = {
|
||||
'filename': os.path.basename(filename),
|
||||
'current_progress': current_progress,
|
||||
@ -44,156 +38,226 @@ class Consumer:
|
||||
'message': message,
|
||||
'document_id': document_id
|
||||
}
|
||||
async_to_sync(self.channel_layer.group_send)("status_updates", {'type': 'status_update', 'data': payload})
|
||||
async_to_sync(self.channel_layer.group_send)("status_updates",
|
||||
{'type': 'status_update',
|
||||
'data': payload})
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR,
|
||||
scratch=settings.SCRATCH_DIR):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
self.consume = consume
|
||||
self.scratch = scratch
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.path = None
|
||||
self.filename = None
|
||||
self.override_title = None
|
||||
self.override_correspondent_id = None
|
||||
self.override_tag_ids = None
|
||||
self.override_document_type_id = None
|
||||
|
||||
self.channel_layer = get_channel_layer()
|
||||
|
||||
os.makedirs(self.scratch, exist_ok=True)
|
||||
def pre_check_file_exists(self):
|
||||
if not os.path.isfile(self.path):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
self.storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
if not self.consume:
|
||||
def pre_check_consumption_dir(self):
|
||||
if not settings.CONSUMPTION_DIR:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set."
|
||||
)
|
||||
"set.")
|
||||
|
||||
if not os.path.exists(self.consume):
|
||||
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.consume))
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
def pre_check_regex(self):
|
||||
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||
raise ConsumerError(
|
||||
"Filename {} does not seem to be safe to "
|
||||
"consume".format(self.filename))
|
||||
|
||||
@transaction.atomic
|
||||
def try_consume_file(self, file):
|
||||
"""
|
||||
Return True if file was consumed
|
||||
"""
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
|
||||
doc = file
|
||||
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"warning",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
)
|
||||
return False
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
def pre_check_directories(self):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
"""
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
parser_class = get_parser_class(doc)
|
||||
self.path = path
|
||||
self.filename = override_filename or os.path.basename(path)
|
||||
self.override_title = override_title
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
self.override_tag_ids = override_tag_ids
|
||||
|
||||
# this is for grouping logging entries for this particular file
|
||||
# together.
|
||||
|
||||
self.renew_logging_group()
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_regex()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
parser_class = get_parser_class(self.filename)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
"error", "No parsers could be found for {}".format(doc))
|
||||
return False
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
else:
|
||||
self.log("info", "Parser: {}".format(parser_class.__name__))
|
||||
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||
|
||||
self._send_progress(file, 0, 100, 'WORKING', 'Consumption started')
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
self._send_progress(self.filename, 0, 100, 'WORKING', 'Consumption started')
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=doc,
|
||||
filename=self.path,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
def progress_callback(current_progress, max_progress, message):
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 60 + 20)
|
||||
self._send_progress(file, p, 100, "WORKING", message)
|
||||
self._send_progress(self.filename, p, 100, "WORKING", message)
|
||||
|
||||
document_parser = parser_class(doc, self.logging_group, progress_callback)
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group, progress_callback)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("info", "Generating thumbnail for {}...".format(doc))
|
||||
self._send_progress(file, 10, 100, 'WORKING',
|
||||
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||
self._send_progress(self.filename, 10, 100, 'WORKING',
|
||||
'Generating thumbnail...')
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self._send_progress(file, 20, 100, 'WORKING',
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
self._send_progress(self.filename, 20, 100, 'WORKING',
|
||||
'Getting text from document...')
|
||||
text = document_parser.get_text()
|
||||
self._send_progress(file, 80, 100, 'WORKING',
|
||||
self._send_progress(self.filename, 80, 100, 'WORKING',
|
||||
'Getting date from document...')
|
||||
date = document_parser.get_date()
|
||||
self._send_progress(file, 85, 100, 'WORKING',
|
||||
'Storing the document...')
|
||||
document = self._store(
|
||||
text,
|
||||
doc,
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
document_parser.cleanup()
|
||||
self._send_progress(self.filename, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
raise ConsumerError(e)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
classifier = None
|
||||
self._send_progress(self.filename, 85, 100, 'WORKING',
|
||||
'Storing the document...')
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
self._send_progress(self.filename, 90, 100, 'WORKING',
|
||||
'Performing post-consumption tasks...')
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
raise ConsumerError(e)
|
||||
self._send_progress(file, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
return False
|
||||
else:
|
||||
document_parser.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
classifier = None
|
||||
self._send_progress(file, 100, 100, 'SUCCESS',
|
||||
'Finished.', document.id)
|
||||
|
||||
try:
|
||||
self.classifier.reload()
|
||||
classifier = self.classifier
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
||||
return document
|
||||
|
||||
self._send_progress(file, 90, 100, 'WORKING',
|
||||
'Performing post-consumption tasks...')
|
||||
def _store(self, text, date):
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
self._send_progress(file, 100, 100, 'SUCCESS',
|
||||
'Finished.', document.id)
|
||||
return True
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
def _store(self, text, doc, thumbnail, date):
|
||||
file_info = FileInfo.from_path(self.filename)
|
||||
|
||||
file_info = FileInfo.from_path(doc)
|
||||
|
||||
stats = os.stat(doc)
|
||||
stats = os.stat(self.path)
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
with open(doc, "rb") as f:
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
else:
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
@ -202,7 +266,7 @@ class Consumer:
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
created=created,
|
||||
modified=created,
|
||||
storage_type=self.storage_type
|
||||
storage_type=storage_type
|
||||
)
|
||||
|
||||
relevant_tags = set(file_info.tags)
|
||||
@ -211,14 +275,30 @@ class Consumer:
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
self._write(document, doc, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
self.apply_overrides(document)
|
||||
|
||||
#TODO: why do we need to save the document again?
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# We need to save the document twice, since we need the PK of the
|
||||
# document in order to create its filename above.
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.override_title:
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||
|
||||
if self.override_document_type_id:
|
||||
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||
|
||||
if self.override_tag_ids:
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
@ -227,13 +307,3 @@ class Consumer:
|
||||
return
|
||||
self.log("debug", "Encrypting")
|
||||
write_file.write(GnuPG.encrypted(read_file))
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
|
||||
@staticmethod
|
||||
def _is_duplicate(doc):
|
||||
with open(doc, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
102
src/documents/file_handling.py
Normal file
@ -0,0 +1,102 @@
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
|
||||
def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory):
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(settings.ORIGINALS_DIR)
|
||||
|
||||
if not directory.startswith(root + os.path.sep):
|
||||
# don't do anything outside our originals folder.
|
||||
|
||||
# append os.path.set so that we avoid these cases:
|
||||
# directory = /home/originals2/test
|
||||
# root = /home/originals ("/" gets appended and startswith fails)
|
||||
return
|
||||
|
||||
while directory != root:
|
||||
if not os.listdir(directory):
|
||||
# it's empty
|
||||
try:
|
||||
os.rmdir(directory)
|
||||
except OSError:
|
||||
# whatever. empty directories aren't that bad anyway.
|
||||
return
|
||||
else:
|
||||
# it's not empty.
|
||||
return
|
||||
|
||||
# go one level up
|
||||
directory = os.path.normpath(os.path.dirname(directory))
|
||||
|
||||
|
||||
def many_to_dictionary(field):
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find('_')
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find('-')
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter + 1:]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(document):
|
||||
# Create filename based on configured format
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(document.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(document.correspondent),
|
||||
title=slugify(document.title),
|
||||
created=slugify(document.created),
|
||||
created_year=document.created.year if document.created else "none",
|
||||
created_month=document.created.month if document.created else "none",
|
||||
created_day=document.created.day if document.created else "none",
|
||||
added=slugify(document.added),
|
||||
added_year=document.added.year if document.added else "none",
|
||||
added_month=document.added.month if document.added else "none",
|
||||
added_day=document.added.day if document.added else "none",
|
||||
tags=tags,
|
||||
)
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (document.pk, document.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if document.storage_type == document.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
@ -1,10 +1,11 @@
|
||||
import os
|
||||
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
|
||||
@ -19,12 +20,6 @@ class UploadForm(forms.Form):
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
return self.cleaned_data.get("document")
|
||||
|
||||
def get_filename(self, i=None):
|
||||
return os.path.join(
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
|
||||
)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
@ -33,15 +28,16 @@ class UploadForm(forms.Form):
|
||||
"""
|
||||
|
||||
document = self.cleaned_data.get("document").read()
|
||||
original_filename = self.cleaned_data.get("document").name
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
file_name = self.get_filename()
|
||||
i = 0
|
||||
while os.path.exists(file_name):
|
||||
i += 1
|
||||
file_name = self.get_filename(i)
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(document)
|
||||
os.utime(file_name, times=(t, t))
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||
|
@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
@ -9,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from paperless import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -69,6 +66,9 @@ def open_index(recreate=False):
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
else:
|
||||
# TODO: this is not thread safe. If 2 instances try to create the index
|
||||
# at the same time, this fails. This currently prevents parallel
|
||||
# tests.
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@ -99,15 +99,19 @@ def remove_document_from_index(document):
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, query, page):
|
||||
with ix.searcher() as searcher:
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
query_parser = MultifieldParser(["content", "title", "correspondent"],
|
||||
ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
return result_page
|
||||
yield result_page
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
||||
def autocomplete(ix, term, limit=10):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
|
||||
class PaperlessHandler(logging.Handler):
|
||||
@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
|
||||
kwargs["group"] = record.group
|
||||
|
||||
Log.objects.create(**kwargs)
|
||||
|
||||
|
||||
class LoggingMixin:
|
||||
|
||||
logging_group = None
|
||||
|
||||
def renew_logging_group(self):
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||
logger = logging.getLogger(target)
|
||||
|
||||
getattr(logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
@ -1,250 +0,0 @@
|
||||
import datetime
|
||||
import imaplib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from base64 import b64decode
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
from dateutil import parser
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Correspondent
|
||||
|
||||
|
||||
class MailFetcherError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidMessageError(MailFetcherError):
|
||||
pass
|
||||
|
||||
|
||||
class Loggable(object):
|
||||
|
||||
def __init__(self, group=None):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = group or uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
|
||||
class Message(Loggable):
|
||||
"""
|
||||
A crude, but simple email message class. We assume that there's a subject
|
||||
and n attachments, and that we don't care about the message body.
|
||||
"""
|
||||
|
||||
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
|
||||
|
||||
def __init__(self, data, group=None):
|
||||
"""
|
||||
Cribbed heavily from
|
||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
||||
"""
|
||||
|
||||
Loggable.__init__(self, group=group)
|
||||
|
||||
self.subject = None
|
||||
self.time = None
|
||||
self.attachment = None
|
||||
|
||||
message = BytesParser(policy=policy.default).parsebytes(data)
|
||||
self.subject = str(message["Subject"]).replace("\r\n", "")
|
||||
self.body = str(message.get_body())
|
||||
|
||||
self.check_subject()
|
||||
self.check_body()
|
||||
|
||||
self._set_time(message)
|
||||
|
||||
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
||||
|
||||
attachments = []
|
||||
for part in message.walk():
|
||||
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
if not content_disposition:
|
||||
continue
|
||||
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if len(dispositions) < 2:
|
||||
continue
|
||||
|
||||
if not dispositions[0].lower() == "attachment" and \
|
||||
"filename" not in dispositions[1].lower():
|
||||
continue
|
||||
|
||||
file_data = part.get_payload()
|
||||
|
||||
attachments.append(Attachment(
|
||||
b64decode(file_data), content_type=part.get_content_type()))
|
||||
|
||||
if len(attachments) == 0:
|
||||
raise InvalidMessageError(
|
||||
"There don't appear to be any attachments to this message")
|
||||
|
||||
if len(attachments) > 1:
|
||||
raise InvalidMessageError(
|
||||
"There's more than one attachment to this message. It cannot "
|
||||
"be indexed automatically."
|
||||
)
|
||||
|
||||
self.attachment = attachments[0]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.attachment)
|
||||
|
||||
def check_subject(self):
|
||||
if self.subject is None:
|
||||
raise InvalidMessageError("Message does not have a subject")
|
||||
if not Correspondent.SAFE_REGEX.match(self.subject):
|
||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
||||
self.subject))
|
||||
|
||||
def check_body(self):
|
||||
if self.SECRET not in self.body:
|
||||
raise InvalidMessageError("The secret wasn't in the body")
|
||||
|
||||
def _set_time(self, message):
|
||||
self.time = datetime.datetime.now()
|
||||
message_time = message.get("Date")
|
||||
if message_time:
|
||||
try:
|
||||
self.time = parser.parse(message_time)
|
||||
except (ValueError, AttributeError):
|
||||
pass # We assume that "now" is ok
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return "{}.{}".format(self.subject, self.attachment.suffix)
|
||||
|
||||
|
||||
class Attachment(object):
|
||||
|
||||
SAFE_SUFFIX_REGEX = re.compile(
|
||||
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
|
||||
|
||||
def __init__(self, data, content_type):
|
||||
|
||||
self.content_type = content_type
|
||||
self.data = data
|
||||
self.suffix = None
|
||||
|
||||
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
|
||||
if not m:
|
||||
raise MailFetcherError(
|
||||
"Not-awesome file type: {}".format(self.content_type))
|
||||
self.suffix = m.group(2) or m.group(4)
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
|
||||
class MailFetcher(Loggable):
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR):
|
||||
|
||||
Loggable.__init__(self)
|
||||
|
||||
self._connection = None
|
||||
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
|
||||
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
|
||||
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
|
||||
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
|
||||
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
|
||||
|
||||
self._enabled = bool(self._host)
|
||||
if self._enabled and Message.SECRET is None:
|
||||
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
|
||||
|
||||
self.last_checked = time.time()
|
||||
self.consume = consume
|
||||
|
||||
def pull(self):
|
||||
"""
|
||||
Fetch all available mail at the target address and store it locally in
|
||||
the consumption directory so that the file consumer can pick it up and
|
||||
do its thing.
|
||||
"""
|
||||
|
||||
if self._enabled:
|
||||
|
||||
# Reset the grouping id for each fetch
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("debug", "Checking mail")
|
||||
|
||||
for message in self._get_messages():
|
||||
|
||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||
|
||||
t = int(time.mktime(message.time.timetuple()))
|
||||
file_name = os.path.join(self.consume, message.file_name)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(message.attachment.data)
|
||||
os.utime(file_name, times=(t, t))
|
||||
|
||||
self.last_checked = time.time()
|
||||
|
||||
def _get_messages(self):
|
||||
|
||||
r = []
|
||||
try:
|
||||
|
||||
self._connect()
|
||||
self._login()
|
||||
|
||||
for message in self._fetch():
|
||||
if message:
|
||||
r.append(message)
|
||||
|
||||
self._connection.expunge()
|
||||
self._connection.close()
|
||||
self._connection.logout()
|
||||
|
||||
except MailFetcherError as e:
|
||||
self.log("error", str(e))
|
||||
|
||||
return r
|
||||
|
||||
def _connect(self):
|
||||
try:
|
||||
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
|
||||
except OSError as e:
|
||||
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
|
||||
raise MailFetcherError(msg)
|
||||
|
||||
def _login(self):
|
||||
|
||||
login = self._connection.login(self._username, self._password)
|
||||
if not login[0] == "OK":
|
||||
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
|
||||
|
||||
inbox = self._connection.select(self._inbox)
|
||||
if not inbox[0] == "OK":
|
||||
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
|
||||
|
||||
def _fetch(self):
|
||||
|
||||
for num in self._connection.search(None, "ALL")[1][0].split():
|
||||
|
||||
__, data = self._connection.fetch(num, "(RFC822)")
|
||||
|
||||
message = None
|
||||
try:
|
||||
message = Message(data[0][1], self.logging_group)
|
||||
except InvalidMessageError as e:
|
||||
self.log("error", str(e))
|
||||
else:
|
||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||
|
||||
if message:
|
||||
yield message
|
@ -3,11 +3,10 @@ import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from watchdog.observers import Observer
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
from documents.consumer import Consumer
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
@ -17,17 +16,25 @@ except ImportError:
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def __init__(self, consumer):
|
||||
self.consumer = consumer
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self.consumer.try_consume_file(event.src_path)
|
||||
self._consume(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
self._consume(event.src_path)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
On every iteration of an infinite loop, consume what we can from the
|
||||
consumption directory, and fetch any mail available.
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -35,12 +42,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.file_consumer = None
|
||||
self.mail_fetcher = None
|
||||
self.first_iteration = True
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
@ -56,9 +57,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
|
||||
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
directory
|
||||
@ -68,11 +66,16 @@ class Command(BaseCommand):
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
self.consumer.try_consume_file(entry.path)
|
||||
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
observer = Observer()
|
||||
event_handler = Handler(self.consumer)
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info('Using polling instead of file'
|
||||
'system notifications.')
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
else:
|
||||
observer = Observer()
|
||||
event_handler = Handler()
|
||||
observer.schedule(event_handler, directory, recursive=True)
|
||||
observer.start()
|
||||
try:
|
||||
|
@ -1,4 +1,5 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from ...mixins import Renderable
|
||||
from ...tasks import train_classifier
|
||||
|
||||
|
@ -1,16 +1,15 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import shutil
|
||||
import time
|
||||
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
@ -3,15 +3,14 @@ import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@ -82,6 +81,10 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
@ -94,6 +97,14 @@ class Command(Renderable, BaseCommand):
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
document.storage_type = storage_type
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
@ -109,18 +120,8 @@ class Command(Renderable, BaseCommand):
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
|
||||
print("Moving {} to {}".format(document_path, document.source_path))
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
# Reset the storage type to whatever we've used while importing
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
Document.objects.filter(
|
||||
pk__in=[r["pk"] for r in self.manifest]
|
||||
).update(
|
||||
storage_type=storage_type
|
||||
)
|
||||
document.save()
|
||||
|
@ -8,5 +8,5 @@ class Command(BaseCommand):
|
||||
help = "A quick & dirty way to see what's in the logs"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
for l in Log.objects.order_by("pk"):
|
||||
print(l)
|
||||
for log in Log.objects.order_by("pk"):
|
||||
print(log)
|
||||
|
@ -1,7 +1,6 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document, Tag
|
||||
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
|
@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier):
|
||||
correspondents = Correspondent.objects.all()
|
||||
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
|
||||
|
||||
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
return matched_correspondents
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
document_types = DocumentType.objects.all()
|
||||
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
|
||||
|
||||
matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
return matched_document_types
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
|
@ -1,7 +1,4 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-07 12:35
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
@ -9,11 +9,11 @@ from django_q.tasks import schedule
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
||||
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
||||
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.all().delete()
|
||||
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
|
||||
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
18
src/documents/migrations/1002_auto_20201111_1105.py
Normal file
@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-11 11:05
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1001_auto_20201109_1636'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True),
|
||||
),
|
||||
]
|
@ -3,18 +3,15 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict, defaultdict
|
||||
from collections import OrderedDict
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
MATCH_ANY = 1
|
||||
@ -116,6 +113,7 @@ class DocumentType(MatchingModel):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
# TODO: why do we need an explicit list
|
||||
TYPE_PDF = "pdf"
|
||||
TYPE_PNG = "png"
|
||||
TYPE_JPG = "jpg"
|
||||
@ -192,7 +190,7 @@ class Document(models.Model):
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
filename = models.FilePathField(
|
||||
max_length=256,
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
null=True,
|
||||
@ -220,123 +218,18 @@ class Document(models.Model):
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
|
||||
def find_renamed_document(self, subdirectory=""):
|
||||
suffix = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
suffix += ".gpg"
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(subdirectory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
if filename.endswith(suffix):
|
||||
return os.path.join(subdirectory, filename)
|
||||
|
||||
fullname = os.path.join(subdirectory, filename)
|
||||
if os.path.isdir(Document.filename_to_path(fullname)):
|
||||
return self.find_renamed_document(fullname)
|
||||
|
||||
return None
|
||||
|
||||
@property
|
||||
def source_filename(self):
|
||||
# Initial filename generation (for new documents)
|
||||
if self.filename is None:
|
||||
self.filename = self.generate_source_filename()
|
||||
|
||||
# Check if document is still available under filename
|
||||
elif not os.path.isfile(Document.filename_to_path(self.filename)):
|
||||
recovered_filename = self.find_renamed_document()
|
||||
|
||||
# If we have found the file so update the filename
|
||||
if recovered_filename is not None:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Filename of document " + str(self.id) +
|
||||
" has changed and was successfully updated")
|
||||
self.filename = recovered_filename
|
||||
|
||||
# Remove all empty subdirectories from MEDIA_ROOT
|
||||
Document.delete_all_empty_subdirectories(
|
||||
Document.filename_to_path(""))
|
||||
else:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("File of document " + str(self.id) + " has " +
|
||||
"gone and could not be recovered")
|
||||
|
||||
return self.filename
|
||||
|
||||
@staticmethod
|
||||
def many_to_dictionary(field):
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find('_')
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find('-')
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter+1:]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
def generate_source_filename(self):
|
||||
# Create filename based on configured format
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
self.many_to_dictionary(self.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(self.correspondent),
|
||||
title=slugify(self.title),
|
||||
created=slugify(self.created),
|
||||
added=slugify(self.added),
|
||||
tags=tags)
|
||||
else:
|
||||
path = ""
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
def create_source_directory(self):
|
||||
new_filename = self.generate_source_filename()
|
||||
|
||||
# Determine the full "target" path
|
||||
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
os.makedirs(dir_new, exist_ok=True)
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
return Document.filename_to_path(self.source_filename)
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
@staticmethod
|
||||
def filename_to_path(filename):
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
filename
|
||||
fname
|
||||
)
|
||||
|
||||
@property
|
||||
@ -362,125 +255,6 @@ class Document(models.Model):
|
||||
def thumbnail_file(self):
|
||||
return open(self.thumbnail_path, "rb")
|
||||
|
||||
def set_filename(self, filename):
|
||||
if os.path.isfile(Document.filename_to_path(filename)):
|
||||
self.filename = filename
|
||||
|
||||
@staticmethod
|
||||
def try_delete_empty_directories(directory):
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(Document.filename_to_path(""))
|
||||
|
||||
while directory != root:
|
||||
# Try to delete the current directory
|
||||
try:
|
||||
os.rmdir(directory)
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
return
|
||||
|
||||
# Cut off actual directory and go one level up
|
||||
directory, _ = os.path.split(directory)
|
||||
directory = os.path.normpath(directory)
|
||||
|
||||
@staticmethod
|
||||
def delete_all_empty_subdirectories(directory):
|
||||
# Go through all folders and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(directory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
fullname = os.path.join(directory, filename)
|
||||
|
||||
if not os.path.isdir(Document.filename_to_path(fullname)):
|
||||
continue
|
||||
|
||||
# Go into subdirectory to see, if there is more to delete
|
||||
Document.delete_all_empty_subdirectories(
|
||||
os.path.join(directory, filename))
|
||||
|
||||
# Try to delete the directory
|
||||
try:
|
||||
os.rmdir(Document.filename_to_path(fullname))
|
||||
continue
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
continue
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_filename(sender, instance, **kwargs):
|
||||
# Skip if document has not been saved yet
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Check is file exists and update filename otherwise
|
||||
if not os.path.isfile(Document.filename_to_path(instance.filename)):
|
||||
instance.filename = instance.source_filename
|
||||
|
||||
# Build the new filename
|
||||
new_filename = instance.generate_source_filename()
|
||||
|
||||
# If the filename is the same, then nothing needs to be done
|
||||
if instance.filename == new_filename:
|
||||
return
|
||||
|
||||
# Determine the full "target" path
|
||||
path_new = instance.filename_to_path(new_filename)
|
||||
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
instance.create_source_directory()
|
||||
|
||||
# Determine the full "current" path
|
||||
path_current = instance.filename_to_path(instance.source_filename)
|
||||
|
||||
# Move file
|
||||
try:
|
||||
os.rename(path_current, path_new)
|
||||
except PermissionError:
|
||||
# Do not update filename in object
|
||||
return
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("Renaming of document " + str(instance.id) + " failed " +
|
||||
"as file " + instance.filename + " was no longer present")
|
||||
return
|
||||
|
||||
# Delete empty directory
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
instance.filename = new_filename
|
||||
|
||||
# Save instance
|
||||
# This will not cause a cascade of post_save signals, as next time
|
||||
# nothing needs to be renamed
|
||||
instance.save()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def delete_files(sender, instance, **kwargs):
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Remove the document
|
||||
old_file = instance.filename_to_path(instance.filename)
|
||||
|
||||
try:
|
||||
os.remove(old_file)
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Deleted document " + str(instance.id) + " but file " +
|
||||
old_file + " was no longer present")
|
||||
|
||||
# And remove the directory (if applicable)
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
|
||||
class Log(models.Model):
|
||||
|
||||
@ -518,7 +292,7 @@ class FileInfo:
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
|
||||
# TODO: what is this used for
|
||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
|
@ -20,13 +20,16 @@ from django.utils import timezone
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
# TODO: isnt there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
|
||||
)
|
||||
|
||||
@ -39,17 +42,16 @@ def get_parser_class(doc):
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
#TODO: add a check that checks parser availability.
|
||||
|
||||
options = []
|
||||
for parser in parsers:
|
||||
result = parser(doc)
|
||||
if result:
|
||||
options.append(result)
|
||||
|
||||
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
parser_test = parser_declaration["test"]
|
||||
|
||||
if parser_test(doc):
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
@ -59,7 +61,7 @@ def get_parser_class(doc):
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
|
||||
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False
|
||||
args += ['-trim'] if trim else []
|
||||
args += ['-type', str(type)] if type else []
|
||||
args += ['-depth', str(depth)] if depth else []
|
||||
args += [input, output]
|
||||
args += [input_file, output_file]
|
||||
|
||||
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
|
||||
|
||||
@ -100,17 +102,17 @@ class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser:
|
||||
class DocumentParser(LoggingMixin):
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group, progress_callback):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = logging_group
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
def get_thumbnail(self):
|
||||
@ -121,16 +123,19 @@ class DocumentParser:
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
||||
return out_path
|
||||
return out_path
|
||||
else:
|
||||
return in_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
@ -222,11 +227,6 @@ class DocumentParser:
|
||||
|
||||
return date
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
shutil.rmtree(self.tempdir)
|
||||
|
@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
class LogSerializer(serializers.ModelSerializer):
|
||||
|
||||
|
||||
class Meta:
|
||||
model = Log
|
||||
fields = (
|
||||
|
@ -1,5 +1,5 @@
|
||||
from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal(providing_args=["filename"])
|
||||
document_consumption_finished = Signal(providing_args=["document"])
|
||||
document_consumer_declaration = Signal(providing_args=[])
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_consumer_declaration = Signal()
|
||||
|
@ -6,9 +6,13 @@ from django.conf import settings
|
||||
from django.contrib.admin.models import ADDITION, LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
)).wait()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
|
||||
if not isinstance(instance, Document):
|
||||
return
|
||||
|
||||
for f in (instance.source_path, instance.thumbnail_path):
|
||||
try:
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
|
||||
delete_empty_directories(os.path.dirname(instance.source_path))
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not instance.filename:
|
||||
# Can't update the filename if there is not filename to begin with
|
||||
# This happens after the consumer creates a new document.
|
||||
# The PK needs to be set first by saving the document once. When this
|
||||
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
|
||||
# renamed anyway. In all other cases, instance.filename will be set.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
old_path = instance.source_path
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
|
||||
return
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
instance.filename = new_filename
|
||||
instance.save()
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
except DatabaseError as e:
|
||||
os.rename(new_path, old_path)
|
||||
instance.filename = old_filename
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
delete_empty_directories(os.path.dirname(old_path))
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
|