Merge pull request #1240 from paperless-ngx/beta

[Beta] Paperless-ngx v1.8.0 Release Candidate 1
This commit is contained in:
shamoon
2022-07-28 15:17:30 -07:00
committed by GitHub
277 changed files with 56739 additions and 28967 deletions

View File

@@ -1,17 +0,0 @@
FROM python:3.5.1
# Install Sphinx and Pygments
RUN pip install --no-cache-dir Sphinx Pygments \
# Setup directories, copy data
&& mkdir /build
COPY . /build
WORKDIR /build/docs
# Build documentation
RUN make html
# Start webserver
WORKDIR /build/docs/_build/html
EXPOSE 8000/tcp
CMD ["python3", "-m", "http.server"]

View File

@@ -24,6 +24,7 @@ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " livehtml to preview changes with live reload in your browser"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@@ -54,6 +55,9 @@ html:
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
livehtml:
sphinx-autobuild "./" "$(BUILDDIR)" $(O)
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo

View File

@@ -1,47 +1,47 @@
let toggleButton;
let icon;
let toggleButton
let icon
function load() {
"use strict";
'use strict'
toggleButton = document.createElement("button");
toggleButton.setAttribute("title", "Toggle dark mode");
toggleButton.classList.add("dark-mode-toggle");
icon = document.createElement("i");
icon.classList.add("fa", darkModeState ? "fa-sun-o" : "fa-moon-o");
toggleButton.appendChild(icon);
document.body.prepend(toggleButton);
toggleButton = document.createElement('button')
toggleButton.setAttribute('title', 'Toggle dark mode')
toggleButton.classList.add('dark-mode-toggle')
icon = document.createElement('i')
icon.classList.add('fa', darkModeState ? 'fa-sun-o' : 'fa-moon-o')
toggleButton.appendChild(icon)
document.body.prepend(toggleButton)
// Listen for changes in the OS settings
// addListener is used because older versions of Safari don't support addEventListener
// prefersDarkQuery set in <head>
if (prefersDarkQuery) {
prefersDarkQuery.addListener(function (evt) {
toggleDarkMode(evt.matches);
});
toggleDarkMode(evt.matches)
})
}
// Initial setting depending on the prefers-color-mode or localstorage
// darkModeState should be set in the document <head> to prevent flash
if (darkModeState == undefined) darkModeState = false;
toggleDarkMode(darkModeState);
if (darkModeState == undefined) darkModeState = false
toggleDarkMode(darkModeState)
// Toggles the "dark-mode" class on click and sets localStorage state
toggleButton.addEventListener("click", () => {
darkModeState = !darkModeState;
toggleButton.addEventListener('click', () => {
darkModeState = !darkModeState
toggleDarkMode(darkModeState);
localStorage.setItem("dark-mode", darkModeState);
});
toggleDarkMode(darkModeState)
localStorage.setItem('dark-mode', darkModeState)
})
}
function toggleDarkMode(state) {
document.documentElement.classList.toggle("dark-mode", state);
document.documentElement.classList.toggle("light-mode", !state);
icon.classList.remove("fa-sun-o");
icon.classList.remove("fa-moon-o");
icon.classList.add(state ? "fa-sun-o" : "fa-moon-o");
darkModeState = state;
document.documentElement.classList.toggle('dark-mode', state)
document.documentElement.classList.toggle('light-mode', !state)
icon.classList.remove('fa-sun-o')
icon.classList.remove('fa-moon-o')
icon.classList.add(state ? 'fa-sun-o' : 'fa-moon-o')
darkModeState = state
}
document.addEventListener("DOMContentLoaded", load);
document.addEventListener('DOMContentLoaded', load)

View File

@@ -287,6 +287,10 @@ When you use the provided docker compose script, put the export inside the
``export`` folder in your paperless source directory. Specify ``../export``
as the ``source``.
.. note::
Importing from a previous version of Paperless may work, but for best results
it is suggested to match the versions.
.. _utilities-retagger:

View File

@@ -7,12 +7,12 @@ easier.
.. _advanced-matching:
Matching tags, correspondents and document types
################################################
Matching tags, correspondents, document types, and storage paths
################################################################
Paperless will compare the matching algorithms defined by every tag and
correspondent already set in your database to see if they apply to the text in
a document. In other words, if you defined a tag called ``Home Utility``
Paperless will compare the matching algorithms defined by every tag, correspondent,
document type, and storage path in your database to see if they apply to the text
in a document. In other words, if you define a tag called ``Home Utility``
that had a ``match`` property of ``bc hydro`` and a ``matching_algorithm`` of
``literal``, Paperless will automatically tag your newly-consumed document with
your ``Home Utility`` tag so long as the text ``bc hydro`` appears in the body
@@ -22,10 +22,10 @@ The matching logic is quite powerful. It supports searching the text of your
document with different algorithms, and as such, some experimentation may be
necessary to get things right.
In order to have a tag, correspondent, or type assigned automatically to newly
consumed documents, assign a match and matching algorithm using the web
interface. These settings define when to assign correspondents, tags, and types
to documents.
In order to have a tag, correspondent, document type, or storage path assigned
automatically to newly consumed documents, assign a match and matching algorithm
using the web interface. These settings define when to assign tags, correspondents,
document types, and storage paths to documents.
The following algorithms are available:
@@ -37,7 +37,7 @@ The following algorithms are available:
* **Literal:** Matches only if the match appears exactly as provided (i.e. preserve ordering) in the PDF.
* **Regular expression:** Parses the match as a regular expression and tries to
find a match within the document.
* **Fuzzy match:** I dont know. Look at the source.
* **Fuzzy match:** I don't know. Look at the source.
* **Auto:** Tries to automatically match new documents. This does not require you
to set a match. See the notes below.
@@ -47,9 +47,9 @@ defining a match text of ``"Bank of America" BofA`` using the *any* algorithm,
will match documents that contain either "Bank of America" or "BofA", but will
not match documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document,
automatically tagged with the appropriate data.
Then just save your tag, correspondent, document type, or storage path and run
another document through the consumer. Once complete, you should see the
newly-created document, automatically tagged with the appropriate data.
.. _advanced-automatic_matching:
@@ -58,9 +58,9 @@ Automatic matching
==================
Paperless-ngx comes with a new matching algorithm called *Auto*. This matching
algorithm tries to assign tags, correspondents, and document types to your
documents based on how you have already assigned these on existing documents. It
uses a neural network under the hood.
algorithm tries to assign tags, correspondents, document types, and storage paths
to your documents based on how you have already assigned these on existing documents.
It uses a neural network under the hood.
If, for example, all your bank statements of your account 123 at the Bank of
America are tagged with the tag "bofa_123" and the matching algorithm of this
@@ -80,20 +80,21 @@ feature:
that the neural network only learns from documents which you have correctly
tagged before.
* The matching algorithm can only work if there is a correlation between the
tag, correspondent, or document type and the document itself. Your bank
statements usually contain your bank account number and the name of the bank,
so this works reasonably well, However, tags such as "TODO" cannot be
automatically assigned.
tag, correspondent, document type, or storage path and the document itself.
Your bank statements usually contain your bank account number and the name
of the bank, so this works reasonably well, However, tags such as "TODO"
cannot be automatically assigned.
* The matching algorithm needs a reasonable number of documents to identify when
to assign tags, correspondents, and types. If one out of a thousand documents
has the correspondent "Very obscure web shop I bought something five years
ago", it will probably not assign this correspondent automatically if you buy
something from them again. The more documents, the better.
to assign tags, correspondents, storage paths, and types. If one out of a
thousand documents has the correspondent "Very obscure web shop I bought
something five years ago", it will probably not assign this correspondent
automatically if you buy something from them again. The more documents, the better.
* Paperless also needs a reasonable amount of negative examples to decide when
not to assign a certain tag, correspondent or type. This will usually be the
case as you start filling up paperless with documents. Example: If all your
documents are either from "Webshop" and "Bank", paperless will assign one of
these correspondents to ANY new document, if both are set to automatic matching.
not to assign a certain tag, correspondent, document type, or storage path. This will
usually be the case as you start filling up paperless with documents.
Example: If all your documents are either from "Webshop" and "Bank", paperless
will assign one of these correspondents to ANY new document, if both are set
to automatic matching.
Hooking into the consumption process
####################################
@@ -120,10 +121,10 @@ Pre-consumption script
======================
Executed after the consumer sees a new document in the consumption folder, but
before any processing of the document is performed. This script receives exactly
one argument:
before any processing of the document is performed. This script can access the
following relevant environment variables set:
* Document file name
* ``DOCUMENT_SOURCE_PATH``
A simple but common example for this would be creating a simple script like
this:
@@ -133,7 +134,7 @@ this:
.. code:: bash
#!/usr/bin/env bash
pdf2pdfocr.py -i ${1}
pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH}
``/etc/paperless.conf``
@@ -156,16 +157,20 @@ Post-consumption script
=======================
Executed after the consumer has successfully processed a document and has moved it
into paperless. It receives the following arguments:
into paperless. It receives the following environment variables:
* Document id
* Generated file name
* Source path
* Thumbnail path
* Download URL
* Thumbnail URL
* Correspondent
* Tags
* ``DOCUMENT_ID``
* ``DOCUMENT_FILE_NAME``
* ``DOCUMENT_CREATED``
* ``DOCUMENT_MODIFIED``
* ``DOCUMENT_ADDED``
* ``DOCUMENT_SOURCE_PATH``
* ``DOCUMENT_ARCHIVE_PATH``
* ``DOCUMENT_THUMBNAIL_PATH``
* ``DOCUMENT_DOWNLOAD_URL``
* ``DOCUMENT_THUMBNAIL_URL``
* ``DOCUMENT_CORRESPONDENT``
* ``DOCUMENT_TAGS``
The script can be in any language, but for a simple shell script
example, you can take a look at `post-consumption-example.sh`_ in this project.
@@ -268,6 +273,17 @@ If paperless detects that two documents share the same filename, paperless will
append ``_01``, ``_02``, etc to the filename. This happens if all the placeholders in a filename
evaluate to the same value.
.. hint::
You can affect how empty placeholders are treated by changing the following setting to
`true`.
.. code::
PAPERLESS_FILENAME_FORMAT_REMOVE_NONE=True
Doing this results in all empty placeholders resolving to "" instead of "none" as stated above.
Spaces before empty placeholders are removed as well, empty directories are omitted.
.. hint::
Paperless checks the filename of a document whenever it is saved. Therefore,
@@ -290,3 +306,59 @@ evaluate to the same value.
However, keep in mind that inside docker, if files get stored outside of the
predefined volumes, they will be lost after a restart of paperless.
Storage paths
#############
One of the best things in Paperless is that you can not only access the documents via the
web interface, but also via the file system.
When as single storage layout is not sufficient for your use case, storage paths come to
the rescue. Storage paths allow you to configure more precisely where each document is stored
in the file system.
- Each storage path is a `PAPERLESS_FILENAME_FORMAT` and follows the rules described above
- Each document is assigned a storage path using the matching algorithms described above, but
can be overwritten at any time
For example, you could define the following two storage paths:
1. Normal communications are put into a folder structure sorted by `year/correspondent`
2. Communications with insurance companies are stored in a flat structure with longer file names,
but containing the full date of the correspondence.
.. code::
By Year = {created_year}/{correspondent}/{title}
Insurances = Insurances/{correspondent}/{created_year}-{created_month}-{created_day} {title}
If you then map these storage paths to the documents, you might get the following result.
For simplicity, `By Year` defines the same structure as in the previous example above.
.. code:: text
2019/ # By Year
My bank/
Statement January.pdf
Statement February.pdf
Insurances/ # Insurances
Healthcare 123/
2022-01-01 Statement January.pdf
2022-02-02 Letter.pdf
2022-02-03 Letter.pdf
Dental 456/
2021-12-01 New Conditions.pdf
.. hint::
Defining a storage path is optional. If no storage path is defined for a document, the global
`PAPERLESS_FILENAME_FORMAT` is applied.
.. caution::
If you adjust the format of an existing storage path, old documents don't get relocated automatically.
You need to run the :ref:`document renamer <utilities-renamer>` to adjust their pathes.

View File

@@ -31,7 +31,8 @@ The objects served by the document endpoint contain the following fields:
* ``tags``: List of IDs of tags assigned to this document, or empty list.
* ``document_type``: Document type of this document, or null.
* ``correspondent``: Correspondent of this document or null.
* ``created``: The date at which this document was created.
* ``created``: The date time at which this document was created.
* ``created_date``: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
* ``modified``: The date at which this document was last edited in paperless. Read-only.
* ``added``: The date at which this document was added to paperless. Read-only.
* ``archive_serial_number``: The identifier of this document in a physical document archive.
@@ -240,11 +241,13 @@ be instructed to consume the document from there.
The endpoint supports the following optional form fields:
* ``title``: Specify a title that the consumer should use for the document.
* ``created``: Specify a DateTime where the document was created (e.g. "2016-04-19" or "2016-04-19 06:15:00+02:00").
* ``correspondent``: Specify the ID of a correspondent that the consumer should use for the document.
* ``document_type``: Similar to correspondent.
* ``tags``: Similar to correspondent. Specify this multiple times to have multiple tags added
to the document.
The endpoint will immediately return "OK" if the document consumption process
was started successfully. No additional status information about the consumption
process itself is available, since that happens in a different process.

View File

@@ -31,7 +31,7 @@ PAPERLESS_REDIS=<url>
PAPERLESS_DBHOST=<hostname>
By default, sqlite is used as the database backend. This can be changed here.
Set PAPERLESS_DBHOST and PostgreSQL will be used instead of mysql.
Set PAPERLESS_DBHOST and PostgreSQL will be used instead of sqlite.
PAPERLESS_DBPORT=<port>
Adjust port if necessary.
@@ -60,6 +60,13 @@ PAPERLESS_DBSSLMODE=<mode>
Default is ``prefer``.
PAPERLESS_DB_TIMEOUT=<float>
Amount of time for a database connection to wait for the database to unlock.
Mostly applicable for an sqlite based installation, consider changing to postgresql
if you need to increase this.
Defaults to unset, keeping the Django defaults.
Paths and folders
#################
@@ -111,6 +118,14 @@ PAPERLESS_FILENAME_FORMAT=<format>
Default is none, which disables this feature.
PAPERLESS_FILENAME_FORMAT_REMOVE_NONE=<bool>
Tells paperless to replace placeholders in `PAPERLESS_FILENAME_FORMAT` that would resolve
to 'none' to be omitted from the resulting filename. This also holds true for directory
names.
See :ref:`advanced-file_name_handling` for details.
Defaults to `false` which disables this feature.
PAPERLESS_LOGGING_DIR=<path>
This is where paperless will store log files.
@@ -416,14 +431,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
the produced PDF documents are A4 sized.
PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
Paperless will not OCR images that have more pixels than this limit.
This is intended to prevent decompression bombs from overloading paperless.
Increasing this limit is desired if you face a DecompressionBombError despite
the concerning file not being malicious; this could e.g. be caused by invalidly
recognized metadata.
If you have enough resources or if you are certain that your uploaded files
are not malicious you can increase this value to your needs.
The default value is 256000000, an image with more pixels than that would not be parsed.
Paperless will raise a warning when OCRing images which are over this limit and
will not OCR images which are more than twice this limit. Note this does not
prevent the document from being consumed, but could result in missing text content.
If unset, will default to the value determined by
`Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
.. note::
Increasing this limit could cause Paperless to consume additional resources
when consuming a file. Be sure you have sufficient system resources.
.. caution::
The limit is intended to prevent malicious files from consuming system resources
and causing crashes and other errors. Only increase this value if you are certain
your documents are not malicious and you need the text which was not OCRed
PAPERLESS_OCR_USER_ARGS=<json>
OCRmyPDF offers many more options. Use this parameter to specify any
@@ -519,6 +543,8 @@ PAPERLESS_TASK_WORKERS=<num>
maintain the automatic matching algorithm, check emails, consume documents,
etc. This variable specifies how many things it will do in parallel.
Defaults to 1
PAPERLESS_THREADS_PER_WORKER=<num>
Furthermore, paperless uses multiple threads when consuming documents to
@@ -590,6 +616,28 @@ PAPERLESS_CONSUMER_POLLING=<num>
Defaults to 0, which disables polling and uses filesystem notifications.
PAPERLESS_CONSUMER_POLLING_RETRY_COUNT=<num>
If consumer polling is enabled, sets the number of times paperless will check for a
file to remain unmodified.
Defaults to 5.
PAPERLESS_CONSUMER_POLLING_DELAY=<num>
If consumer polling is enabled, sets the delay in seconds between each check (above) paperless
will do while waiting for a file to remain unmodified.
Defaults to 5.
.. _configuration-inotify:
PAPERLESS_CONSUMER_INOTIFY_DELAY=<num>
Sets the time in seconds the consumer will wait for additional events
from inotify before the consumer will consider a file ready and begin consumption.
Certain scanners or network setups may generate multiple events for a single file,
leading to multiple consumers working on the same file. Configure this to
prevent that.
Defaults to 0.5 seconds.
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
When the consumer detects a duplicate document, it will not touch the
@@ -650,7 +698,6 @@ PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defaults to "PATCHT"
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
On smaller systems, or even in the case of Very Large Documents, the consumer
may explode, complaining about how it's "unable to extend pixel cache". In
@@ -674,13 +721,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
Default is none, which disables the temporary directory.
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
Use optipng to optimize thumbnails. This usually reduces the size of
thumbnails by about 20%, but uses considerable compute time during
consumption.
Defaults to true.
PAPERLESS_POST_CONSUME_SCRIPT=<filename>
After a document is consumed, Paperless can trigger an arbitrary script if
you like. This script will be passed a number of arguments for you to work
@@ -696,6 +736,9 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
The filename will be checked first, and if nothing is found, the document
text will be checked as normal.
A date in a filename must have some separators (`.`, `-`, `/`, etc)
for it to be parsed.
Defaults to none, which disables this feature.
PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
@@ -713,10 +756,7 @@ PAPERLESS_IGNORE_DATES=<string>
this process. This is useful for special dates (like date of birth) that appear
in documents regularly but are very unlikely to be the documents creation date.
You may specify dates in a multitude of formats supported by dateparser (see
https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
need to be comma separated, the options are limited.
Example: "2020-12-02,22.04.1999"
The date is parsed using the order specified in PAPERLESS_DATE_ORDER
Defaults to an empty string to not ignore any dates.
@@ -751,9 +791,6 @@ PAPERLESS_CONVERT_BINARY=<path>
PAPERLESS_GS_BINARY=<path>
Defaults to "/usr/bin/gs".
PAPERLESS_OPTIPNG_BINARY=<path>
Defaults to "/usr/bin/optipng".
.. _configuration-docker:
@@ -769,9 +806,7 @@ PAPERLESS_WEBSERVER_WORKERS=<num>
also loads the entire application into memory separately, so increasing this value
will increase RAM usage.
Consider configuring this to 1 on low power devices with limited amount of RAM.
Defaults to 2.
Defaults to 1.
PAPERLESS_PORT=<port>
The port number the webserver will listen on inside the container. There are

View File

@@ -88,7 +88,7 @@ Physical scanners
.. [1] Scanners with API Integration allow to push scanned documents directly to :ref:`Paperless API <api-file_uploads>`, sometimes referred to as Webhook or Document POST.
.. [2] Canon Multi Function Printers show strange behavior over SMB. They close and reopen the file after every page. It's recommended to tune the
:ref:`polling <configuration-polling>` configuration values for your scanner. The scanner timeout is 3 minutes, so ``180`` is a good starting point.
:ref:`polling <configuration-polling>` and :ref:`inotify <configuration-inotify>` configuration values for your scanner. The scanner timeout is 3 minutes, so ``180`` is a good starting point.
Mobile phone software
=====================

View File

@@ -184,6 +184,25 @@ Install Paperless from Docker Hub
port 8000. Modifying the part before the colon will map requests on another
port to the webserver running on the default port.
**Rootless**
If you want to run Paperless as a rootless container, you will need to do the
following in your ``docker-compose.yml``:
- set the ``user`` running the container to map to the ``paperless`` user in the
container.
This value (``user_id`` below), should be the same id that ``USERMAP_UID`` and
``USERMAP_GID`` are set to in the next step.
See ``USERMAP_UID`` and ``USERMAP_GID`` :ref:`here <configuration-docker>`.
Your entry for Paperless should contain something like:
.. code::
webserver:
image: ghcr.io/paperless-ngx/paperless-ngx:latest
user: <user_id>
5. Modify ``docker-compose.env``, following the comments in the file. The
most important change is to set ``USERMAP_UID`` and ``USERMAP_GID``
to the uid and gid of your user on the host system. Use ``id -u`` and
@@ -200,6 +219,19 @@ Install Paperless from Docker Hub
You can copy any setting from the file ``paperless.conf.example`` and paste it here.
Have a look at :ref:`configuration` to see what's available.
.. note::
You can utilize Docker secrets for some configuration settings by
appending `_FILE` to some configuration values. This is supported currently
only by:
* PAPERLESS_DBUSER
* PAPERLESS_DBPASS
* PAPERLESS_SECRET_KEY
* PAPERLESS_AUTO_LOGIN_USERNAME
* PAPERLESS_ADMIN_USER
* PAPERLESS_ADMIN_MAIL
* PAPERLESS_ADMIN_PASSWORD
.. caution::
Some file systems such as NFS network shares don't support file system
@@ -286,7 +318,6 @@ writing. Windows is not and will never be supported.
* ``fonts-liberation`` for generating thumbnails for plain text files
* ``imagemagick`` >= 6 for PDF conversion
* ``optipng`` for optimizing thumbnails
* ``gnupg`` for handling encrypted documents
* ``libpq-dev`` for PostgreSQL
* ``libmagic-dev`` for mime type detection
@@ -298,7 +329,7 @@ writing. Windows is not and will never be supported.
.. code::
python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
These dependencies are required for OCRmyPDF, which is used for text recognition.
@@ -308,7 +339,7 @@ writing. Windows is not and will never be supported.
* ``qpdf``
* ``liblept5``
* ``libxml2``
* ``pngquant``
* ``pngquant`` (suggested for certain PDF image optimizations)
* ``zlib1g``
* ``tesseract-ocr`` >= 4.0.0 for OCR
* ``tesseract-ocr`` language packs (``tesseract-ocr-eng``, ``tesseract-ocr-deu``, etc)
@@ -332,6 +363,12 @@ writing. Windows is not and will never be supported.
3. Optional. Install ``postgresql`` and configure a database, user and password for paperless. If you do not wish
to use PostgreSQL, SQLite is available as well.
.. note::
On bare-metal installations using SQLite, ensure the
`JSON1 extension <https://code.djangoproject.com/wiki/JSON1Extension>`_ is enabled. This is
usually the case, but not always.
4. Get the release archive from `<https://github.com/paperless-ngx/paperless-ngx/releases>`_.
If you clone the git repo as it is, you also have to compile the front end by yourself.
Extract the archive to a place from where you wish to execute it, such as ``/opt/paperless``.
@@ -724,8 +761,6 @@ configuring some options in paperless can help improve performance immensely:
* If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``.
This will speed up OCR times and use less memory at the expense of slightly worse
OCR results.
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
times. Thumbnails will be about 20% larger.
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
1. This will save some memory.

View File

@@ -235,3 +235,85 @@ You might find messages like these in your log files:
This indicates that paperless failed to read PDF metadata from one of your documents. This happens when you
open the affected documents in paperless for editing. Paperless will continue to work, and will simply not
show the invalid metadata.
Consumer fails with a FileNotFoundError
#######################################
You might find messages like these in your log files:
.. code::
[ERROR] [paperless.consumer] Error while consuming document SCN_0001.pdf: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
Traceback (most recent call last):
File "/app/paperless/src/paperless_tesseract/parsers.py", line 261, in parse
ocrmypdf.ocr(**args)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/api.py", line 337, in ocr
return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 385, in run_pipeline
exec_concurrent(context, executor)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 302, in exec_concurrent
pdf = post_process(pdf, context, executor)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 235, in post_process
pdf_out = metadata_fixup(pdf_out, context)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_pipeline.py", line 798, in metadata_fixup
with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf:
File "/usr/local/lib/python3.8/dist-packages/pikepdf/_methods.py", line 923, in open
pdf = Pdf._open(
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
This probably indicates paperless tried to consume the same file twice. This can happen for a number of reasons,
depending on how documents are placed into the consume folder. If paperless is using inotify (the default) to
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
try adjusting the :ref:`polling configuration <configuration-polling>`.
Consumer fails waiting for file to remain unmodified.
#####################################################
You might find messages like these in your log files:
.. code::
[ERROR] [paperless.management.consumer] Timeout while waiting on file /usr/src/paperless/src/../consume/SCN_0001.pdf to remain unmodified.
This indicates paperless timed out while waiting for the file to be completely written to the consume folder.
Adjusting :ref:`polling configuration <configuration-polling>` values should resolve the issue.
.. note::
The user will need to manually move the file out of the consume folder and
back in, for the initial failing file to be consumed.
Consumer fails reporting "OS reports file as busy still".
#########################################################
You might find messages like these in your log files:
.. code::
[WARNING] [paperless.management.consumer] Not consuming file /usr/src/paperless/src/../consume/SCN_0001.pdf: OS reports file as busy still
This indicates paperless was unable to open the file, as the OS reported the file as still being in use. To prevent a
crash, paperless did not try to consume the file. If paperless is using inotify (the default) to
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
try adjusting the :ref:`polling configuration <configuration-polling>`.
.. note::
The user will need to manually move the file out of the consume folder and
back in, for the initial failing file to be consumed.
Log reports "Creating PaperlessTask failed".
#########################################################
You might find messages like these in your log files:
.. code::
[ERROR] [paperless.management.consumer] Creating PaperlessTask failed: db locked
You are likely using an sqlite based installation, with an increased number of workers and are running into sqlite's concurrency limitations.
Uploading or consuming multiple files at once results in many workers attempting to access the database simultaneously.
Consider changing to the PostgreSQL database if you will be processing many documents at once often. Otherwise,
try tweaking the ``PAPERLESS_DB_TIMEOUT`` setting to allow more time for the database to unlock. This may have
minor performance implications.

View File

@@ -161,6 +161,9 @@ These are as follows:
will not consume flagged mails.
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
consume them again.
* **Add custom Tag:** Adds a custom tag to mails with consumed documents (the IMAP
standard calls these "keywords"). Paperless will not consume mails already tagged.
Not all mail servers support this feature!
.. caution::