mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
commit
66a81cf6ac
@ -3,6 +3,10 @@ Changelog
|
|||||||
|
|
||||||
* 0.2.0
|
* 0.2.0
|
||||||
|
|
||||||
|
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||||
|
doesn't explode when handling Very Large Documents or when it's just
|
||||||
|
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||||
|
this one.
|
||||||
* Added support for guessing the date from the file name along with the
|
* Added support for guessing the date from the file name along with the
|
||||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||||
@ -97,6 +101,7 @@ Changelog
|
|||||||
.. _zedster: https://github.com/zedster
|
.. _zedster: https://github.com/zedster
|
||||||
.. _Martin Honermeyer: https://github.com/djmaze
|
.. _Martin Honermeyer: https://github.com/djmaze
|
||||||
.. _Tim White: https://github.com/timwhite
|
.. _Tim White: https://github.com/timwhite
|
||||||
|
.. _Florian Harr: https://github.com/evils
|
||||||
|
|
||||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||||
@ -111,3 +116,4 @@ Changelog
|
|||||||
.. _#68: https://github.com/danielquinn/paperless/issues/68
|
.. _#68: https://github.com/danielquinn/paperless/issues/68
|
||||||
.. _#71: https://github.com/danielquinn/paperless/issues/71
|
.. _#71: https://github.com/danielquinn/paperless/issues/71
|
||||||
.. _#94: https://github.com/danielquinn/paperless/issues/71
|
.. _#94: https://github.com/danielquinn/paperless/issues/71
|
||||||
|
.. _#98: https://github.com/danielquinn/paperless/issues/71
|
||||||
|
@ -3,17 +3,47 @@
|
|||||||
Troubleshooting
|
Troubleshooting
|
||||||
===============
|
===============
|
||||||
|
|
||||||
.. _troubleshooting_ocr_language_files_missing:
|
.. _troubleshooting-languagemissing:
|
||||||
|
|
||||||
Consumer warns ``OCR for XX failed``
|
Consumer warns ``OCR for XX failed``
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
If you find the OCR accuracy to be too low, and/or the document consumer warns that ``OCR for
|
If you find the OCR accuracy to be too low, and/or the document consumer warns
|
||||||
XX failed, but we're going to stick with what we've got since FORGIVING_OCR is enabled``, then you
|
that ``OCR for XX failed, but we're going to stick with what we've got since
|
||||||
might need to install the `Tesseract language files
|
FORGIVING_OCR is enabled``, then you might need to install the
|
||||||
<http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_ marching your documents languages.
|
`Tesseract language files <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_
|
||||||
|
marching your documents languages.
|
||||||
|
|
||||||
As an example, if you are running Paperless from the Vagrant setup provided (or from any Ubuntu or Debian
|
As an example, if you are running Paperless from the Vagrant setup provided
|
||||||
box), and your documents are written in Spanish you may need to run::
|
(or from any Ubuntu or Debian box), and your documents are written in Spanish
|
||||||
|
you may need to run::
|
||||||
|
|
||||||
apt-get install -y tesseract-ocr-spa
|
apt-get install -y tesseract-ocr-spa
|
||||||
|
|
||||||
|
|
||||||
|
.. _troubleshooting-convertpixelcache:
|
||||||
|
|
||||||
|
Consumer dies with ``convert: unable to extent pixel cache``
|
||||||
|
------------------------------------------------------------
|
||||||
|
|
||||||
|
During the consumption process, Paperless invokes ImageMagick's ``convert``
|
||||||
|
program to translate the source document into something that the OCR engine can
|
||||||
|
understand and this can burn a Very Large amount of memory if the original
|
||||||
|
document is rather long. Similarly, if your system doesn't have a lot of
|
||||||
|
memory to begin with (ie. a Rasberry Pi), then this can happen for even
|
||||||
|
medium-sized documents.
|
||||||
|
|
||||||
|
The solution is to tell ImageMagick *not* to Use All The RAM, as is its
|
||||||
|
default, and instead tell it to used a fixed amount. ``convert`` will then
|
||||||
|
break up the job into hundreds of individual files and use them to slowly
|
||||||
|
compile the finished image. Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in
|
||||||
|
``/etc/paperless.conf`` to something like ``32000000`` and you'll limit
|
||||||
|
``convert`` to 32MB. Fiddle with this value as you like.
|
||||||
|
|
||||||
|
**HOWEVER**: Simply setting this value may not be enough on system where
|
||||||
|
``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its
|
||||||
|
temporary files. In these cases (most Systemd machines), you need to tell
|
||||||
|
ImageMagick to use a different space for its scratch work. You do this by
|
||||||
|
setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere
|
||||||
|
that's actually on a physical disk (and writable by the user running
|
||||||
|
Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.
|
||||||
|
@ -32,7 +32,32 @@ PAPERLESS_PASSPHRASE="secret"
|
|||||||
# have a shared secret here.
|
# have a shared secret here.
|
||||||
PAPERLESS_SHARED_SECRET=""
|
PAPERLESS_SHARED_SECRET=""
|
||||||
|
|
||||||
|
#
|
||||||
|
# The following values use sensible defaults for modern systems, but if you're
|
||||||
|
# running Paperless on a low-resource machine (like a Rasberry Pi), modifying
|
||||||
|
# some of these values may be necessary.
|
||||||
|
#
|
||||||
|
|
||||||
# By default, Paperless will attempt to use all available CPU cores to process
|
# By default, Paperless will attempt to use all available CPU cores to process
|
||||||
# a document, but if you would like to limit that, you can set this value to
|
# a document, but if you would like to limit that, you can set this value to
|
||||||
# an integer:
|
# an integer:
|
||||||
#PAPERLESS_OCR_THREADS=1
|
#PAPERLESS_OCR_THREADS=1
|
||||||
|
|
||||||
|
# On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||||
|
# may explode, complaining about how it's "unable to extent pixel cache". In
|
||||||
|
# such cases, try setting this to a reasonably low value, like 32000000. The
|
||||||
|
# default is to use whatever is necessary to do everything without writing to
|
||||||
|
# disk, and units are in megabytes.
|
||||||
|
#
|
||||||
|
# For more information on how to use this value, you should probably search
|
||||||
|
# the web for "MAGICK_MEMORY_LIMIT".
|
||||||
|
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||||
|
|
||||||
|
# Similar to the memory limit, if you've got a small system and your OS mounts
|
||||||
|
# /tmp as tmpfs, you should set this to a path that's on a physical disk, like
|
||||||
|
# /home/your_user/tmp or something. ImageMagick will use this as scratch space
|
||||||
|
# when crunching through very large documents.
|
||||||
|
#
|
||||||
|
# For more information on how to use this value, you should probably search
|
||||||
|
# the web for "MAGICK_TMPDIR".
|
||||||
|
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||||
|
@ -129,10 +129,13 @@ class Consumer(object):
|
|||||||
|
|
||||||
# Convert PDF to multiple PNMs
|
# Convert PDF to multiple PNMs
|
||||||
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||||
subprocess.Popen((
|
run_convert(
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
self.CONVERT,
|
||||||
"-type", "grayscale", doc, pnm
|
"-density", "300",
|
||||||
)).wait()
|
"-depth", "8",
|
||||||
|
"-type", "grayscale",
|
||||||
|
doc, pnm,
|
||||||
|
)
|
||||||
|
|
||||||
# Get a list of converted images
|
# Get a list of converted images
|
||||||
pnms = []
|
pnms = []
|
||||||
@ -159,13 +162,12 @@ class Consumer(object):
|
|||||||
|
|
||||||
self.log("info", "Generating the thumbnail")
|
self.log("info", "Generating the thumbnail")
|
||||||
|
|
||||||
subprocess.Popen((
|
run_convert(
|
||||||
self.CONVERT,
|
self.CONVERT,
|
||||||
"-scale", "500x5000",
|
"-scale", "500x5000",
|
||||||
"-alpha", "remove",
|
"-alpha", "remove",
|
||||||
doc,
|
doc, os.path.join(tempdir, "convert-%04d.png")
|
||||||
os.path.join(tempdir, "convert-%04d.png")
|
)
|
||||||
)).wait()
|
|
||||||
|
|
||||||
return os.path.join(tempdir, "convert-0000.png")
|
return os.path.join(tempdir, "convert-0000.png")
|
||||||
|
|
||||||
@ -334,6 +336,16 @@ def image_to_string(args):
|
|||||||
|
|
||||||
def run_unpaper(args):
|
def run_unpaper(args):
|
||||||
unpaper, pnm = args
|
unpaper, pnm = args
|
||||||
subprocess.Popen((
|
subprocess.Popen(
|
||||||
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||||
)).wait()
|
|
||||||
|
|
||||||
|
def run_convert(*args):
|
||||||
|
|
||||||
|
environment = {}
|
||||||
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
|
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||||
|
if settings.CONVERT_TMPDIR:
|
||||||
|
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||||
|
|
||||||
|
subprocess.Popen(args, env=environment).wait()
|
||||||
|
@ -189,6 +189,8 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
|||||||
|
|
||||||
# Convert is part of the ImageMagick package
|
# Convert is part of the ImageMagick package
|
||||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
||||||
|
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||||
|
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||||
|
|
||||||
# Unpaper
|
# Unpaper
|
||||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||||
@ -226,7 +228,7 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
|
|||||||
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
||||||
|
|
||||||
#
|
#
|
||||||
# TODO: Remove after 1.2
|
# TODO: Remove after 0.2
|
||||||
#
|
#
|
||||||
# This logic is here to address issue #44, wherein we were using inconsistent
|
# This logic is here to address issue #44, wherein we were using inconsistent
|
||||||
# constant names vs. environment variables. If you're using Paperless for the
|
# constant names vs. environment variables. If you're using Paperless for the
|
||||||
|
Loading…
x
Reference in New Issue
Block a user