From 9991f5a6b2d5472b69ba0a1a480952768894c716 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 25 Mar 2016 20:31:15 +0000 Subject: [PATCH 1/3] Introducing optional env vars for ImageMagick --- paperless.conf.example | 25 +++++++++++++++++++++++++ src/documents/consumer.py | 32 +++++++++++++++++++++++--------- src/paperless/settings.py | 4 +++- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index d254b7320..9ef9a1b42 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -32,7 +32,32 @@ PAPERLESS_PASSPHRASE="secret" # have a shared secret here. PAPERLESS_SHARED_SECRET="" +# +# The following values use sensible defaults for modern systems, but if you're +# running Paperless on a low-resource machine (like a Rasberry Pi), modifying +# some of these values may be necessary. +# + # By default, Paperless will attempt to use all available CPU cores to process # a document, but if you would like to limit that, you can set this value to # an integer: #PAPERLESS_OCR_THREADS=1 + +# On smaller systems, or even in the case of Very Large Documents, the consumer +# may explode, complaining about how it's "unable to extent pixel cache". In +# such cases, try setting this to a reasonably low value, like 32000000. The +# default is to use whatever is necessary to do everything without writing to +# disk, and units are in megabytes. +# +# For more information on how to use this value, you should probably search +# the web for "MAGICK_MEMORY_LIMIT". +#PAPERLESS_CONVERT_MEMORY_LIMIT=0 + +# Similar to the memory limit, if you've got a small system and your OS mounts +# /tmp as tmpfs, you should set this to a path that's on a physical disk, like +# /home/your_user/tmp or something. ImageMagick will use this as scratch space +# when crunching through very large documents. +# +# For more information on how to use this value, you should probably search +# the web for "MAGICK_TMPDIR". +#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 45239696b..f564e96e3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -129,10 +129,13 @@ class Consumer(object): # Convert PDF to multiple PNMs pnm = os.path.join(tempdir, "convert-%04d.pnm") - subprocess.Popen(( - self.CONVERT, "-density", "300", "-depth", "8", - "-type", "grayscale", doc, pnm - )).wait() + run_convert( + self.CONVERT, + "-density", "300", + "-depth", "8", + "-type", "grayscale", + doc, pnm, + ) # Get a list of converted images pnms = [] @@ -159,13 +162,14 @@ class Consumer(object): self.log("info", "Generating the thumbnail") - subprocess.Popen(( + run_convert( self.CONVERT, "-scale", "500x5000", "-alpha", "remove", + "-limit", "memory", "20MiB", doc, os.path.join(tempdir, "convert-%04d.png") - )).wait() + ) return os.path.join(tempdir, "convert-0000.png") @@ -334,6 +338,16 @@ def image_to_string(args): def run_unpaper(args): unpaper, pnm = args - subprocess.Popen(( - unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") - )).wait() + subprocess.Popen( + (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() + + +def run_convert(*args): + + environment = {} + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + subprocess.Popen(args, env=environment).wait() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index bb1ba363b..7d9d03cd0 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -189,6 +189,8 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") # Convert is part of the ImageMagick package CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") +CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") +CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") # Unpaper UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") @@ -226,7 +228,7 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") # -# TODO: Remove after 1.2 +# TODO: Remove after 0.2 # # This logic is here to address issue #44, wherein we were using inconsistent # constant names vs. environment variables. If you're using Paperless for the From b387be6f253ae527a3bf06606e4c642588245b93 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 25 Mar 2016 20:33:00 +0000 Subject: [PATCH 2/3] I didn't mean to explicitly set -limit --- src/documents/consumer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f564e96e3..62ff7a00d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -166,9 +166,7 @@ class Consumer(object): self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - "-limit", "memory", "20MiB", - doc, - os.path.join(tempdir, "convert-%04d.png") + doc, os.path.join(tempdir, "convert-%04d.png") ) return os.path.join(tempdir, "convert-0000.png") From 23aa79f3073a5ffb04c105350f7e8c99dbd9979e Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 25 Mar 2016 20:51:22 +0000 Subject: [PATCH 3/3] Documented the new variables and updated the changelog --- docs/changelog.rst | 6 ++++++ docs/troubleshooting.rst | 44 +++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index c1397bb6c..da0d0ec56 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,10 @@ Changelog * 0.2.0 + * `#98`_: Added optional environment variables for ImageMagick so that it + doesn't explode when handling Very Large Documents or when it's just + running on a low-memory system. Thanks to `Florian Harr`_ for his help on + this one. * Added support for guessing the date from the file name along with the correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull request that I took forever to merge and to `Pit`_ for his efforts on the @@ -97,6 +101,7 @@ Changelog .. _zedster: https://github.com/zedster .. _Martin Honermeyer: https://github.com/djmaze .. _Tim White: https://github.com/timwhite +.. _Florian Harr: https://github.com/evils .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 @@ -111,3 +116,4 @@ Changelog .. _#68: https://github.com/danielquinn/paperless/issues/68 .. _#71: https://github.com/danielquinn/paperless/issues/71 .. _#94: https://github.com/danielquinn/paperless/issues/71 +.. _#98: https://github.com/danielquinn/paperless/issues/71 diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst index 0fa7c1a29..39228ed48 100644 --- a/docs/troubleshooting.rst +++ b/docs/troubleshooting.rst @@ -3,17 +3,47 @@ Troubleshooting =============== -.. _troubleshooting_ocr_language_files_missing: +.. _troubleshooting-languagemissing: Consumer warns ``OCR for XX failed`` ------------------------------------ -If you find the OCR accuracy to be too low, and/or the document consumer warns that ``OCR for -XX failed, but we're going to stick with what we've got since FORGIVING_OCR is enabled``, then you -might need to install the `Tesseract language files -`_ marching your documents languages. +If you find the OCR accuracy to be too low, and/or the document consumer warns +that ``OCR for XX failed, but we're going to stick with what we've got since +FORGIVING_OCR is enabled``, then you might need to install the +`Tesseract language files `_ +marching your documents languages. -As an example, if you are running Paperless from the Vagrant setup provided (or from any Ubuntu or Debian -box), and your documents are written in Spanish you may need to run:: +As an example, if you are running Paperless from the Vagrant setup provided +(or from any Ubuntu or Debian box), and your documents are written in Spanish +you may need to run:: apt-get install -y tesseract-ocr-spa + + +.. _troubleshooting-convertpixelcache: + +Consumer dies with ``convert: unable to extent pixel cache`` +------------------------------------------------------------ + +During the consumption process, Paperless invokes ImageMagick's ``convert`` +program to translate the source document into something that the OCR engine can +understand and this can burn a Very Large amount of memory if the original +document is rather long. Similarly, if your system doesn't have a lot of +memory to begin with (ie. a Rasberry Pi), then this can happen for even +medium-sized documents. + +The solution is to tell ImageMagick *not* to Use All The RAM, as is its +default, and instead tell it to used a fixed amount. ``convert`` will then +break up the job into hundreds of individual files and use them to slowly +compile the finished image. Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in +``/etc/paperless.conf`` to something like ``32000000`` and you'll limit +``convert`` to 32MB. Fiddle with this value as you like. + +**HOWEVER**: Simply setting this value may not be enough on system where +``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its +temporary files. In these cases (most Systemd machines), you need to tell +ImageMagick to use a different space for its scratch work. You do this by +setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere +that's actually on a physical disk (and writable by the user running +Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.