diff --git a/paperless.conf.example b/paperless.conf.example index c6f6ae191..85709698f 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -61,6 +61,17 @@ PAPERLESS_SHARED_SECRET="" # the web for "MAGICK_MEMORY_LIMIT". #PAPERLESS_CONVERT_MEMORY_LIMIT=0 +# By default the conversion density setting for documents is 300DPI, in some +# cases it has proven useful to configure a lesser value. +# This setting has a high impact on the physical size of tmp page files, +# the speed of document conversion, and can affect the accuracy of OCR +# results. Individual results can vary and this setting should be tested +# thoroughly against the documents you are importing to see if it has any +# impacts either negative or positive. Testing on limited document sets has +# shown a setting of 200 can cut the size of tmp files by 1/3, and speed up +# conversion by up to 4x with little impact to OCR accuracy. +#PAPERLESS_CONVERT_DENSITY=300 + # Similar to the memory limit, if you've got a small system and your OS mounts # /tmp as tmpfs, you should set this to a path that's on a physical disk, like # /home/your_user/tmp or something. ImageMagick will use this as scratch space diff --git a/src/documents/consumer.py b/src/documents/consumer.py index bac93647e..3d66d581d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -53,6 +53,7 @@ class Consumer(object): UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE @@ -158,7 +159,7 @@ class Consumer(object): pnm = os.path.join(tempdir, "convert-%04d.pnm") run_convert( self.CONVERT, - "-density", "300", + "-density", str(self.DENSITY), "-depth", "8", "-type", "grayscale", doc, pnm, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index db74d9cea..ad6084711 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -191,6 +191,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") +CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") # Unpaper UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")