From 52c5aafb3fe3f9debc8d9f1c4f9f8318c820e38a Mon Sep 17 00:00:00 2001 From: Brian Martin <bmartin5692@gmail.com> Date: Fri, 13 May 2016 22:47:40 -0400 Subject: [PATCH 1/3] Convert Density Add settings variable for the convert density setting. If no variable is set, default to 300. --- src/documents/consumer.py | 3 ++- src/paperless/settings.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index bac93647e..df1a73cb3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -53,6 +53,7 @@ class Consumer(object): UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + DENSITY = int(settings.CONVERT_DENSITY) if settings.CONVERT_DENSITY else 300 DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE @@ -158,7 +159,7 @@ class Consumer(object): pnm = os.path.join(tempdir, "convert-%04d.pnm") run_convert( self.CONVERT, - "-density", "300", + "-density", self.DENSITY, "-depth", "8", "-type", "grayscale", doc, pnm, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index db74d9cea..ad6084711 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -191,6 +191,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") +CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") # Unpaper UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") From b6ae129ad181f325e2fb0253d554f2ca91939b8c Mon Sep 17 00:00:00 2001 From: Brian Martin <bmartin5692@gmail.com> Date: Fri, 13 May 2016 23:23:58 -0400 Subject: [PATCH 2/3] Sample Config and Bug Fix Update sample config to reflect new setting variable. Change consumer to handle density setting as str instead of int. --- paperless.conf.example | 11 +++++++++++ src/documents/consumer.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index c6f6ae191..26ca8a8fb 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -61,6 +61,17 @@ PAPERLESS_SHARED_SECRET="" # the web for "MAGICK_MEMORY_LIMIT". #PAPERLESS_CONVERT_MEMORY_LIMIT=0 +# By default the conversion density setting for documents is 300DPI, in some +# cases it has proven useful to configure a lesser value. +# This setting has a high impact on the physical size of tmp page files, +# the speed of document conversion, and can affect the accuracy of OCR +# results. Individual results can vary and this setting should be tested +# thoroughly against the documents you are importing to see if it has any +# impacts either negative or positive. Testing on limited document sets has +# shown a setting of 200 can cut the size of tmp files by 1/3, and speed up +# conversion by up to 4x with little impact to OCR accuracy. +#PAPERLESS_CONVERT_DENSITY=200 + # Similar to the memory limit, if you've got a small system and your OS mounts # /tmp as tmpfs, you should set this to a path that's on a physical disk, like # /home/your_user/tmp or something. ImageMagick will use this as scratch space diff --git a/src/documents/consumer.py b/src/documents/consumer.py index df1a73cb3..3d66d581d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -53,7 +53,7 @@ class Consumer(object): UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - DENSITY = int(settings.CONVERT_DENSITY) if settings.CONVERT_DENSITY else 300 + DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE @@ -159,7 +159,7 @@ class Consumer(object): pnm = os.path.join(tempdir, "convert-%04d.pnm") run_convert( self.CONVERT, - "-density", self.DENSITY, + "-density", str(self.DENSITY), "-depth", "8", "-type", "grayscale", doc, pnm, From 9c062206e42e39638c961be3e16dc2f40fb07b74 Mon Sep 17 00:00:00 2001 From: Brian Martin <bmartin5692@gmail.com> Date: Sun, 15 May 2016 21:56:41 -0400 Subject: [PATCH 3/3] Sample Config Update Update Sample Config to show 300 density as default. --- paperless.conf.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperless.conf.example b/paperless.conf.example index 26ca8a8fb..85709698f 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -70,7 +70,7 @@ PAPERLESS_SHARED_SECRET="" # impacts either negative or positive. Testing on limited document sets has # shown a setting of 200 can cut the size of tmp files by 1/3, and speed up # conversion by up to 4x with little impact to OCR accuracy. -#PAPERLESS_CONVERT_DENSITY=200 +#PAPERLESS_CONVERT_DENSITY=300 # Similar to the memory limit, if you've got a small system and your OS mounts # /tmp as tmpfs, you should set this to a path that's on a physical disk, like