From 20b2408dbbbb64972c09029757ba4f9f945302b4 Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Sun, 14 Feb 2016 16:37:38 +0100
Subject: [PATCH 1/2] Ensure `OCR_THREADS` is integer, add documentation

---
 docs/setup.rst            | 5 +++++
 src/documents/consumer.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/setup.rst b/docs/setup.rst
index 1ca9a6ed3..24a9b9fa2 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -58,6 +58,11 @@ Standard (Bare Metal)
       passphrase from the environment, so if you don't set it to a static value
       here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
       command line whenever invoking the consumer or webserver.
+    * ``OCR_THREADS``: this is the number of threads the OCR process will spawn
+      to process document pages in parallel. The default value gets sourced from
+      the environment-variable ``PAPERLESS_OCR_THREADS`` and expects it to be an
+      integer. If the variable is not set, Python determines the core-count of
+      your CPU and uses that value.
 4. Initialise the database with ``./manage.py migrate``.
 5. Create a user for your Paperless instance with
    ``./manage.py createsuperuser``. Follow the prompts to create your user.
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 3f3b9e9a3..2fa0ea016 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -52,7 +52,7 @@ class Consumer(Renderable):
     SCRATCH = settings.SCRATCH_DIR
     CONVERT = settings.CONVERT_BINARY
     CONSUME = settings.CONSUMPTION_DIR
-    THREADS = settings.OCR_THREADS
+    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 
     OCR = pyocr.get_available_tools()[0]
     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

From aeab9a0e81b43fca9bd338336e2d7d258928fb6c Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Sun, 14 Feb 2016 16:13:34 +0100
Subject: [PATCH 2/2] Detect language only on one page of PDF

To detect the language currently the entire document gets processed. If
a different language has been detected than the default one, the entire
document will be processed again for the new language.

This PR analyzes the middle page for its language and either processes
the remaining pages with the default language if it didn't differ, or
processes all pages for the new guessed language.

The amount of processed pages comes down from the worst case `2n` to
worst case `n+1`.
---
 src/documents/consumer.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 2fa0ea016..dc4405fab 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -9,6 +9,7 @@ import random
 import re
 import subprocess
 
+import math
 import pyocr
 
 from PIL import Image
@@ -152,9 +153,14 @@ class Consumer(Renderable):
         simple language detection trial & error.
         """
 
+        if not pngs:
+            raise OCRError
+
         self._render("  OCRing the document", 2)
 
-        raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
+        # Since the division gets rounded down by int, this calculation works for every edge-case, i.e. 1
+        middle = int(len(pngs) / 2)
+        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
 
         guessed_language = self._guess_language(raw_text)
 
@@ -166,10 +172,14 @@ class Consumer(Renderable):
                     "with what we have.",
                     1
                 )
+                raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
+                raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
                 return raw_text
             raise OCRError
 
         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
+            raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
+            raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
             return raw_text
 
         try:
@@ -183,6 +193,8 @@ class Consumer(Renderable):
                     ),
                     0
                 )
+                raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
+                raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
                 return raw_text
             raise OCRError
 
@@ -191,6 +203,9 @@ class Consumer(Renderable):
         Performs a single OCR attempt.
         """
 
+        if not pngs:
+            return ""
+
         self._render("    Parsing for {}".format(lang), 2)
 
         with Pool(processes=self.THREADS) as pool: