Detect language only on one page of PDF

To detect the language currently the entire document gets processed. If a different language has been detected than the default one, the entire document will be processed again for the new language. This PR analyzes the middle page for its language and either processes the remaining pages with the default language if it didn't differ, or processes all pages for the new guessed language. The amount of processed pages comes down from the worst case `2n` to worst case `n+1`.
2025-11-21 04:36:53 -06:00 · 2016-02-14 16:13:34 +01:00
parent 9162e41507
commit aeab9a0e81
1 changed files with 16 additions and 1 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -9,6 +9,7 @@ import random
 import re
 import subprocess
 import math
 import pyocr
 from PIL import Image
@@ -152,9 +153,14 @@ class Consumer(Renderable):
        simple language detection trial & error.
        """
        if not pngs:
            raise OCRError
        self._render("  OCRing the document", 2)
-        raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
+        # Since the division gets rounded down by int, this calculation works for every edge-case, i.e. 1
        middle = int(len(pngs) / 2)
        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
        guessed_language = self._guess_language(raw_text)
@@ -166,10 +172,14 @@ class Consumer(Renderable):
                    "with what we have.",
                    1
                )
                raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
                raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
                return raw_text
            raise OCRError
        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
            raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
            raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
            return raw_text
        try:
@@ -183,6 +193,8 @@ class Consumer(Renderable):
                    ),
                    0
                )
                raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
                raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
                return raw_text
            raise OCRError
@@ -191,6 +203,9 @@ class Consumer(Renderable):
        Performs a single OCR attempt.
        """
        if not pngs:
            return ""
        self._render("    Parsing for {}".format(lang), 2)
        with Pool(processes=self.THREADS) as pool: