mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Detect language only on one page of PDF
To detect the language currently the entire document gets processed. If a different language has been detected than the default one, the entire document will be processed again for the new language. This PR analyzes the middle page for its language and either processes the remaining pages with the default language if it didn't differ, or processes all pages for the new guessed language. The amount of processed pages comes down from the worst case `2n` to worst case `n+1`.
This commit is contained in:
parent
9162e41507
commit
aeab9a0e81
@ -9,6 +9,7 @@ import random
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
import math
|
||||||
import pyocr
|
import pyocr
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -152,9 +153,14 @@ class Consumer(Renderable):
|
|||||||
simple language detection trial & error.
|
simple language detection trial & error.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not pngs:
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
self._render(" OCRing the document", 2)
|
self._render(" OCRing the document", 2)
|
||||||
|
|
||||||
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
# Since the division gets rounded down by int, this calculation works for every edge-case, i.e. 1
|
||||||
|
middle = int(len(pngs) / 2)
|
||||||
|
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
@ -166,10 +172,14 @@ class Consumer(Renderable):
|
|||||||
"with what we have.",
|
"with what we have.",
|
||||||
1
|
1
|
||||||
)
|
)
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -183,6 +193,8 @@ class Consumer(Renderable):
|
|||||||
),
|
),
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError
|
||||||
|
|
||||||
@ -191,6 +203,9 @@ class Consumer(Renderable):
|
|||||||
Performs a single OCR attempt.
|
Performs a single OCR attempt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not pngs:
|
||||||
|
return ""
|
||||||
|
|
||||||
self._render(" Parsing for {}".format(lang), 2)
|
self._render(" Parsing for {}".format(lang), 2)
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user