mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #32 from pitkley/feature/single-page-langdetect
Detect language only on first page of PDF
This commit is contained in:
commit
4689e2b975
@ -9,6 +9,7 @@ import random
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
import math
|
||||||
import pyocr
|
import pyocr
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -152,9 +153,14 @@ class Consumer(Renderable):
|
|||||||
simple language detection trial & error.
|
simple language detection trial & error.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not pngs:
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
self._render(" OCRing the document", 2)
|
self._render(" OCRing the document", 2)
|
||||||
|
|
||||||
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
# Since the division gets rounded down by int, this calculation works for every edge-case, i.e. 1
|
||||||
|
middle = int(len(pngs) / 2)
|
||||||
|
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
@ -166,10 +172,14 @@ class Consumer(Renderable):
|
|||||||
"with what we have.",
|
"with what we have.",
|
||||||
1
|
1
|
||||||
)
|
)
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -183,6 +193,8 @@ class Consumer(Renderable):
|
|||||||
),
|
),
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
|
||||||
|
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError
|
||||||
|
|
||||||
@ -191,6 +203,9 @@ class Consumer(Renderable):
|
|||||||
Performs a single OCR attempt.
|
Performs a single OCR attempt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not pngs:
|
||||||
|
return ""
|
||||||
|
|
||||||
self._render(" Parsing for {}".format(lang), 2)
|
self._render(" Parsing for {}".format(lang), 2)
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user