mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Introducing language detection
This commit is contained in:
parent
dbe2df0cad
commit
ec70d05517
@ -1,6 +1,6 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import glob
|
import glob
|
||||||
import gnupg
|
import langdetect
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
@ -44,9 +44,10 @@ class Command(BaseCommand):
|
|||||||
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
self.stats = {}
|
self.stats = {}
|
||||||
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
@ -132,6 +133,16 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
self._render(" OCRing the PDF", 2)
|
self._render(" OCRing the PDF", 2)
|
||||||
|
|
||||||
|
raw_text = self._ocr(pngs, self.OCR_LANG)
|
||||||
|
|
||||||
|
guessed_language = langdetect.detect(raw_text)
|
||||||
|
if guessed_language == self.OCR_LANG:
|
||||||
|
return raw_text
|
||||||
|
|
||||||
|
return self._ocr(pngs, guessed_language)
|
||||||
|
|
||||||
|
def _ocr(self, pngs, lang):
|
||||||
|
|
||||||
r = ""
|
r = ""
|
||||||
for png in pngs:
|
for png in pngs:
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user