mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Added language detection
This commit is contained in:
parent
bcdcfbaee0
commit
fdb29f739f
@ -16,10 +16,15 @@ from django.core.management.base import BaseCommand, CommandError
|
|||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from documents.models import Document, Sender
|
from ...languages import ISO639
|
||||||
|
from ...models import Document, Sender
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
|
|
||||||
|
class OCRError(BaseException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
"""
|
"""
|
||||||
Loop over every file found in CONSUMPTION_DIR and:
|
Loop over every file found in CONSUMPTION_DIR and:
|
||||||
@ -37,7 +42,7 @@ class Command(BaseCommand):
|
|||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
OCR = pyocr.get_available_tools()[0]
|
||||||
OCR_LANG = settings.TESSERACT_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||||
|
|
||||||
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
|
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
|
||||||
@ -47,6 +52,7 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
self.stats = {}
|
self.stats = {}
|
||||||
|
self._ignore = []
|
||||||
|
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
@ -81,13 +87,22 @@ class Command(BaseCommand):
|
|||||||
if not re.match(self.PARSER_REGEX_TITLE, pdf):
|
if not re.match(self.PARSER_REGEX_TITLE, pdf):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if pdf in self._ignore:
|
||||||
|
continue
|
||||||
|
|
||||||
if self._is_ready(pdf):
|
if self._is_ready(pdf):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._render("Consuming {}".format(pdf), 1)
|
self._render("Consuming {}".format(pdf), 1)
|
||||||
|
|
||||||
pngs = self._get_greyscale(pdf)
|
pngs = self._get_greyscale(pdf)
|
||||||
text = self._get_ocr(pngs)
|
|
||||||
|
try:
|
||||||
|
text = self._get_ocr(pngs)
|
||||||
|
except OCRError:
|
||||||
|
self._ignore.append(pdf)
|
||||||
|
self._render("OCR FAILURE: {}".format(pdf), 0)
|
||||||
|
continue
|
||||||
|
|
||||||
self._store(text, pdf)
|
self._store(text, pdf)
|
||||||
self._cleanup(pngs, pdf)
|
self._cleanup(pngs, pdf)
|
||||||
@ -131,23 +146,51 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def _get_ocr(self, pngs):
|
def _get_ocr(self, pngs):
|
||||||
|
|
||||||
self._render(" OCRing the PDF", 2)
|
self._render(" OCRing the PDF", 1)
|
||||||
|
|
||||||
raw_text = self._ocr(pngs, self.OCR_LANG)
|
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = langdetect.detect(raw_text)
|
guessed_language = langdetect.detect(raw_text)
|
||||||
if guessed_language == self.OCR_LANG:
|
|
||||||
|
self._render(" Language detected: {}".format(guessed_language), 2)
|
||||||
|
|
||||||
|
if guessed_language not in ISO639:
|
||||||
|
self._render("Language detection failed!", 0)
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self._render(
|
||||||
|
"As FORGIVING_OCR is enabled, we're going to make the best "
|
||||||
|
"with what we have.",
|
||||||
|
1
|
||||||
|
)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
return self._ocr(pngs, guessed_language)
|
try:
|
||||||
|
return self._ocr(pngs, ISO639[guessed_language])
|
||||||
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self._render(
|
||||||
|
"OCR for {} failed, but we're going to stick with what "
|
||||||
|
"we've got since FORGIVING_OCR is enabled.".format(
|
||||||
|
guessed_language
|
||||||
|
),
|
||||||
|
0
|
||||||
|
)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
def _ocr(self, pngs, lang):
|
def _ocr(self, pngs, lang):
|
||||||
|
|
||||||
|
self._render(" Parsing for {}".format(lang), 2)
|
||||||
|
|
||||||
r = ""
|
r = ""
|
||||||
for png in pngs:
|
for png in pngs:
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||||
self._render(" {}".format(f.filename), 3)
|
self._render(" {}".format(f.filename), 3)
|
||||||
r += self.OCR.image_to_string(f, lang=self.OCR_LANG)
|
r += self.OCR.image_to_string(f, lang=lang)
|
||||||
r += "\n\n\n\n\n\n\n\n"
|
r += "\n\n\n\n\n\n\n\n"
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
Loading…
x
Reference in New Issue
Block a user