Added a text cache to optimize performance of date detection

This commit is contained in:
Wolf-Bastian Pöttner 2018-02-02 23:52:44 +01:00
parent bef2d94374
commit 40f8ba23a4

View File

@ -35,6 +35,7 @@ class RasterisedDocumentParser(DocumentParser):
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
TEXT_CACHE = None
def get_thumbnail(self):
"""
@ -62,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser):
return False
def get_text(self):
if self.TEXT_CACHE is not None:
return self.TEXT_CACHE
if not self.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
return get_text_from_pdf(self.document_path)
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
return self.TEXT_CACHE
images = self._get_greyscale()
try:
return self._get_ocr(images)
self.TEXT_CACHE = self._get_ocr(images)
return self.TEXT_CACHE
except OCRError as e:
raise ParseError(e)