mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Added a text cache to optimize performance of date detection
This commit is contained in:
parent
bef2d94374
commit
40f8ba23a4
@ -35,6 +35,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
DATE_ORDER = settings.DATE_ORDER
|
DATE_ORDER = settings.DATE_ORDER
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||||
|
TEXT_CACHE = None
|
||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
"""
|
"""
|
||||||
@ -62,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
|
if self.TEXT_CACHE is not None:
|
||||||
|
return self.TEXT_CACHE
|
||||||
|
|
||||||
if not self.OCR_ALWAYS and self._is_ocred():
|
if not self.OCR_ALWAYS and self._is_ocred():
|
||||||
self.log("info", "Skipping OCR, using Text from PDF")
|
self.log("info", "Skipping OCR, using Text from PDF")
|
||||||
return get_text_from_pdf(self.document_path)
|
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
|
||||||
|
return self.TEXT_CACHE
|
||||||
|
|
||||||
images = self._get_greyscale()
|
images = self._get_greyscale()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
return self._get_ocr(images)
|
self.TEXT_CACHE = self._get_ocr(images)
|
||||||
|
return self.TEXT_CACHE
|
||||||
except OCRError as e:
|
except OCRError as e:
|
||||||
raise ParseError(e)
|
raise ParseError(e)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user