From 40f8ba23a439dc5dd75a9dbae438aa925ce69287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wolf-Bastian=20P=C3=B6ttner?= Date: Fri, 2 Feb 2018 23:52:44 +0100 Subject: [PATCH] Added a text cache to optimize performance of date detection --- src/paperless_tesseract/parsers.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 3d0f82b3e..85209dd8e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -35,6 +35,7 @@ class RasterisedDocumentParser(DocumentParser): DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE OCR_ALWAYS = settings.OCR_ALWAYS + TEXT_CACHE = None def get_thumbnail(self): """ @@ -62,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser): return False def get_text(self): + if self.TEXT_CACHE is not None: + return self.TEXT_CACHE + if not self.OCR_ALWAYS and self._is_ocred(): self.log("info", "Skipping OCR, using Text from PDF") - return get_text_from_pdf(self.document_path) + self.TEXT_CACHE = get_text_from_pdf(self.document_path) + return self.TEXT_CACHE images = self._get_greyscale() try: - return self._get_ocr(images) + self.TEXT_CACHE = self._get_ocr(images) + return self.TEXT_CACHE except OCRError as e: raise ParseError(e)