diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 3d0f82b3e..85209dd8e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -35,6 +35,7 @@ class RasterisedDocumentParser(DocumentParser): DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE OCR_ALWAYS = settings.OCR_ALWAYS + TEXT_CACHE = None def get_thumbnail(self): """ @@ -62,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser): return False def get_text(self): + if self.TEXT_CACHE is not None: + return self.TEXT_CACHE + if not self.OCR_ALWAYS and self._is_ocred(): self.log("info", "Skipping OCR, using Text from PDF") - return get_text_from_pdf(self.document_path) + self.TEXT_CACHE = get_text_from_pdf(self.document_path) + return self.TEXT_CACHE images = self._get_greyscale() try: - return self._get_ocr(images) + self.TEXT_CACHE = self._get_ocr(images) + return self.TEXT_CACHE except OCRError as e: raise ParseError(e)