mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Added a text cache to optimize performance of date detection
This commit is contained in:
		| @@ -35,6 +35,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
|     TEXT_CACHE = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
| @@ -62,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         return False | ||||
|  | ||||
|     def get_text(self): | ||||
|         if self.TEXT_CACHE is not None: | ||||
|             return self.TEXT_CACHE | ||||
|  | ||||
|         if not self.OCR_ALWAYS and self._is_ocred(): | ||||
|             self.log("info", "Skipping OCR, using Text from PDF") | ||||
|             return get_text_from_pdf(self.document_path) | ||||
|             self.TEXT_CACHE = get_text_from_pdf(self.document_path) | ||||
|             return self.TEXT_CACHE | ||||
|  | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
|         try: | ||||
|  | ||||
|             return self._get_ocr(images) | ||||
|             self.TEXT_CACHE = self._get_ocr(images) | ||||
|             return self.TEXT_CACHE | ||||
|         except OCRError as e: | ||||
|             raise ParseError(e) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Wolf-Bastian Pöttner
					Wolf-Bastian Pöttner