diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c6cd392d8..57cfb0118 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -214,8 +214,12 @@ class RasterisedDocumentParser(DocumentParser): # This forces tesseract to use one core per page. os.environ['OMP_THREAD_LIMIT'] = "1" - text_original = self.extract_text(None, document_path) - original_has_text = text_original and len(text_original) > 50 + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = text_original and len(text_original) > 50 + else: + text_original = None + original_has_text = False if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug",