From 1f707e86ccf9a64962265b3e6b4f4ec346884576 Mon Sep 17 00:00:00 2001 From: jonaswinkler <17569239+jonaswinkler@users.noreply.github.com> Date: Sun, 13 Jun 2021 12:09:16 +0200 Subject: [PATCH] fix logging getting spammed with pdfminer warnings on JPG files --- src/paperless_tesseract/parsers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c6cd392d8..57cfb0118 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -214,8 +214,12 @@ class RasterisedDocumentParser(DocumentParser): # This forces tesseract to use one core per page. os.environ['OMP_THREAD_LIMIT'] = "1" - text_original = self.extract_text(None, document_path) - original_has_text = text_original and len(text_original) > 50 + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = text_original and len(text_original) > 50 + else: + text_original = None + original_has_text = False if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug",