From fb1da4834ce2632cef86d0ad1cc2680419aaf829 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sun, 18 Feb 2018 15:55:55 +0000 Subject: [PATCH] Style and removal of Python 2.7 stuff --- src/documents/consumer.py | 2 +- src/paperless_tesseract/parsers.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b7f7f61e7..77acd1ff2 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -22,7 +22,7 @@ class ConsumerError(Exception): pass -class Consumer(object): +class Consumer: """ Loop over every file found in CONSUMPTION_DIR and: 1. Convert it to a greyscale pnm diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 85209dd8e..b3f3e9613 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -52,15 +52,13 @@ class RasterisedDocumentParser(DocumentParser): return os.path.join(self.tempdir, "convert-0000.png") def _is_ocred(self): + # Extract text from PDF using pdftotext text = get_text_from_pdf(self.document_path) # We assume, that a PDF with at least 50 characters contains text # (so no OCR required) - if len(text) > 50: - return True - - return False + return len(text) > 50 def get_text(self): if self.TEXT_CACHE is not None: @@ -74,7 +72,6 @@ class RasterisedDocumentParser(DocumentParser): images = self._get_greyscale() try: - self.TEXT_CACHE = self._get_ocr(images) return self.TEXT_CACHE except OCRError as e: @@ -262,6 +259,7 @@ def image_to_string(args): def get_text_from_pdf(pdf_file): + with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f)