diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b7f7f61e7..77acd1ff2 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -22,7 +22,7 @@ class ConsumerError(Exception): pass -class Consumer(object): +class Consumer: """ Loop over every file found in CONSUMPTION_DIR and: 1. Convert it to a greyscale pnm diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 85209dd8e..b3f3e9613 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -52,15 +52,13 @@ class RasterisedDocumentParser(DocumentParser): return os.path.join(self.tempdir, "convert-0000.png") def _is_ocred(self): + # Extract text from PDF using pdftotext text = get_text_from_pdf(self.document_path) # We assume, that a PDF with at least 50 characters contains text # (so no OCR required) - if len(text) > 50: - return True - - return False + return len(text) > 50 def get_text(self): if self.TEXT_CACHE is not None: @@ -74,7 +72,6 @@ class RasterisedDocumentParser(DocumentParser): images = self._get_greyscale() try: - self.TEXT_CACHE = self._get_ocr(images) return self.TEXT_CACHE except OCRError as e: @@ -262,6 +259,7 @@ def image_to_string(args): def get_text_from_pdf(pdf_file): + with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f)