mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Style and removal of Python 2.7 stuff
This commit is contained in:
parent
9cef689106
commit
fb1da4834c
@ -22,7 +22,7 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Consumer(object):
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
|
@ -52,15 +52,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
if len(text) > 50:
|
||||
return True
|
||||
|
||||
return False
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
if self.TEXT_CACHE is not None:
|
||||
@ -74,7 +72,6 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
|
||||
self.TEXT_CACHE = self._get_ocr(images)
|
||||
return self.TEXT_CACHE
|
||||
except OCRError as e:
|
||||
@ -262,6 +259,7 @@ def image_to_string(args):
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
Loading…
x
Reference in New Issue
Block a user