mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Style and removal of Python 2.7 stuff
This commit is contained in:
parent
9cef689106
commit
fb1da4834c
@ -22,7 +22,7 @@ class ConsumerError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Consumer(object):
|
class Consumer:
|
||||||
"""
|
"""
|
||||||
Loop over every file found in CONSUMPTION_DIR and:
|
Loop over every file found in CONSUMPTION_DIR and:
|
||||||
1. Convert it to a greyscale pnm
|
1. Convert it to a greyscale pnm
|
||||||
|
@ -52,15 +52,13 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return os.path.join(self.tempdir, "convert-0000.png")
|
return os.path.join(self.tempdir, "convert-0000.png")
|
||||||
|
|
||||||
def _is_ocred(self):
|
def _is_ocred(self):
|
||||||
|
|
||||||
# Extract text from PDF using pdftotext
|
# Extract text from PDF using pdftotext
|
||||||
text = get_text_from_pdf(self.document_path)
|
text = get_text_from_pdf(self.document_path)
|
||||||
|
|
||||||
# We assume, that a PDF with at least 50 characters contains text
|
# We assume, that a PDF with at least 50 characters contains text
|
||||||
# (so no OCR required)
|
# (so no OCR required)
|
||||||
if len(text) > 50:
|
return len(text) > 50
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
if self.TEXT_CACHE is not None:
|
if self.TEXT_CACHE is not None:
|
||||||
@ -74,7 +72,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
images = self._get_greyscale()
|
images = self._get_greyscale()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
self.TEXT_CACHE = self._get_ocr(images)
|
self.TEXT_CACHE = self._get_ocr(images)
|
||||||
return self.TEXT_CACHE
|
return self.TEXT_CACHE
|
||||||
except OCRError as e:
|
except OCRError as e:
|
||||||
@ -262,6 +259,7 @@ def image_to_string(args):
|
|||||||
|
|
||||||
|
|
||||||
def get_text_from_pdf(pdf_file):
|
def get_text_from_pdf(pdf_file):
|
||||||
|
|
||||||
with open(pdf_file, "rb") as f:
|
with open(pdf_file, "rb") as f:
|
||||||
try:
|
try:
|
||||||
pdf = pdftotext.PDF(f)
|
pdf = pdftotext.PDF(f)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user