fixes bauerj/paperless_app#23 and most of all other scanner apps out there.

This commit is contained in:
jonaswinkler 2020-12-12 18:25:15 +01:00
parent bf9051e44d
commit a0631413d6
2 changed files with 38 additions and 1 deletions

View File

@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
try:
with Image.open(image) as im:
width, height = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log(
'debug',
f"Estimated DPI {dpi} based on image width {width}"
)
return dpi
except Exception as e:
self.log(
'warning',
f"Error while calculating DPI for image {image}: {e}")
return None
def parse(self, document_path, mime_type):
mode = settings.OCR_MODE
@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
a4_dpi = self.calculate_a4_dpi(document_path)
if dpi:
self.log(
"debug",
@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
elif a4_dpi:
ocr_args['image_dpi'] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
def get_text_from_pdf(pdf_file):
if not os.path.isfile(pdf_file):
return None
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)

View File

@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertRaises(ParseError, f)
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
def test_image_calc_a4_dpi(self, m):
parser = RasterisedDocumentParser(None)
def test_image_no_dpi_fail(self):
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs['image_dpi'], 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):
m.return_value = None
parser = RasterisedDocumentParser(None)
def f():