diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 1cf6a769c..80e200f27 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser): f"Error while getting DPI from image {image}: {e}") return None + def calculate_a4_dpi(self, image): + try: + with Image.open(image) as im: + width, height = im.size + # divide image width by A4 width (210mm) in inches. + dpi = int(width / (21 / 2.54)) + self.log( + 'debug', + f"Estimated DPI {dpi} based on image width {width}" + ) + return dpi + + except Exception as e: + self.log( + 'warning', + f"Error while calculating DPI for image {image}: {e}") + return None + def parse(self, document_path, mime_type): mode = settings.OCR_MODE @@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser): if self.is_image(mime_type): dpi = self.get_dpi(document_path) + a4_dpi = self.calculate_a4_dpi(document_path) if dpi: self.log( "debug", @@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser): ocr_args['image_dpi'] = dpi elif settings.OCR_IMAGE_DPI: ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI + elif a4_dpi: + ocr_args['image_dpi'] = a4_dpi else: raise ParseError( f"Cannot produce archive PDF for image {document_path}, " @@ -241,6 +262,9 @@ def strip_excess_whitespace(text): def get_text_from_pdf(pdf_file): + if not os.path.isfile(pdf_file): + return None + with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 8834ec755..7be176663 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase): self.assertRaises(ParseError, f) + @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") + def test_image_calc_a4_dpi(self, m): + parser = RasterisedDocumentParser(None) - def test_image_no_dpi_fail(self): + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") + + m.assert_called_once() + + args, kwargs = m.call_args + + self.assertEqual(kwargs['image_dpi'], 62) + + @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") + def test_image_dpi_fail(self, m): + m.return_value = None parser = RasterisedDocumentParser(None) def f():