fixes bauerj/paperless_app#23 and most of all other scanner apps out there.

2026-02-18 00:29:35 -06:00 · 2020-12-12 18:25:15 +01:00
parent bf9051e44d
commit a0631413d6
2 changed files with 38 additions and 1 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
                f"Error while getting DPI from image {image}: {e}")
            return None

+    def calculate_a4_dpi(self, image):
+        try:
+            with Image.open(image) as im:
+                width, height = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log(
+                    'debug',
+                    f"Estimated DPI {dpi} based on image width {width}"
+                )
+                return dpi
+
+        except Exception as e:
+            self.log(
+                'warning',
+                f"Error while calculating DPI for image {image}: {e}")
+            return None
+
    def parse(self, document_path, mime_type):
        mode = settings.OCR_MODE

@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):

        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
+            a4_dpi = self.calculate_a4_dpi(document_path)
            if dpi:
                self.log(
                    "debug",
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
                ocr_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
+            elif a4_dpi:
+                ocr_args['image_dpi'] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):

 def get_text_from_pdf(pdf_file):

+    if not os.path.isfile(pdf_file):
+        return None
+
    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)