added image DPI detection to the tesseract parser.

2026-01-30 23:08:59 -06:00 · 2020-11-25 19:37:48 +01:00
parent 9bfa088eb5
commit 3b655c95d9
4 changed files with 48 additions and 3 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -250,6 +250,19 @@ PAPERLESS_OCR_OUTPUT_TYPE=<type>
    If not specified, ``pdfa`` is used. Remember that paperless also keeps
    the original input file as well as the archived version.
 PAPERLESS_OCR_IMAGE_DPI=<num>
    Paperless will OCR any images you put into the system and convert them
    into PDF documents. This is useful if your scanner produces images.
    In order to do so, paperless needs to know the DPI of the image.
    Most images from scanners will have this information embedded and
    paperless will detect and use that information. In case this fails, it
    uses this value as a fallback.
    Set this to the DPI your scanner produces images at.
    Default is none, which causes paperless to fail if no DPI information is
    present in an image.
 PAPERLESS_CONSUMER_POLLING=<num>
    If paperless won't find documents added to your consume folder, it might
    not be able to automatically detect filesystem changes. In that case,
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -40,6 +40,7 @@
 #PAPERLESS_OCR_LANGUAGE=eng
 #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
 #PAPERLESS_OCR_MODE=skip
 #PAPERLESS_OCR_IMAGE_DPI=300
 #PAPERLESS_CONSUMER_POLLING=10
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
 #PAPERLESS_CONVERT_MEMORY_LIMIT=0
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -346,6 +346,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
 # TODO: validate this.
 OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
 OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -4,6 +4,7 @@ import subprocess
 import ocrmypdf
 import pdftotext
 from PIL import Image
 from django.conf import settings
 from ocrmypdf import InputFileError
@@ -60,10 +61,22 @@ class RasterisedDocumentParser(DocumentParser):
        return out_path
-    def get_text(self):
+    def is_image(self, mime_type):
        return mime_type in [
            "image/png",
            "image/jpeg"
        ]
-        if self._text:
+    def get_dpi(self, image):
-            return self._text
+        try:
            with Image.open(image) as im:
                x, y = im.info['dpi']
                return x
        except Exception as e:
            self.log(
                'warning',
                f"Error while getting DPI from image {image}: {e}")
            return None
    def parse(self, document_path, mime_type):
        archive_path = os.path.join(self.tempdir, "archive.pdf")
@@ -89,6 +102,22 @@ class RasterisedDocumentParser(DocumentParser):
        elif settings.OCR_MODE == 'force':
            ocr_args['force_ocr'] = True
        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
            if dpi:
                self.log(
                    "debug",
                    f"Detected DPI for image {document_path}: {dpi}"
                )
                ocr_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")
        try:
            ocrmypdf.ocr(**ocr_args)
            # success! announce results