From a03315102ac1bc38ac827c34867a57278a594cf5 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 19:37:48 +0100 Subject: [PATCH] added image DPI detection to the tesseract parser. --- docs/configuration.rst | 13 +++++++++++ paperless.conf.example | 1 + src/paperless/settings.py | 2 ++ src/paperless_tesseract/parsers.py | 35 +++++++++++++++++++++++++++--- 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index ad1c7c117..75d0a0b4c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -250,6 +250,19 @@ PAPERLESS_OCR_OUTPUT_TYPE= If not specified, ``pdfa`` is used. Remember that paperless also keeps the original input file as well as the archived version. +PAPERLESS_OCR_IMAGE_DPI= + Paperless will OCR any images you put into the system and convert them + into PDF documents. This is useful if your scanner produces images. + In order to do so, paperless needs to know the DPI of the image. + Most images from scanners will have this information embedded and + paperless will detect and use that information. In case this fails, it + uses this value as a fallback. + + Set this to the DPI your scanner produces images at. + + Default is none, which causes paperless to fail if no DPI information is + present in an image. + PAPERLESS_CONSUMER_POLLING= If paperless won't find documents added to your consume folder, it might not be able to automatically detect filesystem changes. In that case, diff --git a/paperless.conf.example b/paperless.conf.example index 685e09fab..e19f4ba0e 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -40,6 +40,7 @@ #PAPERLESS_OCR_LANGUAGE=eng #PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_MODE=skip +#PAPERLESS_OCR_IMAGE_DPI=300 #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false #PAPERLESS_CONVERT_MEMORY_LIMIT=0 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index c49271731..79b454649 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -346,6 +346,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") # TODO: validate this. OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") +OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") + # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b72f95e2d..7e10de5c5 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -4,6 +4,7 @@ import subprocess import ocrmypdf import pdftotext +from PIL import Image from django.conf import settings from ocrmypdf import InputFileError @@ -60,10 +61,22 @@ class RasterisedDocumentParser(DocumentParser): return out_path - def get_text(self): + def is_image(self, mime_type): + return mime_type in [ + "image/png", + "image/jpeg" + ] - if self._text: - return self._text + def get_dpi(self, image): + try: + with Image.open(image) as im: + x, y = im.info['dpi'] + return x + except Exception as e: + self.log( + 'warning', + f"Error while getting DPI from image {image}: {e}") + return None def parse(self, document_path, mime_type): archive_path = os.path.join(self.tempdir, "archive.pdf") @@ -89,6 +102,22 @@ class RasterisedDocumentParser(DocumentParser): elif settings.OCR_MODE == 'force': ocr_args['force_ocr'] = True + if self.is_image(mime_type): + dpi = self.get_dpi(document_path) + if dpi: + self.log( + "debug", + f"Detected DPI for image {document_path}: {dpi}" + ) + ocr_args['image_dpi'] = dpi + elif settings.OCR_IMAGE_DPI: + ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI + else: + raise ParseError( + f"Cannot produce archive PDF for image {document_path}, " + f"no DPI information is present in this image and " + f"OCR_IMAGE_DPI is not set.") + try: ocrmypdf.ocr(**ocr_args) # success! announce results