added image DPI detection to the tesseract parser.

This commit is contained in:
Jonas Winkler 2020-11-25 19:37:48 +01:00
parent df801d17e1
commit a03315102a
4 changed files with 48 additions and 3 deletions

View File

@ -250,6 +250,19 @@ PAPERLESS_OCR_OUTPUT_TYPE=<type>
If not specified, ``pdfa`` is used. Remember that paperless also keeps
the original input file as well as the archived version.
PAPERLESS_OCR_IMAGE_DPI=<num>
Paperless will OCR any images you put into the system and convert them
into PDF documents. This is useful if your scanner produces images.
In order to do so, paperless needs to know the DPI of the image.
Most images from scanners will have this information embedded and
paperless will detect and use that information. In case this fails, it
uses this value as a fallback.
Set this to the DPI your scanner produces images at.
Default is none, which causes paperless to fail if no DPI information is
present in an image.
PAPERLESS_CONSUMER_POLLING=<num>
If paperless won't find documents added to your consume folder, it might
not be able to automatically detect filesystem changes. In that case,

View File

@ -40,6 +40,7 @@
#PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_MODE=skip
#PAPERLESS_OCR_IMAGE_DPI=300
#PAPERLESS_CONSUMER_POLLING=10
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
#PAPERLESS_CONVERT_MEMORY_LIMIT=0

View File

@ -346,6 +346,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# TODO: validate this.
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")

View File

@ -4,6 +4,7 @@ import subprocess
import ocrmypdf
import pdftotext
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError
@ -60,10 +61,22 @@ class RasterisedDocumentParser(DocumentParser):
return out_path
def get_text(self):
def is_image(self, mime_type):
return mime_type in [
"image/png",
"image/jpeg"
]
if self._text:
return self._text
def get_dpi(self, image):
try:
with Image.open(image) as im:
x, y = im.info['dpi']
return x
except Exception as e:
self.log(
'warning',
f"Error while getting DPI from image {image}: {e}")
return None
def parse(self, document_path, mime_type):
archive_path = os.path.join(self.tempdir, "archive.pdf")
@ -89,6 +102,22 @@ class RasterisedDocumentParser(DocumentParser):
elif settings.OCR_MODE == 'force':
ocr_args['force_ocr'] = True
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
if dpi:
self.log(
"debug",
f"Detected DPI for image {document_path}: {dpi}"
)
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.")
try:
ocrmypdf.ocr(**ocr_args)
# success! announce results