2020-11-25 19:37:48 +01:00

174 lines
5.7 KiB
Python

import os
import re
import subprocess
import ocrmypdf
import pdftotext
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError
from documents.parsers import DocumentParser, ParseError, run_convert
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=True,
input_file="{}[0]".format(document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
'warning',
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
def is_image(self, mime_type):
return mime_type in [
"image/png",
"image/jpeg"
]
def get_dpi(self, image):
try:
with Image.open(image) as im:
x, y = im.info['dpi']
return x
except Exception as e:
self.log(
'warning',
f"Error while getting DPI from image {image}: {e}")
return None
def parse(self, document_path, mime_type):
archive_path = os.path.join(self.tempdir, "archive.pdf")
ocr_args = {
'input_file': document_path,
'output_file': archive_path,
'use_threads': True,
'jobs': settings.THREADS_PER_WORKER,
'language': settings.OCR_LANGUAGE,
'output_type': settings.OCR_OUTPUT_TYPE,
'progress_bar': False,
'clean': True
}
if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
if settings.OCR_MODE == 'skip':
ocr_args['skip_text'] = True
elif settings.OCR_MODE == 'redo':
ocr_args['redo_ocr'] = True
elif settings.OCR_MODE == 'force':
ocr_args['force_ocr'] = True
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
if dpi:
self.log(
"debug",
f"Detected DPI for image {document_path}: {dpi}"
)
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.")
try:
ocrmypdf.ocr(**ocr_args)
# success! announce results
self.archive_path = archive_path
self.text = get_text_from_pdf(archive_path)
except InputFileError as e:
# This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already
# have in the document.
self.text = get_text_from_pdf(document_path)
# Also, no archived file.
if not self.text:
# However, if we don't have anything, fail:
raise ParseError(e)
except Exception as e:
# Anything else is probably serious.
raise ParseError(e)
if not self.text:
# This may happen for files that don't have any text.
self.log(
'warning',
f"Document {document_path} does not have any text."
f"This is probably an error or you tried to add an image "
f"without text.")
self.text = ""
def strip_excess_whitespace(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return None
text = "\n".join(pdf)
return strip_excess_whitespace(text)