2021-01-01 22:19:43 +01:00

238 lines
7.8 KiB
Python

import json
import os
import re
import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
def extract_metadata(self, document_path, mime_type):
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if mime_type == 'application/pdf':
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
self.log(
"warning",
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
def get_thumbnail(self, document_path, mime_type):
return make_thumbnail_from_pdf(
document_path, self.tempdir, self.logging_group)
def is_image(self, mime_type):
return mime_type in [
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
]
def get_dpi(self, image):
try:
with Image.open(image) as im:
x, y = im.info['dpi']
return x
except Exception as e:
self.log(
'warning',
f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
try:
with Image.open(image) as im:
width, height = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log(
'debug',
f"Estimated DPI {dpi} based on image width {width}"
)
return dpi
except Exception as e:
self.log(
'warning',
f"Error while calculating DPI for image {image}: {e}")
return None
def parse(self, document_path, mime_type, file_name=None):
mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path)
has_text = text_original and len(text_original) > 50
if mode == "skip_noarchive" and has_text:
self.log("debug",
"Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
if mode in ['skip', 'skip_noarchive'] and not has_text:
# upgrade to redo, since there appears to be no text in the
# document. This happens to some weird encrypted documents or
# documents with failed OCR attempts for which OCRmyPDF will
# still report that there actually is text in them.
self.log("debug",
"No text was found in the document and skip is "
"specified. Upgrading OCR mode to redo.")
mode = "redo"
archive_path = os.path.join(self.tempdir, "archive.pdf")
ocr_args = {
'input_file': document_path,
'output_file': archive_path,
'use_threads': True,
'jobs': settings.THREADS_PER_WORKER,
'language': settings.OCR_LANGUAGE,
'output_type': settings.OCR_OUTPUT_TYPE,
'progress_bar': False,
'clean': True
}
if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
# Mode selection.
if mode in ['skip', 'skip_noarchive']:
ocr_args['skip_text'] = True
elif mode == 'redo':
ocr_args['redo_ocr'] = True
elif mode == 'force':
ocr_args['force_ocr'] = True
else:
raise ParseError(
f"Invalid ocr mode: {mode}")
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
a4_dpi = self.calculate_a4_dpi(document_path)
if dpi:
self.log(
"debug",
f"Detected DPI for image {document_path}: {dpi}"
)
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
elif a4_dpi:
ocr_args['image_dpi'] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.")
if settings.OCR_USER_ARGS:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
ocr_args = {**ocr_args, **user_args}
except Exception as e:
self.log(
"warning",
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used: {e}")
# This forces tesseract to use one core per page.
os.environ['OMP_THREAD_LIMIT'] = "1"
try:
self.log("debug",
f"Calling OCRmyPDF with {str(ocr_args)}")
ocrmypdf.ocr(**ocr_args)
# success! announce results
self.archive_path = archive_path
self.text = get_text_from_pdf(archive_path)
except (InputFileError, EncryptedPdfError) as e:
self.log("debug",
f"Encountered an error: {e}. Trying to use text from "
f"original.")
# This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already
# have in the document.
self.text = text_original
# Also, no archived file.
if not self.text:
# However, if we don't have anything, fail:
raise ParseError(e)
except Exception as e:
# Anything else is probably serious.
raise ParseError(e)
if not self.text:
# This may happen for files that don't have any text.
self.log(
'warning',
f"Document {document_path} does not have any text."
f"This is probably an error or you tried to add an image "
f"without text, or something is wrong with this document.")
self.text = ""
def strip_excess_whitespace(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
# TODO: this needs a rework
return no_trailing_whitespace.strip()
def get_text_from_pdf(pdf_file):
if not os.path.isfile(pdf_file):
return None
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
# might not be a PDF file
return None
text = "\n".join(pdf)
return strip_excess_whitespace(text)