From ce121a261d3c4f88b829002106d172e9c892f8f5 Mon Sep 17 00:00:00 2001 From: jonaswinkler <17569239+jonaswinkler@users.noreply.github.com> Date: Sun, 21 Feb 2021 00:16:57 +0100 Subject: [PATCH] completely reworked the OCRmyPDF parser. --- paperless.conf.example | 4 + src/paperless/settings.py | 8 + src/paperless_tesseract/parsers.py | 294 +++++++++++++++++------------ 3 files changed, 190 insertions(+), 116 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index 652a741b0..9c799b083 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -41,6 +41,10 @@ #PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_IMAGE_DPI=300 +#PAPERLESS_OCR_CLEAN=clean +#PAPERLESS_OCR_DESKEW=false +#PAPERLESS_OCR_ROTATE_PAGES=false +#PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD=10 #PAPERLESS_OCR_USER_ARGS={} #PAPERLESS_CONVERT_MEMORY_LIMIT=0 #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 42ddec88d..abfb1afba 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -449,6 +449,14 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") +OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") + +OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW") + +OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES") + +OCR_ROTATE_PAGES_THRESHOLD = float(os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 10.0)) + OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") # GNUPG needs a home directory for some reason diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 78c335ac3..dac364447 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -9,6 +9,10 @@ from documents.parsers import DocumentParser, ParseError, \ make_thumbnail_from_pdf +class NoTextFoundException(Exception): + pass + + class RasterisedDocumentParser(DocumentParser): """ This parser uses Tesseract to try and get some text out of a rasterised @@ -18,12 +22,13 @@ class RasterisedDocumentParser(DocumentParser): logging_name = "paperless.parsing.tesseract" def extract_metadata(self, document_path, mime_type): - import pikepdf - - namespace_pattern = re.compile(r"\{(.*)\}(.*)") result = [] if mime_type == 'application/pdf': + import pikepdf + + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + pdf = pikepdf.open(document_path) meta = pdf.open_metadata() for key, value in meta.items(): @@ -88,125 +93,199 @@ class RasterisedDocumentParser(DocumentParser): f"Error while calculating DPI for image {image}: {e}") return None + def extract_text(self, sidecar_file, pdf_file): + if sidecar_file and os.path.isfile(sidecar_file): + with open(sidecar_file, "r") as f: + text = f.read() + + if "[OCR skipped on page" not in text: + # This happens when there's already text in the input file. + # The sidecar file will only contain text for OCR'ed pages. + self.log("debug", "Using text from sidecar file") + return text + else: + self.log("debug", "Incomplete sidecar file: discarding.") + + # no success with the sidecar file, try PDF + + if not os.path.isfile(pdf_file): + return None + + from pdfminer.high_level import extract_text + from pdfminer.pdftypes import PDFException + + try: + text = extract_text(pdf_file) + stripped = strip_excess_whitespace(text) + self.log("debug", f"Extracted text from PDF file {pdf_file}") + return stripped + except PDFException: + # probably not a PDF file. + return None + + def construct_ocrmypdf_parameters(self, + input_file, + mime_type, + output_file, + sidecar_file, + safe_fallback=False): + ocrmypdf_args = { + 'input_file': input_file, + 'output_file': output_file, + # need to use threads, since this will be run in daemonized + # processes by django-q. + 'use_threads': True, + 'jobs': settings.THREADS_PER_WORKER, + 'language': settings.OCR_LANGUAGE, + 'output_type': settings.OCR_OUTPUT_TYPE, + 'progress_bar': False + } + + if settings.OCR_MODE == 'force' or safe_fallback: + ocrmypdf_args['force_ocr'] = True + elif settings.OCR_MODE in ['skip', 'skip_noarchive']: + ocrmypdf_args['skip_text'] = True + elif settings.OCR_MODE == 'redo': + ocrmypdf_args['redo_ocr'] = True + else: + raise ParseError( + f"Invalid ocr mode: {settings.OCR_MODE}") + + if settings.OCR_CLEAN == 'clean': + ocrmypdf_args['clean'] = True + elif settings.OCR_CLEAN == 'clean-final': + ocrmypdf_args['clean_final'] = True + + if settings.OCR_DESKEW: + ocrmypdf_args['deskew'] = True + + if settings.OCR_ROTATE_PAGES: + ocrmypdf_args['rotate_pages'] = True + ocrmypdf_args['rotate_pages_threshold'] = settings.OCR_ROTATE_PAGES_THRESHOLD # NOQA: E501 + + if settings.OCR_PAGES > 0: + ocrmypdf_args['pages'] = f"1-{settings.OCR_PAGES}" + else: + # sidecar is incompatible with pages + ocrmypdf_args['sidecar'] = sidecar_file + + if self.is_image(mime_type): + dpi = self.get_dpi(input_file) + a4_dpi = self.calculate_a4_dpi(input_file) + if dpi: + self.log( + "debug", + f"Detected DPI for image {input_file}: {dpi}" + ) + ocrmypdf_args['image_dpi'] = dpi + elif settings.OCR_IMAGE_DPI: + ocrmypdf_args['image_dpi'] = settings.OCR_IMAGE_DPI + elif a4_dpi: + ocrmypdf_args['image_dpi'] = a4_dpi + else: + raise ParseError( + f"Cannot produce archive PDF for image {input_file}, " + f"no DPI information is present in this image and " + f"OCR_IMAGE_DPI is not set.") + + if settings.OCR_USER_ARGS and not safe_fallback: + try: + user_args = json.loads(settings.OCR_USER_ARGS) + ocrmypdf_args = {**ocrmypdf_args, **user_args} + except Exception as e: + self.log( + "warning", + f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " + f"they will not be used. Error: {e}") + + return ocrmypdf_args + def parse(self, document_path, mime_type, file_name=None): - import ocrmypdf - from ocrmypdf import InputFileError, EncryptedPdfError + # This forces tesseract to use one core per page. + os.environ['OMP_THREAD_LIMIT'] = "1" - mode = settings.OCR_MODE + text_original = self.extract_text(None, document_path) + original_has_text = text_original and len(text_original) > 50 - text_original = get_text_from_pdf(document_path) - has_text = text_original and len(text_original) > 50 - - if mode == "skip_noarchive" and has_text: + if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return - if mode in ['skip', 'skip_noarchive'] and not has_text: - # upgrade to redo, since there appears to be no text in the - # document. This happens to some weird encrypted documents or - # documents with failed OCR attempts for which OCRmyPDF will - # still report that there actually is text in them. - self.log("debug", - "No text was found in the document and skip is " - "specified. Upgrading OCR mode to redo.") - mode = "redo" + import ocrmypdf + from ocrmypdf import InputFileError, EncryptedPdfError archive_path = os.path.join(self.tempdir, "archive.pdf") + sidecar_file = os.path.join(self.tempdir, "sidecar.txt") - ocr_args = { - 'input_file': document_path, - 'output_file': archive_path, - 'use_threads': True, - 'jobs': settings.THREADS_PER_WORKER, - 'language': settings.OCR_LANGUAGE, - 'output_type': settings.OCR_OUTPUT_TYPE, - 'progress_bar': False, - 'clean': True - } - - if settings.OCR_PAGES > 0: - ocr_args['pages'] = f"1-{settings.OCR_PAGES}" - - # Mode selection. - - if mode in ['skip', 'skip_noarchive']: - ocr_args['skip_text'] = True - elif mode == 'redo': - ocr_args['redo_ocr'] = True - elif mode == 'force': - ocr_args['force_ocr'] = True - else: - raise ParseError( - f"Invalid ocr mode: {mode}") - - if self.is_image(mime_type): - dpi = self.get_dpi(document_path) - a4_dpi = self.calculate_a4_dpi(document_path) - if dpi: - self.log( - "debug", - f"Detected DPI for image {document_path}: {dpi}" - ) - ocr_args['image_dpi'] = dpi - elif settings.OCR_IMAGE_DPI: - ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI - elif a4_dpi: - ocr_args['image_dpi'] = a4_dpi - else: - raise ParseError( - f"Cannot produce archive PDF for image {document_path}, " - f"no DPI information is present in this image and " - f"OCR_IMAGE_DPI is not set.") - - if settings.OCR_USER_ARGS: - try: - user_args = json.loads(settings.OCR_USER_ARGS) - ocr_args = {**ocr_args, **user_args} - except Exception as e: - self.log( - "warning", - f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " - f"they will not be used: {e}") - - # This forces tesseract to use one core per page. - os.environ['OMP_THREAD_LIMIT'] = "1" + args = self.construct_ocrmypdf_parameters( + document_path, mime_type, archive_path, sidecar_file) try: - self.log("debug", - f"Calling OCRmyPDF with {str(ocr_args)}") - ocrmypdf.ocr(**ocr_args) - # success! announce results + self.log("debug", f"Calling OCRmyPDF with args: {args}") + ocrmypdf.ocr(**args) + self.archive_path = archive_path - self.text = get_text_from_pdf(archive_path) + self.text = self.extract_text(sidecar_file, archive_path) - except (InputFileError, EncryptedPdfError) as e: - - self.log("debug", - f"Encountered an error: {e}. Trying to use text from " - f"original.") - # This happens with some PDFs when used with the redo_ocr option. - # This is not the end of the world, we'll just use what we already - # have in the document. - self.text = text_original - # Also, no archived file. if not self.text: - # However, if we don't have anything, fail: + raise NoTextFoundException( + "No text was found in the original document") + except EncryptedPdfError: + self.log("warning", + "This file is encrypted, OCR is impossible. Using " + "any text present in the original file.") + if original_has_text: + self.text = text_original + except (NoTextFoundException, InputFileError) as e: + self.log("exception", + f"Encountered the following error while running OCR, " + f"attempting force OCR to get the text.") + + archive_path_fallback = os.path.join( + self.tempdir, "archive-fallback.pdf") + sidecar_file_fallback = os.path.join( + self.tempdir, "sidecar-fallback.txt") + + # Attempt to run OCR with safe settings. + + args = self.construct_ocrmypdf_parameters( + document_path, mime_type, + archive_path_fallback, sidecar_file_fallback, + safe_fallback=True + ) + + try: + self.log("debug", + f"Fallback: Calling OCRmyPDF with args: {args}") + ocrmypdf.ocr(**args) + + # Don't return the archived file here, since this file + # is bigger and blurry due to --force-ocr. + + self.text = self.extract_text( + sidecar_file_fallback, archive_path_fallback) + + except Exception as e: + # If this fails, we have a serious issue at hand. raise ParseError(f"{e.__class__.__name__}: {str(e)}") except Exception as e: # Anything else is probably serious. raise ParseError(f"{e.__class__.__name__}: {str(e)}") + # As a last resort, if we still don't have any text for any reason, + # try to extract the text from the original document. if not self.text: - # This may happen for files that don't have any text. - self.log( - 'warning', - f"Document {document_path} does not have any text. " - f"This is probably an error or you tried to add an image " - f"without text, or something is wrong with this document.") - self.text = "" + if original_has_text: + self.text = text_original + else: + self.log( + "warning", + f"No text was found in {document_path}, the content will " + f"be empty." + ) def strip_excess_whitespace(text): @@ -222,20 +301,3 @@ def strip_excess_whitespace(text): # TODO: this needs a rework return no_trailing_whitespace.strip() - -def get_text_from_pdf(pdf_file): - import pdftotext - - if not os.path.isfile(pdf_file): - return None - - with open(pdf_file, "rb") as f: - try: - pdf = pdftotext.PDF(f) - except pdftotext.Error: - # might not be a PDF file - return None - - text = "\n".join(pdf) - - return strip_excess_whitespace(text)