From 7e768bfe230482e59eb9eb23eef03b7519d17b24 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 28 Aug 2023 17:43:59 -0700 Subject: [PATCH] When PDF/A rendering fails, add a warning the user may want to allow it to continue --- src/paperless_tesseract/parsers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c6d066fbe..4dbebb589 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -304,6 +304,7 @@ class RasterisedDocumentParser(DocumentParser): import ocrmypdf from ocrmypdf import EncryptedPdfError from ocrmypdf import InputFileError + from ocrmypdf import SubprocessOutputError archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) @@ -333,6 +334,13 @@ class RasterisedDocumentParser(DocumentParser): ) if original_has_text: self.text = text_original + except SubprocessOutputError as e: + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", # noqa: E501 + ) + raise e except (NoTextFoundException, InputFileError) as e: self.log.warning( f"Encountered an error while running OCR: {e!s}. "