diff --git a/docs/configuration.md b/docs/configuration.md index e952ec41b..c0e8022ac 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -704,6 +704,20 @@ but could result in missing text content. this value if you are certain your documents are not malicious and you need the text which was not OCRed +#### [`PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY=`](#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY) {#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY} + +: Controls the Ghostscript color conversion strategy when creating the archive file. This setting +will only be utilized if the output is a version of PDF/A. + + Valid options are CMYK, Gray, LeaveColorUnchanged, RGB or UseDeviceIndependentColor. + + You can find more on the settings [here](https://ghostscript.readthedocs.io/en/latest/VectorDevices.html#color-conversion-and-management) in the Ghostscript documentation. + + !!! warning + + Utilizing some of the options may result in errors when creating archive + files from PDFs. + #### [`PAPERLESS_OCR_USER_ARGS=`](#PAPERLESS_OCR_USER_ARGS) {#PAPERLESS_OCR_USER_ARGS} : OCRmyPDF offers many more options. Use this parameter to specify any diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 86f1f569f..9daeb8a47 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -864,6 +864,11 @@ OCR_MAX_IMAGE_PIXELS: Optional[int] = None if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) +OCR_COLOR_CONVERSION_STRATEGY = os.getenv( + "PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY", + "RGB", +) + OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") # GNUPG needs a home directory for some reason diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 3523da7bd..babcf6bcf 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -186,6 +186,11 @@ class RasterisedDocumentParser(DocumentParser): "progress_bar": False, } + if "pdfa" in ocrmypdf_args["output_type"]: + ocrmypdf_args[ + "color_conversion_strategy" + ] = settings.OCR_COLOR_CONVERSION_STRATEGY + if settings.OCR_MODE == "force" or safe_fallback: ocrmypdf_args["force_ocr"] = True elif settings.OCR_MODE in ["skip", "skip_noarchive"]: