Adds new setting to control color conversions (#4709)

This commit is contained in:
Trenton H 2023-11-29 12:18:44 -08:00 committed by GitHub
parent e1b573adeb
commit e3f4e0b775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 0 deletions

View File

@ -704,6 +704,20 @@ but could result in missing text content.
this value if you are certain your documents are not malicious and
you need the text which was not OCRed
#### [`PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY=<RGB>`](#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY) {#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY}
: Controls the Ghostscript color conversion strategy when creating the archive file. This setting
will only be utilized if the output is a version of PDF/A.
Valid options are CMYK, Gray, LeaveColorUnchanged, RGB or UseDeviceIndependentColor.
You can find more on the settings [here](https://ghostscript.readthedocs.io/en/latest/VectorDevices.html#color-conversion-and-management) in the Ghostscript documentation.
!!! warning
Utilizing some of the options may result in errors when creating archive
files from PDFs.
#### [`PAPERLESS_OCR_USER_ARGS=<json>`](#PAPERLESS_OCR_USER_ARGS) {#PAPERLESS_OCR_USER_ARGS}
: OCRmyPDF offers many more options. Use this parameter to specify any

View File

@ -864,6 +864,11 @@ OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
"RGB",
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
# GNUPG needs a home directory for some reason

View File

@ -186,6 +186,11 @@ class RasterisedDocumentParser(DocumentParser):
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = settings.OCR_COLOR_CONVERSION_STRATEGY
if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]: