Adds new setting to control color conversions (#4709)

This commit is contained in:
Trenton H 2023-11-29 12:18:44 -08:00 committed by GitHub
parent e1b573adeb
commit e3f4e0b775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 0 deletions

View File

@ -704,6 +704,20 @@ but could result in missing text content.
this value if you are certain your documents are not malicious and this value if you are certain your documents are not malicious and
you need the text which was not OCRed you need the text which was not OCRed
#### [`PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY=<RGB>`](#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY) {#PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY}
: Controls the Ghostscript color conversion strategy when creating the archive file. This setting
will only be utilized if the output is a version of PDF/A.
Valid options are CMYK, Gray, LeaveColorUnchanged, RGB or UseDeviceIndependentColor.
You can find more on the settings [here](https://ghostscript.readthedocs.io/en/latest/VectorDevices.html#color-conversion-and-management) in the Ghostscript documentation.
!!! warning
Utilizing some of the options may result in errors when creating archive
files from PDFs.
#### [`PAPERLESS_OCR_USER_ARGS=<json>`](#PAPERLESS_OCR_USER_ARGS) {#PAPERLESS_OCR_USER_ARGS} #### [`PAPERLESS_OCR_USER_ARGS=<json>`](#PAPERLESS_OCR_USER_ARGS) {#PAPERLESS_OCR_USER_ARGS}
: OCRmyPDF offers many more options. Use this parameter to specify any : OCRmyPDF offers many more options. Use this parameter to specify any

View File

@ -864,6 +864,11 @@ OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
"RGB",
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
# GNUPG needs a home directory for some reason # GNUPG needs a home directory for some reason

View File

@ -186,6 +186,11 @@ class RasterisedDocumentParser(DocumentParser):
"progress_bar": False, "progress_bar": False,
} }
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = settings.OCR_COLOR_CONVERSION_STRATEGY
if settings.OCR_MODE == "force" or safe_fallback: if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]: elif settings.OCR_MODE in ["skip", "skip_noarchive"]: