diff --git a/docs/configuration.rst b/docs/configuration.rst index 82a14ae52..ab35b49d4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -389,6 +389,15 @@ PAPERLESS_OCR_IMAGE_DPI= Default is none, which will automatically calculate image DPI so that the produced PDF documents are A4 sized. +PAPERLESS_OCR_MAX_IMAGE_PIXELS= + Paperless will not OCR images that have more pixels than this limit. + This is intended to prevent decompression bombs from overloading paperless. + Increasing this limit is desired if you face a DecompressionBombError despite + the concerning file not being malicious; this could e.g. be caused by invalidly + recognized metadata. + If you have enough resources or if you are certain that your uploaded files + are not malicious you can increase this value to your needs. + The default value is 256000000. PAPERLESS_OCR_USER_ARGS= OCRmyPDF offers many more options. Use this parameter to specify any diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index ad167ecf0..4065890ec 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -8,6 +8,7 @@ from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError from PIL import Image +Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS) class NoTextFoundException(Exception): pass diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index a0f19c020..e41e25e76 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -6,6 +6,7 @@ from PIL import Image from PIL import ImageDraw from PIL import ImageFont +Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS) class TextDocumentParser(DocumentParser): """