implement PAPERLESS_OCR_MAX_IMAGE_PIXELS

2026-02-24 00:59:35 -06:00 · 2022-03-19 01:03:45 +01:00
parent 9a758fc3dc
commit a8887b211e
3 changed files with 11 additions and 0 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -389,6 +389,15 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
    Default is none, which will automatically calculate image DPI so that
    the produced PDF documents are A4 sized.

+PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
+    Paperless will not OCR images that have more pixels than this limit.
+    This is intended to prevent decompression bombs from overloading paperless.
+    Increasing this limit is desired if you face a DecompressionBombError despite
+    the concerning file not being malicious; this could e.g. be caused by invalidly
+    recognized metadata.
+    If you have enough resources or if you are certain that your uploaded files
+    are not malicious you can increase this value to your needs.
+    The default value is 256000000.

 PAPERLESS_OCR_USER_ARGS=<json>
    OCRmyPDF offers many more options. Use this parameter to specify any
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -8,6 +8,7 @@ from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
 from PIL import Image

+Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS)

 class NoTextFoundException(Exception):
    pass
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -6,6 +6,7 @@ from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont

+Image.MAX_IMAGE_PIXELS = os.environ.get('PAPERLESS_OCR_MAX_IMAGE_PIXELS', Image.MAX_IMAGE_PIXELS)

 class TextDocumentParser(DocumentParser):
    """