From fc26fe0ac08faebcc0037c36fe9565e0e7b33368 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <holmes.trenton@gmail.com>
Date: Sun, 22 May 2022 16:56:08 -0700
Subject: [PATCH 1/2] Updates to provide the user provided max pixel size to
 ocrmypdf

---
 src/paperless/settings.py          |  8 ++++----
 src/paperless_tesseract/parsers.py | 20 ++++++++++++++++++--
 src/paperless_text/parsers.py      |  4 +---
 3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index cd3aafc25..7f484ad97 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -5,6 +5,7 @@ import multiprocessing
 import os
 import re
 from typing import Final
+from typing import Optional
 from typing import Set
 from urllib.parse import urlparse
 
@@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
     os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
 )
 
-OCR_MAX_IMAGE_PIXELS = os.environ.get(
-    "PAPERLESS_OCR_MAX_IMAGE_PIXELS",
-    256000000,
-)
+OCR_MAX_IMAGE_PIXELS: Optional[int] = None
+if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
+    OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
 
 OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
 
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 56313c5b4..f35d3a6b4 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
 from PIL import Image
 
-Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
-
 
 class NoTextFoundException(Exception):
     pass
@@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
                     f"they will not be used. Error: {e}",
                 )
 
+        if settings.OCR_MAX_IMAGE_PIXELS is not None:
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
+            if max_pixels_mpixels > 0:
+
+                self.log(
+                    "debug",
+                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
+                )
+
+                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+            else:
+                self.log(
+                    "warning",
+                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
+                    "this value must be at least 1 megapixel if set",
+                )
+
         return ocrmypdf_args
 
     def parse(self, document_path, mime_type, file_name=None):
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
index 9ef5fec40..fe7e823b3 100644
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -6,8 +6,6 @@ from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont
 
-Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
-
 
 class TextDocumentParser(DocumentParser):
     """
@@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
         font = ImageFont.truetype(
             font=settings.THUMBNAIL_FONT_NAME,
             size=20,
-            layout_engine=ImageFont.LAYOUT_BASIC,
+            layout_engine=ImageFont.Layout.BASIC,
         )
         draw.text((5, 5), read_text(), font=font, fill="black")
 

From 985b77437808d9b468fdf1118dd6702412f46392 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <holmes.trenton@gmail.com>
Date: Mon, 23 May 2022 10:44:33 -0700
Subject: [PATCH 2/2] Reworks  documentation on the max pixels setting

---
 docs/configuration.rst | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index 2068a4238..b7ab978f4 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
     the produced PDF documents are A4 sized.
 
 PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
-    Paperless will not OCR images that have more pixels than this limit.
-    This is intended to prevent decompression bombs from overloading paperless.
-    Increasing this limit is desired if you face a DecompressionBombError despite
-    the concerning file not being malicious; this could e.g. be caused by invalidly
-    recognized metadata.
-    If you have enough resources or if you are certain that your uploaded files
-    are not malicious you can increase this value to your needs.
-    The default value is 256000000, an image with more pixels than that would not be parsed.
+    Paperless will raise a warning when OCRing images which are over this limit and
+    will not OCR images which are more than twice this limit.  Note this does not
+    prevent the document from being consumed, but could result in missing text content.
+
+    If unset, will default to the value determined by
+    `Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
+
+    .. note::
+
+        Increasing this limit could cause Paperless to consume additional resources
+        when consuming a file.  Be sure you have sufficient system resources.
+
+    .. caution::
+
+        The limit is intended to prevent malicious files from consuming system resources
+        and causing crashes and other errors.  Only increase this value if you are certain
+        your documents are not malicious and you need the text which was not OCRed
 
 PAPERLESS_OCR_USER_ARGS=<json>
     OCRmyPDF offers many more options. Use this parameter to specify any