Updates to provide the user provided max pixel size to ocrmypdf

2026-02-07 23:42:46 -06:00 · 2022-05-22 16:56:08 -07:00
parent feaf2da834
commit fc26fe0ac0
3 changed files with 23 additions and 9 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
 from PIL import Image

-Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
-

 class NoTextFoundException(Exception):
    pass
@@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
                    f"they will not be used. Error: {e}",
                )

+        if settings.OCR_MAX_IMAGE_PIXELS is not None:
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
+            if max_pixels_mpixels > 0:
+
+                self.log(
+                    "debug",
+                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
+                )
+
+                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+            else:
+                self.log(
+                    "warning",
+                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
+                    "this value must be at least 1 megapixel if set",
+                )
+
        return ocrmypdf_args

    def parse(self, document_path, mime_type, file_name=None):