diff --git a/docs/configuration.rst b/docs/configuration.rst index 2068a4238..b7ab978f4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI= the produced PDF documents are A4 sized. PAPERLESS_OCR_MAX_IMAGE_PIXELS= - Paperless will not OCR images that have more pixels than this limit. - This is intended to prevent decompression bombs from overloading paperless. - Increasing this limit is desired if you face a DecompressionBombError despite - the concerning file not being malicious; this could e.g. be caused by invalidly - recognized metadata. - If you have enough resources or if you are certain that your uploaded files - are not malicious you can increase this value to your needs. - The default value is 256000000, an image with more pixels than that would not be parsed. + Paperless will raise a warning when OCRing images which are over this limit and + will not OCR images which are more than twice this limit. Note this does not + prevent the document from being consumed, but could result in missing text content. + + If unset, will default to the value determined by + `Pillow `_. + + .. note:: + + Increasing this limit could cause Paperless to consume additional resources + when consuming a file. Be sure you have sufficient system resources. + + .. caution:: + + The limit is intended to prevent malicious files from consuming system resources + and causing crashes and other errors. Only increase this value if you are certain + your documents are not malicious and you need the text which was not OCRed PAPERLESS_OCR_USER_ARGS= OCRmyPDF offers many more options. Use this parameter to specify any diff --git a/src/paperless/settings.py b/src/paperless/settings.py index cd3aafc25..7f484ad97 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -5,6 +5,7 @@ import multiprocessing import os import re from typing import Final +from typing import Optional from typing import Set from urllib.parse import urlparse @@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float( os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), ) -OCR_MAX_IMAGE_PIXELS = os.environ.get( - "PAPERLESS_OCR_MAX_IMAGE_PIXELS", - 256000000, -) +OCR_MAX_IMAGE_PIXELS: Optional[int] = None +if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: + OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 56313c5b4..f35d3a6b4 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError from PIL import Image -Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS - class NoTextFoundException(Exception): pass @@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser): f"they will not be used. Error: {e}", ) + if settings.OCR_MAX_IMAGE_PIXELS is not None: + # Convert pixels to mega-pixels and provide to ocrmypdf + max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 + if max_pixels_mpixels > 0: + + self.log( + "debug", + f"Calculated {max_pixels_mpixels} megapixels for OCR", + ) + + ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels + else: + self.log( + "warning", + "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " + "this value must be at least 1 megapixel if set", + ) + return ocrmypdf_args def parse(self, document_path, mime_type, file_name=None): diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 9ef5fec40..fe7e823b3 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -6,8 +6,6 @@ from PIL import Image from PIL import ImageDraw from PIL import ImageFont -Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS - class TextDocumentParser(DocumentParser): """ @@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser): font = ImageFont.truetype( font=settings.THUMBNAIL_FONT_NAME, size=20, - layout_engine=ImageFont.LAYOUT_BASIC, + layout_engine=ImageFont.Layout.BASIC, ) draw.text((5, 5), read_text(), font=font, fill="black")