Merge pull request #1008 from paperless-ngx/bugfix-max-pixel-setting

Bugfix: Corrects the setting of max pixel size for OCR
This commit is contained in:
Quinn Casey 2022-05-26 09:12:24 -07:00 committed by GitHub
commit a4927477fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 17 deletions

View File

@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
the produced PDF documents are A4 sized. the produced PDF documents are A4 sized.
PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num> PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
Paperless will not OCR images that have more pixels than this limit. Paperless will raise a warning when OCRing images which are over this limit and
This is intended to prevent decompression bombs from overloading paperless. will not OCR images which are more than twice this limit. Note this does not
Increasing this limit is desired if you face a DecompressionBombError despite prevent the document from being consumed, but could result in missing text content.
the concerning file not being malicious; this could e.g. be caused by invalidly
recognized metadata. If unset, will default to the value determined by
If you have enough resources or if you are certain that your uploaded files `Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
are not malicious you can increase this value to your needs.
The default value is 256000000, an image with more pixels than that would not be parsed. .. note::
Increasing this limit could cause Paperless to consume additional resources
when consuming a file. Be sure you have sufficient system resources.
.. caution::
The limit is intended to prevent malicious files from consuming system resources
and causing crashes and other errors. Only increase this value if you are certain
your documents are not malicious and you need the text which was not OCRed
PAPERLESS_OCR_USER_ARGS=<json> PAPERLESS_OCR_USER_ARGS=<json>
OCRmyPDF offers many more options. Use this parameter to specify any OCRmyPDF offers many more options. Use this parameter to specify any

View File

@ -5,6 +5,7 @@ import multiprocessing
import os import os
import re import re
from typing import Final from typing import Final
from typing import Optional
from typing import Set from typing import Set
from urllib.parse import urlparse from urllib.parse import urlparse
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
) )
OCR_MAX_IMAGE_PIXELS = os.environ.get( OCR_MAX_IMAGE_PIXELS: Optional[int] = None
"PAPERLESS_OCR_MAX_IMAGE_PIXELS", if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
256000000, OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

View File

@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError from documents.parsers import ParseError
from PIL import Image from PIL import Image
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class NoTextFoundException(Exception): class NoTextFoundException(Exception):
pass pass
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
f"they will not be used. Error: {e}", f"they will not be used. Error: {e}",
) )
if settings.OCR_MAX_IMAGE_PIXELS is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0:
self.log(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR",
)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else:
self.log(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)
return ocrmypdf_args return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):

View File

@ -6,8 +6,6 @@ from PIL import Image
from PIL import ImageDraw from PIL import ImageDraw
from PIL import ImageFont from PIL import ImageFont
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class TextDocumentParser(DocumentParser): class TextDocumentParser(DocumentParser):
""" """
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
font = ImageFont.truetype( font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME, font=settings.THUMBNAIL_FONT_NAME,
size=20, size=20,
layout_engine=ImageFont.LAYOUT_BASIC, layout_engine=ImageFont.Layout.BASIC,
) )
draw.text((5, 5), read_text(), font=font, fill="black") draw.text((5, 5), read_text(), font=font, fill="black")