Merge pull request #1008 from paperless-ngx/bugfix-max-pixel-setting

Bugfix: Corrects the setting of max pixel size for OCR
This commit is contained in:
Quinn Casey 2022-05-26 09:12:24 -07:00 committed by GitHub
commit a4927477fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 17 deletions

View File

@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
the produced PDF documents are A4 sized.
PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
Paperless will not OCR images that have more pixels than this limit.
This is intended to prevent decompression bombs from overloading paperless.
Increasing this limit is desired if you face a DecompressionBombError despite
the concerning file not being malicious; this could e.g. be caused by invalidly
recognized metadata.
If you have enough resources or if you are certain that your uploaded files
are not malicious you can increase this value to your needs.
The default value is 256000000, an image with more pixels than that would not be parsed.
Paperless will raise a warning when OCRing images which are over this limit and
will not OCR images which are more than twice this limit. Note this does not
prevent the document from being consumed, but could result in missing text content.
If unset, will default to the value determined by
`Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
.. note::
Increasing this limit could cause Paperless to consume additional resources
when consuming a file. Be sure you have sufficient system resources.
.. caution::
The limit is intended to prevent malicious files from consuming system resources
and causing crashes and other errors. Only increase this value if you are certain
your documents are not malicious and you need the text which was not OCRed
PAPERLESS_OCR_USER_ARGS=<json>
OCRmyPDF offers many more options. Use this parameter to specify any

View File

@ -5,6 +5,7 @@ import multiprocessing
import os
import re
from typing import Final
from typing import Optional
from typing import Set
from urllib.parse import urlparse
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
)
OCR_MAX_IMAGE_PIXELS = os.environ.get(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
256000000,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

View File

@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from PIL import Image
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class NoTextFoundException(Exception):
pass
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
f"they will not be used. Error: {e}",
)
if settings.OCR_MAX_IMAGE_PIXELS is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0:
self.log(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR",
)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else:
self.log(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)
return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None):

View File

@ -6,8 +6,6 @@ from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class TextDocumentParser(DocumentParser):
"""
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME,
size=20,
layout_engine=ImageFont.LAYOUT_BASIC,
layout_engine=ImageFont.Layout.BASIC,
)
draw.text((5, 5), read_text(), font=font, fill="black")