mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #1008 from paperless-ngx/bugfix-max-pixel-setting
Bugfix: Corrects the setting of max pixel size for OCR
This commit is contained in:
commit
a4927477fb
@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
|
||||
the produced PDF documents are A4 sized.
|
||||
|
||||
PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
|
||||
Paperless will not OCR images that have more pixels than this limit.
|
||||
This is intended to prevent decompression bombs from overloading paperless.
|
||||
Increasing this limit is desired if you face a DecompressionBombError despite
|
||||
the concerning file not being malicious; this could e.g. be caused by invalidly
|
||||
recognized metadata.
|
||||
If you have enough resources or if you are certain that your uploaded files
|
||||
are not malicious you can increase this value to your needs.
|
||||
The default value is 256000000, an image with more pixels than that would not be parsed.
|
||||
Paperless will raise a warning when OCRing images which are over this limit and
|
||||
will not OCR images which are more than twice this limit. Note this does not
|
||||
prevent the document from being consumed, but could result in missing text content.
|
||||
|
||||
If unset, will default to the value determined by
|
||||
`Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`_.
|
||||
|
||||
.. note::
|
||||
|
||||
Increasing this limit could cause Paperless to consume additional resources
|
||||
when consuming a file. Be sure you have sufficient system resources.
|
||||
|
||||
.. caution::
|
||||
|
||||
The limit is intended to prevent malicious files from consuming system resources
|
||||
and causing crashes and other errors. Only increase this value if you are certain
|
||||
your documents are not malicious and you need the text which was not OCRed
|
||||
|
||||
PAPERLESS_OCR_USER_ARGS=<json>
|
||||
OCRmyPDF offers many more options. Use this parameter to specify any
|
||||
|
@ -5,6 +5,7 @@ import multiprocessing
|
||||
import os
|
||||
import re
|
||||
from typing import Final
|
||||
from typing import Optional
|
||||
from typing import Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
|
||||
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
|
||||
)
|
||||
|
||||
OCR_MAX_IMAGE_PIXELS = os.environ.get(
|
||||
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
|
||||
256000000,
|
||||
)
|
||||
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
|
||||
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
|
||||
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
|
||||
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||
|
||||
|
@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.parsers import ParseError
|
||||
from PIL import Image
|
||||
|
||||
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
pass
|
||||
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"they will not be used. Error: {e}",
|
||||
)
|
||||
|
||||
if settings.OCR_MAX_IMAGE_PIXELS is not None:
|
||||
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
|
||||
if max_pixels_mpixels > 0:
|
||||
|
||||
self.log(
|
||||
"debug",
|
||||
f"Calculated {max_pixels_mpixels} megapixels for OCR",
|
||||
)
|
||||
|
||||
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||
else:
|
||||
self.log(
|
||||
"warning",
|
||||
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
|
||||
"this value must be at least 1 megapixel if set",
|
||||
)
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
|
@ -6,8 +6,6 @@ from PIL import Image
|
||||
from PIL import ImageDraw
|
||||
from PIL import ImageFont
|
||||
|
||||
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
|
||||
|
||||
|
||||
class TextDocumentParser(DocumentParser):
|
||||
"""
|
||||
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
|
||||
font = ImageFont.truetype(
|
||||
font=settings.THUMBNAIL_FONT_NAME,
|
||||
size=20,
|
||||
layout_engine=ImageFont.LAYOUT_BASIC,
|
||||
layout_engine=ImageFont.Layout.BASIC,
|
||||
)
|
||||
draw.text((5, 5), read_text(), font=font, fill="black")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user