Updates to provide the user provided max pixel size to ocrmypdf

This commit is contained in:
Trenton Holmes 2022-05-22 16:56:08 -07:00
parent feaf2da834
commit fc26fe0ac0
No known key found for this signature in database
GPG Key ID: 4815A6E23A56B8D1
3 changed files with 23 additions and 9 deletions

View File

@ -5,6 +5,7 @@ import multiprocessing
import os
import re
from typing import Final
from typing import Optional
from typing import Set
from urllib.parse import urlparse
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
)
OCR_MAX_IMAGE_PIXELS = os.environ.get(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
256000000,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

View File

@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from PIL import Image
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class NoTextFoundException(Exception):
pass
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
f"they will not be used. Error: {e}",
)
if settings.OCR_MAX_IMAGE_PIXELS is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0:
self.log(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR",
)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else:
self.log(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)
return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None):

View File

@ -6,8 +6,6 @@ from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class TextDocumentParser(DocumentParser):
"""
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME,
size=20,
layout_engine=ImageFont.LAYOUT_BASIC,
layout_engine=ImageFont.Layout.BASIC,
)
draw.text((5, 5), read_text(), font=font, fill="black")