Updates to provide the user provided max pixel size to ocrmypdf

This commit is contained in:
Trenton Holmes 2022-05-22 16:56:08 -07:00
parent feaf2da834
commit fc26fe0ac0
No known key found for this signature in database
GPG Key ID: 4815A6E23A56B8D1
3 changed files with 23 additions and 9 deletions

View File

@ -5,6 +5,7 @@ import multiprocessing
import os import os
import re import re
from typing import Final from typing import Final
from typing import Optional
from typing import Set from typing import Set
from urllib.parse import urlparse from urllib.parse import urlparse
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
) )
OCR_MAX_IMAGE_PIXELS = os.environ.get( OCR_MAX_IMAGE_PIXELS: Optional[int] = None
"PAPERLESS_OCR_MAX_IMAGE_PIXELS", if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
256000000, OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

View File

@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError from documents.parsers import ParseError
from PIL import Image from PIL import Image
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class NoTextFoundException(Exception): class NoTextFoundException(Exception):
pass pass
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
f"they will not be used. Error: {e}", f"they will not be used. Error: {e}",
) )
if settings.OCR_MAX_IMAGE_PIXELS is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0:
self.log(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR",
)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else:
self.log(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)
return ocrmypdf_args return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):

View File

@ -6,8 +6,6 @@ from PIL import Image
from PIL import ImageDraw from PIL import ImageDraw
from PIL import ImageFont from PIL import ImageFont
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
class TextDocumentParser(DocumentParser): class TextDocumentParser(DocumentParser):
""" """
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
font = ImageFont.truetype( font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME, font=settings.THUMBNAIL_FONT_NAME,
size=20, size=20,
layout_engine=ImageFont.LAYOUT_BASIC, layout_engine=ImageFont.Layout.BASIC,
) )
draw.text((5, 5), read_text(), font=font, fill="black") draw.text((5, 5), read_text(), font=font, fill="black")