mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Updates to provide the user provided max pixel size to ocrmypdf
This commit is contained in:
parent
feaf2da834
commit
fc26fe0ac0
@ -5,6 +5,7 @@ import multiprocessing
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
from typing import Optional
|
||||||
from typing import Set
|
from typing import Set
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float(
|
|||||||
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
|
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
|
||||||
)
|
)
|
||||||
|
|
||||||
OCR_MAX_IMAGE_PIXELS = os.environ.get(
|
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
|
||||||
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
|
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
|
||||||
256000000,
|
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
|
||||||
)
|
|
||||||
|
|
||||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||||
|
|
||||||
|
@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf
|
|||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
|
|
||||||
|
|
||||||
|
|
||||||
class NoTextFoundException(Exception):
|
class NoTextFoundException(Exception):
|
||||||
pass
|
pass
|
||||||
@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
f"they will not be used. Error: {e}",
|
f"they will not be used. Error: {e}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if settings.OCR_MAX_IMAGE_PIXELS is not None:
|
||||||
|
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||||
|
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
|
||||||
|
if max_pixels_mpixels > 0:
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
"debug",
|
||||||
|
f"Calculated {max_pixels_mpixels} megapixels for OCR",
|
||||||
|
)
|
||||||
|
|
||||||
|
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||||
|
else:
|
||||||
|
self.log(
|
||||||
|
"warning",
|
||||||
|
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
|
||||||
|
"this value must be at least 1 megapixel if set",
|
||||||
|
)
|
||||||
|
|
||||||
return ocrmypdf_args
|
return ocrmypdf_args
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
|
@ -6,8 +6,6 @@ from PIL import Image
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from PIL import ImageFont
|
from PIL import ImageFont
|
||||||
|
|
||||||
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
|
|
||||||
|
|
||||||
|
|
||||||
class TextDocumentParser(DocumentParser):
|
class TextDocumentParser(DocumentParser):
|
||||||
"""
|
"""
|
||||||
@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser):
|
|||||||
font = ImageFont.truetype(
|
font = ImageFont.truetype(
|
||||||
font=settings.THUMBNAIL_FONT_NAME,
|
font=settings.THUMBNAIL_FONT_NAME,
|
||||||
size=20,
|
size=20,
|
||||||
layout_engine=ImageFont.LAYOUT_BASIC,
|
layout_engine=ImageFont.Layout.BASIC,
|
||||||
)
|
)
|
||||||
draw.text((5, 5), read_text(), font=font, fill="black")
|
draw.text((5, 5), read_text(), font=font, fill="black")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user