mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Updates to provide the user provided max pixel size to ocrmypdf
This commit is contained in:
		| @@ -5,6 +5,7 @@ import multiprocessing | ||||
| import os | ||||
| import re | ||||
| from typing import Final | ||||
| from typing import Optional | ||||
| from typing import Set | ||||
| from urllib.parse import urlparse | ||||
|  | ||||
| @@ -551,10 +552,9 @@ OCR_ROTATE_PAGES_THRESHOLD = float( | ||||
|     os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), | ||||
| ) | ||||
|  | ||||
| OCR_MAX_IMAGE_PIXELS = os.environ.get( | ||||
|     "PAPERLESS_OCR_MAX_IMAGE_PIXELS", | ||||
|     256000000, | ||||
| ) | ||||
| OCR_MAX_IMAGE_PIXELS: Optional[int] = None | ||||
| if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: | ||||
|     OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) | ||||
|  | ||||
| OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||
|  | ||||
|   | ||||
| @@ -8,8 +8,6 @@ from documents.parsers import make_thumbnail_from_pdf | ||||
| from documents.parsers import ParseError | ||||
| from PIL import Image | ||||
|  | ||||
| Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS | ||||
|  | ||||
|  | ||||
| class NoTextFoundException(Exception): | ||||
|     pass | ||||
| @@ -225,6 +223,24 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                     f"they will not be used. Error: {e}", | ||||
|                 ) | ||||
|  | ||||
|         if settings.OCR_MAX_IMAGE_PIXELS is not None: | ||||
|             # Convert pixels to mega-pixels and provide to ocrmypdf | ||||
|             max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 | ||||
|             if max_pixels_mpixels > 0: | ||||
|  | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Calculated {max_pixels_mpixels} megapixels for OCR", | ||||
|                 ) | ||||
|  | ||||
|                 ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels | ||||
|             else: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " | ||||
|                     "this value must be at least 1 megapixel if set", | ||||
|                 ) | ||||
|  | ||||
|         return ocrmypdf_args | ||||
|  | ||||
|     def parse(self, document_path, mime_type, file_name=None): | ||||
|   | ||||
| @@ -6,8 +6,6 @@ from PIL import Image | ||||
| from PIL import ImageDraw | ||||
| from PIL import ImageFont | ||||
|  | ||||
| Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS | ||||
|  | ||||
|  | ||||
| class TextDocumentParser(DocumentParser): | ||||
|     """ | ||||
| @@ -28,7 +26,7 @@ class TextDocumentParser(DocumentParser): | ||||
|         font = ImageFont.truetype( | ||||
|             font=settings.THUMBNAIL_FONT_NAME, | ||||
|             size=20, | ||||
|             layout_engine=ImageFont.LAYOUT_BASIC, | ||||
|             layout_engine=ImageFont.Layout.BASIC, | ||||
|         ) | ||||
|         draw.text((5, 5), read_text(), font=font, fill="black") | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton Holmes
					Trenton Holmes