mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	added image DPI detection to the tesseract parser.
This commit is contained in:
		| @@ -250,6 +250,19 @@ PAPERLESS_OCR_OUTPUT_TYPE=<type> | ||||
|     If not specified, ``pdfa`` is used. Remember that paperless also keeps | ||||
|     the original input file as well as the archived version. | ||||
|  | ||||
| PAPERLESS_OCR_IMAGE_DPI=<num> | ||||
|     Paperless will OCR any images you put into the system and convert them | ||||
|     into PDF documents. This is useful if your scanner produces images. | ||||
|     In order to do so, paperless needs to know the DPI of the image. | ||||
|     Most images from scanners will have this information embedded and | ||||
|     paperless will detect and use that information. In case this fails, it | ||||
|     uses this value as a fallback. | ||||
|  | ||||
|     Set this to the DPI your scanner produces images at. | ||||
|  | ||||
|     Default is none, which causes paperless to fail if no DPI information is | ||||
|     present in an image. | ||||
|  | ||||
| PAPERLESS_CONSUMER_POLLING=<num> | ||||
|     If paperless won't find documents added to your consume folder, it might | ||||
|     not be able to automatically detect filesystem changes. In that case, | ||||
|   | ||||
| @@ -40,6 +40,7 @@ | ||||
| #PAPERLESS_OCR_LANGUAGE=eng | ||||
| #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||
| #PAPERLESS_OCR_MODE=skip | ||||
| #PAPERLESS_OCR_IMAGE_DPI=300 | ||||
| #PAPERLESS_CONSUMER_POLLING=10 | ||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||
|   | ||||
| @@ -346,6 +346,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | ||||
| # TODO: validate this. | ||||
| OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||
|  | ||||
| OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||
|  | ||||
| # GNUPG needs a home directory for some reason | ||||
| GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
|  | ||||
|   | ||||
| @@ -4,6 +4,7 @@ import subprocess | ||||
|  | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from ocrmypdf import InputFileError | ||||
|  | ||||
| @@ -60,10 +61,22 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def get_text(self): | ||||
|     def is_image(self, mime_type): | ||||
|         return mime_type in [ | ||||
|             "image/png", | ||||
|             "image/jpeg" | ||||
|         ] | ||||
|  | ||||
|         if self._text: | ||||
|             return self._text | ||||
|     def get_dpi(self, image): | ||||
|         try: | ||||
|             with Image.open(image) as im: | ||||
|                 x, y = im.info['dpi'] | ||||
|                 return x | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||
| @@ -89,6 +102,22 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         elif settings.OCR_MODE == 'force': | ||||
|             ocr_args['force_ocr'] = True | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(document_path) | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected DPI for image {document_path}: {dpi}" | ||||
|                 ) | ||||
|                 ocr_args['image_dpi'] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|             else: | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {document_path}, " | ||||
|                     f"no DPI information is present in this image and " | ||||
|                     f"OCR_IMAGE_DPI is not set.") | ||||
|  | ||||
|         try: | ||||
|             ocrmypdf.ocr(**ocr_args) | ||||
|             # success! announce results | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler