mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	added image DPI detection to the tesseract parser.
This commit is contained in:
		| @@ -250,6 +250,19 @@ PAPERLESS_OCR_OUTPUT_TYPE=<type> | |||||||
|     If not specified, ``pdfa`` is used. Remember that paperless also keeps |     If not specified, ``pdfa`` is used. Remember that paperless also keeps | ||||||
|     the original input file as well as the archived version. |     the original input file as well as the archived version. | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_IMAGE_DPI=<num> | ||||||
|  |     Paperless will OCR any images you put into the system and convert them | ||||||
|  |     into PDF documents. This is useful if your scanner produces images. | ||||||
|  |     In order to do so, paperless needs to know the DPI of the image. | ||||||
|  |     Most images from scanners will have this information embedded and | ||||||
|  |     paperless will detect and use that information. In case this fails, it | ||||||
|  |     uses this value as a fallback. | ||||||
|  |  | ||||||
|  |     Set this to the DPI your scanner produces images at. | ||||||
|  |  | ||||||
|  |     Default is none, which causes paperless to fail if no DPI information is | ||||||
|  |     present in an image. | ||||||
|  |  | ||||||
| PAPERLESS_CONSUMER_POLLING=<num> | PAPERLESS_CONSUMER_POLLING=<num> | ||||||
|     If paperless won't find documents added to your consume folder, it might |     If paperless won't find documents added to your consume folder, it might | ||||||
|     not be able to automatically detect filesystem changes. In that case, |     not be able to automatically detect filesystem changes. In that case, | ||||||
|   | |||||||
| @@ -40,6 +40,7 @@ | |||||||
| #PAPERLESS_OCR_LANGUAGE=eng | #PAPERLESS_OCR_LANGUAGE=eng | ||||||
| #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||||
| #PAPERLESS_OCR_MODE=skip | #PAPERLESS_OCR_MODE=skip | ||||||
|  | #PAPERLESS_OCR_IMAGE_DPI=300 | ||||||
| #PAPERLESS_CONSUMER_POLLING=10 | #PAPERLESS_CONSUMER_POLLING=10 | ||||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||||
|   | |||||||
| @@ -346,6 +346,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | |||||||
| # TODO: validate this. | # TODO: validate this. | ||||||
| OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||||
|  |  | ||||||
|  | OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||||
|  |  | ||||||
| # GNUPG needs a home directory for some reason | # GNUPG needs a home directory for some reason | ||||||
| GNUPG_HOME = os.getenv("HOME", "/tmp") | GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||||
|  |  | ||||||
|   | |||||||
| @@ -4,6 +4,7 @@ import subprocess | |||||||
|  |  | ||||||
| import ocrmypdf | import ocrmypdf | ||||||
| import pdftotext | import pdftotext | ||||||
|  | from PIL import Image | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from ocrmypdf import InputFileError | from ocrmypdf import InputFileError | ||||||
|  |  | ||||||
| @@ -60,10 +61,22 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         return out_path |         return out_path | ||||||
|  |  | ||||||
|     def get_text(self): |     def is_image(self, mime_type): | ||||||
|  |         return mime_type in [ | ||||||
|  |             "image/png", | ||||||
|  |             "image/jpeg" | ||||||
|  |         ] | ||||||
|  |  | ||||||
|         if self._text: |     def get_dpi(self, image): | ||||||
|             return self._text |         try: | ||||||
|  |             with Image.open(image) as im: | ||||||
|  |                 x, y = im.info['dpi'] | ||||||
|  |                 return x | ||||||
|  |         except Exception as e: | ||||||
|  |             self.log( | ||||||
|  |                 'warning', | ||||||
|  |                 f"Error while getting DPI from image {image}: {e}") | ||||||
|  |             return None | ||||||
|  |  | ||||||
|     def parse(self, document_path, mime_type): |     def parse(self, document_path, mime_type): | ||||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") |         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||||
| @@ -89,6 +102,22 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         elif settings.OCR_MODE == 'force': |         elif settings.OCR_MODE == 'force': | ||||||
|             ocr_args['force_ocr'] = True |             ocr_args['force_ocr'] = True | ||||||
|  |  | ||||||
|  |         if self.is_image(mime_type): | ||||||
|  |             dpi = self.get_dpi(document_path) | ||||||
|  |             if dpi: | ||||||
|  |                 self.log( | ||||||
|  |                     "debug", | ||||||
|  |                     f"Detected DPI for image {document_path}: {dpi}" | ||||||
|  |                 ) | ||||||
|  |                 ocr_args['image_dpi'] = dpi | ||||||
|  |             elif settings.OCR_IMAGE_DPI: | ||||||
|  |                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||||
|  |             else: | ||||||
|  |                 raise ParseError( | ||||||
|  |                     f"Cannot produce archive PDF for image {document_path}, " | ||||||
|  |                     f"no DPI information is present in this image and " | ||||||
|  |                     f"OCR_IMAGE_DPI is not set.") | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             ocrmypdf.ocr(**ocr_args) |             ocrmypdf.ocr(**ocr_args) | ||||||
|             # success! announce results |             # success! announce results | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler