mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-24 03:26:11 -05:00 
			
		
		
		
	fixes bauerj/paperless_app#23 and most of all other scanner apps out there.
This commit is contained in:
		| @@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def calculate_a4_dpi(self, image): | ||||
|         try: | ||||
|             with Image.open(image) as im: | ||||
|                 width, height = im.size | ||||
|                 # divide image width by A4 width (210mm) in inches. | ||||
|                 dpi = int(width / (21 / 2.54)) | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Estimated DPI {dpi} based on image width {width}" | ||||
|                 ) | ||||
|                 return dpi | ||||
|  | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while calculating DPI for image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         mode = settings.OCR_MODE | ||||
|  | ||||
| @@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(document_path) | ||||
|             a4_dpi = self.calculate_a4_dpi(document_path) | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
| @@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 ocr_args['image_dpi'] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|             elif a4_dpi: | ||||
|                 ocr_args['image_dpi'] = a4_dpi | ||||
|             else: | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {document_path}, " | ||||
| @@ -241,6 +262,9 @@ def strip_excess_whitespace(text): | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
|  | ||||
|     if not os.path.isfile(pdf_file): | ||||
|         return None | ||||
|  | ||||
|     with open(pdf_file, "rb") as f: | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|   | ||||
| @@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") | ||||
|     def test_image_calc_a4_dpi(self, m): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|     def test_image_no_dpi_fail(self): | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
|         args, kwargs = m.call_args | ||||
|  | ||||
|         self.assertEqual(kwargs['image_dpi'], 62) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") | ||||
|     def test_image_dpi_fail(self, m): | ||||
|         m.return_value = None | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler