mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 03:36:08 -05:00 
			
		
		
		
	fixes bauerj/paperless_app#23 and most of all other scanner apps out there.
This commit is contained in:
		| @@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 f"Error while getting DPI from image {image}: {e}") |                 f"Error while getting DPI from image {image}: {e}") | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|  |     def calculate_a4_dpi(self, image): | ||||||
|  |         try: | ||||||
|  |             with Image.open(image) as im: | ||||||
|  |                 width, height = im.size | ||||||
|  |                 # divide image width by A4 width (210mm) in inches. | ||||||
|  |                 dpi = int(width / (21 / 2.54)) | ||||||
|  |                 self.log( | ||||||
|  |                     'debug', | ||||||
|  |                     f"Estimated DPI {dpi} based on image width {width}" | ||||||
|  |                 ) | ||||||
|  |                 return dpi | ||||||
|  |  | ||||||
|  |         except Exception as e: | ||||||
|  |             self.log( | ||||||
|  |                 'warning', | ||||||
|  |                 f"Error while calculating DPI for image {image}: {e}") | ||||||
|  |             return None | ||||||
|  |  | ||||||
|     def parse(self, document_path, mime_type): |     def parse(self, document_path, mime_type): | ||||||
|         mode = settings.OCR_MODE |         mode = settings.OCR_MODE | ||||||
|  |  | ||||||
| @@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         if self.is_image(mime_type): |         if self.is_image(mime_type): | ||||||
|             dpi = self.get_dpi(document_path) |             dpi = self.get_dpi(document_path) | ||||||
|  |             a4_dpi = self.calculate_a4_dpi(document_path) | ||||||
|             if dpi: |             if dpi: | ||||||
|                 self.log( |                 self.log( | ||||||
|                     "debug", |                     "debug", | ||||||
| @@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 ocr_args['image_dpi'] = dpi |                 ocr_args['image_dpi'] = dpi | ||||||
|             elif settings.OCR_IMAGE_DPI: |             elif settings.OCR_IMAGE_DPI: | ||||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI |                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||||
|  |             elif a4_dpi: | ||||||
|  |                 ocr_args['image_dpi'] = a4_dpi | ||||||
|             else: |             else: | ||||||
|                 raise ParseError( |                 raise ParseError( | ||||||
|                     f"Cannot produce archive PDF for image {document_path}, " |                     f"Cannot produce archive PDF for image {document_path}, " | ||||||
| @@ -241,6 +262,9 @@ def strip_excess_whitespace(text): | |||||||
|  |  | ||||||
| def get_text_from_pdf(pdf_file): | def get_text_from_pdf(pdf_file): | ||||||
|  |  | ||||||
|  |     if not os.path.isfile(pdf_file): | ||||||
|  |         return None | ||||||
|  |  | ||||||
|     with open(pdf_file, "rb") as f: |     with open(pdf_file, "rb") as f: | ||||||
|         try: |         try: | ||||||
|             pdf = pdftotext.PDF(f) |             pdf = pdftotext.PDF(f) | ||||||
|   | |||||||
| @@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         self.assertRaises(ParseError, f) |         self.assertRaises(ParseError, f) | ||||||
|  |  | ||||||
|  |     @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") | ||||||
|  |     def test_image_calc_a4_dpi(self, m): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|     def test_image_no_dpi_fail(self): |         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||||
|  |  | ||||||
|  |         m.assert_called_once() | ||||||
|  |  | ||||||
|  |         args, kwargs = m.call_args | ||||||
|  |  | ||||||
|  |         self.assertEqual(kwargs['image_dpi'], 62) | ||||||
|  |  | ||||||
|  |     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") | ||||||
|  |     def test_image_dpi_fail(self, m): | ||||||
|  |         m.return_value = None | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|         def f(): |         def f(): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler