mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'dev' into feature-bulk-edit
This commit is contained in:
		| @@ -2,6 +2,7 @@ import logging | ||||
|  | ||||
| import tqdm | ||||
| from django.core.management.base import BaseCommand | ||||
| from django.db.models.signals import post_save | ||||
|  | ||||
| from documents.models import Document | ||||
| from ...mixins import Renderable | ||||
| @@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand): | ||||
|         logging.getLogger().handlers[0].level = logging.ERROR | ||||
|  | ||||
|         for document in tqdm.tqdm(Document.objects.all()): | ||||
|             # Saving the document again will generate a new filename and rename | ||||
|             document.save() | ||||
|             post_save.send(Document, instance=document) | ||||
|   | ||||
| @@ -7,6 +7,7 @@ from django.contrib.admin.models import ADDITION, LogEntry | ||||
| from django.contrib.auth.models import User | ||||
| from django.contrib.contenttypes.models import ContentType | ||||
| from django.db import models, DatabaseError | ||||
| from django.db.models import Q | ||||
| from django.dispatch import receiver | ||||
| from django.utils import timezone | ||||
| from filelock import FileLock | ||||
| @@ -121,11 +122,14 @@ def set_tags(sender, | ||||
|              classifier=None, | ||||
|              replace=False, | ||||
|              **kwargs): | ||||
|  | ||||
|     if replace: | ||||
|         document.tags.clear() | ||||
|         current_tags = set([]) | ||||
|     else: | ||||
|         current_tags = set(document.tags.all()) | ||||
|         Document.tags.through.objects.filter(document=document).exclude( | ||||
|             Q(tag__is_inbox_tag=True)).exclude( | ||||
|             Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO) | ||||
|         ).delete() | ||||
|  | ||||
|     current_tags = set(document.tags.all()) | ||||
|  | ||||
|     matched_tags = matching.match_tags(document.content, classifier) | ||||
|  | ||||
|   | ||||
| @@ -14,6 +14,12 @@ class TestRetagger(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY) | ||||
|         self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY) | ||||
|         self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True) | ||||
|         self.tag_no_match = Tag.objects.create(name="test2") | ||||
|  | ||||
|         self.d3.tags.add(self.tag_inbox) | ||||
|         self.d3.tags.add(self.tag_no_match) | ||||
|  | ||||
|  | ||||
|         self.correspondent_first = Correspondent.objects.create( | ||||
|             name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY) | ||||
| @@ -38,7 +44,7 @@ class TestRetagger(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertEqual(d_first.tags.count(), 1) | ||||
|         self.assertEqual(d_second.tags.count(), 1) | ||||
|         self.assertEqual(d_unrelated.tags.count(), 0) | ||||
|         self.assertEqual(d_unrelated.tags.count(), 2) | ||||
|  | ||||
|         self.assertEqual(d_first.tags.first(), self.tag_first) | ||||
|         self.assertEqual(d_second.tags.first(), self.tag_second) | ||||
| @@ -56,3 +62,17 @@ class TestRetagger(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertEqual(d_first.correspondent, self.correspondent_first) | ||||
|         self.assertEqual(d_second.correspondent, self.correspondent_second) | ||||
|  | ||||
|     def test_overwrite_preserve_inbox(self): | ||||
|         self.d1.tags.add(self.tag_second) | ||||
|  | ||||
|         call_command('document_retagger', '--tags', '--overwrite') | ||||
|  | ||||
|         d_first, d_second, d_unrelated = self.get_updated_docs() | ||||
|  | ||||
|         self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id)) | ||||
|  | ||||
|         self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id]) | ||||
|         self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id]) | ||||
|         self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id]) | ||||
|  | ||||
|   | ||||
| @@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def calculate_a4_dpi(self, image): | ||||
|         try: | ||||
|             with Image.open(image) as im: | ||||
|                 width, height = im.size | ||||
|                 # divide image width by A4 width (210mm) in inches. | ||||
|                 dpi = int(width / (21 / 2.54)) | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Estimated DPI {dpi} based on image width {width}" | ||||
|                 ) | ||||
|                 return dpi | ||||
|  | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while calculating DPI for image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         mode = settings.OCR_MODE | ||||
|  | ||||
| @@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(document_path) | ||||
|             a4_dpi = self.calculate_a4_dpi(document_path) | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
| @@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 ocr_args['image_dpi'] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|             elif a4_dpi: | ||||
|                 ocr_args['image_dpi'] = a4_dpi | ||||
|             else: | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {document_path}, " | ||||
| @@ -241,6 +262,9 @@ def strip_excess_whitespace(text): | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
|  | ||||
|     if not os.path.isfile(pdf_file): | ||||
|         return None | ||||
|  | ||||
|     with open(pdf_file, "rb") as f: | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|   | ||||
| @@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") | ||||
|     def test_image_calc_a4_dpi(self, m): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|     def test_image_no_dpi_fail(self): | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
|         args, kwargs = m.call_args | ||||
|  | ||||
|         self.assertEqual(kwargs['image_dpi'], 62) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") | ||||
|     def test_image_dpi_fail(self, m): | ||||
|         m.return_value = None | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler