Merge branch 'dev' into feature-bulk-edit

2026-01-28 22:59:03 -06:00 · 2020-12-13 00:52:36 +01:00
parent a85792e327 1c4d19198f
commit 2374506a20
14 changed files with 152 additions and 51 deletions
--- a/src/documents/management/commands/document_renamer.py
+++ b/src/documents/management/commands/document_renamer.py
@@ -2,6 +2,7 @@ import logging

 import tqdm
 from django.core.management.base import BaseCommand
+from django.db.models.signals import post_save

 from documents.models import Document
 from ...mixins import Renderable
@@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand):
        logging.getLogger().handlers[0].level = logging.ERROR

        for document in tqdm.tqdm(Document.objects.all()):
-            # Saving the document again will generate a new filename and rename
-            document.save()
+            post_save.send(Document, instance=document)
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -7,6 +7,7 @@ from django.contrib.admin.models import ADDITION, LogEntry
 from django.contrib.auth.models import User
 from django.contrib.contenttypes.models import ContentType
 from django.db import models, DatabaseError
+from django.db.models import Q
 from django.dispatch import receiver
 from django.utils import timezone
 from filelock import FileLock
@@ -121,11 +122,14 @@ def set_tags(sender,
             classifier=None,
             replace=False,
             **kwargs):
+
    if replace:
-        document.tags.clear()
-        current_tags = set([])
-    else:
-        current_tags = set(document.tags.all())
+        Document.tags.through.objects.filter(document=document).exclude(
+            Q(tag__is_inbox_tag=True)).exclude(
+            Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO)
+        ).delete()
+
+    current_tags = set(document.tags.all())

    matched_tags = matching.match_tags(document.content, classifier)

--- a/src/documents/tests/test_management_retagger.py
+++ b/src/documents/tests/test_management_retagger.py
@@ -14,6 +14,12 @@ class TestRetagger(DirectoriesMixin, TestCase):

        self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
        self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
+        self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
+        self.tag_no_match = Tag.objects.create(name="test2")
+
+        self.d3.tags.add(self.tag_inbox)
+        self.d3.tags.add(self.tag_no_match)
+

        self.correspondent_first = Correspondent.objects.create(
            name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
@@ -38,7 +44,7 @@ class TestRetagger(DirectoriesMixin, TestCase):

        self.assertEqual(d_first.tags.count(), 1)
        self.assertEqual(d_second.tags.count(), 1)
-        self.assertEqual(d_unrelated.tags.count(), 0)
+        self.assertEqual(d_unrelated.tags.count(), 2)

        self.assertEqual(d_first.tags.first(), self.tag_first)
        self.assertEqual(d_second.tags.first(), self.tag_second)
@@ -56,3 +62,17 @@ class TestRetagger(DirectoriesMixin, TestCase):

        self.assertEqual(d_first.correspondent, self.correspondent_first)
        self.assertEqual(d_second.correspondent, self.correspondent_second)
+
+    def test_overwrite_preserve_inbox(self):
+        self.d1.tags.add(self.tag_second)
+
+        call_command('document_retagger', '--tags', '--overwrite')
+
+        d_first, d_second, d_unrelated = self.get_updated_docs()
+
+        self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
+
+        self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
+        self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
+        self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
+
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
                f"Error while getting DPI from image {image}: {e}")
            return None

+    def calculate_a4_dpi(self, image):
+        try:
+            with Image.open(image) as im:
+                width, height = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log(
+                    'debug',
+                    f"Estimated DPI {dpi} based on image width {width}"
+                )
+                return dpi
+
+        except Exception as e:
+            self.log(
+                'warning',
+                f"Error while calculating DPI for image {image}: {e}")
+            return None
+
    def parse(self, document_path, mime_type):
        mode = settings.OCR_MODE

@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):

        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
+            a4_dpi = self.calculate_a4_dpi(document_path)
            if dpi:
                self.log(
                    "debug",
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
                ocr_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
+            elif a4_dpi:
+                ocr_args['image_dpi'] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):

 def get_text_from_pdf(pdf_file):

+    if not os.path.isfile(pdf_file):
+        return None
+
    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):

        self.assertRaises(ParseError, f)

+    @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
+    def test_image_calc_a4_dpi(self, m):
+        parser = RasterisedDocumentParser(None)

-    def test_image_no_dpi_fail(self):
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+
+        m.assert_called_once()
+
+        args, kwargs = m.call_args
+
+        self.assertEqual(kwargs['image_dpi'], 62)
+
+    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
+    def test_image_dpi_fail(self, m):
+        m.return_value = None
        parser = RasterisedDocumentParser(None)

        def f():