#11: automatic tagging support

2025-07-22 17:54:40 -05:00 · 2016-01-28 07:23:11 +00:00 · 2016-01-28 07:23:11 +00:00 · 0ec63ae1f9
commit 0ec63ae1f9
parent 95fd68d708
3 changed files with 99 additions and 3 deletions
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@ -19,7 +19,7 @@ from django.utils import timezone
 from paperless.db import GnuPG

 from ...languages import ISO639
-from ...models import Document, Sender
+from ...models import Document, Sender, Tag


 class OCRError(BaseException):
@ -199,13 +199,14 @@ class Command(BaseCommand):
            with Image.open(os.path.join(self.SCRATCH, png)) as f:
                self._render("    {}".format(f.filename), 3)
                r += self.OCR.image_to_string(f, lang=lang)
-                r += "\n\n\n\n\n\n\n\n"

-        return r
+        # Strip out excess white space to allow matching to go smoother
+        return re.sub(r"\s+", " ", r)

    def _store(self, text, pdf):

        sender, title = self._parse_file_name(pdf)
+        relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]

        stats = os.stat(pdf)

@ -221,6 +222,11 @@ class Command(BaseCommand):
                datetime.datetime.fromtimestamp(stats.st_mtime))
        )

+        if relevant_tags:
+            tag_names = ", ".join([t.slug for t in relevant_tags])
+            self._render("    Tagging with {}".format(tag_names), 2)
+            doc.tags.add(*relevant_tags)
+
        with open(pdf, "rb") as unencrypted:
            with open(doc.pdf_path, "wb") as encrypted:
                self._render("  Encrypting", 3)
--- a/src/documents/migrations/0007_auto_20160126_2114.py
+++ b/src/documents/migrations/0007_auto_20160126_2114.py
@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9 on 2016-01-26 21:14
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0006_auto_20160123_0430'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='tag',
+            name='match',
+            field=models.CharField(blank=True, max_length=256),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='matching_algorithm',
+            field=models.PositiveIntegerField(blank=True, choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  If you don\'t know what a regexis, you probably don\'t want this option.', null=True),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='colour',
+            field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -1,4 +1,5 @@
 import os
+import re

 from django.conf import settings
 from django.db import models
@ -46,7 +47,66 @@ class Tag(SluggedModel):
        (12, "#000000"),
        (13, "#cccccc")
    )
+
+    MATCH_ANY = 1
+    MATCH_ALL = 2
+    MATCH_LITERAL = 3
+    MATCH_REGEX = 4
+    MATCHING_ALGORITHMS = (
+        (MATCH_ANY, "Any"),
+        (MATCH_ALL, "All"),
+        (MATCH_LITERAL, "Literal"),
+        (MATCH_REGEX, "Regular Expression"),
+    )
+
    colour = models.PositiveIntegerField(choices=COLOURS, default=1)
+    match = models.CharField(max_length=256, blank=True)
+    matching_algorithm = models.PositiveIntegerField(
+        choices=MATCHING_ALGORITHMS,
+        blank=True,
+        null=True,
+        help_text=(
+            "Which algorithm you want to use when matching text to the OCR'd "
+            "PDF.  Here, \"any\" looks for any occurrence of any word provided "
+            "in the PDF, while \"all\" requires that every word provided "
+            "appear in the PDF, albeit not in the order provided.  A "
+            "\"literal\" match means that the text you enter must appear in "
+            "the PDF exactly as you've entered it, and \"regular expression\" "
+            "uses a regex to match the PDF.  If you don't know what a regex"
+            "is, you probably don't want this option."
+        )
+    )
+
+    @property
+    def conditions(self):
+        return "{}: \"{}\" ({})".format(
+            self.name, self.match, self.get_matching_algorithm_display())
+
+    def matches(self, text):
+
+        if self.matching_algorithm == self.MATCH_ALL:
+            for word in self.match.split(" "):
+                if word not in text:
+                    return False
+            return True
+
+        if self.matching_algorithm == self.MATCH_ANY:
+            for word in self.match.split(" "):
+                if word in text:
+                    return True
+            return False
+
+        if self.matching_algorithm == self.MATCH_LITERAL:
+            return self.match in text
+
+        if self.matching_algorithm == self.MATCH_REGEX:
+            return re.search(re.compile(self.match), text)
+
+        raise NotImplementedError("Unsupported matching algorithm")
+
+    def save(self, *args, **kwargs):
+        self.match = self.match.lower()
+        SluggedModel.save(self, *args, **kwargs)


 class Document(models.Model):