#11: automatic tagging support

This commit is contained in:
Daniel Quinn 2016-01-28 07:23:11 +00:00
parent 95fd68d708
commit 0ec63ae1f9
3 changed files with 99 additions and 3 deletions

View File

@ -19,7 +19,7 @@ from django.utils import timezone
from paperless.db import GnuPG
from ...languages import ISO639
from ...models import Document, Sender
from ...models import Document, Sender, Tag
class OCRError(BaseException):
@ -199,13 +199,14 @@ class Command(BaseCommand):
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=lang)
r += "\n\n\n\n\n\n\n\n"
return r
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def _store(self, text, pdf):
sender, title = self._parse_file_name(pdf)
relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
stats = os.stat(pdf)
@ -221,6 +222,11 @@ class Command(BaseCommand):
datetime.datetime.fromtimestamp(stats.st_mtime))
)
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
doc.tags.add(*relevant_tags)
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)

View File

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-01-26 21:14
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0006_auto_20160123_0430'),
]
operations = [
migrations.AddField(
model_name='tag',
name='match',
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(blank=True, choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regexis, you probably don\'t want this option.', null=True),
),
migrations.AlterField(
model_name='tag',
name='colour',
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1),
),
]

View File

@ -1,4 +1,5 @@
import os
import re
from django.conf import settings
from django.db import models
@ -46,7 +47,66 @@ class Tag(SluggedModel):
(12, "#000000"),
(13, "#cccccc")
)
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
)
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
blank=True,
null=True,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word provided "
"in the PDF, while \"all\" requires that every word provided "
"appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. If you don't know what a regex"
"is, you probably don't want this option."
)
)
@property
def conditions(self):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
def matches(self, text):
if self.matching_algorithm == self.MATCH_ALL:
for word in self.match.split(" "):
if word not in text:
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self.match.split(" "):
if word in text:
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
return self.match in text
if self.matching_algorithm == self.MATCH_REGEX:
return re.search(re.compile(self.match), text)
raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs):
self.match = self.match.lower()
SluggedModel.save(self, *args, **kwargs)
class Document(models.Model):