mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
#11: automatic tagging support
This commit is contained in:
parent
95fd68d708
commit
0ec63ae1f9
@ -19,7 +19,7 @@ from django.utils import timezone
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...languages import ISO639
|
||||
from ...models import Document, Sender
|
||||
from ...models import Document, Sender, Tag
|
||||
|
||||
|
||||
class OCRError(BaseException):
|
||||
@ -199,13 +199,14 @@ class Command(BaseCommand):
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
self._render(" {}".format(f.filename), 3)
|
||||
r += self.OCR.image_to_string(f, lang=lang)
|
||||
r += "\n\n\n\n\n\n\n\n"
|
||||
|
||||
return r
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return re.sub(r"\s+", " ", r)
|
||||
|
||||
def _store(self, text, pdf):
|
||||
|
||||
sender, title = self._parse_file_name(pdf)
|
||||
relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
|
||||
|
||||
stats = os.stat(pdf)
|
||||
|
||||
@ -221,6 +222,11 @@ class Command(BaseCommand):
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
)
|
||||
|
||||
if relevant_tags:
|
||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||
self._render(" Tagging with {}".format(tag_names), 2)
|
||||
doc.tags.add(*relevant_tags)
|
||||
|
||||
with open(pdf, "rb") as unencrypted:
|
||||
with open(doc.pdf_path, "wb") as encrypted:
|
||||
self._render(" Encrypting", 3)
|
||||
|
30
src/documents/migrations/0007_auto_20160126_2114.py
Normal file
30
src/documents/migrations/0007_auto_20160126_2114.py
Normal file
@ -0,0 +1,30 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9 on 2016-01-26 21:14
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0006_auto_20160123_0430'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='match',
|
||||
field=models.CharField(blank=True, max_length=256),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(blank=True, choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regexis, you probably don\'t want this option.', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='colour',
|
||||
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1),
|
||||
),
|
||||
]
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
@ -46,7 +47,66 @@ class Tag(SluggedModel):
|
||||
(12, "#000000"),
|
||||
(13, "#cccccc")
|
||||
)
|
||||
|
||||
MATCH_ANY = 1
|
||||
MATCH_ALL = 2
|
||||
MATCH_LITERAL = 3
|
||||
MATCH_REGEX = 4
|
||||
MATCHING_ALGORITHMS = (
|
||||
(MATCH_ANY, "Any"),
|
||||
(MATCH_ALL, "All"),
|
||||
(MATCH_LITERAL, "Literal"),
|
||||
(MATCH_REGEX, "Regular Expression"),
|
||||
)
|
||||
|
||||
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
choices=MATCHING_ALGORITHMS,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text=(
|
||||
"Which algorithm you want to use when matching text to the OCR'd "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word provided "
|
||||
"in the PDF, while \"all\" requires that every word provided "
|
||||
"appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. If you don't know what a regex"
|
||||
"is, you probably don't want this option."
|
||||
)
|
||||
)
|
||||
|
||||
@property
|
||||
def conditions(self):
|
||||
return "{}: \"{}\" ({})".format(
|
||||
self.name, self.match, self.get_matching_algorithm_display())
|
||||
|
||||
def matches(self, text):
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ALL:
|
||||
for word in self.match.split(" "):
|
||||
if word not in text:
|
||||
return False
|
||||
return True
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ANY:
|
||||
for word in self.match.split(" "):
|
||||
if word in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
||||
return self.match in text
|
||||
|
||||
if self.matching_algorithm == self.MATCH_REGEX:
|
||||
return re.search(re.compile(self.match), text)
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.match = self.match.lower()
|
||||
SluggedModel.save(self, *args, **kwargs)
|
||||
|
||||
|
||||
class Document(models.Model):
|
||||
|
Loading…
x
Reference in New Issue
Block a user