Merge branch 'master' into feature/api

This commit is contained in:
Daniel Quinn
2016-02-20 22:55:42 +00:00
17 changed files with 678 additions and 37 deletions

View File

@@ -1,21 +1,23 @@
import datetime
import glob
import tempfile
from multiprocessing.pool import Pool
import itertools
import langdetect
import os
import random
import re
import subprocess
import pyocr
import shutil
from PIL import Image
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from logger.models import Log
from paperless.db import GnuPG
@@ -27,6 +29,12 @@ from .languages import ISO639
def image_to_string(args):
self, png, lang = args
with Image.open(os.path.join(self.SCRATCH, png)) as f:
if self.OCR.can_detect_orientation():
try:
orientation = self.OCR.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except TesseractError:
pass
return self.OCR.image_to_string(f, lang=lang)
@@ -111,34 +119,41 @@ class Consumer(object):
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
pngs = self._get_greyscale(doc)
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc)
try:
text = self._get_ocr(pngs)
self._store(text, doc)
except OCRError:
self._ignore.append(doc)
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
self._cleanup_tempdir(tempdir)
continue
else:
self._cleanup_tempdir(tempdir)
self._cleanup_doc(doc)
self._store(text, doc)
self._cleanup(pngs, doc)
def _get_greyscale(self, doc):
def _get_greyscale(self, tempdir, doc):
Log.debug(
"Generating greyscale image from {}".format(doc),
Log.COMPONENT_CONSUMER
)
i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))
png = os.path.join(tempdir, "convert-%04d.jpg")
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
pngs = []
for f in os.listdir(tempdir):
if f.startswith("convert"):
pngs.append(os.path.join(tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pngs))
@staticmethod
def _guess_language(text):
@@ -271,11 +286,7 @@ class Consumer(object):
def _store(self, text, doc):
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
tags = list(tags)
lower_text = text.lower()
relevant_tags = set(
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
stats = os.stat(doc)
@@ -303,14 +314,15 @@ class Consumer(object):
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
encrypted.write(GnuPG.encrypted(unencrypted))
def _cleanup(self, pngs, doc):
@staticmethod
def _cleanup_tempdir(d):
Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
shutil.rmtree(d)
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(png_glob)) + [doc]:
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
os.unlink(f)
@staticmethod
def _cleanup_doc(doc):
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
os.unlink(doc)
def _is_ready(self, doc):
"""

View File

@@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
self.verbosity = options["verbosity"]
for document in Document.objects.all():
tags = Tag.objects.exclude(
pk__in=document.tags.values_list("pk", flat=True))
for tag in tags:
if tag.matches(document.content):
print('Tagging {} with "{}"'.format(document, tag))
document.tags.add(tag)
for tag in Tag.match_all(document.content, tags):
print('Tagging {} with "{}"'.format(document, tag))
document.tags.add(tag)

View File

@@ -0,0 +1,23 @@
"""
Source:
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
License:
MIT
Copyright (c) 2016 Baptiste Mispelon
"""
import sys
from django.core.management.commands.loaddata import Command as LoadDataCommand
class Command(LoadDataCommand):
def parse_name(self, fixture_name):
self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
if fixture_name == '-':
return '-', 'json', 'stdin'
def find_fixtures(self, fixture_label):
if fixture_label == '-':
return [('-', None, '-')]
return super(Command, self).find_fixtures(fixture_label)

View File

@@ -86,28 +86,40 @@ class Tag(SluggedModel):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
@classmethod
def match_all(cls, text, tags=None):
if tags is None:
tags = cls.objects.all()
text = text.lower()
for tag in tags:
if tag.matches(text):
yield tag
def matches(self, text):
# Check that match is not empty
if self.match.strip() == "":
return False
if self.matching_algorithm == self.MATCH_ALL:
for word in self.match.split(" "):
if word not in text:
if not re.search(r"\b{}\b".format(word), text):
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self.match.split(" "):
if word in text:
if re.search(r"\b{}\b".format(word), text):
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
return self.match in text
return bool(re.search(r"\b{}\b".format(self.match), text))
if self.matching_algorithm == self.MATCH_REGEX:
return re.search(re.compile(self.match), text)
return bool(re.search(re.compile(self.match), text))
raise NotImplementedError("Unsupported matching algorithm")

View File

@@ -0,0 +1,120 @@
from django.test import TestCase
from ..models import Tag
class TestTagMatching(TestCase):
def test_match_all(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_any(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have alpha in me"))
self.assertTrue(t.matches("I have charlie in me"))
self.assertTrue(t.matches("I have gamma in me"))
self.assertTrue(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have 12 in me"))
self.assertTrue(t.matches("I have 34 in me"))
self.assertTrue(t.matches("I have 56 in me"))
self.assertTrue(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_literal(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
self.assertTrue(t.matches("I have 12 34 56 in me"))
def test_match_regex(self):
t = Tag.objects.create(
name="Test 0",
match="alpha\w+gamma",
matching_algorithm=Tag.MATCH_REGEX
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
self.assertFalse(t.matches("I have alpha and gamma in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))