Merge branch 'master' into feature/api

2025-12-14 01:21:14 -06:00 · 2016-02-20 22:55:42 +00:00
parent cebc44f2c9 224f4acdc3
commit a5124cade6
17 changed files with 678 additions and 37 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,21 +1,23 @@
 import datetime
-import glob
+import tempfile
 from multiprocessing.pool import Pool

 import itertools
+
 import langdetect
 import os
-import random
 import re
 import subprocess

 import pyocr
+import shutil

 from PIL import Image

 from django.conf import settings
 from django.utils import timezone
 from django.template.defaultfilters import slugify
+from pyocr.tesseract import TesseractError

 from logger.models import Log
 from paperless.db import GnuPG
@@ -27,6 +29,12 @@ from .languages import ISO639
 def image_to_string(args):
    self, png, lang = args
    with Image.open(os.path.join(self.SCRATCH, png)) as f:
+        if self.OCR.can_detect_orientation():
+            try:
+                orientation = self.OCR.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except TesseractError:
+                pass
        return self.OCR.image_to_string(f, lang=lang)


@@ -111,34 +119,41 @@ class Consumer(object):

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

-            pngs = self._get_greyscale(doc)
+            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
+            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
+                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                self._cleanup_tempdir(tempdir)
                continue
+            else:
+                self._cleanup_tempdir(tempdir)
+                self._cleanup_doc(doc)

-            self._store(text, doc)
-            self._cleanup(pngs, doc)
-
-    def _get_greyscale(self, doc):
+    def _get_greyscale(self, tempdir, doc):

        Log.debug(
            "Generating greyscale image from {}".format(doc),
            Log.COMPONENT_CONSUMER
        )

-        i = random.randint(1000000, 9999999)
-        png = os.path.join(self.SCRATCH, "{}.png".format(i))
+        png = os.path.join(tempdir, "convert-%04d.jpg")

        subprocess.Popen((
            self.CONVERT, "-density", "300", "-depth", "8",
            "-type", "grayscale", doc, png
        )).wait()

-        return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
+        pngs = []
+        for f in os.listdir(tempdir):
+            if f.startswith("convert"):
+                pngs.append(os.path.join(tempdir, f))
+
+        return sorted(filter(lambda __: os.path.isfile(__), pngs))

    @staticmethod
    def _guess_language(text):
@@ -271,11 +286,7 @@ class Consumer(object):
    def _store(self, text, doc):

        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
-        tags = list(tags)
-
-        lower_text = text.lower()
-        relevant_tags = set(
-            [t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
+        relevant_tags = set(list(Tag.match_all(text)) + list(tags))

        stats = os.stat(doc)

@@ -303,14 +314,15 @@ class Consumer(object):
                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
                encrypted.write(GnuPG.encrypted(unencrypted))

-    def _cleanup(self, pngs, doc):
+    @staticmethod
+    def _cleanup_tempdir(d):
+        Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
+        shutil.rmtree(d)

-        png_glob = os.path.join(
-            self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
-
-        for f in list(glob.glob(png_glob)) + [doc]:
-            Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
-            os.unlink(f)
+    @staticmethod
+    def _cleanup_doc(doc):
+        Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
+        os.unlink(doc)

    def _is_ready(self, doc):
        """
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
        self.verbosity = options["verbosity"]

        for document in Document.objects.all():
+
            tags = Tag.objects.exclude(
                pk__in=document.tags.values_list("pk", flat=True))
-            for tag in tags:
-                if tag.matches(document.content):
-                    print('Tagging {} with "{}"'.format(document, tag))
-                    document.tags.add(tag)
+
+            for tag in Tag.match_all(document.content, tags):
+                print('Tagging {} with "{}"'.format(document, tag))
+                document.tags.add(tag)
--- a/src/documents/management/commands/loaddata_stdin.py
+++ b/src/documents/management/commands/loaddata_stdin.py
@@ -0,0 +1,23 @@
+"""
+Source:
+    https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
+
+License:
+    MIT
+    Copyright (c) 2016 Baptiste Mispelon
+"""
+import sys
+
+from django.core.management.commands.loaddata import Command as LoadDataCommand
+
+
+class Command(LoadDataCommand):
+    def parse_name(self, fixture_name):
+        self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
+        if fixture_name == '-':
+            return '-', 'json', 'stdin'
+
+    def find_fixtures(self, fixture_label):
+        if fixture_label == '-':
+            return [('-', None, '-')]
+        return super(Command, self).find_fixtures(fixture_label)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -86,28 +86,40 @@ class Tag(SluggedModel):
        return "{}: \"{}\" ({})".format(
            self.name, self.match, self.get_matching_algorithm_display())

+    @classmethod
+    def match_all(cls, text, tags=None):
+
+        if tags is None:
+            tags = cls.objects.all()
+
+        text = text.lower()
+        for tag in tags:
+            if tag.matches(text):
+                yield tag
+
    def matches(self, text):
+
        # Check that match is not empty
        if self.match.strip() == "":
            return False

        if self.matching_algorithm == self.MATCH_ALL:
            for word in self.match.split(" "):
-                if word not in text:
+                if not re.search(r"\b{}\b".format(word), text):
                    return False
            return True

        if self.matching_algorithm == self.MATCH_ANY:
            for word in self.match.split(" "):
-                if word in text:
+                if re.search(r"\b{}\b".format(word), text):
                    return True
            return False

        if self.matching_algorithm == self.MATCH_LITERAL:
-            return self.match in text
+            return bool(re.search(r"\b{}\b".format(self.match), text))

        if self.matching_algorithm == self.MATCH_REGEX:
-            return re.search(re.compile(self.match), text)
+            return bool(re.search(re.compile(self.match), text))

        raise NotImplementedError("Unsupported matching algorithm")

--- a/src/documents/tests/test_tags.py
+++ b/src/documents/tests/test_tags.py
@@ -0,0 +1,120 @@
+from django.test import TestCase
+
+from ..models import Tag
+
+
+class TestTagMatching(TestCase):
+
+    def test_match_all(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_any(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+
+        self.assertTrue(t.matches("I have alpha in me"))
+        self.assertTrue(t.matches("I have charlie in me"))
+        self.assertTrue(t.matches("I have gamma in me"))
+        self.assertTrue(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+        self.assertTrue(t.matches("I have 12 in me"))
+        self.assertTrue(t.matches("I have 34 in me"))
+        self.assertTrue(t.matches("I have 56 in me"))
+        self.assertTrue(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_literal(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertFalse(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+        self.assertTrue(t.matches("I have 12 34 56 in me"))
+
+    def test_match_regex(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha\w+gamma",
+            matching_algorithm=Tag.MATCH_REGEX
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha_and_gamma in me"))
+        self.assertTrue(t.matches("I have alphas_and_gamma in me"))
+        self.assertFalse(t.matches("I have alpha,and,gamma in me"))
+        self.assertFalse(t.matches("I have alpha and gamma in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+