Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction

2026-02-24 00:59:35 -06:00 · 2016-03-19 15:44:35 +00:00
parent c199b0498d a22f088e28
commit cf5076bcad
3 changed files with 197 additions and 80 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError

 from paperless.db import GnuPG

-from .models import Correspondent, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log, FileInfo
 from .languages import ISO639


@@ -54,19 +54,6 @@ class Consumer(object):

    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

-    REGEX_TITLE = re.compile(
-        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-    REGEX_CORRESPONDENT_TITLE = re.compile(
-        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
-        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-
    def __init__(self):

        self.logger = logging.getLogger(__name__)
@@ -105,7 +92,7 @@ class Consumer(object):
            if not os.path.isfile(doc):
                continue

-            if not re.match(self.REGEX_TITLE, doc):
+            if not re.match(FileInfo.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
@@ -269,72 +256,20 @@ class Consumer(object):
        # Strip out excess white space to allow matching to go smoother
        return re.sub(r"\s+", " ", r)

-    def _guess_attributes_from_name(self, parseable):
-        """
-        We use a crude naming convention to make handling the correspondent,
-        title, and tags easier:
-          "<correspondent> - <title> - <tags>.<suffix>"
-          "<correspondent> - <title>.<suffix>"
-          "<title>.<suffix>"
-        """
-
-        def get_correspondent(correspondent_name):
-            return Correspondent.objects.get_or_create(
-                name=correspondent_name,
-                defaults={"slug": slugify(correspondent_name)}
-            )[0]
-
-        def get_tags(tags):
-            r = []
-            for t in tags.split(","):
-                r.append(
-                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
-            return tuple(r)
-
-        def get_suffix(suffix):
-            suffix = suffix.lower()
-            if suffix == "jpeg":
-                return "jpg"
-            return suffix
-
-        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
-        m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
-        if m:
-            return (
-                get_correspondent(m.group(1)),
-                m.group(2),
-                get_tags(m.group(3)),
-                get_suffix(m.group(4))
-            )
-
-        # Second attempt: "<correspondent> - <title>.<suffix>"
-        m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
-        if m:
-            return (
-                get_correspondent(m.group(1)),
-                m.group(2),
-                (),
-                get_suffix(m.group(3))
-            )
-
-        # That didn't work, so we assume correspondent and tags are None
-        m = re.match(self.REGEX_TITLE, parseable)
-        return None, m.group(1), (), get_suffix(m.group(2))
-
    def _store(self, text, doc, thumbnail):

-        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
-        relevant_tags = set(list(Tag.match_all(text)) + list(tags))
+        file_info = FileInfo.from_path(doc)
+        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))

        stats = os.stat(doc)

        self.log("debug", "Saving record to database")

        document = Document.objects.create(
-            correspondent=sender,
-            title=title,
+            correspondent=file_info.correspondent,
+            title=file_info.title,
            content=text,
-            file_type=file_type,
+            file_type=file_info.suffix,
            created=timezone.make_aware(
                datetime.datetime.fromtimestamp(stats.st_mtime)),
            modified=timezone.make_aware(
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -12,6 +12,97 @@ from django.utils import timezone
 from .managers import LogManager


+class FileInfo(object):
+    def __init__(self, title, suffix,
+                 correspondent=None, tags=None):
+        self._title = title
+        self._suffix = suffix
+        self._correspondent = correspondent
+        self._tags = tags
+
+    REGEX_TITLE = re.compile(
+        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+    REGEX_CORRESPONDENT_TITLE = re.compile(
+        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
+        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+
+    @classmethod
+    def from_path(cls, path):
+        """
+        We use a crude naming convention to make handling the correspondent,
+        title, and tags easier:
+          "<correspondent> - <title> - <tags>.<suffix>"
+          "<correspondent> - <title>.<suffix>"
+          "<title>.<suffix>"
+        """
+
+        def get_correspondent(correspondent_name):
+            return Correspondent.objects.get_or_create(
+                name=correspondent_name,
+                defaults={"slug": slugify(correspondent_name)}
+            )[0]
+
+        def get_tags(tags):
+            r = []
+            for t in tags.split(","):
+                r.append(
+                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
+            return tuple(r)
+
+        def get_suffix(suffix):
+            suffix = suffix.lower()
+            if suffix == "jpeg":
+                return "jpg"
+            return suffix
+
+        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
+        m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
+        if m:
+            return cls(
+                title=m.group(2),
+                correspondent=get_correspondent(m.group(1)),
+                tags=get_tags(m.group(3)),
+                suffix=get_suffix(m.group(4))
+            )
+
+        # Second attempt: "<correspondent> - <title>.<suffix>"
+        m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
+        if m:
+            return cls(
+                title=m.group(2),
+                correspondent=get_correspondent(m.group(1)),
+                tags=(),
+                suffix=get_suffix(m.group(3))
+            )
+
+        # That didn't work, so we assume correspondent and tags are None
+        m = re.match(cls.REGEX_TITLE, path)
+        return FileInfo(
+            title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
+
+    @property
+    def title(self):
+        return self._title
+
+    @property
+    def correspondent(self):
+        return self._correspondent
+
+    @property
+    def tags(self):
+        return self._tags
+
+    @property
+    def suffix(self):
+        return self._suffix
+
 class SluggedModel(models.Model):

    name = models.CharField(max_length=128, unique=True)
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,12 +1,11 @@
 from django.test import TestCase

-from ..consumer import Consumer
+from ..models import FileInfo


 class TestAttachment(TestCase):

    TAGS = ("tag1", "tag2", "tag3")
-    CONSUMER = Consumer()
    SUFFIXES = (
        "pdf", "png", "jpg", "jpeg", "gif",
        "PDF", "PNG", "JPG", "JPEG", "GIF",
@@ -16,14 +15,14 @@ class TestAttachment(TestCase):
    def _test_guess_attributes_from_name(self, path, sender, title, tags):
        for suffix in self.SUFFIXES:
            f = path.format(suffix)
-            results = self.CONSUMER._guess_attributes_from_name(f)
-            self.assertEqual(results[0].name, sender, f)
-            self.assertEqual(results[1], title, f)
-            self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
+            file_info = FileInfo.from_path(f)
+            self.assertEqual(file_info.correspondent.name, sender, f)
+            self.assertEqual(file_info.title, title, f)
+            self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
            if suffix.lower() == "jpeg":
-                self.assertEqual(results[3], "jpg", f)
+                self.assertEqual(file_info.suffix, "jpg", f)
            else:
-                self.assertEqual(results[3], suffix.lower(), f)
+                self.assertEqual(file_info.suffix, suffix.lower(), f)

    def test_guess_attributes_from_name0(self):
        self._test_guess_attributes_from_name(
@@ -92,3 +91,95 @@ class TestAttachment(TestCase):
            "Τιτλε",
            self.TAGS
        )
+
+    def test_guess_attributes_from_name_when_correspondent_empty(self):
+        self._test_guess_attributes_from_name(
+            '/path/to/ - weird empty correspondent but should not break.{}',
+            None,
+            ' - weird empty correspondent but should not break',
+            ()
+        )
+
+    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
+        self._test_guess_attributes_from_name(
+            '/path/to/- weird but should not break.{}',
+            None,
+            '- weird but should not break',
+            ()
+        )
+
+    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
+        self._test_guess_attributes_from_name(
+            '/path/to/weird but should not break -.{}',
+            None,
+            'weird but should not break -',
+            ()
+        )
+
+    def test_guess_attributes_from_name_when_title_is_empty(self):
+        self._test_guess_attributes_from_name(
+            '/path/to/weird correspondent but should not break - .{}',
+            'weird correspondent but should not break',
+            '',
+            ()
+        )
+
+
+class Permutations(TestCase):
+    valid_correspondents = ['timmy', 'Dr. McWheelie',
+                            'Dash Gor-don', 'ο Θερμαστής']
+    valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
+    valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
+    valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
+
+    def _test_guessed_attributes(
+            self, filename, title, suffix, correspondent=None, tags=None):
+        file_info = FileInfo.from_path(filename)
+
+        # Required
+        self.assertEqual(file_info.title, title, filename)
+        if suffix == 'jpeg':
+            suffix = 'jpg'
+        self.assertEqual(file_info.suffix, suffix, filename)
+        # Optional
+        if correspondent is None:
+            self.assertEqual(file_info.correspondent,
+                             correspondent, filename)
+        else:
+            self.assertEqual(file_info.correspondent.name,
+                             correspondent, filename)
+        if tags is None:
+            self.assertEqual(file_info.tags, (), filename)
+        else:
+            self.assertEqual([t.slug for t in file_info.tags],
+                             tags.split(','),
+                             filename)
+
+    def test_just_title(self):
+        template = '/path/to/{title}.{suffix}'
+        for title in self.valid_titles:
+            for suffix in self.valid_suffixes:
+                spec = dict(title=title, suffix=suffix)
+                filename = template.format(**spec)
+                self._test_guessed_attributes(filename, **spec)
+
+    def test_title_and_correspondent(self):
+        template = '/path/to/{correspondent} - {title}.{suffix}'
+        for correspondent in self.valid_correspondents:
+            for title in self.valid_titles:
+                for suffix in self.valid_suffixes:
+                    spec = dict(correspondent=correspondent, title=title,
+                                suffix=suffix)
+                    filename = template.format(**spec)
+                    self._test_guessed_attributes(filename, **spec)
+
+    def test_title_and_correspondent_and_tags(self):
+        template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
+        for correspondent in self.valid_correspondents:
+            for title in self.valid_titles:
+                for tags in self.valid_tags:
+                    for suffix in self.valid_suffixes:
+                        spec = dict(correspondent=correspondent, title=title,
+                                    tags=tags, suffix=suffix)
+                        filename = template.format(**spec)
+                        self._test_guessed_attributes(filename, **spec)