Extract filename parsing into testable class

2026-01-30 23:08:59 -06:00 · 2016-03-07 21:05:04 +02:00
parent 0b34894db9
commit 1f75af0137
2 changed files with 68 additions and 66 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError

 from paperless.db import GnuPG

-from .models import Correspondent, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log, FileInfo
 from .languages import ISO639


@@ -54,19 +54,6 @@ class Consumer(object):

    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

-    REGEX_TITLE = re.compile(
-        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-    REGEX_CORRESPONDENT_TITLE = re.compile(
-        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
-        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
-        flags=re.IGNORECASE
-    )
-
    def __init__(self):

        self.logger = logging.getLogger(__name__)
@@ -105,7 +92,7 @@ class Consumer(object):
            if not os.path.isfile(doc):
                continue

-            if not re.match(self.REGEX_TITLE, doc):
+            if not re.match(FileInfo.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
@@ -270,56 +257,8 @@ class Consumer(object):
        return re.sub(r"\s+", " ", r)

    def _guess_attributes_from_name(self, parseable):
-        """
-        We use a crude naming convention to make handling the correspondent,
-        title, and tags easier:
-          "<correspondent> - <title> - <tags>.<suffix>"
-          "<correspondent> - <title>.<suffix>"
-          "<title>.<suffix>"
-        """
-
-        def get_correspondent(correspondent_name):
-            return Correspondent.objects.get_or_create(
-                name=correspondent_name,
-                defaults={"slug": slugify(correspondent_name)}
-            )[0]
-
-        def get_tags(tags):
-            r = []
-            for t in tags.split(","):
-                r.append(
-                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
-            return tuple(r)
-
-        def get_suffix(suffix):
-            suffix = suffix.lower()
-            if suffix == "jpeg":
-                return "jpg"
-            return suffix
-
-        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
-        m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
-        if m:
-            return (
-                get_correspondent(m.group(1)),
-                m.group(2),
-                get_tags(m.group(3)),
-                get_suffix(m.group(4))
-            )
-
-        # Second attempt: "<correspondent> - <title>.<suffix>"
-        m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
-        if m:
-            return (
-                get_correspondent(m.group(1)),
-                m.group(2),
-                (),
-                get_suffix(m.group(3))
-            )
-
-        # That didn't work, so we assume correspondent and tags are None
-        m = re.match(self.REGEX_TITLE, parseable)
-        return None, m.group(1), (), get_suffix(m.group(2))
+        file_info = FileInfo.from_path(parseable)
+        return file_info.sender, file_info.title, file_info.tags, file_info.suffix

    def _store(self, text, doc, thumbnail):

--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -23,9 +23,72 @@ class FileInfo(object):
        self._file_mtime = file_mtime
        self._path = path

+    REGEX_TITLE = re.compile(
+        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+    REGEX_CORRESPONDENT_TITLE = re.compile(
+        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
+        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
+        flags=re.IGNORECASE
+    )
+
    @classmethod
    def from_path(cls, path):
-        pass
+        """
+        We use a crude naming convention to make handling the correspondent,
+        title, and tags easier:
+          "<correspondent> - <title> - <tags>.<suffix>"
+          "<correspondent> - <title>.<suffix>"
+          "<title>.<suffix>"
+        """
+
+        def get_correspondent(correspondent_name):
+            return Correspondent.objects.get_or_create(
+                name=correspondent_name,
+                defaults={"slug": slugify(correspondent_name)}
+            )[0]
+
+        def get_tags(tags):
+            r = []
+            for t in tags.split(","):
+                r.append(
+                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
+            return tuple(r)
+
+        def get_suffix(suffix):
+            suffix = suffix.lower()
+            if suffix == "jpeg":
+                return "jpg"
+            return suffix
+
+        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
+        m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
+        if m:
+            return cls(
+                title=m.group(2),
+                correspondent=get_correspondent(m.group(1)),
+                tags=get_tags(m.group(3)),
+                suffix=get_suffix(m.group(4))
+            )
+
+        # Second attempt: "<correspondent> - <title>.<suffix>"
+        m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
+        if m:
+            return cls(
+                title=m.group(2),
+                correspondent=get_correspondent(m.group(1)),
+                tags=(),
+                suffix=get_suffix(m.group(3))
+            )
+
+        # That didn't work, so we assume correspondent and tags are None
+        m = re.match(cls.REGEX_TITLE, path)
+        return FileInfo(
+            title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))

    @classmethod
    def from_document(cls, document):