Extract filename parsing into testable class

2025-07-24 18:04:39 -05:00 · 2016-03-07 21:05:04 +02:00 · 2016-03-07 21:05:04 +02:00 · 1f75af0137
commit 1f75af0137
parent 0b34894db9
2 changed files with 68 additions and 66 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
 from paperless.db import GnuPG
-from .models import Correspondent, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log, FileInfo
 from .languages import ISO639
@ -54,19 +54,6 @@ class Consumer(object):
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    REGEX_TITLE = re.compile(
        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_CORRESPONDENT_TITLE = re.compile(
        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    def __init__(self):
        self.logger = logging.getLogger(__name__)
@ -105,7 +92,7 @@ class Consumer(object):
            if not os.path.isfile(doc):
                continue
-            if not re.match(self.REGEX_TITLE, doc):
+            if not re.match(FileInfo.REGEX_TITLE, doc):
                continue
            if doc in self._ignore:
@ -270,56 +257,8 @@ class Consumer(object):
        return re.sub(r"\s+", " ", r)
    def _guess_attributes_from_name(self, parseable):
-        """
+        file_info = FileInfo.from_path(parseable)
-        We use a crude naming convention to make handling the correspondent,
+        return file_info.sender, file_info.title, file_info.tags, file_info.suffix
        title, and tags easier:
          "<correspondent> - <title> - <tags>.<suffix>"
          "<correspondent> - <title>.<suffix>"
          "<title>.<suffix>"
        """
        def get_correspondent(correspondent_name):
            return Correspondent.objects.get_or_create(
                name=correspondent_name,
                defaults={"slug": slugify(correspondent_name)}
            )[0]
        def get_tags(tags):
            r = []
            for t in tags.split(","):
                r.append(
                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
            return tuple(r)
        def get_suffix(suffix):
            suffix = suffix.lower()
            if suffix == "jpeg":
                return "jpg"
            return suffix
        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
        m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
        if m:
            return (
                get_correspondent(m.group(1)),
                m.group(2),
                get_tags(m.group(3)),
                get_suffix(m.group(4))
            )
        # Second attempt: "<correspondent> - <title>.<suffix>"
        m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
        if m:
            return (
                get_correspondent(m.group(1)),
                m.group(2),
                (),
                get_suffix(m.group(3))
            )
        # That didn't work, so we assume correspondent and tags are None
        m = re.match(self.REGEX_TITLE, parseable)
        return None, m.group(1), (), get_suffix(m.group(2))
    def _store(self, text, doc, thumbnail):
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -23,9 +23,72 @@ class FileInfo(object):
        self._file_mtime = file_mtime
        self._path = path
    REGEX_TITLE = re.compile(
        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_CORRESPONDENT_TITLE = re.compile(
        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    @classmethod
    def from_path(cls, path):
-        pass
+        """
        We use a crude naming convention to make handling the correspondent,
        title, and tags easier:
          "<correspondent> - <title> - <tags>.<suffix>"
          "<correspondent> - <title>.<suffix>"
          "<title>.<suffix>"
        """
        def get_correspondent(correspondent_name):
            return Correspondent.objects.get_or_create(
                name=correspondent_name,
                defaults={"slug": slugify(correspondent_name)}
            )[0]
        def get_tags(tags):
            r = []
            for t in tags.split(","):
                r.append(
                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
            return tuple(r)
        def get_suffix(suffix):
            suffix = suffix.lower()
            if suffix == "jpeg":
                return "jpg"
            return suffix
        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
        m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
        if m:
            return cls(
                title=m.group(2),
                correspondent=get_correspondent(m.group(1)),
                tags=get_tags(m.group(3)),
                suffix=get_suffix(m.group(4))
            )
        # Second attempt: "<correspondent> - <title>.<suffix>"
        m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
        if m:
            return cls(
                title=m.group(2),
                correspondent=get_correspondent(m.group(1)),
                tags=(),
                suffix=get_suffix(m.group(3))
            )
        # That didn't work, so we assume correspondent and tags are None
        m = re.match(cls.REGEX_TITLE, path)
        return FileInfo(
            title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
    @classmethod
    def from_document(cls, document):