From 1f75af01373543d370218b6055f9afb525937c3b Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Mon, 7 Mar 2016 21:05:04 +0200 Subject: [PATCH] Extract filename parsing into testable class --- src/documents/consumer.py | 69 +++------------------------------------ src/documents/models.py | 65 +++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..74aced5c0 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log, FileInfo from .languages import ISO639 @@ -54,19 +54,6 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - def __init__(self): self.logger = logging.getLogger(__name__) @@ -105,7 +92,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(self.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEX_TITLE, doc): continue if doc in self._ignore: @@ -270,56 +257,8 @@ class Consumer(object): return re.sub(r"\s+", " ", r) def _guess_attributes_from_name(self, parseable): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - " - - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - get_tags(m.group(3)), - get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - (), - get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), get_suffix(m.group(2)) + file_info = FileInfo.from_path(parseable) + return file_info.sender, file_info.title, file_info.tags, file_info.suffix def _store(self, text, doc, thumbnail): diff --git a/src/documents/models.py b/src/documents/models.py index bfc0224bd..c8342bf4a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -23,9 +23,72 @@ class FileInfo(object): self._file_mtime = file_mtime self._path = path + REGEX_TITLE = re.compile( + r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE = re.compile( + r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( + r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + @classmethod def from_path(cls, path): - pass + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + def get_correspondent(correspondent_name): + return Correspondent.objects.get_or_create( + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] + + def get_tags(tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=get_tags(m.group(3)), + suffix=get_suffix(m.group(4)) + ) + + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=(), + suffix=get_suffix(m.group(3)) + ) + + # That didn't work, so we assume correspondent and tags are None + m = re.match(cls.REGEX_TITLE, path) + return FileInfo( + title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) @classmethod def from_document(cls, document):