From 0b34894db9706dfaeea683fc6e0b1f6890eb2efc Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Mon, 7 Mar 2016 20:42:25 +0200 Subject: [PATCH 1/8] Add `FileInfo` class with `pass` implementations --- src/documents/models.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/documents/models.py b/src/documents/models.py index 0d79dba0a..bfc0224bd 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -12,6 +12,35 @@ from django.utils import timezone from .managers import LogManager +class FileInfo(object): + def __init__(self, title, suffix, + correspondent=None, tags=None, + file_mtime=None, path=None): + self._title = title + self._suffix = suffix + self._correspondent = correspondent + self._tags = tags + self._file_mtime = file_mtime + self._path = path + + @classmethod + def from_path(cls, path): + pass + + @classmethod + def from_document(cls, document): + pass + + def filename(self): + pass + + def kwargs_for_document_create(self): + pass + + def add_tags(self, tags): + self._tags = set(tags).union(self._tags) + + class SluggedModel(models.Model): name = models.CharField(max_length=128, unique=True) From 1f75af01373543d370218b6055f9afb525937c3b Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Mon, 7 Mar 2016 21:05:04 +0200 Subject: [PATCH 2/8] Extract filename parsing into testable class --- src/documents/consumer.py | 69 +++------------------------------------ src/documents/models.py | 65 +++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..74aced5c0 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log, FileInfo from .languages import ISO639 @@ -54,19 +54,6 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - def __init__(self): self.logger = logging.getLogger(__name__) @@ -105,7 +92,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(self.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEX_TITLE, doc): continue if doc in self._ignore: @@ -270,56 +257,8 @@ class Consumer(object): return re.sub(r"\s+", " ", r) def _guess_attributes_from_name(self, parseable): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - " - - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - get_tags(m.group(3)), - get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - (), - get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), get_suffix(m.group(2)) + file_info = FileInfo.from_path(parseable) + return file_info.sender, file_info.title, file_info.tags, file_info.suffix def _store(self, text, doc, thumbnail): diff --git a/src/documents/models.py b/src/documents/models.py index bfc0224bd..c8342bf4a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -23,9 +23,72 @@ class FileInfo(object): self._file_mtime = file_mtime self._path = path + REGEX_TITLE = re.compile( + r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE = re.compile( + r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( + r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + @classmethod def from_path(cls, path): - pass + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + def get_correspondent(correspondent_name): + return Correspondent.objects.get_or_create( + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] + + def get_tags(tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=get_tags(m.group(3)), + suffix=get_suffix(m.group(4)) + ) + + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=(), + suffix=get_suffix(m.group(3)) + ) + + # That didn't work, so we assume correspondent and tags are None + m = re.match(cls.REGEX_TITLE, path) + return FileInfo( + title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) @classmethod def from_document(cls, document): From 95217e8e21201f06beccf3f368a34396ba35660d Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:08:07 +0200 Subject: [PATCH 3/8] Use FileInfo directly instead of via indirection --- src/documents/consumer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 74aced5c0..704548013 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -256,24 +256,20 @@ class Consumer(object): # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) - def _guess_attributes_from_name(self, parseable): - file_info = FileInfo.from_path(parseable) - return file_info.sender, file_info.title, file_info.tags, file_info.suffix - def _store(self, text, doc, thumbnail): - sender, title, tags, file_type = self._guess_attributes_from_name(doc) - relevant_tags = set(list(Tag.match_all(text)) + list(tags)) + file_info = FileInfo.from_path(doc) + relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) stats = os.stat(doc) self.log("debug", "Saving record to database") document = Document.objects.create( - correspondent=sender, - title=title, + correspondent=file_info.correspondent, + title=file_info.title, content=text, - file_type=file_type, + file_type=file_info.suffix, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( From 4065d14fabd9122512f09726ba272cd54540db18 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:11:34 +0200 Subject: [PATCH 4/8] Remove stuff I intended to use but never did --- src/documents/models.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index c8342bf4a..e60a699d2 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -14,14 +14,11 @@ from .managers import LogManager class FileInfo(object): def __init__(self, title, suffix, - correspondent=None, tags=None, - file_mtime=None, path=None): + correspondent=None, tags=None): self._title = title self._suffix = suffix self._correspondent = correspondent self._tags = tags - self._file_mtime = file_mtime - self._path = path REGEX_TITLE = re.compile( r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", @@ -90,19 +87,6 @@ class FileInfo(object): return FileInfo( title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) - @classmethod - def from_document(cls, document): - pass - - def filename(self): - pass - - def kwargs_for_document_create(self): - pass - - def add_tags(self, tags): - self._tags = set(tags).union(self._tags) - class SluggedModel(models.Model): From ad07eec3e1a0ead8127870967378edfb9e569b37 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:37:18 +0200 Subject: [PATCH 5/8] Make tests pass --- src/documents/models.py | 15 +++++++++++++++ src/documents/tests/test_consumer.py | 15 +++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index e60a699d2..94dc60102 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -87,6 +87,21 @@ class FileInfo(object): return FileInfo( title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) + @property + def title(self): + return self._title + + @property + def correspondent(self): + return self._correspondent + + @property + def tags(self): + return self._tags + + @property + def suffix(self): + return self._suffix class SluggedModel(models.Model): diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 04f92f98c..0e4c9d368 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,12 +1,11 @@ from django.test import TestCase -from ..consumer import Consumer +from ..models import FileInfo class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") - CONSUMER = Consumer() SUFFIXES = ( "pdf", "png", "jpg", "jpeg", "gif", "PDF", "PNG", "JPG", "JPEG", "GIF", @@ -16,14 +15,14 @@ class TestAttachment(TestCase): def _test_guess_attributes_from_name(self, path, sender, title, tags): for suffix in self.SUFFIXES: f = path.format(suffix) - results = self.CONSUMER._guess_attributes_from_name(f) - self.assertEqual(results[0].name, sender, f) - self.assertEqual(results[1], title, f) - self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) + file_info = FileInfo.from_path(f) + self.assertEqual(file_info.correspondent.name, sender, f) + self.assertEqual(file_info.title, title, f) + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) if suffix.lower() == "jpeg": - self.assertEqual(results[3], "jpg", f) + self.assertEqual(file_info.suffix, "jpg", f) else: - self.assertEqual(results[3], suffix.lower(), f) + self.assertEqual(file_info.suffix, suffix.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( From 8afdcabca8993bf2f26db504a27db74f3fe1c932 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:42:52 +0200 Subject: [PATCH 6/8] Template-based tests of combinations of valid elements --- src/documents/tests/test_consumer.py | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 0e4c9d368..37d765ac7 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -91,3 +91,63 @@ class TestAttachment(TestCase): "Τιτλε", self.TAGS ) + + +class Permutations(TestCase): + valid_correspondents = ['timmy', 'Dr. McWheelie', + 'Dash Gor-don', 'ο Θερμαστής'] + valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] + valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] + valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] + + def _test_guessed_attributes( + self, filename, title, suffix, correspondent=None, tags=None): + file_info = FileInfo.from_path(filename) + + # Required + self.assertEqual(file_info.title, title, filename) + if suffix == 'jpeg': + suffix = 'jpg' + self.assertEqual(file_info.suffix, suffix, filename) + # Optional + if correspondent is None: + self.assertEqual(file_info.correspondent, + correspondent, filename) + else: + self.assertEqual(file_info.correspondent.name, + correspondent, filename) + if tags is None: + self.assertEqual(file_info.tags, (), filename) + else: + self.assertEqual([t.slug for t in file_info.tags], + tags.split(','), + filename) + + def test_just_title(self): + template = '/path/to/{title}.{suffix}' + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(title=title, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent(self): + template = '/path/to/{correspondent} - {title}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent_and_tags(self): + template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + tags=tags, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) From a22f088e28335b4bb33e9226fbb460b4b24086e7 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:48:47 +0200 Subject: [PATCH 7/8] Add some failing edge case tests --- src/documents/tests/test_consumer.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 37d765ac7..634e8c4f0 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -92,6 +92,38 @@ class TestAttachment(TestCase): self.TAGS ) + def test_guess_attributes_from_name_when_correspondent_empty(self): + self._test_guess_attributes_from_name( + '/path/to/ - weird empty correspondent but should not break.{}', + None, + ' - weird empty correspondent but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_starts_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/- weird but should not break.{}', + None, + '- weird but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_ends_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/weird but should not break -.{}', + None, + 'weird but should not break -', + () + ) + + def test_guess_attributes_from_name_when_title_is_empty(self): + self._test_guess_attributes_from_name( + '/path/to/weird correspondent but should not break - .{}', + 'weird correspondent but should not break', + '', + () + ) + class Permutations(TestCase): valid_correspondents = ['timmy', 'Dr. McWheelie', From 0aa0513004f17e81462ff7fdfd450cced83c5cae Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Thu, 24 Mar 2016 19:18:33 +0000 Subject: [PATCH 8/8] Modifications for support for dates --- src/documents/consumer.py | 7 +- .../management/commands/document_exporter.py | 21 +- src/documents/models.py | 227 +++++++++++------- src/documents/tests/test_consumer.py | 205 ++++++++++++---- 4 files changed, 314 insertions(+), 146 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 704548013..45239696b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -19,12 +19,11 @@ from PIL import Image from django.conf import settings from django.utils import timezone -from django.template.defaultfilters import slugify from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log, FileInfo +from .models import Tag, Document, Log, FileInfo from .languages import ISO639 @@ -92,7 +91,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(FileInfo.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEXES["title"], doc): continue if doc in self._ignore: @@ -269,7 +268,7 @@ class Consumer(object): correspondent=file_info.correspondent, title=file_info.title, content=text, - file_type=file_info.suffix, + file_type=file_info.extension, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 913f7ae79..1c6ac6e44 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -96,11 +96,16 @@ class Command(Renderable, BaseCommand): @staticmethod def _get_legacy_file_name(doc): - if doc.correspondent and doc.title: - tags = ",".join([t.slug for t in doc.tags.all()]) - if tags: - return "{} - {} - {}.{}".format( - doc.correspondent, doc.title, tags, doc.file_type) - return "{} - {}.{}".format( - doc.correspondent, doc.title, doc.file_type) - return os.path.basename(doc.source_path) + + if not doc.correspondent and not doc.title: + return os.path.basename(doc.source_path) + + created = doc.created.strftime("%Y%m%d%H%M%SZ") + tags = ",".join([t.slug for t in doc.tags.all()]) + + if tags: + return "{} - {} - {} - {}.{}".format( + created, doc.correspondent, doc.title, tags, doc.file_type) + + return "{} - {} - {}.{}".format( + created, doc.correspondent, doc.title, doc.file_type) diff --git a/src/documents/models.py b/src/documents/models.py index 94dc60102..8880935e3 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,8 +1,11 @@ +import dateutil.parser import logging import os import re import uuid +from collections import OrderedDict + from django.conf import settings from django.core.urlresolvers import reverse from django.db import models @@ -12,97 +15,6 @@ from django.utils import timezone from .managers import LogManager -class FileInfo(object): - def __init__(self, title, suffix, - correspondent=None, tags=None): - self._title = title - self._suffix = suffix - self._correspondent = correspondent - self._tags = tags - - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - - @classmethod - def from_path(cls, path): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - "<correspondent> - <title> - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) - if m: - return cls( - title=m.group(2), - correspondent=get_correspondent(m.group(1)), - tags=get_tags(m.group(3)), - suffix=get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) - if m: - return cls( - title=m.group(2), - correspondent=get_correspondent(m.group(1)), - tags=(), - suffix=get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(cls.REGEX_TITLE, path) - return FileInfo( - title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) - - @property - def title(self): - return self._title - - @property - def correspondent(self): - return self._correspondent - - @property - def tags(self): - return self._tags - - @property - def suffix(self): - return self._suffix - class SluggedModel(models.Model): name = models.CharField(max_length=128, unique=True) @@ -341,3 +253,136 @@ class Log(models.Model): self.group = uuid.uuid4() models.Model.save(self, *args, **kwargs) + + +class FileInfo(object): + + # This epic regex *almost* worked for our needs, so I'm keeping it here for + # posterity, in the hopes that we might find a way to make it work one day. + ALMOST_REGEX = re.compile( + r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?" + r"((?P<correspondent>{non_separated_word}+){separator})??" + r"(?P<title>{non_separated_word}+)" + r"({separator}(?P<tags>[a-z,0-9-]+))?" + r"\.(?P<extension>[a-zA-Z.-]+)$".format( + separator=r"\s+-\s+", + non_separated_word=r"([\w,. ]|([^\s]-))" + ) + ) + + REGEXES = OrderedDict([ + ("created-correspondent-title-tags", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<correspondent>.*) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-title-tags", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-correspondent-title", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<correspondent>.*) - " + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-title", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("correspondent-title-tags", re.compile( + r"(?P<correspondent>.*) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("correspondent-title", re.compile( + r"(?P<correspondent>.*) - " + r"(?P<title>.*)?" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("title", re.compile( + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )) + ]) + + def __init__(self, created=None, correspondent=None, title=None, tags=(), + extension=None): + + self.created = created + self.title = title + self.extension = extension + self.correspondent = correspondent + self.tags = tags + + @classmethod + def _get_created(cls, created): + return dateutil.parser.parse("{:0<14}Z".format(created[:-1])) + + @classmethod + def _get_correspondent(cls, name): + if not name: + return None + return Correspondent.objects.get_or_create(name=name, defaults={ + "slug": slugify(name) + })[0] + + @classmethod + def _get_title(cls, title): + return title + + @classmethod + def _get_tags(cls, tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + @classmethod + def _get_extension(cls, extension): + r = extension.lower() + if r == "jpeg": + return "jpg" + return r + + @classmethod + def _mangle_property(cls, properties, name): + if name in properties: + properties[name] = getattr(cls, "_get_{}".format(name))( + properties[name] + ) + + @classmethod + def from_path(cls, path): + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + for regex in cls.REGEXES.values(): + m = regex.match(os.path.basename(path)) + if m: + properties = m.groupdict() + cls._mangle_property(properties, "created") + cls._mangle_property(properties, "correspondent") + cls._mangle_property(properties, "title") + cls._mangle_property(properties, "tags") + cls._mangle_property(properties, "extension") + return cls(**properties) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 634e8c4f0..48407044d 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,28 +1,36 @@ from django.test import TestCase -from ..models import FileInfo +from ..models import Document, FileInfo class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") - SUFFIXES = ( + EXTENSIONS = ( "pdf", "png", "jpg", "jpeg", "gif", "PDF", "PNG", "JPG", "JPEG", "GIF", "PdF", "PnG", "JpG", "JPeG", "GiF", ) def _test_guess_attributes_from_name(self, path, sender, title, tags): - for suffix in self.SUFFIXES: - f = path.format(suffix) + + for extension in self.EXTENSIONS: + + f = path.format(extension) file_info = FileInfo.from_path(f) - self.assertEqual(file_info.correspondent.name, sender, f) - self.assertEqual(file_info.title, title, f) - self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) - if suffix.lower() == "jpeg": - self.assertEqual(file_info.suffix, "jpg", f) + + if sender: + self.assertEqual(file_info.correspondent.name, sender, f) else: - self.assertEqual(file_info.suffix, suffix.lower(), f) + self.assertIsNone(file_info.correspondent, f) + + self.assertEqual(file_info.title, title, f) + + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) + if extension.lower() == "jpeg": + self.assertEqual(file_info.extension, "jpg", f) + else: + self.assertEqual(file_info.extension, extension.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( @@ -96,7 +104,7 @@ class TestAttachment(TestCase): self._test_guess_attributes_from_name( '/path/to/ - weird empty correspondent but should not break.{}', None, - ' - weird empty correspondent but should not break', + 'weird empty correspondent but should not break', () ) @@ -126,60 +134,171 @@ class TestAttachment(TestCase): class Permutations(TestCase): - valid_correspondents = ['timmy', 'Dr. McWheelie', - 'Dash Gor-don', 'ο Θερμαστής'] - valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] - valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] - valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] - def _test_guessed_attributes( - self, filename, title, suffix, correspondent=None, tags=None): - file_info = FileInfo.from_path(filename) + valid_dates = ( + "20150102030405Z", + "20150102Z", + ) + valid_correspondents = [ + "timmy", + "Dr. McWheelie", + "Dash Gor-don", + "ο Θερμαστής", + "" + ] + valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""] + valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] + valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"] - # Required - self.assertEqual(file_info.title, title, filename) - if suffix == 'jpeg': - suffix = 'jpg' - self.assertEqual(file_info.suffix, suffix, filename) - # Optional - if correspondent is None: - self.assertEqual(file_info.correspondent, - correspondent, filename) + def _test_guessed_attributes(self, filename, created=None, + correspondent=None, title=None, + extension=None, tags=None): + + # print(filename) + info = FileInfo.from_path(filename) + + # Created + if created is None: + self.assertIsNone(info.created, filename) else: - self.assertEqual(file_info.correspondent.name, - correspondent, filename) + self.assertEqual(info.created.year, int(created[:4]), filename) + self.assertEqual(info.created.month, int(created[4:6]), filename) + self.assertEqual(info.created.day, int(created[6:8]), filename) + + # Correspondent + if correspondent: + self.assertEqual(info.correspondent.name, correspondent, filename) + else: + self.assertEqual(info.correspondent, None, filename) + + # Title + self.assertEqual(info.title, title, filename) + + # Tags if tags is None: - self.assertEqual(file_info.tags, (), filename) + self.assertEqual(info.tags, (), filename) else: - self.assertEqual([t.slug for t in file_info.tags], - tags.split(','), - filename) + self.assertEqual( + [t.slug for t in info.tags], tags.split(','), + filename + ) + + # Extension + if extension == 'jpeg': + extension = 'jpg' + self.assertEqual(info.extension, extension, filename) def test_just_title(self): - template = '/path/to/{title}.{suffix}' + template = '/path/to/{title}.{extension}' for title in self.valid_titles: - for suffix in self.valid_suffixes: - spec = dict(title=title, suffix=suffix) + for extension in self.valid_extensions: + spec = dict(title=title, extension=extension) filename = template.format(**spec) self._test_guessed_attributes(filename, **spec) def test_title_and_correspondent(self): - template = '/path/to/{correspondent} - {title}.{suffix}' + template = '/path/to/{correspondent} - {title}.{extension}' for correspondent in self.valid_correspondents: for title in self.valid_titles: - for suffix in self.valid_suffixes: + for extension in self.valid_extensions: spec = dict(correspondent=correspondent, title=title, - suffix=suffix) + extension=extension) filename = template.format(**spec) self._test_guessed_attributes(filename, **spec) def test_title_and_correspondent_and_tags(self): - template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' + template = '/path/to/{correspondent} - {title} - {tags}.{extension}' for correspondent in self.valid_correspondents: for title in self.valid_titles: for tags in self.valid_tags: - for suffix in self.valid_suffixes: + for extension in self.valid_extensions: spec = dict(correspondent=correspondent, title=title, - tags=tags, suffix=suffix) + tags=tags, extension=extension) filename = template.format(**spec) self._test_guessed_attributes(filename, **spec) + + def test_created_and_correspondent_and_title_and_tags(self): + + template = ("/path/to/{created} - " + "{correspondent} - " + "{title} - " + "{tags}" + ".{extension}") + + for created in self.valid_dates: + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for extension in self.valid_extensions: + spec = { + "created": created, + "correspondent": correspondent, + "title": title, + "tags": tags, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_correspondent_and_title(self): + + template = ("/path/to/{created} - " + "{correspondent} - " + "{title}" + ".{extension}") + + for created in self.valid_dates: + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + + # Skip cases where title looks like a tag as we can't + # accommodate such cases. + if title.lower() == title: + continue + + for extension in self.valid_extensions: + spec = { + "created": created, + "correspondent": correspondent, + "title": title, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_title(self): + + template = ("/path/to/{created} - " + "{title}" + ".{extension}") + + for created in self.valid_dates: + for title in self.valid_titles: + for extension in self.valid_extensions: + spec = { + "created": created, + "title": title, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_title_and_tags(self): + + template = ("/path/to/{created} - " + "{title} - " + "{tags}" + ".{extension}") + + for created in self.valid_dates: + for title in self.valid_titles: + for tags in self.valid_tags: + for extension in self.valid_extensions: + spec = { + "created": created, + "title": title, + "tags": tags, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec)