From 0b34894db9706dfaeea683fc6e0b1f6890eb2efc Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Mon, 7 Mar 2016 20:42:25 +0200 Subject: [PATCH 1/7] Add `FileInfo` class with `pass` implementations --- src/documents/models.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/documents/models.py b/src/documents/models.py index 0d79dba0a..bfc0224bd 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -12,6 +12,35 @@ from django.utils import timezone from .managers import LogManager +class FileInfo(object): + def __init__(self, title, suffix, + correspondent=None, tags=None, + file_mtime=None, path=None): + self._title = title + self._suffix = suffix + self._correspondent = correspondent + self._tags = tags + self._file_mtime = file_mtime + self._path = path + + @classmethod + def from_path(cls, path): + pass + + @classmethod + def from_document(cls, document): + pass + + def filename(self): + pass + + def kwargs_for_document_create(self): + pass + + def add_tags(self, tags): + self._tags = set(tags).union(self._tags) + + class SluggedModel(models.Model): name = models.CharField(max_length=128, unique=True) From 1f75af01373543d370218b6055f9afb525937c3b Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Mon, 7 Mar 2016 21:05:04 +0200 Subject: [PATCH 2/7] Extract filename parsing into testable class --- src/documents/consumer.py | 69 +++------------------------------------ src/documents/models.py | 65 +++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..74aced5c0 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log, FileInfo from .languages import ISO639 @@ -54,19 +54,6 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - def __init__(self): self.logger = logging.getLogger(__name__) @@ -105,7 +92,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(self.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEX_TITLE, doc): continue if doc in self._ignore: @@ -270,56 +257,8 @@ class Consumer(object): return re.sub(r"\s+", " ", r) def _guess_attributes_from_name(self, parseable): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - " - - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - get_tags(m.group(3)), - get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - (), - get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), get_suffix(m.group(2)) + file_info = FileInfo.from_path(parseable) + return file_info.sender, file_info.title, file_info.tags, file_info.suffix def _store(self, text, doc, thumbnail): diff --git a/src/documents/models.py b/src/documents/models.py index bfc0224bd..c8342bf4a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -23,9 +23,72 @@ class FileInfo(object): self._file_mtime = file_mtime self._path = path + REGEX_TITLE = re.compile( + r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE = re.compile( + r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( + r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + @classmethod def from_path(cls, path): - pass + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + def get_correspondent(correspondent_name): + return Correspondent.objects.get_or_create( + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] + + def get_tags(tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=get_tags(m.group(3)), + suffix=get_suffix(m.group(4)) + ) + + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=(), + suffix=get_suffix(m.group(3)) + ) + + # That didn't work, so we assume correspondent and tags are None + m = re.match(cls.REGEX_TITLE, path) + return FileInfo( + title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) @classmethod def from_document(cls, document): From 95217e8e21201f06beccf3f368a34396ba35660d Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:08:07 +0200 Subject: [PATCH 3/7] Use FileInfo directly instead of via indirection --- src/documents/consumer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 74aced5c0..704548013 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -256,24 +256,20 @@ class Consumer(object): # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) - def _guess_attributes_from_name(self, parseable): - file_info = FileInfo.from_path(parseable) - return file_info.sender, file_info.title, file_info.tags, file_info.suffix - def _store(self, text, doc, thumbnail): - sender, title, tags, file_type = self._guess_attributes_from_name(doc) - relevant_tags = set(list(Tag.match_all(text)) + list(tags)) + file_info = FileInfo.from_path(doc) + relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) stats = os.stat(doc) self.log("debug", "Saving record to database") document = Document.objects.create( - correspondent=sender, - title=title, + correspondent=file_info.correspondent, + title=file_info.title, content=text, - file_type=file_type, + file_type=file_info.suffix, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( From 4065d14fabd9122512f09726ba272cd54540db18 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:11:34 +0200 Subject: [PATCH 4/7] Remove stuff I intended to use but never did --- src/documents/models.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index c8342bf4a..e60a699d2 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -14,14 +14,11 @@ from .managers import LogManager class FileInfo(object): def __init__(self, title, suffix, - correspondent=None, tags=None, - file_mtime=None, path=None): + correspondent=None, tags=None): self._title = title self._suffix = suffix self._correspondent = correspondent self._tags = tags - self._file_mtime = file_mtime - self._path = path REGEX_TITLE = re.compile( r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", @@ -90,19 +87,6 @@ class FileInfo(object): return FileInfo( title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) - @classmethod - def from_document(cls, document): - pass - - def filename(self): - pass - - def kwargs_for_document_create(self): - pass - - def add_tags(self, tags): - self._tags = set(tags).union(self._tags) - class SluggedModel(models.Model): From ad07eec3e1a0ead8127870967378edfb9e569b37 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:37:18 +0200 Subject: [PATCH 5/7] Make tests pass --- src/documents/models.py | 15 +++++++++++++++ src/documents/tests/test_consumer.py | 15 +++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index e60a699d2..94dc60102 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -87,6 +87,21 @@ class FileInfo(object): return FileInfo( title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) + @property + def title(self): + return self._title + + @property + def correspondent(self): + return self._correspondent + + @property + def tags(self): + return self._tags + + @property + def suffix(self): + return self._suffix class SluggedModel(models.Model): diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 04f92f98c..0e4c9d368 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,12 +1,11 @@ from django.test import TestCase -from ..consumer import Consumer +from ..models import FileInfo class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") - CONSUMER = Consumer() SUFFIXES = ( "pdf", "png", "jpg", "jpeg", "gif", "PDF", "PNG", "JPG", "JPEG", "GIF", @@ -16,14 +15,14 @@ class TestAttachment(TestCase): def _test_guess_attributes_from_name(self, path, sender, title, tags): for suffix in self.SUFFIXES: f = path.format(suffix) - results = self.CONSUMER._guess_attributes_from_name(f) - self.assertEqual(results[0].name, sender, f) - self.assertEqual(results[1], title, f) - self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) + file_info = FileInfo.from_path(f) + self.assertEqual(file_info.correspondent.name, sender, f) + self.assertEqual(file_info.title, title, f) + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) if suffix.lower() == "jpeg": - self.assertEqual(results[3], "jpg", f) + self.assertEqual(file_info.suffix, "jpg", f) else: - self.assertEqual(results[3], suffix.lower(), f) + self.assertEqual(file_info.suffix, suffix.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( From 8afdcabca8993bf2f26db504a27db74f3fe1c932 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:42:52 +0200 Subject: [PATCH 6/7] Template-based tests of combinations of valid elements --- src/documents/tests/test_consumer.py | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 0e4c9d368..37d765ac7 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -91,3 +91,63 @@ class TestAttachment(TestCase): "Τιτλε", self.TAGS ) + + +class Permutations(TestCase): + valid_correspondents = ['timmy', 'Dr. McWheelie', + 'Dash Gor-don', 'ο Θερμαστής'] + valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] + valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] + valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] + + def _test_guessed_attributes( + self, filename, title, suffix, correspondent=None, tags=None): + file_info = FileInfo.from_path(filename) + + # Required + self.assertEqual(file_info.title, title, filename) + if suffix == 'jpeg': + suffix = 'jpg' + self.assertEqual(file_info.suffix, suffix, filename) + # Optional + if correspondent is None: + self.assertEqual(file_info.correspondent, + correspondent, filename) + else: + self.assertEqual(file_info.correspondent.name, + correspondent, filename) + if tags is None: + self.assertEqual(file_info.tags, (), filename) + else: + self.assertEqual([t.slug for t in file_info.tags], + tags.split(','), + filename) + + def test_just_title(self): + template = '/path/to/{title}.{suffix}' + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(title=title, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent(self): + template = '/path/to/{correspondent} - {title}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent_and_tags(self): + template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + tags=tags, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) From a22f088e28335b4bb33e9226fbb460b4b24086e7 Mon Sep 17 00:00:00 2001 From: Tikitu de Jager <tikitu@minddistrict.com> Date: Mon, 7 Mar 2016 21:48:47 +0200 Subject: [PATCH 7/7] Add some failing edge case tests --- src/documents/tests/test_consumer.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 37d765ac7..634e8c4f0 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -92,6 +92,38 @@ class TestAttachment(TestCase): self.TAGS ) + def test_guess_attributes_from_name_when_correspondent_empty(self): + self._test_guess_attributes_from_name( + '/path/to/ - weird empty correspondent but should not break.{}', + None, + ' - weird empty correspondent but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_starts_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/- weird but should not break.{}', + None, + '- weird but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_ends_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/weird but should not break -.{}', + None, + 'weird but should not break -', + () + ) + + def test_guess_attributes_from_name_when_title_is_empty(self): + self._test_guess_attributes_from_name( + '/path/to/weird correspondent but should not break - .{}', + 'weird correspondent but should not break', + '', + () + ) + class Permutations(TestCase): valid_correspondents = ['timmy', 'Dr. McWheelie',