diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..704548013 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log, FileInfo from .languages import ISO639 @@ -54,19 +54,6 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - def __init__(self): self.logger = logging.getLogger(__name__) @@ -105,7 +92,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(self.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEX_TITLE, doc): continue if doc in self._ignore: @@ -269,72 +256,20 @@ class Consumer(object): # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) - def _guess_attributes_from_name(self, parseable): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - " - - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - get_tags(m.group(3)), - get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - (), - get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), get_suffix(m.group(2)) - def _store(self, text, doc, thumbnail): - sender, title, tags, file_type = self._guess_attributes_from_name(doc) - relevant_tags = set(list(Tag.match_all(text)) + list(tags)) + file_info = FileInfo.from_path(doc) + relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) stats = os.stat(doc) self.log("debug", "Saving record to database") document = Document.objects.create( - correspondent=sender, - title=title, + correspondent=file_info.correspondent, + title=file_info.title, content=text, - file_type=file_type, + file_type=file_info.suffix, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( diff --git a/src/documents/models.py b/src/documents/models.py index 0d79dba0a..94dc60102 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -12,6 +12,97 @@ from django.utils import timezone from .managers import LogManager +class FileInfo(object): + def __init__(self, title, suffix, + correspondent=None, tags=None): + self._title = title + self._suffix = suffix + self._correspondent = correspondent + self._tags = tags + + REGEX_TITLE = re.compile( + r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE = re.compile( + r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( + r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + ) + + @classmethod + def from_path(cls, path): + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + def get_correspondent(correspondent_name): + return Correspondent.objects.get_or_create( + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] + + def get_tags(tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=get_tags(m.group(3)), + suffix=get_suffix(m.group(4)) + ) + + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) + if m: + return cls( + title=m.group(2), + correspondent=get_correspondent(m.group(1)), + tags=(), + suffix=get_suffix(m.group(3)) + ) + + # That didn't work, so we assume correspondent and tags are None + m = re.match(cls.REGEX_TITLE, path) + return FileInfo( + title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) + + @property + def title(self): + return self._title + + @property + def correspondent(self): + return self._correspondent + + @property + def tags(self): + return self._tags + + @property + def suffix(self): + return self._suffix + class SluggedModel(models.Model): name = models.CharField(max_length=128, unique=True) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 04f92f98c..634e8c4f0 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,12 +1,11 @@ from django.test import TestCase -from ..consumer import Consumer +from ..models import FileInfo class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") - CONSUMER = Consumer() SUFFIXES = ( "pdf", "png", "jpg", "jpeg", "gif", "PDF", "PNG", "JPG", "JPEG", "GIF", @@ -16,14 +15,14 @@ class TestAttachment(TestCase): def _test_guess_attributes_from_name(self, path, sender, title, tags): for suffix in self.SUFFIXES: f = path.format(suffix) - results = self.CONSUMER._guess_attributes_from_name(f) - self.assertEqual(results[0].name, sender, f) - self.assertEqual(results[1], title, f) - self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) + file_info = FileInfo.from_path(f) + self.assertEqual(file_info.correspondent.name, sender, f) + self.assertEqual(file_info.title, title, f) + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) if suffix.lower() == "jpeg": - self.assertEqual(results[3], "jpg", f) + self.assertEqual(file_info.suffix, "jpg", f) else: - self.assertEqual(results[3], suffix.lower(), f) + self.assertEqual(file_info.suffix, suffix.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( @@ -92,3 +91,95 @@ class TestAttachment(TestCase): "Τιτλε", self.TAGS ) + + def test_guess_attributes_from_name_when_correspondent_empty(self): + self._test_guess_attributes_from_name( + '/path/to/ - weird empty correspondent but should not break.{}', + None, + ' - weird empty correspondent but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_starts_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/- weird but should not break.{}', + None, + '- weird but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_ends_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/weird but should not break -.{}', + None, + 'weird but should not break -', + () + ) + + def test_guess_attributes_from_name_when_title_is_empty(self): + self._test_guess_attributes_from_name( + '/path/to/weird correspondent but should not break - .{}', + 'weird correspondent but should not break', + '', + () + ) + + +class Permutations(TestCase): + valid_correspondents = ['timmy', 'Dr. McWheelie', + 'Dash Gor-don', 'ο Θερμαστής'] + valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] + valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] + valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] + + def _test_guessed_attributes( + self, filename, title, suffix, correspondent=None, tags=None): + file_info = FileInfo.from_path(filename) + + # Required + self.assertEqual(file_info.title, title, filename) + if suffix == 'jpeg': + suffix = 'jpg' + self.assertEqual(file_info.suffix, suffix, filename) + # Optional + if correspondent is None: + self.assertEqual(file_info.correspondent, + correspondent, filename) + else: + self.assertEqual(file_info.correspondent.name, + correspondent, filename) + if tags is None: + self.assertEqual(file_info.tags, (), filename) + else: + self.assertEqual([t.slug for t in file_info.tags], + tags.split(','), + filename) + + def test_just_title(self): + template = '/path/to/{title}.{suffix}' + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(title=title, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent(self): + template = '/path/to/{correspondent} - {title}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent_and_tags(self): + template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for suffix in self.valid_suffixes: + spec = dict(correspondent=correspondent, title=title, + tags=tags, suffix=suffix) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec)