diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..45239696b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -19,12 +19,11 @@ from PIL import Image from django.conf import settings from django.utils import timezone -from django.template.defaultfilters import slugify from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Correspondent, Tag, Document, Log +from .models import Tag, Document, Log, FileInfo from .languages import ISO639 @@ -54,19 +53,6 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE = re.compile( - r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", - flags=re.IGNORECASE - ) - def __init__(self): self.logger = logging.getLogger(__name__) @@ -105,7 +91,7 @@ class Consumer(object): if not os.path.isfile(doc): continue - if not re.match(self.REGEX_TITLE, doc): + if not re.match(FileInfo.REGEXES["title"], doc): continue if doc in self._ignore: @@ -269,72 +255,20 @@ class Consumer(object): # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) - def _guess_attributes_from_name(self, parseable): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - " - - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" - """ - - def get_correspondent(correspondent_name): - return Correspondent.objects.get_or_create( - name=correspondent_name, - defaults={"slug": slugify(correspondent_name)} - )[0] - - def get_tags(tags): - r = [] - for t in tags.split(","): - r.append( - Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) - return tuple(r) - - def get_suffix(suffix): - suffix = suffix.lower() - if suffix == "jpeg": - return "jpg" - return suffix - - # First attempt: "<correspondent> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - get_tags(m.group(3)), - get_suffix(m.group(4)) - ) - - # Second attempt: "<correspondent> - <title>.<suffix>" - m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) - if m: - return ( - get_correspondent(m.group(1)), - m.group(2), - (), - get_suffix(m.group(3)) - ) - - # That didn't work, so we assume correspondent and tags are None - m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), get_suffix(m.group(2)) - def _store(self, text, doc, thumbnail): - sender, title, tags, file_type = self._guess_attributes_from_name(doc) - relevant_tags = set(list(Tag.match_all(text)) + list(tags)) + file_info = FileInfo.from_path(doc) + relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) stats = os.stat(doc) self.log("debug", "Saving record to database") document = Document.objects.create( - correspondent=sender, - title=title, + correspondent=file_info.correspondent, + title=file_info.title, content=text, - file_type=file_type, + file_type=file_info.extension, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 913f7ae79..1c6ac6e44 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -96,11 +96,16 @@ class Command(Renderable, BaseCommand): @staticmethod def _get_legacy_file_name(doc): - if doc.correspondent and doc.title: - tags = ",".join([t.slug for t in doc.tags.all()]) - if tags: - return "{} - {} - {}.{}".format( - doc.correspondent, doc.title, tags, doc.file_type) - return "{} - {}.{}".format( - doc.correspondent, doc.title, doc.file_type) - return os.path.basename(doc.source_path) + + if not doc.correspondent and not doc.title: + return os.path.basename(doc.source_path) + + created = doc.created.strftime("%Y%m%d%H%M%SZ") + tags = ",".join([t.slug for t in doc.tags.all()]) + + if tags: + return "{} - {} - {} - {}.{}".format( + created, doc.correspondent, doc.title, tags, doc.file_type) + + return "{} - {} - {}.{}".format( + created, doc.correspondent, doc.title, doc.file_type) diff --git a/src/documents/models.py b/src/documents/models.py index 0d79dba0a..8880935e3 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,8 +1,11 @@ +import dateutil.parser import logging import os import re import uuid +from collections import OrderedDict + from django.conf import settings from django.core.urlresolvers import reverse from django.db import models @@ -250,3 +253,136 @@ class Log(models.Model): self.group = uuid.uuid4() models.Model.save(self, *args, **kwargs) + + +class FileInfo(object): + + # This epic regex *almost* worked for our needs, so I'm keeping it here for + # posterity, in the hopes that we might find a way to make it work one day. + ALMOST_REGEX = re.compile( + r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?" + r"((?P<correspondent>{non_separated_word}+){separator})??" + r"(?P<title>{non_separated_word}+)" + r"({separator}(?P<tags>[a-z,0-9-]+))?" + r"\.(?P<extension>[a-zA-Z.-]+)$".format( + separator=r"\s+-\s+", + non_separated_word=r"([\w,. ]|([^\s]-))" + ) + ) + + REGEXES = OrderedDict([ + ("created-correspondent-title-tags", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<correspondent>.*) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-title-tags", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-correspondent-title", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<correspondent>.*) - " + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("created-title", re.compile( + r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("correspondent-title-tags", re.compile( + r"(?P<correspondent>.*) - " + r"(?P<title>.*) - " + r"(?P<tags>[a-z0-9\-,]*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("correspondent-title", re.compile( + r"(?P<correspondent>.*) - " + r"(?P<title>.*)?" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )), + ("title", re.compile( + r"(?P<title>.*)" + r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$", + flags=re.IGNORECASE + )) + ]) + + def __init__(self, created=None, correspondent=None, title=None, tags=(), + extension=None): + + self.created = created + self.title = title + self.extension = extension + self.correspondent = correspondent + self.tags = tags + + @classmethod + def _get_created(cls, created): + return dateutil.parser.parse("{:0<14}Z".format(created[:-1])) + + @classmethod + def _get_correspondent(cls, name): + if not name: + return None + return Correspondent.objects.get_or_create(name=name, defaults={ + "slug": slugify(name) + })[0] + + @classmethod + def _get_title(cls, title): + return title + + @classmethod + def _get_tags(cls, tags): + r = [] + for t in tags.split(","): + r.append( + Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) + return tuple(r) + + @classmethod + def _get_extension(cls, extension): + r = extension.lower() + if r == "jpeg": + return "jpg" + return r + + @classmethod + def _mangle_property(cls, properties, name): + if name in properties: + properties[name] = getattr(cls, "_get_{}".format(name))( + properties[name] + ) + + @classmethod + def from_path(cls, path): + """ + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" + "<title>.<suffix>" + """ + + for regex in cls.REGEXES.values(): + m = regex.match(os.path.basename(path)) + if m: + properties = m.groupdict() + cls._mangle_property(properties, "created") + cls._mangle_property(properties, "correspondent") + cls._mangle_property(properties, "title") + cls._mangle_property(properties, "tags") + cls._mangle_property(properties, "extension") + return cls(**properties) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 04f92f98c..48407044d 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,29 +1,36 @@ from django.test import TestCase -from ..consumer import Consumer +from ..models import Document, FileInfo class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") - CONSUMER = Consumer() - SUFFIXES = ( + EXTENSIONS = ( "pdf", "png", "jpg", "jpeg", "gif", "PDF", "PNG", "JPG", "JPEG", "GIF", "PdF", "PnG", "JpG", "JPeG", "GiF", ) def _test_guess_attributes_from_name(self, path, sender, title, tags): - for suffix in self.SUFFIXES: - f = path.format(suffix) - results = self.CONSUMER._guess_attributes_from_name(f) - self.assertEqual(results[0].name, sender, f) - self.assertEqual(results[1], title, f) - self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) - if suffix.lower() == "jpeg": - self.assertEqual(results[3], "jpg", f) + + for extension in self.EXTENSIONS: + + f = path.format(extension) + file_info = FileInfo.from_path(f) + + if sender: + self.assertEqual(file_info.correspondent.name, sender, f) else: - self.assertEqual(results[3], suffix.lower(), f) + self.assertIsNone(file_info.correspondent, f) + + self.assertEqual(file_info.title, title, f) + + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) + if extension.lower() == "jpeg": + self.assertEqual(file_info.extension, "jpg", f) + else: + self.assertEqual(file_info.extension, extension.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( @@ -92,3 +99,206 @@ class TestAttachment(TestCase): "Τιτλε", self.TAGS ) + + def test_guess_attributes_from_name_when_correspondent_empty(self): + self._test_guess_attributes_from_name( + '/path/to/ - weird empty correspondent but should not break.{}', + None, + 'weird empty correspondent but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_starts_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/- weird but should not break.{}', + None, + '- weird but should not break', + () + ) + + def test_guess_attributes_from_name_when_title_ends_with_dash(self): + self._test_guess_attributes_from_name( + '/path/to/weird but should not break -.{}', + None, + 'weird but should not break -', + () + ) + + def test_guess_attributes_from_name_when_title_is_empty(self): + self._test_guess_attributes_from_name( + '/path/to/weird correspondent but should not break - .{}', + 'weird correspondent but should not break', + '', + () + ) + + +class Permutations(TestCase): + + valid_dates = ( + "20150102030405Z", + "20150102Z", + ) + valid_correspondents = [ + "timmy", + "Dr. McWheelie", + "Dash Gor-don", + "ο Θερμαστής", + "" + ] + valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""] + valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] + valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"] + + def _test_guessed_attributes(self, filename, created=None, + correspondent=None, title=None, + extension=None, tags=None): + + # print(filename) + info = FileInfo.from_path(filename) + + # Created + if created is None: + self.assertIsNone(info.created, filename) + else: + self.assertEqual(info.created.year, int(created[:4]), filename) + self.assertEqual(info.created.month, int(created[4:6]), filename) + self.assertEqual(info.created.day, int(created[6:8]), filename) + + # Correspondent + if correspondent: + self.assertEqual(info.correspondent.name, correspondent, filename) + else: + self.assertEqual(info.correspondent, None, filename) + + # Title + self.assertEqual(info.title, title, filename) + + # Tags + if tags is None: + self.assertEqual(info.tags, (), filename) + else: + self.assertEqual( + [t.slug for t in info.tags], tags.split(','), + filename + ) + + # Extension + if extension == 'jpeg': + extension = 'jpg' + self.assertEqual(info.extension, extension, filename) + + def test_just_title(self): + template = '/path/to/{title}.{extension}' + for title in self.valid_titles: + for extension in self.valid_extensions: + spec = dict(title=title, extension=extension) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent(self): + template = '/path/to/{correspondent} - {title}.{extension}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for extension in self.valid_extensions: + spec = dict(correspondent=correspondent, title=title, + extension=extension) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent_and_tags(self): + template = '/path/to/{correspondent} - {title} - {tags}.{extension}' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for extension in self.valid_extensions: + spec = dict(correspondent=correspondent, title=title, + tags=tags, extension=extension) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_created_and_correspondent_and_title_and_tags(self): + + template = ("/path/to/{created} - " + "{correspondent} - " + "{title} - " + "{tags}" + ".{extension}") + + for created in self.valid_dates: + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + for tags in self.valid_tags: + for extension in self.valid_extensions: + spec = { + "created": created, + "correspondent": correspondent, + "title": title, + "tags": tags, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_correspondent_and_title(self): + + template = ("/path/to/{created} - " + "{correspondent} - " + "{title}" + ".{extension}") + + for created in self.valid_dates: + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + + # Skip cases where title looks like a tag as we can't + # accommodate such cases. + if title.lower() == title: + continue + + for extension in self.valid_extensions: + spec = { + "created": created, + "correspondent": correspondent, + "title": title, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_title(self): + + template = ("/path/to/{created} - " + "{title}" + ".{extension}") + + for created in self.valid_dates: + for title in self.valid_titles: + for extension in self.valid_extensions: + spec = { + "created": created, + "title": title, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec) + + def test_created_and_title_and_tags(self): + + template = ("/path/to/{created} - " + "{title} - " + "{tags}" + ".{extension}") + + for created in self.valid_dates: + for title in self.valid_titles: + for tags in self.valid_tags: + for extension in self.valid_extensions: + spec = { + "created": created, + "title": title, + "tags": tags, + "extension": extension + } + self._test_guessed_attributes( + template.format(**spec), **spec)