Fixed the auto-naming regexes

This commit is contained in:
Daniel Quinn 2016-02-11 22:05:55 +00:00
parent 7aadab23cc
commit a022fcb8f1
2 changed files with 85 additions and 78 deletions

View File

@ -16,6 +16,7 @@ from django.template.defaultfilters import slugify
from paperless.db import GnuPG from paperless.db import GnuPG
from .mixins import Renderable
from .models import Sender, Tag, Document from .models import Sender, Tag, Document
from .languages import ISO639 from .languages import ISO639
@ -28,7 +29,7 @@ class ConsumerError(Exception):
pass pass
class Consumer(object): class Consumer(Renderable):
""" """
Loop over every file found in CONSUMPTION_DIR and: Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png 1. Convert it to a greyscale png
@ -50,11 +51,11 @@ class Consumer(object):
flags=re.IGNORECASE flags=re.IGNORECASE
) )
REGEX_SENDER_TITLE = re.compile( REGEX_SENDER_TITLE = re.compile(
r"^[^/]*/(.+) - ([^/]+)\.(pdf|jpe?g|png|gif|tiff)$", r"^.*/(.+) - ([^/]+)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE flags=re.IGNORECASE
) )
REGEX_SENDER_TITLE_TAGS = re.compile( REGEX_SENDER_TITLE_TAGS = re.compile(
r"^.*/([^/]+) - ([^/]+) - ([a-z\-,]+)\.(pdf|jpe?g|png|gif|tiff)$", r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE flags=re.IGNORECASE
) )
@ -208,7 +209,7 @@ class Consumer(object):
for t in tags.split(","): for t in tags.split(","):
r.append( r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return r return tuple(r)
# First attempt: "<sender> - <title> - <tags>.<suffix>" # First attempt: "<sender> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable) m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
@ -223,11 +224,11 @@ class Consumer(object):
# Second attempt: "<sender> - <title>.<suffix>" # Second attempt: "<sender> - <title>.<suffix>"
m = re.match(self.REGEX_SENDER_TITLE, parseable) m = re.match(self.REGEX_SENDER_TITLE, parseable)
if m: if m:
return get_sender(m.group(1)), m.group(2), [], m.group(3) return get_sender(m.group(1)), m.group(2), (), m.group(3)
# That didn't work, so we assume sender and tags are None # That didn't work, so we assume sender and tags are None
m = re.match(self.REGEX_TITLE, parseable) m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), [], m.group(2) return None, m.group(1), (), m.group(2)
def _store(self, text, doc): def _store(self, text, doc):
@ -273,10 +274,6 @@ class Consumer(object):
self._render("", 2) self._render("", 2)
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)
def _is_ready(self, doc): def _is_ready(self, doc):
""" """
Detect whether `doc` is ready to consume or if it's still being written Detect whether `doc` is ready to consume or if it's still being written

View File

@ -5,72 +5,82 @@ from ..consumer import Consumer
class TestAttachment(TestCase): class TestAttachment(TestCase):
def test_guess_attributes_from_name(self): TAGS = ("tag1", "tag2", "tag3")
consumer = Consumer() CONSUMER = Consumer()
suffixes = ("pdf", "png", "jpg", "jpeg", "gif")
tests = ( def _test_guess_attributes_from_name(self, path, sender, title, tags):
{ for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
"path": "/path/to/Sender - Title - tag1,tag2,tag3.{}", f = path.format(suffix)
"result": { results = self.CONSUMER._guess_attributes_from_name(f)
"sender": "Sender", self.assertEqual(results[0].name, sender, f)
"title": "Title", self.assertEqual(results[1], title, f)
"tags": ("tag1", "tag2", "tag3") self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
}, self.assertEqual(results[3], suffix, f)
},
{ def test_guess_attributes_from_name0(self):
"path": "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}", self._test_guess_attributes_from_name(
"result": { "/path/to/Sender - Title.{}", "Sender", "Title", ())
"sender": "Spaced Sender",
"title": "Title", def test_guess_attributes_from_name1(self):
"tags": ("tag1", "tag2", "tag3") self._test_guess_attributes_from_name(
}, "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
},
{ def test_guess_attributes_from_name2(self):
"path": "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}", self._test_guess_attributes_from_name(
"result": { "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
"sender": "Sender",
"title": "Spaced Title", def test_guess_attributes_from_name3(self):
"tags": ("tag1", "tag2", "tag3") self._test_guess_attributes_from_name(
}, "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
},
{ def test_guess_attributes_from_name4(self):
"path": "/path/to/Spaced Sender - Spaced Title - tag1,tag2.{}", self._test_guess_attributes_from_name(
"result": { "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
"sender": "Spaced Sender",
"title": "Spaced Title", def test_guess_attributes_from_name5(self):
"tags": ("tag1", "tag2") self._test_guess_attributes_from_name(
}, "/path/to/Sender - Title - tag1,tag2,tag3.{}",
}, "Sender",
{ "Title",
"path": "/path/to/Dash-Sender - Title - tag1,tag2.{}", self.TAGS
"result": { )
"sender": "Dash-Sender",
"title": "Title", def test_guess_attributes_from_name6(self):
"tags": ("tag1", "tag2") self._test_guess_attributes_from_name(
}, "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
}, "Spaced Sender",
{ "Title",
"path": "/path/to/Sender - Dash-Title - tag1,tag2.{}", self.TAGS
"result": { )
"sender": "Sender",
"title": "Dash-Title", def test_guess_attributes_from_name7(self):
"tags": ("tag1", "tag2") self._test_guess_attributes_from_name(
}, "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
}, "Sender",
{ "Spaced Title",
"path": "/path/to/Dash-Sender - Dash-Title - tag1,tag2.{}", self.TAGS
"result": { )
"sender": "Dash-Sender",
"title": "Dash-Title", def test_guess_attributes_from_name8(self):
"tags": ("tag1", "tag2") self._test_guess_attributes_from_name(
}, "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
}, "Dashed-Sender",
"Title",
self.TAGS
)
def test_guess_attributes_from_name9(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
"Sender",
"Dashed-Title",
self.TAGS
)
def test_guess_attributes_from_name10(self):
self._test_guess_attributes_from_name(
"/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
"Σενδερ",
"Τιτλε",
self.TAGS
) )
for test in tests:
for suffix in suffixes:
f = test["path"].format(suffix)
sender, title, tags, s = consumer._guess_attributes_from_name(f)
self.assertEqual(sender.name, test["result"]["sender"])
self.assertEqual(title, test["result"]["title"])
self.assertEqual(tags, test["result"]["tags"])
self.assertEqual(s, suffix)