Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction

This commit is contained in:
Daniel Quinn 2016-03-19 15:44:35 +00:00
commit cf5076bcad
3 changed files with 197 additions and 80 deletions

View File

@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
from paperless.db import GnuPG from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log from .models import Correspondent, Tag, Document, Log, FileInfo
from .languages import ISO639 from .languages import ISO639
@ -54,19 +54,6 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self): def __init__(self):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@ -105,7 +92,7 @@ class Consumer(object):
if not os.path.isfile(doc): if not os.path.isfile(doc):
continue continue
if not re.match(self.REGEX_TITLE, doc): if not re.match(FileInfo.REGEX_TITLE, doc):
continue continue
if doc in self._ignore: if doc in self._ignore:
@ -269,72 +256,20 @@ class Consumer(object):
# Strip out excess white space to allow matching to go smoother # Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r) return re.sub(r"\s+", " ", r)
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), get_suffix(m.group(2))
def _store(self, text, doc, thumbnail): def _store(self, text, doc, thumbnail):
sender, title, tags, file_type = self._guess_attributes_from_name(doc) file_info = FileInfo.from_path(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(tags)) relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
stats = os.stat(doc) stats = os.stat(doc)
self.log("debug", "Saving record to database") self.log("debug", "Saving record to database")
document = Document.objects.create( document = Document.objects.create(
correspondent=sender, correspondent=file_info.correspondent,
title=title, title=file_info.title,
content=text, content=text,
file_type=file_type, file_type=file_info.suffix,
created=timezone.make_aware( created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)), datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware( modified=timezone.make_aware(

View File

@ -12,6 +12,97 @@ from django.utils import timezone
from .managers import LogManager from .managers import LogManager
class FileInfo(object):
def __init__(self, title, suffix,
correspondent=None, tags=None):
self._title = title
self._suffix = suffix
self._correspondent = correspondent
self._tags = tags
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
@classmethod
def from_path(cls, path):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=get_tags(m.group(3)),
suffix=get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=(),
suffix=get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(cls.REGEX_TITLE, path)
return FileInfo(
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
@property
def title(self):
return self._title
@property
def correspondent(self):
return self._correspondent
@property
def tags(self):
return self._tags
@property
def suffix(self):
return self._suffix
class SluggedModel(models.Model): class SluggedModel(models.Model):
name = models.CharField(max_length=128, unique=True) name = models.CharField(max_length=128, unique=True)

View File

@ -1,12 +1,11 @@
from django.test import TestCase from django.test import TestCase
from ..consumer import Consumer from ..models import FileInfo
class TestAttachment(TestCase): class TestAttachment(TestCase):
TAGS = ("tag1", "tag2", "tag3") TAGS = ("tag1", "tag2", "tag3")
CONSUMER = Consumer()
SUFFIXES = ( SUFFIXES = (
"pdf", "png", "jpg", "jpeg", "gif", "pdf", "png", "jpg", "jpeg", "gif",
"PDF", "PNG", "JPG", "JPEG", "GIF", "PDF", "PNG", "JPG", "JPEG", "GIF",
@ -16,14 +15,14 @@ class TestAttachment(TestCase):
def _test_guess_attributes_from_name(self, path, sender, title, tags): def _test_guess_attributes_from_name(self, path, sender, title, tags):
for suffix in self.SUFFIXES: for suffix in self.SUFFIXES:
f = path.format(suffix) f = path.format(suffix)
results = self.CONSUMER._guess_attributes_from_name(f) file_info = FileInfo.from_path(f)
self.assertEqual(results[0].name, sender, f) self.assertEqual(file_info.correspondent.name, sender, f)
self.assertEqual(results[1], title, f) self.assertEqual(file_info.title, title, f)
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
if suffix.lower() == "jpeg": if suffix.lower() == "jpeg":
self.assertEqual(results[3], "jpg", f) self.assertEqual(file_info.suffix, "jpg", f)
else: else:
self.assertEqual(results[3], suffix.lower(), f) self.assertEqual(file_info.suffix, suffix.lower(), f)
def test_guess_attributes_from_name0(self): def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name( self._test_guess_attributes_from_name(
@ -92,3 +91,95 @@ class TestAttachment(TestCase):
"Τιτλε", "Τιτλε",
self.TAGS self.TAGS
) )
def test_guess_attributes_from_name_when_correspondent_empty(self):
self._test_guess_attributes_from_name(
'/path/to/ - weird empty correspondent but should not break.{}',
None,
' - weird empty correspondent but should not break',
()
)
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/- weird but should not break.{}',
None,
'- weird but should not break',
()
)
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/weird but should not break -.{}',
None,
'weird but should not break -',
()
)
def test_guess_attributes_from_name_when_title_is_empty(self):
self._test_guess_attributes_from_name(
'/path/to/weird correspondent but should not break - .{}',
'weird correspondent but should not break',
'',
()
)
class Permutations(TestCase):
valid_correspondents = ['timmy', 'Dr. McWheelie',
'Dash Gor-don', 'ο Θερμαστής']
valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
def _test_guessed_attributes(
self, filename, title, suffix, correspondent=None, tags=None):
file_info = FileInfo.from_path(filename)
# Required
self.assertEqual(file_info.title, title, filename)
if suffix == 'jpeg':
suffix = 'jpg'
self.assertEqual(file_info.suffix, suffix, filename)
# Optional
if correspondent is None:
self.assertEqual(file_info.correspondent,
correspondent, filename)
else:
self.assertEqual(file_info.correspondent.name,
correspondent, filename)
if tags is None:
self.assertEqual(file_info.tags, (), filename)
else:
self.assertEqual([t.slug for t in file_info.tags],
tags.split(','),
filename)
def test_just_title(self):
template = '/path/to/{title}.{suffix}'
for title in self.valid_titles:
for suffix in self.valid_suffixes:
spec = dict(title=title, suffix=suffix)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent(self):
template = '/path/to/{correspondent} - {title}.{suffix}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for suffix in self.valid_suffixes:
spec = dict(correspondent=correspondent, title=title,
suffix=suffix)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent_and_tags(self):
template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for suffix in self.valid_suffixes:
spec = dict(correspondent=correspondent, title=title,
tags=tags, suffix=suffix)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)