Merge branch 'tikitu-refactor-file-info-extraction'

This commit is contained in:
Daniel Quinn 2016-03-24 19:18:46 +00:00
commit 11e1b9783e
4 changed files with 378 additions and 93 deletions

View File

@ -19,12 +19,11 @@ from PIL import Image
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log
from .models import Tag, Document, Log, FileInfo
from .languages import ISO639
@ -54,19 +53,6 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self):
self.logger = logging.getLogger(__name__)
@ -105,7 +91,7 @@ class Consumer(object):
if not os.path.isfile(doc):
continue
if not re.match(self.REGEX_TITLE, doc):
if not re.match(FileInfo.REGEXES["title"], doc):
continue
if doc in self._ignore:
@ -269,72 +255,20 @@ class Consumer(object):
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), get_suffix(m.group(2))
def _store(self, text, doc, thumbnail):
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
file_info = FileInfo.from_path(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
stats = os.stat(doc)
self.log("debug", "Saving record to database")
document = Document.objects.create(
correspondent=sender,
title=title,
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_type,
file_type=file_info.extension,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(

View File

@ -96,11 +96,16 @@ class Command(Renderable, BaseCommand):
@staticmethod
def _get_legacy_file_name(doc):
if doc.correspondent and doc.title:
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {}.{}".format(
doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {}.{}".format(
doc.correspondent, doc.title, doc.file_type)
return os.path.basename(doc.source_path)
if not doc.correspondent and not doc.title:
return os.path.basename(doc.source_path)
created = doc.created.strftime("%Y%m%d%H%M%SZ")
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {} - {}.{}".format(
created, doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {} - {}.{}".format(
created, doc.correspondent, doc.title, doc.file_type)

View File

@ -1,8 +1,11 @@
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
from django.conf import settings
from django.core.urlresolvers import reverse
from django.db import models
@ -250,3 +253,136 @@ class Log(models.Model):
self.group = uuid.uuid4()
models.Model.save(self, *args, **kwargs)
class FileInfo(object):
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re.compile(
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
r"((?P<correspondent>{non_separated_word}+){separator})??"
r"(?P<title>{non_separated_word}+)"
r"({separator}(?P<tags>[a-z,0-9-]+))?"
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
separator=r"\s+-\s+",
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
))
])
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
self.created = created
self.title = title
self.extension = extension
self.correspondent = correspondent
self.tags = tags
@classmethod
def _get_created(cls, created):
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
@classmethod
def _get_correspondent(cls, name):
if not name:
return None
return Correspondent.objects.get_or_create(name=name, defaults={
"slug": slugify(name)
})[0]
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _get_tags(cls, tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
@classmethod
def _get_extension(cls, extension):
r = extension.lower()
if r == "jpeg":
return "jpg"
return r
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
@classmethod
def from_path(cls, path):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
for regex in cls.REGEXES.values():
m = regex.match(os.path.basename(path))
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
cls._mangle_property(properties, "extension")
return cls(**properties)

View File

@ -1,29 +1,36 @@
from django.test import TestCase
from ..consumer import Consumer
from ..models import Document, FileInfo
class TestAttachment(TestCase):
TAGS = ("tag1", "tag2", "tag3")
CONSUMER = Consumer()
SUFFIXES = (
EXTENSIONS = (
"pdf", "png", "jpg", "jpeg", "gif",
"PDF", "PNG", "JPG", "JPEG", "GIF",
"PdF", "PnG", "JpG", "JPeG", "GiF",
)
def _test_guess_attributes_from_name(self, path, sender, title, tags):
for suffix in self.SUFFIXES:
f = path.format(suffix)
results = self.CONSUMER._guess_attributes_from_name(f)
self.assertEqual(results[0].name, sender, f)
self.assertEqual(results[1], title, f)
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
if suffix.lower() == "jpeg":
self.assertEqual(results[3], "jpg", f)
for extension in self.EXTENSIONS:
f = path.format(extension)
file_info = FileInfo.from_path(f)
if sender:
self.assertEqual(file_info.correspondent.name, sender, f)
else:
self.assertEqual(results[3], suffix.lower(), f)
self.assertIsNone(file_info.correspondent, f)
self.assertEqual(file_info.title, title, f)
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
if extension.lower() == "jpeg":
self.assertEqual(file_info.extension, "jpg", f)
else:
self.assertEqual(file_info.extension, extension.lower(), f)
def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name(
@ -92,3 +99,206 @@ class TestAttachment(TestCase):
"Τιτλε",
self.TAGS
)
def test_guess_attributes_from_name_when_correspondent_empty(self):
self._test_guess_attributes_from_name(
'/path/to/ - weird empty correspondent but should not break.{}',
None,
'weird empty correspondent but should not break',
()
)
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/- weird but should not break.{}',
None,
'- weird but should not break',
()
)
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/weird but should not break -.{}',
None,
'weird but should not break -',
()
)
def test_guess_attributes_from_name_when_title_is_empty(self):
self._test_guess_attributes_from_name(
'/path/to/weird correspondent but should not break - .{}',
'weird correspondent but should not break',
'',
()
)
class Permutations(TestCase):
valid_dates = (
"20150102030405Z",
"20150102Z",
)
valid_correspondents = [
"timmy",
"Dr. McWheelie",
"Dash Gor-don",
"ο Θερμαστής",
""
]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
def _test_guessed_attributes(self, filename, created=None,
correspondent=None, title=None,
extension=None, tags=None):
# print(filename)
info = FileInfo.from_path(filename)
# Created
if created is None:
self.assertIsNone(info.created, filename)
else:
self.assertEqual(info.created.year, int(created[:4]), filename)
self.assertEqual(info.created.month, int(created[4:6]), filename)
self.assertEqual(info.created.day, int(created[6:8]), filename)
# Correspondent
if correspondent:
self.assertEqual(info.correspondent.name, correspondent, filename)
else:
self.assertEqual(info.correspondent, None, filename)
# Title
self.assertEqual(info.title, title, filename)
# Tags
if tags is None:
self.assertEqual(info.tags, (), filename)
else:
self.assertEqual(
[t.slug for t in info.tags], tags.split(','),
filename
)
# Extension
if extension == 'jpeg':
extension = 'jpg'
self.assertEqual(info.extension, extension, filename)
def test_just_title(self):
template = '/path/to/{title}.{extension}'
for title in self.valid_titles:
for extension in self.valid_extensions:
spec = dict(title=title, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent(self):
template = '/path/to/{correspondent} - {title}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent_and_tags(self):
template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
tags=tags, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_created_and_correspondent_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_correspondent_and_title(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
# Skip cases where title looks like a tag as we can't
# accommodate such cases.
if title.lower() == title:
continue
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title(self):
template = ("/path/to/{created} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)