Modifications for support for dates

This commit is contained in:
Daniel Quinn 2016-03-24 19:18:33 +00:00
parent cf5076bcad
commit 0aa0513004
4 changed files with 314 additions and 146 deletions

View File

@ -19,12 +19,11 @@ from PIL import Image
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log, FileInfo
from .models import Tag, Document, Log, FileInfo
from .languages import ISO639
@ -92,7 +91,7 @@ class Consumer(object):
if not os.path.isfile(doc):
continue
if not re.match(FileInfo.REGEX_TITLE, doc):
if not re.match(FileInfo.REGEXES["title"], doc):
continue
if doc in self._ignore:
@ -269,7 +268,7 @@ class Consumer(object):
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_info.suffix,
file_type=file_info.extension,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(

View File

@ -96,11 +96,16 @@ class Command(Renderable, BaseCommand):
@staticmethod
def _get_legacy_file_name(doc):
if doc.correspondent and doc.title:
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {}.{}".format(
doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {}.{}".format(
doc.correspondent, doc.title, doc.file_type)
return os.path.basename(doc.source_path)
if not doc.correspondent and not doc.title:
return os.path.basename(doc.source_path)
created = doc.created.strftime("%Y%m%d%H%M%SZ")
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {} - {}.{}".format(
created, doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {} - {}.{}".format(
created, doc.correspondent, doc.title, doc.file_type)

View File

@ -1,8 +1,11 @@
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
from django.conf import settings
from django.core.urlresolvers import reverse
from django.db import models
@ -12,97 +15,6 @@ from django.utils import timezone
from .managers import LogManager
class FileInfo(object):
def __init__(self, title, suffix,
correspondent=None, tags=None):
self._title = title
self._suffix = suffix
self._correspondent = correspondent
self._tags = tags
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
@classmethod
def from_path(cls, path):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=get_tags(m.group(3)),
suffix=get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=(),
suffix=get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(cls.REGEX_TITLE, path)
return FileInfo(
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
@property
def title(self):
return self._title
@property
def correspondent(self):
return self._correspondent
@property
def tags(self):
return self._tags
@property
def suffix(self):
return self._suffix
class SluggedModel(models.Model):
name = models.CharField(max_length=128, unique=True)
@ -341,3 +253,136 @@ class Log(models.Model):
self.group = uuid.uuid4()
models.Model.save(self, *args, **kwargs)
class FileInfo(object):
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re.compile(
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
r"((?P<correspondent>{non_separated_word}+){separator})??"
r"(?P<title>{non_separated_word}+)"
r"({separator}(?P<tags>[a-z,0-9-]+))?"
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
separator=r"\s+-\s+",
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
))
])
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
self.created = created
self.title = title
self.extension = extension
self.correspondent = correspondent
self.tags = tags
@classmethod
def _get_created(cls, created):
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
@classmethod
def _get_correspondent(cls, name):
if not name:
return None
return Correspondent.objects.get_or_create(name=name, defaults={
"slug": slugify(name)
})[0]
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _get_tags(cls, tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
@classmethod
def _get_extension(cls, extension):
r = extension.lower()
if r == "jpeg":
return "jpg"
return r
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
@classmethod
def from_path(cls, path):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
for regex in cls.REGEXES.values():
m = regex.match(os.path.basename(path))
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
cls._mangle_property(properties, "extension")
return cls(**properties)

View File

@ -1,28 +1,36 @@
from django.test import TestCase
from ..models import FileInfo
from ..models import Document, FileInfo
class TestAttachment(TestCase):
TAGS = ("tag1", "tag2", "tag3")
SUFFIXES = (
EXTENSIONS = (
"pdf", "png", "jpg", "jpeg", "gif",
"PDF", "PNG", "JPG", "JPEG", "GIF",
"PdF", "PnG", "JpG", "JPeG", "GiF",
)
def _test_guess_attributes_from_name(self, path, sender, title, tags):
for suffix in self.SUFFIXES:
f = path.format(suffix)
for extension in self.EXTENSIONS:
f = path.format(extension)
file_info = FileInfo.from_path(f)
self.assertEqual(file_info.correspondent.name, sender, f)
self.assertEqual(file_info.title, title, f)
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
if suffix.lower() == "jpeg":
self.assertEqual(file_info.suffix, "jpg", f)
if sender:
self.assertEqual(file_info.correspondent.name, sender, f)
else:
self.assertEqual(file_info.suffix, suffix.lower(), f)
self.assertIsNone(file_info.correspondent, f)
self.assertEqual(file_info.title, title, f)
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
if extension.lower() == "jpeg":
self.assertEqual(file_info.extension, "jpg", f)
else:
self.assertEqual(file_info.extension, extension.lower(), f)
def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name(
@ -96,7 +104,7 @@ class TestAttachment(TestCase):
self._test_guess_attributes_from_name(
'/path/to/ - weird empty correspondent but should not break.{}',
None,
' - weird empty correspondent but should not break',
'weird empty correspondent but should not break',
()
)
@ -126,60 +134,171 @@ class TestAttachment(TestCase):
class Permutations(TestCase):
valid_correspondents = ['timmy', 'Dr. McWheelie',
'Dash Gor-don', 'ο Θερμαστής']
valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
def _test_guessed_attributes(
self, filename, title, suffix, correspondent=None, tags=None):
file_info = FileInfo.from_path(filename)
valid_dates = (
"20150102030405Z",
"20150102Z",
)
valid_correspondents = [
"timmy",
"Dr. McWheelie",
"Dash Gor-don",
"ο Θερμαστής",
""
]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
# Required
self.assertEqual(file_info.title, title, filename)
if suffix == 'jpeg':
suffix = 'jpg'
self.assertEqual(file_info.suffix, suffix, filename)
# Optional
if correspondent is None:
self.assertEqual(file_info.correspondent,
correspondent, filename)
def _test_guessed_attributes(self, filename, created=None,
correspondent=None, title=None,
extension=None, tags=None):
# print(filename)
info = FileInfo.from_path(filename)
# Created
if created is None:
self.assertIsNone(info.created, filename)
else:
self.assertEqual(file_info.correspondent.name,
correspondent, filename)
self.assertEqual(info.created.year, int(created[:4]), filename)
self.assertEqual(info.created.month, int(created[4:6]), filename)
self.assertEqual(info.created.day, int(created[6:8]), filename)
# Correspondent
if correspondent:
self.assertEqual(info.correspondent.name, correspondent, filename)
else:
self.assertEqual(info.correspondent, None, filename)
# Title
self.assertEqual(info.title, title, filename)
# Tags
if tags is None:
self.assertEqual(file_info.tags, (), filename)
self.assertEqual(info.tags, (), filename)
else:
self.assertEqual([t.slug for t in file_info.tags],
tags.split(','),
filename)
self.assertEqual(
[t.slug for t in info.tags], tags.split(','),
filename
)
# Extension
if extension == 'jpeg':
extension = 'jpg'
self.assertEqual(info.extension, extension, filename)
def test_just_title(self):
template = '/path/to/{title}.{suffix}'
template = '/path/to/{title}.{extension}'
for title in self.valid_titles:
for suffix in self.valid_suffixes:
spec = dict(title=title, suffix=suffix)
for extension in self.valid_extensions:
spec = dict(title=title, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent(self):
template = '/path/to/{correspondent} - {title}.{suffix}'
template = '/path/to/{correspondent} - {title}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for suffix in self.valid_suffixes:
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
suffix=suffix)
extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent_and_tags(self):
template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for suffix in self.valid_suffixes:
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
tags=tags, suffix=suffix)
tags=tags, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_created_and_correspondent_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_correspondent_and_title(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
# Skip cases where title looks like a tag as we can't
# accommodate such cases.
if title.lower() == title:
continue
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title(self):
template = ("/path/to/{created} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)