mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Modifications for support for dates
This commit is contained in:
parent
cf5076bcad
commit
0aa0513004
@ -19,12 +19,11 @@ from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.template.defaultfilters import slugify
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log, FileInfo
|
||||
from .models import Tag, Document, Log, FileInfo
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
@ -92,7 +91,7 @@ class Consumer(object):
|
||||
if not os.path.isfile(doc):
|
||||
continue
|
||||
|
||||
if not re.match(FileInfo.REGEX_TITLE, doc):
|
||||
if not re.match(FileInfo.REGEXES["title"], doc):
|
||||
continue
|
||||
|
||||
if doc in self._ignore:
|
||||
@ -269,7 +268,7 @@ class Consumer(object):
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
content=text,
|
||||
file_type=file_info.suffix,
|
||||
file_type=file_info.extension,
|
||||
created=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||
modified=timezone.make_aware(
|
||||
|
@ -96,11 +96,16 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
@staticmethod
|
||||
def _get_legacy_file_name(doc):
|
||||
if doc.correspondent and doc.title:
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
if tags:
|
||||
return "{} - {} - {}.{}".format(
|
||||
doc.correspondent, doc.title, tags, doc.file_type)
|
||||
return "{} - {}.{}".format(
|
||||
doc.correspondent, doc.title, doc.file_type)
|
||||
return os.path.basename(doc.source_path)
|
||||
|
||||
if not doc.correspondent and not doc.title:
|
||||
return os.path.basename(doc.source_path)
|
||||
|
||||
created = doc.created.strftime("%Y%m%d%H%M%SZ")
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
|
||||
if tags:
|
||||
return "{} - {} - {} - {}.{}".format(
|
||||
created, doc.correspondent, doc.title, tags, doc.file_type)
|
||||
|
||||
return "{} - {} - {}.{}".format(
|
||||
created, doc.correspondent, doc.title, doc.file_type)
|
||||
|
@ -1,8 +1,11 @@
|
||||
import dateutil.parser
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.urlresolvers import reverse
|
||||
from django.db import models
|
||||
@ -12,97 +15,6 @@ from django.utils import timezone
|
||||
from .managers import LogManager
|
||||
|
||||
|
||||
class FileInfo(object):
|
||||
def __init__(self, title, suffix,
|
||||
correspondent=None, tags=None):
|
||||
self._title = title
|
||||
self._suffix = suffix
|
||||
self._correspondent = correspondent
|
||||
self._tags = tags
|
||||
|
||||
REGEX_TITLE = re.compile(
|
||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, path):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title>.<suffix>"
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
def get_correspondent(correspondent_name):
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=correspondent_name,
|
||||
defaults={"slug": slugify(correspondent_name)}
|
||||
)[0]
|
||||
|
||||
def get_tags(tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(
|
||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||
return tuple(r)
|
||||
|
||||
def get_suffix(suffix):
|
||||
suffix = suffix.lower()
|
||||
if suffix == "jpeg":
|
||||
return "jpg"
|
||||
return suffix
|
||||
|
||||
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
|
||||
if m:
|
||||
return cls(
|
||||
title=m.group(2),
|
||||
correspondent=get_correspondent(m.group(1)),
|
||||
tags=get_tags(m.group(3)),
|
||||
suffix=get_suffix(m.group(4))
|
||||
)
|
||||
|
||||
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
|
||||
if m:
|
||||
return cls(
|
||||
title=m.group(2),
|
||||
correspondent=get_correspondent(m.group(1)),
|
||||
tags=(),
|
||||
suffix=get_suffix(m.group(3))
|
||||
)
|
||||
|
||||
# That didn't work, so we assume correspondent and tags are None
|
||||
m = re.match(cls.REGEX_TITLE, path)
|
||||
return FileInfo(
|
||||
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
return self._title
|
||||
|
||||
@property
|
||||
def correspondent(self):
|
||||
return self._correspondent
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
return self._tags
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
return self._suffix
|
||||
|
||||
class SluggedModel(models.Model):
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
@ -341,3 +253,136 @@ class Log(models.Model):
|
||||
self.group = uuid.uuid4()
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
|
||||
class FileInfo(object):
|
||||
|
||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
||||
# posterity, in the hopes that we might find a way to make it work one day.
|
||||
ALMOST_REGEX = re.compile(
|
||||
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
|
||||
r"((?P<correspondent>{non_separated_word}+){separator})??"
|
||||
r"(?P<title>{non_separated_word}+)"
|
||||
r"({separator}(?P<tags>[a-z,0-9-]+))?"
|
||||
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
|
||||
separator=r"\s+-\s+",
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
))
|
||||
])
|
||||
|
||||
def __init__(self, created=None, correspondent=None, title=None, tags=(),
|
||||
extension=None):
|
||||
|
||||
self.created = created
|
||||
self.title = title
|
||||
self.extension = extension
|
||||
self.correspondent = correspondent
|
||||
self.tags = tags
|
||||
|
||||
@classmethod
|
||||
def _get_created(cls, created):
|
||||
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
|
||||
|
||||
@classmethod
|
||||
def _get_correspondent(cls, name):
|
||||
if not name:
|
||||
return None
|
||||
return Correspondent.objects.get_or_create(name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
|
||||
@classmethod
|
||||
def _get_title(cls, title):
|
||||
return title
|
||||
|
||||
@classmethod
|
||||
def _get_tags(cls, tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(
|
||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||
return tuple(r)
|
||||
|
||||
@classmethod
|
||||
def _get_extension(cls, extension):
|
||||
r = extension.lower()
|
||||
if r == "jpeg":
|
||||
return "jpg"
|
||||
return r
|
||||
|
||||
@classmethod
|
||||
def _mangle_property(cls, properties, name):
|
||||
if name in properties:
|
||||
properties[name] = getattr(cls, "_get_{}".format(name))(
|
||||
properties[name]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, path):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title>.<suffix>"
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
for regex in cls.REGEXES.values():
|
||||
m = regex.match(os.path.basename(path))
|
||||
if m:
|
||||
properties = m.groupdict()
|
||||
cls._mangle_property(properties, "created")
|
||||
cls._mangle_property(properties, "correspondent")
|
||||
cls._mangle_property(properties, "title")
|
||||
cls._mangle_property(properties, "tags")
|
||||
cls._mangle_property(properties, "extension")
|
||||
return cls(**properties)
|
||||
|
@ -1,28 +1,36 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import FileInfo
|
||||
from ..models import Document, FileInfo
|
||||
|
||||
|
||||
class TestAttachment(TestCase):
|
||||
|
||||
TAGS = ("tag1", "tag2", "tag3")
|
||||
SUFFIXES = (
|
||||
EXTENSIONS = (
|
||||
"pdf", "png", "jpg", "jpeg", "gif",
|
||||
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
||||
"PdF", "PnG", "JpG", "JPeG", "GiF",
|
||||
)
|
||||
|
||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||
for suffix in self.SUFFIXES:
|
||||
f = path.format(suffix)
|
||||
|
||||
for extension in self.EXTENSIONS:
|
||||
|
||||
f = path.format(extension)
|
||||
file_info = FileInfo.from_path(f)
|
||||
self.assertEqual(file_info.correspondent.name, sender, f)
|
||||
self.assertEqual(file_info.title, title, f)
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
|
||||
if suffix.lower() == "jpeg":
|
||||
self.assertEqual(file_info.suffix, "jpg", f)
|
||||
|
||||
if sender:
|
||||
self.assertEqual(file_info.correspondent.name, sender, f)
|
||||
else:
|
||||
self.assertEqual(file_info.suffix, suffix.lower(), f)
|
||||
self.assertIsNone(file_info.correspondent, f)
|
||||
|
||||
self.assertEqual(file_info.title, title, f)
|
||||
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
|
||||
if extension.lower() == "jpeg":
|
||||
self.assertEqual(file_info.extension, "jpg", f)
|
||||
else:
|
||||
self.assertEqual(file_info.extension, extension.lower(), f)
|
||||
|
||||
def test_guess_attributes_from_name0(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
@ -96,7 +104,7 @@ class TestAttachment(TestCase):
|
||||
self._test_guess_attributes_from_name(
|
||||
'/path/to/ - weird empty correspondent but should not break.{}',
|
||||
None,
|
||||
' - weird empty correspondent but should not break',
|
||||
'weird empty correspondent but should not break',
|
||||
()
|
||||
)
|
||||
|
||||
@ -126,60 +134,171 @@ class TestAttachment(TestCase):
|
||||
|
||||
|
||||
class Permutations(TestCase):
|
||||
valid_correspondents = ['timmy', 'Dr. McWheelie',
|
||||
'Dash Gor-don', 'ο Θερμαστής']
|
||||
valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
|
||||
valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
|
||||
valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
|
||||
|
||||
def _test_guessed_attributes(
|
||||
self, filename, title, suffix, correspondent=None, tags=None):
|
||||
file_info = FileInfo.from_path(filename)
|
||||
valid_dates = (
|
||||
"20150102030405Z",
|
||||
"20150102Z",
|
||||
)
|
||||
valid_correspondents = [
|
||||
"timmy",
|
||||
"Dr. McWheelie",
|
||||
"Dash Gor-don",
|
||||
"ο Θερμαστής",
|
||||
""
|
||||
]
|
||||
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
|
||||
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
|
||||
valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
|
||||
|
||||
# Required
|
||||
self.assertEqual(file_info.title, title, filename)
|
||||
if suffix == 'jpeg':
|
||||
suffix = 'jpg'
|
||||
self.assertEqual(file_info.suffix, suffix, filename)
|
||||
# Optional
|
||||
if correspondent is None:
|
||||
self.assertEqual(file_info.correspondent,
|
||||
correspondent, filename)
|
||||
def _test_guessed_attributes(self, filename, created=None,
|
||||
correspondent=None, title=None,
|
||||
extension=None, tags=None):
|
||||
|
||||
# print(filename)
|
||||
info = FileInfo.from_path(filename)
|
||||
|
||||
# Created
|
||||
if created is None:
|
||||
self.assertIsNone(info.created, filename)
|
||||
else:
|
||||
self.assertEqual(file_info.correspondent.name,
|
||||
correspondent, filename)
|
||||
self.assertEqual(info.created.year, int(created[:4]), filename)
|
||||
self.assertEqual(info.created.month, int(created[4:6]), filename)
|
||||
self.assertEqual(info.created.day, int(created[6:8]), filename)
|
||||
|
||||
# Correspondent
|
||||
if correspondent:
|
||||
self.assertEqual(info.correspondent.name, correspondent, filename)
|
||||
else:
|
||||
self.assertEqual(info.correspondent, None, filename)
|
||||
|
||||
# Title
|
||||
self.assertEqual(info.title, title, filename)
|
||||
|
||||
# Tags
|
||||
if tags is None:
|
||||
self.assertEqual(file_info.tags, (), filename)
|
||||
self.assertEqual(info.tags, (), filename)
|
||||
else:
|
||||
self.assertEqual([t.slug for t in file_info.tags],
|
||||
tags.split(','),
|
||||
filename)
|
||||
self.assertEqual(
|
||||
[t.slug for t in info.tags], tags.split(','),
|
||||
filename
|
||||
)
|
||||
|
||||
# Extension
|
||||
if extension == 'jpeg':
|
||||
extension = 'jpg'
|
||||
self.assertEqual(info.extension, extension, filename)
|
||||
|
||||
def test_just_title(self):
|
||||
template = '/path/to/{title}.{suffix}'
|
||||
template = '/path/to/{title}.{extension}'
|
||||
for title in self.valid_titles:
|
||||
for suffix in self.valid_suffixes:
|
||||
spec = dict(title=title, suffix=suffix)
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(title=title, extension=extension)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent(self):
|
||||
template = '/path/to/{correspondent} - {title}.{suffix}'
|
||||
template = '/path/to/{correspondent} - {title}.{extension}'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for suffix in self.valid_suffixes:
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
suffix=suffix)
|
||||
extension=extension)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent_and_tags(self):
|
||||
template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
|
||||
template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for suffix in self.valid_suffixes:
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
tags=tags, suffix=suffix)
|
||||
tags=tags, extension=extension)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title_and_tags(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{correspondent} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}")
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{correspondent} - "
|
||||
"{title}"
|
||||
".{extension}")
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
|
||||
# Skip cases where title looks like a tag as we can't
|
||||
# accommodate such cases.
|
||||
if title.lower() == title:
|
||||
continue
|
||||
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{title}"
|
||||
".{extension}")
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title_and_tags(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}")
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
Loading…
x
Reference in New Issue
Block a user