mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction
This commit is contained in:
commit
cf5076bcad
@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
|
|||||||
|
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from .models import Correspondent, Tag, Document, Log
|
from .models import Correspondent, Tag, Document, Log, FileInfo
|
||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
@ -54,19 +54,6 @@ class Consumer(object):
|
|||||||
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
REGEX_TITLE = re.compile(
|
|
||||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
REGEX_CORRESPONDENT_TITLE = re.compile(
|
|
||||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
|
||||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
@ -105,7 +92,7 @@ class Consumer(object):
|
|||||||
if not os.path.isfile(doc):
|
if not os.path.isfile(doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not re.match(self.REGEX_TITLE, doc):
|
if not re.match(FileInfo.REGEX_TITLE, doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if doc in self._ignore:
|
if doc in self._ignore:
|
||||||
@ -269,72 +256,20 @@ class Consumer(object):
|
|||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return re.sub(r"\s+", " ", r)
|
||||||
|
|
||||||
def _guess_attributes_from_name(self, parseable):
|
|
||||||
"""
|
|
||||||
We use a crude naming convention to make handling the correspondent,
|
|
||||||
title, and tags easier:
|
|
||||||
"<correspondent> - <title> - <tags>.<suffix>"
|
|
||||||
"<correspondent> - <title>.<suffix>"
|
|
||||||
"<title>.<suffix>"
|
|
||||||
"""
|
|
||||||
|
|
||||||
def get_correspondent(correspondent_name):
|
|
||||||
return Correspondent.objects.get_or_create(
|
|
||||||
name=correspondent_name,
|
|
||||||
defaults={"slug": slugify(correspondent_name)}
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
def get_tags(tags):
|
|
||||||
r = []
|
|
||||||
for t in tags.split(","):
|
|
||||||
r.append(
|
|
||||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
|
||||||
return tuple(r)
|
|
||||||
|
|
||||||
def get_suffix(suffix):
|
|
||||||
suffix = suffix.lower()
|
|
||||||
if suffix == "jpeg":
|
|
||||||
return "jpg"
|
|
||||||
return suffix
|
|
||||||
|
|
||||||
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
|
||||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
|
|
||||||
if m:
|
|
||||||
return (
|
|
||||||
get_correspondent(m.group(1)),
|
|
||||||
m.group(2),
|
|
||||||
get_tags(m.group(3)),
|
|
||||||
get_suffix(m.group(4))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Second attempt: "<correspondent> - <title>.<suffix>"
|
|
||||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
|
|
||||||
if m:
|
|
||||||
return (
|
|
||||||
get_correspondent(m.group(1)),
|
|
||||||
m.group(2),
|
|
||||||
(),
|
|
||||||
get_suffix(m.group(3))
|
|
||||||
)
|
|
||||||
|
|
||||||
# That didn't work, so we assume correspondent and tags are None
|
|
||||||
m = re.match(self.REGEX_TITLE, parseable)
|
|
||||||
return None, m.group(1), (), get_suffix(m.group(2))
|
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
file_info = FileInfo.from_path(doc)
|
||||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
|
||||||
|
|
||||||
stats = os.stat(doc)
|
stats = os.stat(doc)
|
||||||
|
|
||||||
self.log("debug", "Saving record to database")
|
self.log("debug", "Saving record to database")
|
||||||
|
|
||||||
document = Document.objects.create(
|
document = Document.objects.create(
|
||||||
correspondent=sender,
|
correspondent=file_info.correspondent,
|
||||||
title=title,
|
title=file_info.title,
|
||||||
content=text,
|
content=text,
|
||||||
file_type=file_type,
|
file_type=file_info.suffix,
|
||||||
created=timezone.make_aware(
|
created=timezone.make_aware(
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||||
modified=timezone.make_aware(
|
modified=timezone.make_aware(
|
||||||
|
@ -12,6 +12,97 @@ from django.utils import timezone
|
|||||||
from .managers import LogManager
|
from .managers import LogManager
|
||||||
|
|
||||||
|
|
||||||
|
class FileInfo(object):
|
||||||
|
def __init__(self, title, suffix,
|
||||||
|
correspondent=None, tags=None):
|
||||||
|
self._title = title
|
||||||
|
self._suffix = suffix
|
||||||
|
self._correspondent = correspondent
|
||||||
|
self._tags = tags
|
||||||
|
|
||||||
|
REGEX_TITLE = re.compile(
|
||||||
|
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||||
|
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||||
|
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_path(cls, path):
|
||||||
|
"""
|
||||||
|
We use a crude naming convention to make handling the correspondent,
|
||||||
|
title, and tags easier:
|
||||||
|
"<correspondent> - <title> - <tags>.<suffix>"
|
||||||
|
"<correspondent> - <title>.<suffix>"
|
||||||
|
"<title>.<suffix>"
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_correspondent(correspondent_name):
|
||||||
|
return Correspondent.objects.get_or_create(
|
||||||
|
name=correspondent_name,
|
||||||
|
defaults={"slug": slugify(correspondent_name)}
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
def get_tags(tags):
|
||||||
|
r = []
|
||||||
|
for t in tags.split(","):
|
||||||
|
r.append(
|
||||||
|
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||||
|
return tuple(r)
|
||||||
|
|
||||||
|
def get_suffix(suffix):
|
||||||
|
suffix = suffix.lower()
|
||||||
|
if suffix == "jpeg":
|
||||||
|
return "jpg"
|
||||||
|
return suffix
|
||||||
|
|
||||||
|
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||||
|
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
|
||||||
|
if m:
|
||||||
|
return cls(
|
||||||
|
title=m.group(2),
|
||||||
|
correspondent=get_correspondent(m.group(1)),
|
||||||
|
tags=get_tags(m.group(3)),
|
||||||
|
suffix=get_suffix(m.group(4))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||||
|
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
|
||||||
|
if m:
|
||||||
|
return cls(
|
||||||
|
title=m.group(2),
|
||||||
|
correspondent=get_correspondent(m.group(1)),
|
||||||
|
tags=(),
|
||||||
|
suffix=get_suffix(m.group(3))
|
||||||
|
)
|
||||||
|
|
||||||
|
# That didn't work, so we assume correspondent and tags are None
|
||||||
|
m = re.match(cls.REGEX_TITLE, path)
|
||||||
|
return FileInfo(
|
||||||
|
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def title(self):
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def correspondent(self):
|
||||||
|
return self._correspondent
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tags(self):
|
||||||
|
return self._tags
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix(self):
|
||||||
|
return self._suffix
|
||||||
|
|
||||||
class SluggedModel(models.Model):
|
class SluggedModel(models.Model):
|
||||||
|
|
||||||
name = models.CharField(max_length=128, unique=True)
|
name = models.CharField(max_length=128, unique=True)
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
from ..consumer import Consumer
|
from ..models import FileInfo
|
||||||
|
|
||||||
|
|
||||||
class TestAttachment(TestCase):
|
class TestAttachment(TestCase):
|
||||||
|
|
||||||
TAGS = ("tag1", "tag2", "tag3")
|
TAGS = ("tag1", "tag2", "tag3")
|
||||||
CONSUMER = Consumer()
|
|
||||||
SUFFIXES = (
|
SUFFIXES = (
|
||||||
"pdf", "png", "jpg", "jpeg", "gif",
|
"pdf", "png", "jpg", "jpeg", "gif",
|
||||||
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
||||||
@ -16,14 +15,14 @@ class TestAttachment(TestCase):
|
|||||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||||
for suffix in self.SUFFIXES:
|
for suffix in self.SUFFIXES:
|
||||||
f = path.format(suffix)
|
f = path.format(suffix)
|
||||||
results = self.CONSUMER._guess_attributes_from_name(f)
|
file_info = FileInfo.from_path(f)
|
||||||
self.assertEqual(results[0].name, sender, f)
|
self.assertEqual(file_info.correspondent.name, sender, f)
|
||||||
self.assertEqual(results[1], title, f)
|
self.assertEqual(file_info.title, title, f)
|
||||||
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
|
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
|
||||||
if suffix.lower() == "jpeg":
|
if suffix.lower() == "jpeg":
|
||||||
self.assertEqual(results[3], "jpg", f)
|
self.assertEqual(file_info.suffix, "jpg", f)
|
||||||
else:
|
else:
|
||||||
self.assertEqual(results[3], suffix.lower(), f)
|
self.assertEqual(file_info.suffix, suffix.lower(), f)
|
||||||
|
|
||||||
def test_guess_attributes_from_name0(self):
|
def test_guess_attributes_from_name0(self):
|
||||||
self._test_guess_attributes_from_name(
|
self._test_guess_attributes_from_name(
|
||||||
@ -92,3 +91,95 @@ class TestAttachment(TestCase):
|
|||||||
"Τιτλε",
|
"Τιτλε",
|
||||||
self.TAGS
|
self.TAGS
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_guess_attributes_from_name_when_correspondent_empty(self):
|
||||||
|
self._test_guess_attributes_from_name(
|
||||||
|
'/path/to/ - weird empty correspondent but should not break.{}',
|
||||||
|
None,
|
||||||
|
' - weird empty correspondent but should not break',
|
||||||
|
()
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
||||||
|
self._test_guess_attributes_from_name(
|
||||||
|
'/path/to/- weird but should not break.{}',
|
||||||
|
None,
|
||||||
|
'- weird but should not break',
|
||||||
|
()
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
|
||||||
|
self._test_guess_attributes_from_name(
|
||||||
|
'/path/to/weird but should not break -.{}',
|
||||||
|
None,
|
||||||
|
'weird but should not break -',
|
||||||
|
()
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_guess_attributes_from_name_when_title_is_empty(self):
|
||||||
|
self._test_guess_attributes_from_name(
|
||||||
|
'/path/to/weird correspondent but should not break - .{}',
|
||||||
|
'weird correspondent but should not break',
|
||||||
|
'',
|
||||||
|
()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Permutations(TestCase):
|
||||||
|
valid_correspondents = ['timmy', 'Dr. McWheelie',
|
||||||
|
'Dash Gor-don', 'ο Θερμαστής']
|
||||||
|
valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
|
||||||
|
valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
|
||||||
|
valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
|
||||||
|
|
||||||
|
def _test_guessed_attributes(
|
||||||
|
self, filename, title, suffix, correspondent=None, tags=None):
|
||||||
|
file_info = FileInfo.from_path(filename)
|
||||||
|
|
||||||
|
# Required
|
||||||
|
self.assertEqual(file_info.title, title, filename)
|
||||||
|
if suffix == 'jpeg':
|
||||||
|
suffix = 'jpg'
|
||||||
|
self.assertEqual(file_info.suffix, suffix, filename)
|
||||||
|
# Optional
|
||||||
|
if correspondent is None:
|
||||||
|
self.assertEqual(file_info.correspondent,
|
||||||
|
correspondent, filename)
|
||||||
|
else:
|
||||||
|
self.assertEqual(file_info.correspondent.name,
|
||||||
|
correspondent, filename)
|
||||||
|
if tags is None:
|
||||||
|
self.assertEqual(file_info.tags, (), filename)
|
||||||
|
else:
|
||||||
|
self.assertEqual([t.slug for t in file_info.tags],
|
||||||
|
tags.split(','),
|
||||||
|
filename)
|
||||||
|
|
||||||
|
def test_just_title(self):
|
||||||
|
template = '/path/to/{title}.{suffix}'
|
||||||
|
for title in self.valid_titles:
|
||||||
|
for suffix in self.valid_suffixes:
|
||||||
|
spec = dict(title=title, suffix=suffix)
|
||||||
|
filename = template.format(**spec)
|
||||||
|
self._test_guessed_attributes(filename, **spec)
|
||||||
|
|
||||||
|
def test_title_and_correspondent(self):
|
||||||
|
template = '/path/to/{correspondent} - {title}.{suffix}'
|
||||||
|
for correspondent in self.valid_correspondents:
|
||||||
|
for title in self.valid_titles:
|
||||||
|
for suffix in self.valid_suffixes:
|
||||||
|
spec = dict(correspondent=correspondent, title=title,
|
||||||
|
suffix=suffix)
|
||||||
|
filename = template.format(**spec)
|
||||||
|
self._test_guessed_attributes(filename, **spec)
|
||||||
|
|
||||||
|
def test_title_and_correspondent_and_tags(self):
|
||||||
|
template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
|
||||||
|
for correspondent in self.valid_correspondents:
|
||||||
|
for title in self.valid_titles:
|
||||||
|
for tags in self.valid_tags:
|
||||||
|
for suffix in self.valid_suffixes:
|
||||||
|
spec = dict(correspondent=correspondent, title=title,
|
||||||
|
tags=tags, suffix=suffix)
|
||||||
|
filename = template.format(**spec)
|
||||||
|
self._test_guessed_attributes(filename, **spec)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user