mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
removed most of the logic that extracts data from filename patterns #156
This commit is contained in:
parent
32224f187d
commit
7f9a0204b5
@ -247,7 +247,6 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
with open(self.path, "rb") as f:
|
with open(self.path, "rb") as f:
|
||||||
document = Document.objects.create(
|
document = Document.objects.create(
|
||||||
correspondent=file_info.correspondent,
|
|
||||||
title=(self.override_title or file_info.title)[:127],
|
title=(self.override_title or file_info.title)[:127],
|
||||||
content=text,
|
content=text,
|
||||||
mime_type=mime_type,
|
mime_type=mime_type,
|
||||||
@ -257,12 +256,6 @@ class Consumer(LoggingMixin):
|
|||||||
storage_type=storage_type
|
storage_type=storage_type
|
||||||
)
|
)
|
||||||
|
|
||||||
relevant_tags = set(file_info.tags)
|
|
||||||
if relevant_tags:
|
|
||||||
tag_names = ", ".join([t.name for t in relevant_tags])
|
|
||||||
self.log("debug", "Tagging with {}".format(tag_names))
|
|
||||||
document.tags.add(*relevant_tags)
|
|
||||||
|
|
||||||
self.apply_overrides(document)
|
self.apply_overrides(document)
|
||||||
|
|
||||||
document.save()
|
document.save()
|
||||||
|
@ -357,54 +357,12 @@ class SavedViewFilterRule(models.Model):
|
|||||||
# TODO: why is this in the models file?
|
# TODO: why is this in the models file?
|
||||||
class FileInfo:
|
class FileInfo:
|
||||||
|
|
||||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
|
||||||
# posterity, in the hopes that we might find a way to make it work one day.
|
|
||||||
ALMOST_REGEX = re.compile(
|
|
||||||
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
|
|
||||||
r"((?P<correspondent>{non_separated_word}+){separator})??"
|
|
||||||
r"(?P<title>{non_separated_word}+)"
|
|
||||||
r"({separator}(?P<tags>[a-z,0-9-]+))?"
|
|
||||||
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
|
|
||||||
separator=r"\s+-\s+",
|
|
||||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
REGEXES = OrderedDict([
|
REGEXES = OrderedDict([
|
||||||
("created-correspondent-title-tags", re.compile(
|
|
||||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
|
||||||
r"(?P<correspondent>.*) - "
|
|
||||||
r"(?P<title>.*) - "
|
|
||||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)),
|
|
||||||
("created-title-tags", re.compile(
|
|
||||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
|
||||||
r"(?P<title>.*) - "
|
|
||||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)),
|
|
||||||
("created-correspondent-title", re.compile(
|
|
||||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
|
||||||
r"(?P<correspondent>.*) - "
|
|
||||||
r"(?P<title>.*)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)),
|
|
||||||
("created-title", re.compile(
|
("created-title", re.compile(
|
||||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||||
r"(?P<title>.*)$",
|
r"(?P<title>.*)$",
|
||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
)),
|
)),
|
||||||
("correspondent-title-tags", re.compile(
|
|
||||||
r"(?P<correspondent>.*) - "
|
|
||||||
r"(?P<title>.*) - "
|
|
||||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)),
|
|
||||||
("correspondent-title", re.compile(
|
|
||||||
r"(?P<correspondent>.*) - "
|
|
||||||
r"(?P<title>.*)?$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)),
|
|
||||||
("title", re.compile(
|
("title", re.compile(
|
||||||
r"(?P<title>.*)$",
|
r"(?P<title>.*)$",
|
||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
@ -427,23 +385,10 @@ class FileInfo:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_correspondent(cls, name):
|
|
||||||
if not name:
|
|
||||||
return None
|
|
||||||
return Correspondent.objects.get_or_create(name=name)[0]
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_title(cls, title):
|
def _get_title(cls, title):
|
||||||
return title
|
return title
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_tags(cls, tags):
|
|
||||||
r = []
|
|
||||||
for t in tags.split(","):
|
|
||||||
r.append(Tag.objects.get_or_create(name=t)[0])
|
|
||||||
return tuple(r)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _mangle_property(cls, properties, name):
|
def _mangle_property(cls, properties, name):
|
||||||
if name in properties:
|
if name in properties:
|
||||||
@ -453,15 +398,6 @@ class FileInfo:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_filename(cls, filename):
|
def from_filename(cls, filename):
|
||||||
"""
|
|
||||||
We use a crude naming convention to make handling the correspondent,
|
|
||||||
title, and tags easier:
|
|
||||||
"<date> - <correspondent> - <title> - <tags>"
|
|
||||||
"<correspondent> - <title> - <tags>"
|
|
||||||
"<correspondent> - <title>"
|
|
||||||
"<title>"
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Mutate filename in-place before parsing its components
|
# Mutate filename in-place before parsing its components
|
||||||
# by applying at most one of the configured transformations.
|
# by applying at most one of the configured transformations.
|
||||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||||
@ -492,7 +428,5 @@ class FileInfo:
|
|||||||
if m:
|
if m:
|
||||||
properties = m.groupdict()
|
properties = m.groupdict()
|
||||||
cls._mangle_property(properties, "created")
|
cls._mangle_property(properties, "created")
|
||||||
cls._mangle_property(properties, "correspondent")
|
|
||||||
cls._mangle_property(properties, "title")
|
cls._mangle_property(properties, "title")
|
||||||
cls._mangle_property(properties, "tags")
|
|
||||||
return cls(**properties)
|
return cls(**properties)
|
||||||
|
@ -29,81 +29,6 @@ class TestAttributes(TestCase):
|
|||||||
|
|
||||||
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
|
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
|
||||||
|
|
||||||
def test_guess_attributes_from_name0(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Title.pdf", "Sender", "Title", ())
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name1(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name2(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name3(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name4(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name5(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Title - tag1,tag2,tag3.pdf",
|
|
||||||
"Sender",
|
|
||||||
"Title",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name6(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Spaced Sender - Title - tag1,tag2,tag3.pdf",
|
|
||||||
"Spaced Sender",
|
|
||||||
"Title",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name7(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Spaced Title - tag1,tag2,tag3.pdf",
|
|
||||||
"Sender",
|
|
||||||
"Spaced Title",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name8(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Dashed-Sender - Title - tag1,tag2,tag3.pdf",
|
|
||||||
"Dashed-Sender",
|
|
||||||
"Title",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name9(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Sender - Dashed-Title - tag1,tag2,tag3.pdf",
|
|
||||||
"Sender",
|
|
||||||
"Dashed-Title",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name10(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
|
|
||||||
"Σενδερ",
|
|
||||||
"Τιτλε",
|
|
||||||
self.TAGS
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name_when_correspondent_empty(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
' - weird empty correspondent but should not break.pdf',
|
|
||||||
None,
|
|
||||||
'weird empty correspondent but should not break',
|
|
||||||
()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
||||||
self._test_guess_attributes_from_name(
|
self._test_guess_attributes_from_name(
|
||||||
@ -121,28 +46,6 @@ class TestAttributes(TestCase):
|
|||||||
()
|
()
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_guess_attributes_from_name_when_title_is_empty(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
'weird correspondent but should not break - .pdf',
|
|
||||||
'weird correspondent but should not break',
|
|
||||||
'',
|
|
||||||
()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_case_insensitive_tag_creation(self):
|
|
||||||
"""
|
|
||||||
Tags should be detected and created as lower case.
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
filename = "Title - Correspondent - tAg1,TAG2.pdf"
|
|
||||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
|
||||||
|
|
||||||
path = "Title - Correspondent - tag1,tag2.pdf"
|
|
||||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
|
||||||
|
|
||||||
self.assertEqual(Tag.objects.all().count(), 2)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFieldPermutations(TestCase):
|
class TestFieldPermutations(TestCase):
|
||||||
|
|
||||||
@ -199,69 +102,7 @@ class TestFieldPermutations(TestCase):
|
|||||||
filename = template.format(**spec)
|
filename = template.format(**spec)
|
||||||
self._test_guessed_attributes(filename, **spec)
|
self._test_guessed_attributes(filename, **spec)
|
||||||
|
|
||||||
def test_title_and_correspondent(self):
|
|
||||||
template = '{correspondent} - {title}.pdf'
|
|
||||||
for correspondent in self.valid_correspondents:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
spec = dict(correspondent=correspondent, title=title)
|
|
||||||
filename = template.format(**spec)
|
|
||||||
self._test_guessed_attributes(filename, **spec)
|
|
||||||
|
|
||||||
def test_title_and_correspondent_and_tags(self):
|
|
||||||
template = '{correspondent} - {title} - {tags}.pdf'
|
|
||||||
for correspondent in self.valid_correspondents:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
for tags in self.valid_tags:
|
|
||||||
spec = dict(correspondent=correspondent, title=title,
|
|
||||||
tags=tags)
|
|
||||||
filename = template.format(**spec)
|
|
||||||
self._test_guessed_attributes(filename, **spec)
|
|
||||||
|
|
||||||
def test_created_and_correspondent_and_title_and_tags(self):
|
|
||||||
|
|
||||||
template = (
|
|
||||||
"{created} - "
|
|
||||||
"{correspondent} - "
|
|
||||||
"{title} - "
|
|
||||||
"{tags}.pdf"
|
|
||||||
)
|
|
||||||
|
|
||||||
for created in self.valid_dates:
|
|
||||||
for correspondent in self.valid_correspondents:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
for tags in self.valid_tags:
|
|
||||||
spec = {
|
|
||||||
"created": created,
|
|
||||||
"correspondent": correspondent,
|
|
||||||
"title": title,
|
|
||||||
"tags": tags,
|
|
||||||
}
|
|
||||||
self._test_guessed_attributes(
|
|
||||||
template.format(**spec), **spec)
|
|
||||||
|
|
||||||
def test_created_and_correspondent_and_title(self):
|
|
||||||
|
|
||||||
template = "{created} - {correspondent} - {title}.pdf"
|
|
||||||
|
|
||||||
for created in self.valid_dates:
|
|
||||||
for correspondent in self.valid_correspondents:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
|
|
||||||
# Skip cases where title looks like a tag as we can't
|
|
||||||
# accommodate such cases.
|
|
||||||
if title.lower() == title:
|
|
||||||
continue
|
|
||||||
|
|
||||||
spec = {
|
|
||||||
"created": created,
|
|
||||||
"correspondent": correspondent,
|
|
||||||
"title": title
|
|
||||||
}
|
|
||||||
self._test_guessed_attributes(
|
|
||||||
template.format(**spec), **spec)
|
|
||||||
|
|
||||||
def test_created_and_title(self):
|
def test_created_and_title(self):
|
||||||
|
|
||||||
template = "{created} - {title}.pdf"
|
template = "{created} - {title}.pdf"
|
||||||
|
|
||||||
for created in self.valid_dates:
|
for created in self.valid_dates:
|
||||||
@ -273,21 +114,6 @@ class TestFieldPermutations(TestCase):
|
|||||||
self._test_guessed_attributes(
|
self._test_guessed_attributes(
|
||||||
template.format(**spec), **spec)
|
template.format(**spec), **spec)
|
||||||
|
|
||||||
def test_created_and_title_and_tags(self):
|
|
||||||
|
|
||||||
template = "{created} - {title} - {tags}.pdf"
|
|
||||||
|
|
||||||
for created in self.valid_dates:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
for tags in self.valid_tags:
|
|
||||||
spec = {
|
|
||||||
"created": created,
|
|
||||||
"title": title,
|
|
||||||
"tags": tags
|
|
||||||
}
|
|
||||||
self._test_guessed_attributes(
|
|
||||||
template.format(**spec), **spec)
|
|
||||||
|
|
||||||
def test_invalid_date_format(self):
|
def test_invalid_date_format(self):
|
||||||
info = FileInfo.from_filename("06112017Z - title.pdf")
|
info = FileInfo.from_filename("06112017Z - title.pdf")
|
||||||
self.assertEqual(info.title, "title")
|
self.assertEqual(info.title, "title")
|
||||||
@ -336,32 +162,6 @@ class TestFieldPermutations(TestCase):
|
|||||||
info = FileInfo.from_filename(filename)
|
info = FileInfo.from_filename(filename)
|
||||||
self.assertEqual(info.title, "anotherall")
|
self.assertEqual(info.title, "anotherall")
|
||||||
|
|
||||||
# Complex transformation without date in replacement string
|
|
||||||
with self.settings(
|
|
||||||
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "0001")
|
|
||||||
self.assertEqual(len(info.tags), 2)
|
|
||||||
self.assertEqual(info.tags[0].name, "tag1")
|
|
||||||
self.assertEqual(info.tags[1].name, "tag2")
|
|
||||||
self.assertIsNone(info.created)
|
|
||||||
|
|
||||||
# Complex transformation with date in replacement string
|
|
||||||
with self.settings(
|
|
||||||
FILENAME_PARSE_TRANSFORMS=[
|
|
||||||
(none_patt, "none.gif"),
|
|
||||||
(exact_patt, repl2), # <-- matches
|
|
||||||
(exact_patt, repl1),
|
|
||||||
(all_patt, "all.gif")]):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "0001")
|
|
||||||
self.assertEqual(len(info.tags), 2)
|
|
||||||
self.assertEqual(info.tags[0].name, "tag1")
|
|
||||||
self.assertEqual(info.tags[1].name, "tag2")
|
|
||||||
self.assertEqual(info.created.year, 2019)
|
|
||||||
self.assertEqual(info.created.month, 9)
|
|
||||||
self.assertEqual(info.created.day, 8)
|
|
||||||
|
|
||||||
|
|
||||||
class DummyParser(DocumentParser):
|
class DummyParser(DocumentParser):
|
||||||
|
|
||||||
@ -476,15 +276,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
def testOverrideFilename(self):
|
def testOverrideFilename(self):
|
||||||
filename = self.get_test_file()
|
filename = self.get_test_file()
|
||||||
override_filename = "My Bank - Statement for November.pdf"
|
override_filename = "Statement for November.pdf"
|
||||||
|
|
||||||
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
|
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
|
||||||
|
|
||||||
self.assertEqual(document.correspondent.name, "My Bank")
|
|
||||||
self.assertEqual(document.title, "Statement for November")
|
self.assertEqual(document.title, "Statement for November")
|
||||||
|
|
||||||
def testOverrideTitle(self):
|
def testOverrideTitle(self):
|
||||||
|
|
||||||
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
||||||
self.assertEqual(document.title, "Override Title")
|
self.assertEqual(document.title, "Override Title")
|
||||||
|
|
||||||
@ -594,11 +392,10 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
|||||||
def testFilenameHandling(self):
|
def testFilenameHandling(self):
|
||||||
filename = self.get_test_file()
|
filename = self.get_test_file()
|
||||||
|
|
||||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
document = self.consumer.try_consume_file(filename, override_title="new docs")
|
||||||
|
|
||||||
self.assertEqual(document.title, "new docs")
|
self.assertEqual(document.title, "new docs")
|
||||||
self.assertEqual(document.correspondent.name, "Bank")
|
self.assertEqual(document.filename, "none/new docs.pdf")
|
||||||
self.assertEqual(document.filename, "Bank/new docs.pdf")
|
|
||||||
|
|
||||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||||
@ -617,10 +414,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
Tag.objects.create(name="test", is_inbox_tag=True)
|
Tag.objects.create(name="test", is_inbox_tag=True)
|
||||||
|
|
||||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
document = self.consumer.try_consume_file(filename, override_title="new docs")
|
||||||
|
|
||||||
self.assertEqual(document.title, "new docs")
|
self.assertEqual(document.title, "new docs")
|
||||||
self.assertEqual(document.correspondent.name, "Bank")
|
|
||||||
self.assertIsNotNone(os.path.isfile(document.title))
|
self.assertIsNotNone(os.path.isfile(document.title))
|
||||||
self.assertTrue(os.path.isfile(document.source_path))
|
self.assertTrue(os.path.isfile(document.source_path))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user