From 7f9a0204b59239088d2e47aec8d797d12d1a581a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sun, 20 Dec 2020 00:08:05 +0100 Subject: [PATCH] removed most of the logic that extracts data from filename patterns #156 --- src/documents/consumer.py | 7 - src/documents/models.py | 66 --------- src/documents/tests/test_consumer.py | 212 +-------------------------- 3 files changed, 4 insertions(+), 281 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index e4da51f1d..ab4912a36 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -247,7 +247,6 @@ class Consumer(LoggingMixin): with open(self.path, "rb") as f: document = Document.objects.create( - correspondent=file_info.correspondent, title=(self.override_title or file_info.title)[:127], content=text, mime_type=mime_type, @@ -257,12 +256,6 @@ class Consumer(LoggingMixin): storage_type=storage_type ) - relevant_tags = set(file_info.tags) - if relevant_tags: - tag_names = ", ".join([t.name for t in relevant_tags]) - self.log("debug", "Tagging with {}".format(tag_names)) - document.tags.add(*relevant_tags) - self.apply_overrides(document) document.save() diff --git a/src/documents/models.py b/src/documents/models.py index 3a6d155ed..168dd8c7b 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -357,54 +357,12 @@ class SavedViewFilterRule(models.Model): # TODO: why is this in the models file? class FileInfo: - # This epic regex *almost* worked for our needs, so I'm keeping it here for - # posterity, in the hopes that we might find a way to make it work one day. - ALMOST_REGEX = re.compile( - r"^((?P\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?" - r"((?P{non_separated_word}+){separator})??" - r"(?P{non_separated_word}+)" - r"({separator}(?P<tags>[a-z,0-9-]+))?" - r"\.(?P<extension>[a-zA-Z.-]+)$".format( - separator=r"\s+-\s+", - non_separated_word=r"([\w,. ]|([^\s]-))" - ) - ) REGEXES = OrderedDict([ - ("created-correspondent-title-tags", re.compile( - r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " - r"(?P<correspondent>.*) - " - r"(?P<title>.*) - " - r"(?P<tags>[a-z0-9\-,]*)$", - flags=re.IGNORECASE - )), - ("created-title-tags", re.compile( - r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " - r"(?P<title>.*) - " - r"(?P<tags>[a-z0-9\-,]*)$", - flags=re.IGNORECASE - )), - ("created-correspondent-title", re.compile( - r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " - r"(?P<correspondent>.*) - " - r"(?P<title>.*)$", - flags=re.IGNORECASE - )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*)$", flags=re.IGNORECASE )), - ("correspondent-title-tags", re.compile( - r"(?P<correspondent>.*) - " - r"(?P<title>.*) - " - r"(?P<tags>[a-z0-9\-,]*)$", - flags=re.IGNORECASE - )), - ("correspondent-title", re.compile( - r"(?P<correspondent>.*) - " - r"(?P<title>.*)?$", - flags=re.IGNORECASE - )), ("title", re.compile( r"(?P<title>.*)$", flags=re.IGNORECASE @@ -427,23 +385,10 @@ class FileInfo: except ValueError: return None - @classmethod - def _get_correspondent(cls, name): - if not name: - return None - return Correspondent.objects.get_or_create(name=name)[0] - @classmethod def _get_title(cls, title): return title - @classmethod - def _get_tags(cls, tags): - r = [] - for t in tags.split(","): - r.append(Tag.objects.get_or_create(name=t)[0]) - return tuple(r) - @classmethod def _mangle_property(cls, properties, name): if name in properties: @@ -453,15 +398,6 @@ class FileInfo: @classmethod def from_filename(cls, filename): - """ - We use a crude naming convention to make handling the correspondent, - title, and tags easier: - "<date> - <correspondent> - <title> - <tags>" - "<correspondent> - <title> - <tags>" - "<correspondent> - <title>" - "<title>" - """ - # Mutate filename in-place before parsing its components # by applying at most one of the configured transformations. for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: @@ -492,7 +428,5 @@ class FileInfo: if m: properties = m.groupdict() cls._mangle_property(properties, "created") - cls._mangle_property(properties, "correspondent") cls._mangle_property(properties, "title") - cls._mangle_property(properties, "tags") return cls(**properties) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 75d6aa16b..f53981850 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -29,81 +29,6 @@ class TestAttributes(TestCase): self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename) - def test_guess_attributes_from_name0(self): - self._test_guess_attributes_from_name( - "Sender - Title.pdf", "Sender", "Title", ()) - - def test_guess_attributes_from_name1(self): - self._test_guess_attributes_from_name( - "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ()) - - def test_guess_attributes_from_name2(self): - self._test_guess_attributes_from_name( - "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ()) - - def test_guess_attributes_from_name3(self): - self._test_guess_attributes_from_name( - "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ()) - - def test_guess_attributes_from_name4(self): - self._test_guess_attributes_from_name( - "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ()) - - def test_guess_attributes_from_name5(self): - self._test_guess_attributes_from_name( - "Sender - Title - tag1,tag2,tag3.pdf", - "Sender", - "Title", - self.TAGS - ) - - def test_guess_attributes_from_name6(self): - self._test_guess_attributes_from_name( - "Spaced Sender - Title - tag1,tag2,tag3.pdf", - "Spaced Sender", - "Title", - self.TAGS - ) - - def test_guess_attributes_from_name7(self): - self._test_guess_attributes_from_name( - "Sender - Spaced Title - tag1,tag2,tag3.pdf", - "Sender", - "Spaced Title", - self.TAGS - ) - - def test_guess_attributes_from_name8(self): - self._test_guess_attributes_from_name( - "Dashed-Sender - Title - tag1,tag2,tag3.pdf", - "Dashed-Sender", - "Title", - self.TAGS - ) - - def test_guess_attributes_from_name9(self): - self._test_guess_attributes_from_name( - "Sender - Dashed-Title - tag1,tag2,tag3.pdf", - "Sender", - "Dashed-Title", - self.TAGS - ) - - def test_guess_attributes_from_name10(self): - self._test_guess_attributes_from_name( - "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf", - "Σενδερ", - "Τιτλε", - self.TAGS - ) - - def test_guess_attributes_from_name_when_correspondent_empty(self): - self._test_guess_attributes_from_name( - ' - weird empty correspondent but should not break.pdf', - None, - 'weird empty correspondent but should not break', - () - ) def test_guess_attributes_from_name_when_title_starts_with_dash(self): self._test_guess_attributes_from_name( @@ -121,28 +46,6 @@ class TestAttributes(TestCase): () ) - def test_guess_attributes_from_name_when_title_is_empty(self): - self._test_guess_attributes_from_name( - 'weird correspondent but should not break - .pdf', - 'weird correspondent but should not break', - '', - () - ) - - def test_case_insensitive_tag_creation(self): - """ - Tags should be detected and created as lower case. - :return: - """ - - filename = "Title - Correspondent - tAg1,TAG2.pdf" - self.assertEqual(len(FileInfo.from_filename(filename).tags), 2) - - path = "Title - Correspondent - tag1,tag2.pdf" - self.assertEqual(len(FileInfo.from_filename(filename).tags), 2) - - self.assertEqual(Tag.objects.all().count(), 2) - class TestFieldPermutations(TestCase): @@ -199,69 +102,7 @@ class TestFieldPermutations(TestCase): filename = template.format(**spec) self._test_guessed_attributes(filename, **spec) - def test_title_and_correspondent(self): - template = '{correspondent} - {title}.pdf' - for correspondent in self.valid_correspondents: - for title in self.valid_titles: - spec = dict(correspondent=correspondent, title=title) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) - - def test_title_and_correspondent_and_tags(self): - template = '{correspondent} - {title} - {tags}.pdf' - for correspondent in self.valid_correspondents: - for title in self.valid_titles: - for tags in self.valid_tags: - spec = dict(correspondent=correspondent, title=title, - tags=tags) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) - - def test_created_and_correspondent_and_title_and_tags(self): - - template = ( - "{created} - " - "{correspondent} - " - "{title} - " - "{tags}.pdf" - ) - - for created in self.valid_dates: - for correspondent in self.valid_correspondents: - for title in self.valid_titles: - for tags in self.valid_tags: - spec = { - "created": created, - "correspondent": correspondent, - "title": title, - "tags": tags, - } - self._test_guessed_attributes( - template.format(**spec), **spec) - - def test_created_and_correspondent_and_title(self): - - template = "{created} - {correspondent} - {title}.pdf" - - for created in self.valid_dates: - for correspondent in self.valid_correspondents: - for title in self.valid_titles: - - # Skip cases where title looks like a tag as we can't - # accommodate such cases. - if title.lower() == title: - continue - - spec = { - "created": created, - "correspondent": correspondent, - "title": title - } - self._test_guessed_attributes( - template.format(**spec), **spec) - def test_created_and_title(self): - template = "{created} - {title}.pdf" for created in self.valid_dates: @@ -273,21 +114,6 @@ class TestFieldPermutations(TestCase): self._test_guessed_attributes( template.format(**spec), **spec) - def test_created_and_title_and_tags(self): - - template = "{created} - {title} - {tags}.pdf" - - for created in self.valid_dates: - for title in self.valid_titles: - for tags in self.valid_tags: - spec = { - "created": created, - "title": title, - "tags": tags - } - self._test_guessed_attributes( - template.format(**spec), **spec) - def test_invalid_date_format(self): info = FileInfo.from_filename("06112017Z - title.pdf") self.assertEqual(info.title, "title") @@ -336,32 +162,6 @@ class TestFieldPermutations(TestCase): info = FileInfo.from_filename(filename) self.assertEqual(info.title, "anotherall") - # Complex transformation without date in replacement string - with self.settings( - FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "0001") - self.assertEqual(len(info.tags), 2) - self.assertEqual(info.tags[0].name, "tag1") - self.assertEqual(info.tags[1].name, "tag2") - self.assertIsNone(info.created) - - # Complex transformation with date in replacement string - with self.settings( - FILENAME_PARSE_TRANSFORMS=[ - (none_patt, "none.gif"), - (exact_patt, repl2), # <-- matches - (exact_patt, repl1), - (all_patt, "all.gif")]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "0001") - self.assertEqual(len(info.tags), 2) - self.assertEqual(info.tags[0].name, "tag1") - self.assertEqual(info.tags[1].name, "tag2") - self.assertEqual(info.created.year, 2019) - self.assertEqual(info.created.month, 9) - self.assertEqual(info.created.day, 8) - class DummyParser(DocumentParser): @@ -476,15 +276,13 @@ class TestConsumer(DirectoriesMixin, TestCase): def testOverrideFilename(self): filename = self.get_test_file() - override_filename = "My Bank - Statement for November.pdf" + override_filename = "Statement for November.pdf" document = self.consumer.try_consume_file(filename, override_filename=override_filename) - self.assertEqual(document.correspondent.name, "My Bank") self.assertEqual(document.title, "Statement for November") def testOverrideTitle(self): - document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title") self.assertEqual(document.title, "Override Title") @@ -594,11 +392,10 @@ class TestConsumer(DirectoriesMixin, TestCase): def testFilenameHandling(self): filename = self.get_test_file() - document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs") + document = self.consumer.try_consume_file(filename, override_title="new docs") self.assertEqual(document.title, "new docs") - self.assertEqual(document.correspondent.name, "Bank") - self.assertEqual(document.filename, "Bank/new docs.pdf") + self.assertEqual(document.filename, "none/new docs.pdf") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") @mock.patch("documents.signals.handlers.generate_unique_filename") @@ -617,10 +414,9 @@ class TestConsumer(DirectoriesMixin, TestCase): Tag.objects.create(name="test", is_inbox_tag=True) - document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs") + document = self.consumer.try_consume_file(filename, override_title="new docs") self.assertEqual(document.title, "new docs") - self.assertEqual(document.correspondent.name, "Bank") self.assertIsNotNone(os.path.isfile(document.title)) self.assertTrue(os.path.isfile(document.source_path))