From 3d5b66c2b77f8653758d87d432e6d379f69a5399 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Fri, 20 Nov 2020 16:18:59 +0100 Subject: [PATCH] FileType does not care about the extension anymore. --- src/documents/consumer.py | 2 +- src/documents/forms.py | 3 +- src/documents/models.py | 64 ++++---- src/documents/tests/test_consumer.py | 213 +++++++++++---------------- 4 files changed, 118 insertions(+), 164 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b8eb8cfca..175f6710f 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -197,7 +197,7 @@ class Consumer(LoggingMixin): # If someone gave us the original filename, use it instead of doc. - file_info = FileInfo.from_path(self.filename) + file_info = FileInfo.from_filename(self.filename) stats = os.stat(self.path) diff --git a/src/documents/forms.py b/src/documents/forms.py index 38a95a068..c3efc774f 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -34,8 +34,7 @@ class UploadForm(forms.Form): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - # TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer. - with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f: + with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f: f.write(document) os.utime(f.name, times=(t, t)) diff --git a/src/documents/models.py b/src/documents/models.py index 559c395e0..6288980c5 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -269,7 +269,7 @@ class Log(models.Model): def __str__(self): return self.message - +# TODO: why is this in the models file? class FileInfo: # This epic regex *almost* worked for our needs, so I'm keeping it here for @@ -284,53 +284,44 @@ class FileInfo: non_separated_word=r"([\w,. ]|([^\s]-))" ) ) - # TODO: what is this used for - formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" REGEXES = OrderedDict([ ("created-correspondent-title-tags", re.compile( r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P.*) - " r"(?P.*) - " - r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<tags>[a-z0-9\-,]*)$", flags=re.IGNORECASE )), ("created-title-tags", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*) - " - r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<tags>[a-z0-9\-,]*)$", flags=re.IGNORECASE )), ("created-correspondent-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<correspondent>.*) - " - r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<title>.*)$", flags=re.IGNORECASE )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " - r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<title>.*)$", flags=re.IGNORECASE )), ("correspondent-title-tags", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*) - " - r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<tags>[a-z0-9\-,]*)$", flags=re.IGNORECASE )), ("correspondent-title", re.compile( r"(?P<correspondent>.*) - " - r"(?P<title>.*)?" - r"\.(?P<extension>{})$".format(formats), + r"(?P<title>.*)?$", flags=re.IGNORECASE )), ("title", re.compile( - r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"(?P<title>.*)$", flags=re.IGNORECASE )) ]) @@ -373,15 +364,6 @@ class FileInfo: )[0]) return tuple(r) - @classmethod - def _get_extension(cls, extension): - r = extension.lower() - if r == "jpeg": - return "jpg" - if r == "tif": - return "tiff" - return r - @classmethod def _mangle_property(cls, properties, name): if name in properties: @@ -390,18 +372,16 @@ class FileInfo: ) @classmethod - def from_path(cls, path): + def from_filename(cls, filename): """ We use a crude naming convention to make handling the correspondent, title, and tags easier: - "<date> - <correspondent> - <title> - <tags>.<suffix>" - "<correspondent> - <title> - <tags>.<suffix>" - "<correspondent> - <title>.<suffix>" - "<title>.<suffix>" + "<date> - <correspondent> - <title> - <tags>" + "<correspondent> - <title> - <tags>" + "<correspondent> - <title>" + "<title>" """ - filename = os.path.basename(path) - # Mutate filename in-place before parsing its components # by applying at most one of the configured transformations. for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: @@ -409,6 +389,23 @@ class FileInfo: if count: break + # do this after the transforms so that the transforms can do whatever + # with the file extension. + filename_no_ext = os.path.splitext(filename)[0] + + if filename_no_ext == filename and filename.startswith("."): + # This is a very special case where there is no text before the + # file type. + # TODO: this should be handled better. The ext is not removed + # because usually, files like '.pdf' are just hidden files + # with the name pdf, but in our case, its more likely that + # there's just no name to begin with. + filename = "" + # This isn't too bad either, since we'll just not match anything + # and return an empty title. TODO: actually, this is kinda bad. + else: + filename = filename_no_ext + # Parse filename components. for regex in cls.REGEXES.values(): m = regex.match(filename) @@ -418,5 +415,4 @@ class FileInfo: cls._mangle_property(properties, "correspondent") cls._mangle_property(properties, "title") cls._mangle_property(properties, "tags") - cls._mangle_property(properties, "extension") return cls(**properties) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a89bd75ae..6dab98d02 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -15,57 +15,42 @@ from ..parsers import DocumentParser, ParseError class TestAttributes(TestCase): TAGS = ("tag1", "tag2", "tag3") - EXTENSIONS = ( - "pdf", "png", "jpg", "jpeg", "gif", "tiff", "tif", - "PDF", "PNG", "JPG", "JPEG", "GIF", "TIFF", "TIF", - "PdF", "PnG", "JpG", "JPeG", "GiF", "TiFf", "TiF", - ) - def _test_guess_attributes_from_name(self, path, sender, title, tags): + def _test_guess_attributes_from_name(self, filename, sender, title, tags): + file_info = FileInfo.from_filename(filename) - for extension in self.EXTENSIONS: + if sender: + self.assertEqual(file_info.correspondent.name, sender, filename) + else: + self.assertIsNone(file_info.correspondent, filename) - f = path.format(extension) - file_info = FileInfo.from_path(f) + self.assertEqual(file_info.title, title, filename) - if sender: - self.assertEqual(file_info.correspondent.name, sender, f) - else: - self.assertIsNone(file_info.correspondent, f) - - self.assertEqual(file_info.title, title, f) - - self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) - if extension.lower() == "jpeg": - self.assertEqual(file_info.extension, "jpg", f) - elif extension.lower() == "tif": - self.assertEqual(file_info.extension, "tiff", f) - else: - self.assertEqual(file_info.extension, extension.lower(), f) + self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Title.{}", "Sender", "Title", ()) + "Sender - Title.pdf", "Sender", "Title", ()) def test_guess_attributes_from_name1(self): self._test_guess_attributes_from_name( - "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ()) + "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ()) def test_guess_attributes_from_name2(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ()) + "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ()) def test_guess_attributes_from_name3(self): self._test_guess_attributes_from_name( - "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ()) + "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ()) def test_guess_attributes_from_name4(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ()) + "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ()) def test_guess_attributes_from_name5(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Title - tag1,tag2,tag3.{}", + "Sender - Title - tag1,tag2,tag3.pdf", "Sender", "Title", self.TAGS @@ -73,7 +58,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name6(self): self._test_guess_attributes_from_name( - "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}", + "Spaced Sender - Title - tag1,tag2,tag3.pdf", "Spaced Sender", "Title", self.TAGS @@ -81,7 +66,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name7(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}", + "Sender - Spaced Title - tag1,tag2,tag3.pdf", "Sender", "Spaced Title", self.TAGS @@ -89,7 +74,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name8(self): self._test_guess_attributes_from_name( - "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}", + "Dashed-Sender - Title - tag1,tag2,tag3.pdf", "Dashed-Sender", "Title", self.TAGS @@ -97,7 +82,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name9(self): self._test_guess_attributes_from_name( - "/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}", + "Sender - Dashed-Title - tag1,tag2,tag3.pdf", "Sender", "Dashed-Title", self.TAGS @@ -105,7 +90,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name10(self): self._test_guess_attributes_from_name( - "/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}", + "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf", "Σενδερ", "Τιτλε", self.TAGS @@ -113,7 +98,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name_when_correspondent_empty(self): self._test_guess_attributes_from_name( - '/path/to/ - weird empty correspondent but should not break.{}', + ' - weird empty correspondent but should not break.pdf', None, 'weird empty correspondent but should not break', () @@ -121,7 +106,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name_when_title_starts_with_dash(self): self._test_guess_attributes_from_name( - '/path/to/- weird but should not break.{}', + '- weird but should not break.pdf', None, '- weird but should not break', () @@ -129,7 +114,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name_when_title_ends_with_dash(self): self._test_guess_attributes_from_name( - '/path/to/weird but should not break -.{}', + 'weird but should not break -.pdf', None, 'weird but should not break -', () @@ -137,7 +122,7 @@ class TestAttributes(TestCase): def test_guess_attributes_from_name_when_title_is_empty(self): self._test_guess_attributes_from_name( - '/path/to/weird correspondent but should not break - .{}', + 'weird correspondent but should not break - .pdf', 'weird correspondent but should not break', '', () @@ -149,11 +134,11 @@ class TestAttributes(TestCase): :return: """ - path = "Title - Correspondent - tAg1,TAG2.pdf" - self.assertEqual(len(FileInfo.from_path(path).tags), 2) + filename = "Title - Correspondent - tAg1,TAG2.pdf" + self.assertEqual(len(FileInfo.from_filename(filename).tags), 2) path = "Title - Correspondent - tag1,tag2.pdf" - self.assertEqual(len(FileInfo.from_path(path).tags), 2) + self.assertEqual(len(FileInfo.from_filename(filename).tags), 2) self.assertEqual(Tag.objects.all().count(), 2) @@ -173,13 +158,12 @@ class TestFieldPermutations(TestCase): ] valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""] valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] - valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"] def _test_guessed_attributes(self, filename, created=None, correspondent=None, title=None, - extension=None, tags=None): + tags=None): - info = FileInfo.from_path(filename) + info = FileInfo.from_filename(filename) # Created if created is None: @@ -207,68 +191,56 @@ class TestFieldPermutations(TestCase): filename ) - # Extension - if extension == 'jpeg': - extension = 'jpg' - self.assertEqual(info.extension, extension, filename) - def test_just_title(self): - template = '/path/to/{title}.{extension}' + template = '{title}.pdf' for title in self.valid_titles: - for extension in self.valid_extensions: - spec = dict(title=title, extension=extension) + spec = dict(title=title) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) + + def test_title_and_correspondent(self): + template = '{correspondent} - {title}.pdf' + for correspondent in self.valid_correspondents: + for title in self.valid_titles: + spec = dict(correspondent=correspondent, title=title) filename = template.format(**spec) self._test_guessed_attributes(filename, **spec) - def test_title_and_correspondent(self): - template = '/path/to/{correspondent} - {title}.{extension}' - for correspondent in self.valid_correspondents: - for title in self.valid_titles: - for extension in self.valid_extensions: - spec = dict(correspondent=correspondent, title=title, - extension=extension) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) - def test_title_and_correspondent_and_tags(self): - template = '/path/to/{correspondent} - {title} - {tags}.{extension}' + template = '{correspondent} - {title} - {tags}.pdf' for correspondent in self.valid_correspondents: for title in self.valid_titles: for tags in self.valid_tags: - for extension in self.valid_extensions: - spec = dict(correspondent=correspondent, title=title, - tags=tags, extension=extension) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) + spec = dict(correspondent=correspondent, title=title, + tags=tags) + filename = template.format(**spec) + self._test_guessed_attributes(filename, **spec) def test_created_and_correspondent_and_title_and_tags(self): template = ( - "/path/to/{created} - " + "{created} - " "{correspondent} - " "{title} - " - "{tags}" - ".{extension}" + "{tags}.pdf" ) for created in self.valid_dates: for correspondent in self.valid_correspondents: for title in self.valid_titles: for tags in self.valid_tags: - for extension in self.valid_extensions: - spec = { - "created": created, - "correspondent": correspondent, - "title": title, - "tags": tags, - "extension": extension - } - self._test_guessed_attributes( - template.format(**spec), **spec) + spec = { + "created": created, + "correspondent": correspondent, + "title": title, + "tags": tags, + } + self._test_guessed_attributes( + template.format(**spec), **spec) def test_created_and_correspondent_and_title(self): - template = "/path/to/{created} - {correspondent} - {title}.{extension}" + template = "{created} - {correspondent} - {title}.pdf" for created in self.valid_dates: for correspondent in self.valid_correspondents: @@ -279,56 +251,50 @@ class TestFieldPermutations(TestCase): if title.lower() == title: continue - for extension in self.valid_extensions: - spec = { - "created": created, - "correspondent": correspondent, - "title": title, - "extension": extension - } - self._test_guessed_attributes( - template.format(**spec), **spec) - - def test_created_and_title(self): - - template = "/path/to/{created} - {title}.{extension}" - - for created in self.valid_dates: - for title in self.valid_titles: - for extension in self.valid_extensions: spec = { "created": created, - "title": title, - "extension": extension + "correspondent": correspondent, + "title": title } self._test_guessed_attributes( template.format(**spec), **spec) + def test_created_and_title(self): + + template = "{created} - {title}.pdf" + + for created in self.valid_dates: + for title in self.valid_titles: + spec = { + "created": created, + "title": title + } + self._test_guessed_attributes( + template.format(**spec), **spec) + def test_created_and_title_and_tags(self): - template = "/path/to/{created} - {title} - {tags}.{extension}" + template = "{created} - {title} - {tags}.pdf" for created in self.valid_dates: for title in self.valid_titles: for tags in self.valid_tags: - for extension in self.valid_extensions: - spec = { - "created": created, - "title": title, - "tags": tags, - "extension": extension - } - self._test_guessed_attributes( - template.format(**spec), **spec) + spec = { + "created": created, + "title": title, + "tags": tags + } + self._test_guessed_attributes( + template.format(**spec), **spec) def test_invalid_date_format(self): - info = FileInfo.from_path("/path/to/06112017Z - title.pdf") + info = FileInfo.from_filename("06112017Z - title.pdf") self.assertEqual(info.title, "title") self.assertIsNone(info.created) def test_filename_parse_transforms(self): - path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf" + filename = "tag1,tag2_20190908_180610_0001.pdf" all_patt = re.compile("^.*$") none_patt = re.compile("$a") exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.") @@ -336,50 +302,44 @@ class TestFieldPermutations(TestCase): repl2 = "\\2Z - " + repl1 # creation date + repl1 # No transformations configured (= default) - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - self.assertEqual(info.extension, "pdf") self.assertEqual(info.tags, ()) self.assertIsNone(info.created) # Pattern doesn't match (filename unaltered) with self.settings( FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - self.assertEqual(info.extension, "pdf") # Simple transformation (match all) with self.settings( FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "all") - self.assertEqual(info.extension, "gif") # Multiple transformations configured (first pattern matches) with self.settings( FILENAME_PARSE_TRANSFORMS=[ (all_patt, "all.gif"), (all_patt, "anotherall.gif")]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "all") - self.assertEqual(info.extension, "gif") # Multiple transformations configured (second pattern matches) with self.settings( FILENAME_PARSE_TRANSFORMS=[ (none_patt, "none.gif"), (all_patt, "anotherall.gif")]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "anotherall") - self.assertEqual(info.extension, "gif") # Complex transformation without date in replacement string with self.settings( FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "0001") - self.assertEqual(info.extension, "pdf") self.assertEqual(len(info.tags), 2) self.assertEqual(info.tags[0].slug, "tag1") self.assertEqual(info.tags[1].slug, "tag2") @@ -392,9 +352,8 @@ class TestFieldPermutations(TestCase): (exact_patt, repl2), # <-- matches (exact_patt, repl1), (all_patt, "all.gif")]): - info = FileInfo.from_path(path) + info = FileInfo.from_filename(filename) self.assertEqual(info.title, "0001") - self.assertEqual(info.extension, "pdf") self.assertEqual(len(info.tags), 2) self.assertEqual(info.tags[0].slug, "tag1") self.assertEqual(info.tags[1].slug, "tag2")