From f205c4d0e25e380ef9b609867662b6b909e7979b Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 4 Mar 2025 13:49:47 -0800 Subject: [PATCH] Removes undocumented FileInfo (#9298) --- src/documents/consumer.py | 12 +-- src/documents/models.py | 89 ----------------- src/documents/tests/test_consumer.py | 140 --------------------------- src/paperless/settings.py | 6 -- 4 files changed, 5 insertions(+), 242 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 81739fa7a..4bf9ab89b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -26,7 +26,6 @@ from documents.models import CustomField from documents.models import CustomFieldInstance from documents.models import Document from documents.models import DocumentType -from documents.models import FileInfo from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowTrigger @@ -705,8 +704,6 @@ class ConsumerPlugin( ) -> Document: # If someone gave us the original filename, use it instead of doc. - file_info = FileInfo.from_filename(self.filename) - self.log.debug("Saving record to database") if self.metadata.created is not None: @@ -714,9 +711,6 @@ class ConsumerPlugin( self.log.debug( f"Creation date from post_documents parameter: {create_date}", ) - elif file_info.created is not None: - create_date = file_info.created - self.log.debug(f"Creation date from FileInfo: {create_date}") elif date is not None: create_date = date self.log.debug(f"Creation date from parse_date: {create_date}") @@ -729,7 +723,11 @@ class ConsumerPlugin( storage_type = Document.STORAGE_TYPE_UNENCRYPTED - title = file_info.title + if self.metadata.filename: + title = Path(self.metadata.filename).stem + else: + title = self.input_doc.original_file.stem + if self.metadata.title is not None: try: title = self._parse_title_placeholders(self.metadata.title) diff --git a/src/documents/models.py b/src/documents/models.py index 57ff96df1..e40ee8115 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,11 +1,7 @@ import datetime -import os -import re -from collections import OrderedDict from pathlib import Path from typing import Final -import dateutil.parser import pathvalidate from celery import states from django.conf import settings @@ -517,91 +513,6 @@ class SavedViewFilterRule(models.Model): return f"SavedViewFilterRule: {self.rule_type} : {self.value}" -# TODO: why is this in the models file? -# TODO: how about, what is this and where is it documented? -# It appears to parsing JSON from an environment variable to get a title and date from -# the filename, if possible, as a higher priority than either document filename or -# content parsing -class FileInfo: - REGEXES = OrderedDict( - [ - ( - "created-title", - re.compile( - r"^(?P\d{8}(\d{6})?Z) - (?P.*)$", - flags=re.IGNORECASE, - ), - ), - ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)), - ], - ) - - def __init__( - self, - created=None, - correspondent=None, - title=None, - tags=(), - extension=None, - ): - self.created = created - self.title = title - self.extension = extension - self.correspondent = correspondent - self.tags = tags - - @classmethod - def _get_created(cls, created): - try: - return dateutil.parser.parse(f"{created[:-1]:0<14}Z") - except ValueError: - return None - - @classmethod - def _get_title(cls, title): - return title - - @classmethod - def _mangle_property(cls, properties, name): - if name in properties: - properties[name] = getattr(cls, f"_get_{name}")(properties[name]) - - @classmethod - def from_filename(cls, filename) -> "FileInfo": - # Mutate filename in-place before parsing its components - # by applying at most one of the configured transformations. - for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS: - (filename, count) = pattern.subn(repl, filename) - if count: - break - - # do this after the transforms so that the transforms can do whatever - # with the file extension. - filename_no_ext = os.path.splitext(filename)[0] - - if filename_no_ext == filename and filename.startswith("."): - # This is a very special case where there is no text before the - # file type. - # TODO: this should be handled better. The ext is not removed - # because usually, files like '.pdf' are just hidden files - # with the name pdf, but in our case, its more likely that - # there's just no name to begin with. - filename = "" - # This isn't too bad either, since we'll just not match anything - # and return an empty title. TODO: actually, this is kinda bad. - else: - filename = filename_no_ext - - # Parse filename components. - for regex in cls.REGEXES.values(): - m = regex.match(filename) - if m: - properties = m.groupdict() - cls._mangle_property(properties, "created") - cls._mangle_property(properties, "title") - return cls(**properties) - - # Extending User Model Using a One-To-One Link class UiSettings(models.Model): user = models.OneToOneField( diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 6f576ab24..ff684804e 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,12 +1,10 @@ import datetime import os -import re import shutil import stat import tempfile import zoneinfo from pathlib import Path -from unittest import TestCase as UnittestTestCase from unittest import mock from unittest.mock import MagicMock @@ -26,7 +24,6 @@ from documents.models import Correspondent from documents.models import CustomField from documents.models import Document from documents.models import DocumentType -from documents.models import FileInfo from documents.models import StoragePath from documents.models import Tag from documents.parsers import DocumentParser @@ -40,143 +37,6 @@ from paperless_mail.models import MailRule from paperless_mail.parsers import MailDocumentParser -class TestAttributes(UnittestTestCase): - TAGS = ("tag1", "tag2", "tag3") - - def _test_guess_attributes_from_name(self, filename, sender, title, tags): - file_info = FileInfo.from_filename(filename) - - if sender: - self.assertEqual(file_info.correspondent.name, sender, filename) - else: - self.assertIsNone(file_info.correspondent, filename) - - self.assertEqual(file_info.title, title, filename) - - self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename) - - def test_guess_attributes_from_name_when_title_starts_with_dash(self): - self._test_guess_attributes_from_name( - "- weird but should not break.pdf", - None, - "- weird but should not break", - (), - ) - - def test_guess_attributes_from_name_when_title_ends_with_dash(self): - self._test_guess_attributes_from_name( - "weird but should not break -.pdf", - None, - "weird but should not break -", - (), - ) - - -class TestFieldPermutations(TestCase): - valid_dates = ( - "20150102030405Z", - "20150102Z", - ) - valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""] - valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""] - valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] - - def _test_guessed_attributes( - self, - filename, - created=None, - correspondent=None, - title=None, - tags=None, - ): - info = FileInfo.from_filename(filename) - - # Created - if created is None: - self.assertIsNone(info.created, filename) - else: - self.assertEqual(info.created.year, int(created[:4]), filename) - self.assertEqual(info.created.month, int(created[4:6]), filename) - self.assertEqual(info.created.day, int(created[6:8]), filename) - - # Correspondent - if correspondent: - self.assertEqual(info.correspondent.name, correspondent, filename) - else: - self.assertEqual(info.correspondent, None, filename) - - # Title - self.assertEqual(info.title, title, filename) - - # Tags - if tags is None: - self.assertEqual(info.tags, (), filename) - else: - self.assertEqual([t.name for t in info.tags], tags.split(","), filename) - - def test_just_title(self): - template = "{title}.pdf" - for title in self.valid_titles: - spec = dict(title=title) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) - - def test_created_and_title(self): - template = "{created} - {title}.pdf" - - for created in self.valid_dates: - for title in self.valid_titles: - spec = {"created": created, "title": title} - self._test_guessed_attributes(template.format(**spec), **spec) - - def test_invalid_date_format(self): - info = FileInfo.from_filename("06112017Z - title.pdf") - self.assertEqual(info.title, "title") - self.assertIsNone(info.created) - - def test_filename_parse_transforms(self): - filename = "tag1,tag2_20190908_180610_0001.pdf" - all_patt = re.compile("^.*$") - none_patt = re.compile("$a") - re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.") - - # No transformations configured (= default) - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - self.assertEqual(info.tags, ()) - self.assertIsNone(info.created) - - # Pattern doesn't match (filename unaltered) - with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - - # Simple transformation (match all) - with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "all") - - # Multiple transformations configured (first pattern matches) - with self.settings( - FILENAME_PARSE_TRANSFORMS=[ - (all_patt, "all.gif"), - (all_patt, "anotherall.gif"), - ], - ): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "all") - - # Multiple transformations configured (second pattern matches) - with self.settings( - FILENAME_PARSE_TRANSFORMS=[ - (none_patt, "none.gif"), - (all_patt, "anotherall.gif"), - ], - ): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "anotherall") - - class _BaseTestParser(DocumentParser): def get_settings(self): """ diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 0c8c71ab9..ff1829528 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -3,7 +3,6 @@ import json import math import multiprocessing import os -import re import tempfile from os import PathLike from pathlib import Path @@ -1089,11 +1088,6 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") # fewer dates shown. NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) -# Transformations applied before filename parsing -FILENAME_PARSE_TRANSFORMS = [] -for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): - FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) - # Specify the filename format for out files FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")