mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Removes undocumented FileInfo (#9298)
This commit is contained in:
		@@ -26,7 +26,6 @@ from documents.models import CustomField
 | 
			
		||||
from documents.models import CustomFieldInstance
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.models import DocumentType
 | 
			
		||||
from documents.models import FileInfo
 | 
			
		||||
from documents.models import StoragePath
 | 
			
		||||
from documents.models import Tag
 | 
			
		||||
from documents.models import WorkflowTrigger
 | 
			
		||||
@@ -705,8 +704,6 @@ class ConsumerPlugin(
 | 
			
		||||
    ) -> Document:
 | 
			
		||||
        # If someone gave us the original filename, use it instead of doc.
 | 
			
		||||
 | 
			
		||||
        file_info = FileInfo.from_filename(self.filename)
 | 
			
		||||
 | 
			
		||||
        self.log.debug("Saving record to database")
 | 
			
		||||
 | 
			
		||||
        if self.metadata.created is not None:
 | 
			
		||||
@@ -714,9 +711,6 @@ class ConsumerPlugin(
 | 
			
		||||
            self.log.debug(
 | 
			
		||||
                f"Creation date from post_documents parameter: {create_date}",
 | 
			
		||||
            )
 | 
			
		||||
        elif file_info.created is not None:
 | 
			
		||||
            create_date = file_info.created
 | 
			
		||||
            self.log.debug(f"Creation date from FileInfo: {create_date}")
 | 
			
		||||
        elif date is not None:
 | 
			
		||||
            create_date = date
 | 
			
		||||
            self.log.debug(f"Creation date from parse_date: {create_date}")
 | 
			
		||||
@@ -729,7 +723,11 @@ class ConsumerPlugin(
 | 
			
		||||
 | 
			
		||||
        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
 | 
			
		||||
 | 
			
		||||
        title = file_info.title
 | 
			
		||||
        if self.metadata.filename:
 | 
			
		||||
            title = Path(self.metadata.filename).stem
 | 
			
		||||
        else:
 | 
			
		||||
            title = self.input_doc.original_file.stem
 | 
			
		||||
 | 
			
		||||
        if self.metadata.title is not None:
 | 
			
		||||
            try:
 | 
			
		||||
                title = self._parse_title_placeholders(self.metadata.title)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,7 @@
 | 
			
		||||
import datetime
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Final
 | 
			
		||||
 | 
			
		||||
import dateutil.parser
 | 
			
		||||
import pathvalidate
 | 
			
		||||
from celery import states
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
@@ -517,91 +513,6 @@ class SavedViewFilterRule(models.Model):
 | 
			
		||||
        return f"SavedViewFilterRule: {self.rule_type} : {self.value}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO: why is this in the models file?
 | 
			
		||||
# TODO: how about, what is this and where is it documented?
 | 
			
		||||
# It appears to parsing JSON from an environment variable to get a title and date from
 | 
			
		||||
# the filename, if possible, as a higher priority than either document filename or
 | 
			
		||||
# content parsing
 | 
			
		||||
class FileInfo:
 | 
			
		||||
    REGEXES = OrderedDict(
 | 
			
		||||
        [
 | 
			
		||||
            (
 | 
			
		||||
                "created-title",
 | 
			
		||||
                re.compile(
 | 
			
		||||
                    r"^(?P<created>\d{8}(\d{6})?Z) - (?P<title>.*)$",
 | 
			
		||||
                    flags=re.IGNORECASE,
 | 
			
		||||
                ),
 | 
			
		||||
            ),
 | 
			
		||||
            ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)),
 | 
			
		||||
        ],
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        created=None,
 | 
			
		||||
        correspondent=None,
 | 
			
		||||
        title=None,
 | 
			
		||||
        tags=(),
 | 
			
		||||
        extension=None,
 | 
			
		||||
    ):
 | 
			
		||||
        self.created = created
 | 
			
		||||
        self.title = title
 | 
			
		||||
        self.extension = extension
 | 
			
		||||
        self.correspondent = correspondent
 | 
			
		||||
        self.tags = tags
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _get_created(cls, created):
 | 
			
		||||
        try:
 | 
			
		||||
            return dateutil.parser.parse(f"{created[:-1]:0<14}Z")
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _get_title(cls, title):
 | 
			
		||||
        return title
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _mangle_property(cls, properties, name):
 | 
			
		||||
        if name in properties:
 | 
			
		||||
            properties[name] = getattr(cls, f"_get_{name}")(properties[name])
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def from_filename(cls, filename) -> "FileInfo":
 | 
			
		||||
        # Mutate filename in-place before parsing its components
 | 
			
		||||
        # by applying at most one of the configured transformations.
 | 
			
		||||
        for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS:
 | 
			
		||||
            (filename, count) = pattern.subn(repl, filename)
 | 
			
		||||
            if count:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        # do this after the transforms so that the transforms can do whatever
 | 
			
		||||
        # with the file extension.
 | 
			
		||||
        filename_no_ext = os.path.splitext(filename)[0]
 | 
			
		||||
 | 
			
		||||
        if filename_no_ext == filename and filename.startswith("."):
 | 
			
		||||
            # This is a very special case where there is no text before the
 | 
			
		||||
            # file type.
 | 
			
		||||
            # TODO: this should be handled better. The ext is not removed
 | 
			
		||||
            #  because usually, files like '.pdf' are just hidden files
 | 
			
		||||
            #  with the name pdf, but in our case, its more likely that
 | 
			
		||||
            #  there's just no name to begin with.
 | 
			
		||||
            filename = ""
 | 
			
		||||
            # This isn't too bad either, since we'll just not match anything
 | 
			
		||||
            # and return an empty title. TODO: actually, this is kinda bad.
 | 
			
		||||
        else:
 | 
			
		||||
            filename = filename_no_ext
 | 
			
		||||
 | 
			
		||||
        # Parse filename components.
 | 
			
		||||
        for regex in cls.REGEXES.values():
 | 
			
		||||
            m = regex.match(filename)
 | 
			
		||||
            if m:
 | 
			
		||||
                properties = m.groupdict()
 | 
			
		||||
                cls._mangle_property(properties, "created")
 | 
			
		||||
                cls._mangle_property(properties, "title")
 | 
			
		||||
                return cls(**properties)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Extending User Model Using a One-To-One Link
 | 
			
		||||
class UiSettings(models.Model):
 | 
			
		||||
    user = models.OneToOneField(
 | 
			
		||||
 
 | 
			
		||||
@@ -1,12 +1,10 @@
 | 
			
		||||
import datetime
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import shutil
 | 
			
		||||
import stat
 | 
			
		||||
import tempfile
 | 
			
		||||
import zoneinfo
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from unittest import TestCase as UnittestTestCase
 | 
			
		||||
from unittest import mock
 | 
			
		||||
from unittest.mock import MagicMock
 | 
			
		||||
 | 
			
		||||
@@ -26,7 +24,6 @@ from documents.models import Correspondent
 | 
			
		||||
from documents.models import CustomField
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.models import DocumentType
 | 
			
		||||
from documents.models import FileInfo
 | 
			
		||||
from documents.models import StoragePath
 | 
			
		||||
from documents.models import Tag
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
@@ -40,143 +37,6 @@ from paperless_mail.models import MailRule
 | 
			
		||||
from paperless_mail.parsers import MailDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestAttributes(UnittestTestCase):
 | 
			
		||||
    TAGS = ("tag1", "tag2", "tag3")
 | 
			
		||||
 | 
			
		||||
    def _test_guess_attributes_from_name(self, filename, sender, title, tags):
 | 
			
		||||
        file_info = FileInfo.from_filename(filename)
 | 
			
		||||
 | 
			
		||||
        if sender:
 | 
			
		||||
            self.assertEqual(file_info.correspondent.name, sender, filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertIsNone(file_info.correspondent, filename)
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(file_info.title, title, filename)
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename)
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            "- weird but should not break.pdf",
 | 
			
		||||
            None,
 | 
			
		||||
            "- weird but should not break",
 | 
			
		||||
            (),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            "weird but should not break -.pdf",
 | 
			
		||||
            None,
 | 
			
		||||
            "weird but should not break -",
 | 
			
		||||
            (),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestFieldPermutations(TestCase):
 | 
			
		||||
    valid_dates = (
 | 
			
		||||
        "20150102030405Z",
 | 
			
		||||
        "20150102Z",
 | 
			
		||||
    )
 | 
			
		||||
    valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""]
 | 
			
		||||
    valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""]
 | 
			
		||||
    valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
 | 
			
		||||
 | 
			
		||||
    def _test_guessed_attributes(
 | 
			
		||||
        self,
 | 
			
		||||
        filename,
 | 
			
		||||
        created=None,
 | 
			
		||||
        correspondent=None,
 | 
			
		||||
        title=None,
 | 
			
		||||
        tags=None,
 | 
			
		||||
    ):
 | 
			
		||||
        info = FileInfo.from_filename(filename)
 | 
			
		||||
 | 
			
		||||
        # Created
 | 
			
		||||
        if created is None:
 | 
			
		||||
            self.assertIsNone(info.created, filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertEqual(info.created.year, int(created[:4]), filename)
 | 
			
		||||
            self.assertEqual(info.created.month, int(created[4:6]), filename)
 | 
			
		||||
            self.assertEqual(info.created.day, int(created[6:8]), filename)
 | 
			
		||||
 | 
			
		||||
        # Correspondent
 | 
			
		||||
        if correspondent:
 | 
			
		||||
            self.assertEqual(info.correspondent.name, correspondent, filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertEqual(info.correspondent, None, filename)
 | 
			
		||||
 | 
			
		||||
        # Title
 | 
			
		||||
        self.assertEqual(info.title, title, filename)
 | 
			
		||||
 | 
			
		||||
        # Tags
 | 
			
		||||
        if tags is None:
 | 
			
		||||
            self.assertEqual(info.tags, (), filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertEqual([t.name for t in info.tags], tags.split(","), filename)
 | 
			
		||||
 | 
			
		||||
    def test_just_title(self):
 | 
			
		||||
        template = "{title}.pdf"
 | 
			
		||||
        for title in self.valid_titles:
 | 
			
		||||
            spec = dict(title=title)
 | 
			
		||||
            filename = template.format(**spec)
 | 
			
		||||
            self._test_guessed_attributes(filename, **spec)
 | 
			
		||||
 | 
			
		||||
    def test_created_and_title(self):
 | 
			
		||||
        template = "{created} - {title}.pdf"
 | 
			
		||||
 | 
			
		||||
        for created in self.valid_dates:
 | 
			
		||||
            for title in self.valid_titles:
 | 
			
		||||
                spec = {"created": created, "title": title}
 | 
			
		||||
                self._test_guessed_attributes(template.format(**spec), **spec)
 | 
			
		||||
 | 
			
		||||
    def test_invalid_date_format(self):
 | 
			
		||||
        info = FileInfo.from_filename("06112017Z - title.pdf")
 | 
			
		||||
        self.assertEqual(info.title, "title")
 | 
			
		||||
        self.assertIsNone(info.created)
 | 
			
		||||
 | 
			
		||||
    def test_filename_parse_transforms(self):
 | 
			
		||||
        filename = "tag1,tag2_20190908_180610_0001.pdf"
 | 
			
		||||
        all_patt = re.compile("^.*$")
 | 
			
		||||
        none_patt = re.compile("$a")
 | 
			
		||||
        re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
 | 
			
		||||
 | 
			
		||||
        # No transformations configured (= default)
 | 
			
		||||
        info = FileInfo.from_filename(filename)
 | 
			
		||||
        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
 | 
			
		||||
        self.assertEqual(info.tags, ())
 | 
			
		||||
        self.assertIsNone(info.created)
 | 
			
		||||
 | 
			
		||||
        # Pattern doesn't match (filename unaltered)
 | 
			
		||||
        with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
 | 
			
		||||
            info = FileInfo.from_filename(filename)
 | 
			
		||||
            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
 | 
			
		||||
 | 
			
		||||
        # Simple transformation (match all)
 | 
			
		||||
        with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
 | 
			
		||||
            info = FileInfo.from_filename(filename)
 | 
			
		||||
            self.assertEqual(info.title, "all")
 | 
			
		||||
 | 
			
		||||
        # Multiple transformations configured (first pattern matches)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
            FILENAME_PARSE_TRANSFORMS=[
 | 
			
		||||
                (all_patt, "all.gif"),
 | 
			
		||||
                (all_patt, "anotherall.gif"),
 | 
			
		||||
            ],
 | 
			
		||||
        ):
 | 
			
		||||
            info = FileInfo.from_filename(filename)
 | 
			
		||||
            self.assertEqual(info.title, "all")
 | 
			
		||||
 | 
			
		||||
        # Multiple transformations configured (second pattern matches)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
            FILENAME_PARSE_TRANSFORMS=[
 | 
			
		||||
                (none_patt, "none.gif"),
 | 
			
		||||
                (all_patt, "anotherall.gif"),
 | 
			
		||||
            ],
 | 
			
		||||
        ):
 | 
			
		||||
            info = FileInfo.from_filename(filename)
 | 
			
		||||
            self.assertEqual(info.title, "anotherall")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class _BaseTestParser(DocumentParser):
 | 
			
		||||
    def get_settings(self):
 | 
			
		||||
        """
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,6 @@ import json
 | 
			
		||||
import math
 | 
			
		||||
import multiprocessing
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import tempfile
 | 
			
		||||
from os import PathLike
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
@@ -1089,11 +1088,6 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 | 
			
		||||
# fewer dates shown.
 | 
			
		||||
NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3)
 | 
			
		||||
 | 
			
		||||
# Transformations applied before filename parsing
 | 
			
		||||
FILENAME_PARSE_TRANSFORMS = []
 | 
			
		||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
 | 
			
		||||
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 | 
			
		||||
 | 
			
		||||
# Specify the filename format for out files
 | 
			
		||||
FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user