mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Removes undocumented FileInfo (#9298)
This commit is contained in:
parent
344b2bc0eb
commit
f205c4d0e2
@ -26,7 +26,6 @@ from documents.models import CustomField
|
|||||||
from documents.models import CustomFieldInstance
|
from documents.models import CustomFieldInstance
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import FileInfo
|
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
@ -705,8 +704,6 @@ class ConsumerPlugin(
|
|||||||
) -> Document:
|
) -> Document:
|
||||||
# If someone gave us the original filename, use it instead of doc.
|
# If someone gave us the original filename, use it instead of doc.
|
||||||
|
|
||||||
file_info = FileInfo.from_filename(self.filename)
|
|
||||||
|
|
||||||
self.log.debug("Saving record to database")
|
self.log.debug("Saving record to database")
|
||||||
|
|
||||||
if self.metadata.created is not None:
|
if self.metadata.created is not None:
|
||||||
@ -714,9 +711,6 @@ class ConsumerPlugin(
|
|||||||
self.log.debug(
|
self.log.debug(
|
||||||
f"Creation date from post_documents parameter: {create_date}",
|
f"Creation date from post_documents parameter: {create_date}",
|
||||||
)
|
)
|
||||||
elif file_info.created is not None:
|
|
||||||
create_date = file_info.created
|
|
||||||
self.log.debug(f"Creation date from FileInfo: {create_date}")
|
|
||||||
elif date is not None:
|
elif date is not None:
|
||||||
create_date = date
|
create_date = date
|
||||||
self.log.debug(f"Creation date from parse_date: {create_date}")
|
self.log.debug(f"Creation date from parse_date: {create_date}")
|
||||||
@ -729,7 +723,11 @@ class ConsumerPlugin(
|
|||||||
|
|
||||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||||
|
|
||||||
title = file_info.title
|
if self.metadata.filename:
|
||||||
|
title = Path(self.metadata.filename).stem
|
||||||
|
else:
|
||||||
|
title = self.input_doc.original_file.stem
|
||||||
|
|
||||||
if self.metadata.title is not None:
|
if self.metadata.title is not None:
|
||||||
try:
|
try:
|
||||||
title = self._parse_title_placeholders(self.metadata.title)
|
title = self._parse_title_placeholders(self.metadata.title)
|
||||||
|
@ -1,11 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import os
|
|
||||||
import re
|
|
||||||
from collections import OrderedDict
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
import dateutil.parser
|
|
||||||
import pathvalidate
|
import pathvalidate
|
||||||
from celery import states
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@ -517,91 +513,6 @@ class SavedViewFilterRule(models.Model):
|
|||||||
return f"SavedViewFilterRule: {self.rule_type} : {self.value}"
|
return f"SavedViewFilterRule: {self.rule_type} : {self.value}"
|
||||||
|
|
||||||
|
|
||||||
# TODO: why is this in the models file?
|
|
||||||
# TODO: how about, what is this and where is it documented?
|
|
||||||
# It appears to parsing JSON from an environment variable to get a title and date from
|
|
||||||
# the filename, if possible, as a higher priority than either document filename or
|
|
||||||
# content parsing
|
|
||||||
class FileInfo:
|
|
||||||
REGEXES = OrderedDict(
|
|
||||||
[
|
|
||||||
(
|
|
||||||
"created-title",
|
|
||||||
re.compile(
|
|
||||||
r"^(?P<created>\d{8}(\d{6})?Z) - (?P<title>.*)$",
|
|
||||||
flags=re.IGNORECASE,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
created=None,
|
|
||||||
correspondent=None,
|
|
||||||
title=None,
|
|
||||||
tags=(),
|
|
||||||
extension=None,
|
|
||||||
):
|
|
||||||
self.created = created
|
|
||||||
self.title = title
|
|
||||||
self.extension = extension
|
|
||||||
self.correspondent = correspondent
|
|
||||||
self.tags = tags
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_created(cls, created):
|
|
||||||
try:
|
|
||||||
return dateutil.parser.parse(f"{created[:-1]:0<14}Z")
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_title(cls, title):
|
|
||||||
return title
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _mangle_property(cls, properties, name):
|
|
||||||
if name in properties:
|
|
||||||
properties[name] = getattr(cls, f"_get_{name}")(properties[name])
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_filename(cls, filename) -> "FileInfo":
|
|
||||||
# Mutate filename in-place before parsing its components
|
|
||||||
# by applying at most one of the configured transformations.
|
|
||||||
for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS:
|
|
||||||
(filename, count) = pattern.subn(repl, filename)
|
|
||||||
if count:
|
|
||||||
break
|
|
||||||
|
|
||||||
# do this after the transforms so that the transforms can do whatever
|
|
||||||
# with the file extension.
|
|
||||||
filename_no_ext = os.path.splitext(filename)[0]
|
|
||||||
|
|
||||||
if filename_no_ext == filename and filename.startswith("."):
|
|
||||||
# This is a very special case where there is no text before the
|
|
||||||
# file type.
|
|
||||||
# TODO: this should be handled better. The ext is not removed
|
|
||||||
# because usually, files like '.pdf' are just hidden files
|
|
||||||
# with the name pdf, but in our case, its more likely that
|
|
||||||
# there's just no name to begin with.
|
|
||||||
filename = ""
|
|
||||||
# This isn't too bad either, since we'll just not match anything
|
|
||||||
# and return an empty title. TODO: actually, this is kinda bad.
|
|
||||||
else:
|
|
||||||
filename = filename_no_ext
|
|
||||||
|
|
||||||
# Parse filename components.
|
|
||||||
for regex in cls.REGEXES.values():
|
|
||||||
m = regex.match(filename)
|
|
||||||
if m:
|
|
||||||
properties = m.groupdict()
|
|
||||||
cls._mangle_property(properties, "created")
|
|
||||||
cls._mangle_property(properties, "title")
|
|
||||||
return cls(**properties)
|
|
||||||
|
|
||||||
|
|
||||||
# Extending User Model Using a One-To-One Link
|
# Extending User Model Using a One-To-One Link
|
||||||
class UiSettings(models.Model):
|
class UiSettings(models.Model):
|
||||||
user = models.OneToOneField(
|
user = models.OneToOneField(
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import stat
|
import stat
|
||||||
import tempfile
|
import tempfile
|
||||||
import zoneinfo
|
import zoneinfo
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import TestCase as UnittestTestCase
|
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
@ -26,7 +24,6 @@ from documents.models import Correspondent
|
|||||||
from documents.models import CustomField
|
from documents.models import CustomField
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import FileInfo
|
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
@ -40,143 +37,6 @@ from paperless_mail.models import MailRule
|
|||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class TestAttributes(UnittestTestCase):
|
|
||||||
TAGS = ("tag1", "tag2", "tag3")
|
|
||||||
|
|
||||||
def _test_guess_attributes_from_name(self, filename, sender, title, tags):
|
|
||||||
file_info = FileInfo.from_filename(filename)
|
|
||||||
|
|
||||||
if sender:
|
|
||||||
self.assertEqual(file_info.correspondent.name, sender, filename)
|
|
||||||
else:
|
|
||||||
self.assertIsNone(file_info.correspondent, filename)
|
|
||||||
|
|
||||||
self.assertEqual(file_info.title, title, filename)
|
|
||||||
|
|
||||||
self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"- weird but should not break.pdf",
|
|
||||||
None,
|
|
||||||
"- weird but should not break",
|
|
||||||
(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
|
|
||||||
self._test_guess_attributes_from_name(
|
|
||||||
"weird but should not break -.pdf",
|
|
||||||
None,
|
|
||||||
"weird but should not break -",
|
|
||||||
(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFieldPermutations(TestCase):
|
|
||||||
valid_dates = (
|
|
||||||
"20150102030405Z",
|
|
||||||
"20150102Z",
|
|
||||||
)
|
|
||||||
valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""]
|
|
||||||
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""]
|
|
||||||
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
|
|
||||||
|
|
||||||
def _test_guessed_attributes(
|
|
||||||
self,
|
|
||||||
filename,
|
|
||||||
created=None,
|
|
||||||
correspondent=None,
|
|
||||||
title=None,
|
|
||||||
tags=None,
|
|
||||||
):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
|
|
||||||
# Created
|
|
||||||
if created is None:
|
|
||||||
self.assertIsNone(info.created, filename)
|
|
||||||
else:
|
|
||||||
self.assertEqual(info.created.year, int(created[:4]), filename)
|
|
||||||
self.assertEqual(info.created.month, int(created[4:6]), filename)
|
|
||||||
self.assertEqual(info.created.day, int(created[6:8]), filename)
|
|
||||||
|
|
||||||
# Correspondent
|
|
||||||
if correspondent:
|
|
||||||
self.assertEqual(info.correspondent.name, correspondent, filename)
|
|
||||||
else:
|
|
||||||
self.assertEqual(info.correspondent, None, filename)
|
|
||||||
|
|
||||||
# Title
|
|
||||||
self.assertEqual(info.title, title, filename)
|
|
||||||
|
|
||||||
# Tags
|
|
||||||
if tags is None:
|
|
||||||
self.assertEqual(info.tags, (), filename)
|
|
||||||
else:
|
|
||||||
self.assertEqual([t.name for t in info.tags], tags.split(","), filename)
|
|
||||||
|
|
||||||
def test_just_title(self):
|
|
||||||
template = "{title}.pdf"
|
|
||||||
for title in self.valid_titles:
|
|
||||||
spec = dict(title=title)
|
|
||||||
filename = template.format(**spec)
|
|
||||||
self._test_guessed_attributes(filename, **spec)
|
|
||||||
|
|
||||||
def test_created_and_title(self):
|
|
||||||
template = "{created} - {title}.pdf"
|
|
||||||
|
|
||||||
for created in self.valid_dates:
|
|
||||||
for title in self.valid_titles:
|
|
||||||
spec = {"created": created, "title": title}
|
|
||||||
self._test_guessed_attributes(template.format(**spec), **spec)
|
|
||||||
|
|
||||||
def test_invalid_date_format(self):
|
|
||||||
info = FileInfo.from_filename("06112017Z - title.pdf")
|
|
||||||
self.assertEqual(info.title, "title")
|
|
||||||
self.assertIsNone(info.created)
|
|
||||||
|
|
||||||
def test_filename_parse_transforms(self):
|
|
||||||
filename = "tag1,tag2_20190908_180610_0001.pdf"
|
|
||||||
all_patt = re.compile("^.*$")
|
|
||||||
none_patt = re.compile("$a")
|
|
||||||
re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
|
|
||||||
|
|
||||||
# No transformations configured (= default)
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
|
||||||
self.assertEqual(info.tags, ())
|
|
||||||
self.assertIsNone(info.created)
|
|
||||||
|
|
||||||
# Pattern doesn't match (filename unaltered)
|
|
||||||
with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
|
||||||
|
|
||||||
# Simple transformation (match all)
|
|
||||||
with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "all")
|
|
||||||
|
|
||||||
# Multiple transformations configured (first pattern matches)
|
|
||||||
with self.settings(
|
|
||||||
FILENAME_PARSE_TRANSFORMS=[
|
|
||||||
(all_patt, "all.gif"),
|
|
||||||
(all_patt, "anotherall.gif"),
|
|
||||||
],
|
|
||||||
):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "all")
|
|
||||||
|
|
||||||
# Multiple transformations configured (second pattern matches)
|
|
||||||
with self.settings(
|
|
||||||
FILENAME_PARSE_TRANSFORMS=[
|
|
||||||
(none_patt, "none.gif"),
|
|
||||||
(all_patt, "anotherall.gif"),
|
|
||||||
],
|
|
||||||
):
|
|
||||||
info = FileInfo.from_filename(filename)
|
|
||||||
self.assertEqual(info.title, "anotherall")
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseTestParser(DocumentParser):
|
class _BaseTestParser(DocumentParser):
|
||||||
def get_settings(self):
|
def get_settings(self):
|
||||||
"""
|
"""
|
||||||
|
@ -3,7 +3,6 @@ import json
|
|||||||
import math
|
import math
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -1089,11 +1088,6 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
|||||||
# fewer dates shown.
|
# fewer dates shown.
|
||||||
NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3)
|
NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3)
|
||||||
|
|
||||||
# Transformations applied before filename parsing
|
|
||||||
FILENAME_PARSE_TRANSFORMS = []
|
|
||||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
|
||||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
|
||||||
|
|
||||||
# Specify the filename format for out files
|
# Specify the filename format for out files
|
||||||
FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user