mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Removes undocumented FileInfo (#9298)
This commit is contained in:
		| @@ -26,7 +26,6 @@ from documents.models import CustomField | |||||||
| from documents.models import CustomFieldInstance | from documents.models import CustomFieldInstance | ||||||
| from documents.models import Document | from documents.models import Document | ||||||
| from documents.models import DocumentType | from documents.models import DocumentType | ||||||
| from documents.models import FileInfo |  | ||||||
| from documents.models import StoragePath | from documents.models import StoragePath | ||||||
| from documents.models import Tag | from documents.models import Tag | ||||||
| from documents.models import WorkflowTrigger | from documents.models import WorkflowTrigger | ||||||
| @@ -705,8 +704,6 @@ class ConsumerPlugin( | |||||||
|     ) -> Document: |     ) -> Document: | ||||||
|         # If someone gave us the original filename, use it instead of doc. |         # If someone gave us the original filename, use it instead of doc. | ||||||
|  |  | ||||||
|         file_info = FileInfo.from_filename(self.filename) |  | ||||||
|  |  | ||||||
|         self.log.debug("Saving record to database") |         self.log.debug("Saving record to database") | ||||||
|  |  | ||||||
|         if self.metadata.created is not None: |         if self.metadata.created is not None: | ||||||
| @@ -714,9 +711,6 @@ class ConsumerPlugin( | |||||||
|             self.log.debug( |             self.log.debug( | ||||||
|                 f"Creation date from post_documents parameter: {create_date}", |                 f"Creation date from post_documents parameter: {create_date}", | ||||||
|             ) |             ) | ||||||
|         elif file_info.created is not None: |  | ||||||
|             create_date = file_info.created |  | ||||||
|             self.log.debug(f"Creation date from FileInfo: {create_date}") |  | ||||||
|         elif date is not None: |         elif date is not None: | ||||||
|             create_date = date |             create_date = date | ||||||
|             self.log.debug(f"Creation date from parse_date: {create_date}") |             self.log.debug(f"Creation date from parse_date: {create_date}") | ||||||
| @@ -729,7 +723,11 @@ class ConsumerPlugin( | |||||||
|  |  | ||||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED |         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||||
|  |  | ||||||
|         title = file_info.title |         if self.metadata.filename: | ||||||
|  |             title = Path(self.metadata.filename).stem | ||||||
|  |         else: | ||||||
|  |             title = self.input_doc.original_file.stem | ||||||
|  |  | ||||||
|         if self.metadata.title is not None: |         if self.metadata.title is not None: | ||||||
|             try: |             try: | ||||||
|                 title = self._parse_title_placeholders(self.metadata.title) |                 title = self._parse_title_placeholders(self.metadata.title) | ||||||
|   | |||||||
| @@ -1,11 +1,7 @@ | |||||||
| import datetime | import datetime | ||||||
| import os |  | ||||||
| import re |  | ||||||
| from collections import OrderedDict |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Final | from typing import Final | ||||||
|  |  | ||||||
| import dateutil.parser |  | ||||||
| import pathvalidate | import pathvalidate | ||||||
| from celery import states | from celery import states | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| @@ -517,91 +513,6 @@ class SavedViewFilterRule(models.Model): | |||||||
|         return f"SavedViewFilterRule: {self.rule_type} : {self.value}" |         return f"SavedViewFilterRule: {self.rule_type} : {self.value}" | ||||||
|  |  | ||||||
|  |  | ||||||
| # TODO: why is this in the models file? |  | ||||||
| # TODO: how about, what is this and where is it documented? |  | ||||||
| # It appears to parsing JSON from an environment variable to get a title and date from |  | ||||||
| # the filename, if possible, as a higher priority than either document filename or |  | ||||||
| # content parsing |  | ||||||
| class FileInfo: |  | ||||||
|     REGEXES = OrderedDict( |  | ||||||
|         [ |  | ||||||
|             ( |  | ||||||
|                 "created-title", |  | ||||||
|                 re.compile( |  | ||||||
|                     r"^(?P<created>\d{8}(\d{6})?Z) - (?P<title>.*)$", |  | ||||||
|                     flags=re.IGNORECASE, |  | ||||||
|                 ), |  | ||||||
|             ), |  | ||||||
|             ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)), |  | ||||||
|         ], |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|         self, |  | ||||||
|         created=None, |  | ||||||
|         correspondent=None, |  | ||||||
|         title=None, |  | ||||||
|         tags=(), |  | ||||||
|         extension=None, |  | ||||||
|     ): |  | ||||||
|         self.created = created |  | ||||||
|         self.title = title |  | ||||||
|         self.extension = extension |  | ||||||
|         self.correspondent = correspondent |  | ||||||
|         self.tags = tags |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def _get_created(cls, created): |  | ||||||
|         try: |  | ||||||
|             return dateutil.parser.parse(f"{created[:-1]:0<14}Z") |  | ||||||
|         except ValueError: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def _get_title(cls, title): |  | ||||||
|         return title |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def _mangle_property(cls, properties, name): |  | ||||||
|         if name in properties: |  | ||||||
|             properties[name] = getattr(cls, f"_get_{name}")(properties[name]) |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def from_filename(cls, filename) -> "FileInfo": |  | ||||||
|         # Mutate filename in-place before parsing its components |  | ||||||
|         # by applying at most one of the configured transformations. |  | ||||||
|         for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS: |  | ||||||
|             (filename, count) = pattern.subn(repl, filename) |  | ||||||
|             if count: |  | ||||||
|                 break |  | ||||||
|  |  | ||||||
|         # do this after the transforms so that the transforms can do whatever |  | ||||||
|         # with the file extension. |  | ||||||
|         filename_no_ext = os.path.splitext(filename)[0] |  | ||||||
|  |  | ||||||
|         if filename_no_ext == filename and filename.startswith("."): |  | ||||||
|             # This is a very special case where there is no text before the |  | ||||||
|             # file type. |  | ||||||
|             # TODO: this should be handled better. The ext is not removed |  | ||||||
|             #  because usually, files like '.pdf' are just hidden files |  | ||||||
|             #  with the name pdf, but in our case, its more likely that |  | ||||||
|             #  there's just no name to begin with. |  | ||||||
|             filename = "" |  | ||||||
|             # This isn't too bad either, since we'll just not match anything |  | ||||||
|             # and return an empty title. TODO: actually, this is kinda bad. |  | ||||||
|         else: |  | ||||||
|             filename = filename_no_ext |  | ||||||
|  |  | ||||||
|         # Parse filename components. |  | ||||||
|         for regex in cls.REGEXES.values(): |  | ||||||
|             m = regex.match(filename) |  | ||||||
|             if m: |  | ||||||
|                 properties = m.groupdict() |  | ||||||
|                 cls._mangle_property(properties, "created") |  | ||||||
|                 cls._mangle_property(properties, "title") |  | ||||||
|                 return cls(**properties) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Extending User Model Using a One-To-One Link | # Extending User Model Using a One-To-One Link | ||||||
| class UiSettings(models.Model): | class UiSettings(models.Model): | ||||||
|     user = models.OneToOneField( |     user = models.OneToOneField( | ||||||
|   | |||||||
| @@ -1,12 +1,10 @@ | |||||||
| import datetime | import datetime | ||||||
| import os | import os | ||||||
| import re |  | ||||||
| import shutil | import shutil | ||||||
| import stat | import stat | ||||||
| import tempfile | import tempfile | ||||||
| import zoneinfo | import zoneinfo | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from unittest import TestCase as UnittestTestCase |  | ||||||
| from unittest import mock | from unittest import mock | ||||||
| from unittest.mock import MagicMock | from unittest.mock import MagicMock | ||||||
|  |  | ||||||
| @@ -26,7 +24,6 @@ from documents.models import Correspondent | |||||||
| from documents.models import CustomField | from documents.models import CustomField | ||||||
| from documents.models import Document | from documents.models import Document | ||||||
| from documents.models import DocumentType | from documents.models import DocumentType | ||||||
| from documents.models import FileInfo |  | ||||||
| from documents.models import StoragePath | from documents.models import StoragePath | ||||||
| from documents.models import Tag | from documents.models import Tag | ||||||
| from documents.parsers import DocumentParser | from documents.parsers import DocumentParser | ||||||
| @@ -40,143 +37,6 @@ from paperless_mail.models import MailRule | |||||||
| from paperless_mail.parsers import MailDocumentParser | from paperless_mail.parsers import MailDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestAttributes(UnittestTestCase): |  | ||||||
|     TAGS = ("tag1", "tag2", "tag3") |  | ||||||
|  |  | ||||||
|     def _test_guess_attributes_from_name(self, filename, sender, title, tags): |  | ||||||
|         file_info = FileInfo.from_filename(filename) |  | ||||||
|  |  | ||||||
|         if sender: |  | ||||||
|             self.assertEqual(file_info.correspondent.name, sender, filename) |  | ||||||
|         else: |  | ||||||
|             self.assertIsNone(file_info.correspondent, filename) |  | ||||||
|  |  | ||||||
|         self.assertEqual(file_info.title, title, filename) |  | ||||||
|  |  | ||||||
|         self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename) |  | ||||||
|  |  | ||||||
|     def test_guess_attributes_from_name_when_title_starts_with_dash(self): |  | ||||||
|         self._test_guess_attributes_from_name( |  | ||||||
|             "- weird but should not break.pdf", |  | ||||||
|             None, |  | ||||||
|             "- weird but should not break", |  | ||||||
|             (), |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def test_guess_attributes_from_name_when_title_ends_with_dash(self): |  | ||||||
|         self._test_guess_attributes_from_name( |  | ||||||
|             "weird but should not break -.pdf", |  | ||||||
|             None, |  | ||||||
|             "weird but should not break -", |  | ||||||
|             (), |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestFieldPermutations(TestCase): |  | ||||||
|     valid_dates = ( |  | ||||||
|         "20150102030405Z", |  | ||||||
|         "20150102Z", |  | ||||||
|     ) |  | ||||||
|     valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""] |  | ||||||
|     valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""] |  | ||||||
|     valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] |  | ||||||
|  |  | ||||||
|     def _test_guessed_attributes( |  | ||||||
|         self, |  | ||||||
|         filename, |  | ||||||
|         created=None, |  | ||||||
|         correspondent=None, |  | ||||||
|         title=None, |  | ||||||
|         tags=None, |  | ||||||
|     ): |  | ||||||
|         info = FileInfo.from_filename(filename) |  | ||||||
|  |  | ||||||
|         # Created |  | ||||||
|         if created is None: |  | ||||||
|             self.assertIsNone(info.created, filename) |  | ||||||
|         else: |  | ||||||
|             self.assertEqual(info.created.year, int(created[:4]), filename) |  | ||||||
|             self.assertEqual(info.created.month, int(created[4:6]), filename) |  | ||||||
|             self.assertEqual(info.created.day, int(created[6:8]), filename) |  | ||||||
|  |  | ||||||
|         # Correspondent |  | ||||||
|         if correspondent: |  | ||||||
|             self.assertEqual(info.correspondent.name, correspondent, filename) |  | ||||||
|         else: |  | ||||||
|             self.assertEqual(info.correspondent, None, filename) |  | ||||||
|  |  | ||||||
|         # Title |  | ||||||
|         self.assertEqual(info.title, title, filename) |  | ||||||
|  |  | ||||||
|         # Tags |  | ||||||
|         if tags is None: |  | ||||||
|             self.assertEqual(info.tags, (), filename) |  | ||||||
|         else: |  | ||||||
|             self.assertEqual([t.name for t in info.tags], tags.split(","), filename) |  | ||||||
|  |  | ||||||
|     def test_just_title(self): |  | ||||||
|         template = "{title}.pdf" |  | ||||||
|         for title in self.valid_titles: |  | ||||||
|             spec = dict(title=title) |  | ||||||
|             filename = template.format(**spec) |  | ||||||
|             self._test_guessed_attributes(filename, **spec) |  | ||||||
|  |  | ||||||
|     def test_created_and_title(self): |  | ||||||
|         template = "{created} - {title}.pdf" |  | ||||||
|  |  | ||||||
|         for created in self.valid_dates: |  | ||||||
|             for title in self.valid_titles: |  | ||||||
|                 spec = {"created": created, "title": title} |  | ||||||
|                 self._test_guessed_attributes(template.format(**spec), **spec) |  | ||||||
|  |  | ||||||
|     def test_invalid_date_format(self): |  | ||||||
|         info = FileInfo.from_filename("06112017Z - title.pdf") |  | ||||||
|         self.assertEqual(info.title, "title") |  | ||||||
|         self.assertIsNone(info.created) |  | ||||||
|  |  | ||||||
|     def test_filename_parse_transforms(self): |  | ||||||
|         filename = "tag1,tag2_20190908_180610_0001.pdf" |  | ||||||
|         all_patt = re.compile("^.*$") |  | ||||||
|         none_patt = re.compile("$a") |  | ||||||
|         re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.") |  | ||||||
|  |  | ||||||
|         # No transformations configured (= default) |  | ||||||
|         info = FileInfo.from_filename(filename) |  | ||||||
|         self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") |  | ||||||
|         self.assertEqual(info.tags, ()) |  | ||||||
|         self.assertIsNone(info.created) |  | ||||||
|  |  | ||||||
|         # Pattern doesn't match (filename unaltered) |  | ||||||
|         with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]): |  | ||||||
|             info = FileInfo.from_filename(filename) |  | ||||||
|             self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") |  | ||||||
|  |  | ||||||
|         # Simple transformation (match all) |  | ||||||
|         with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]): |  | ||||||
|             info = FileInfo.from_filename(filename) |  | ||||||
|             self.assertEqual(info.title, "all") |  | ||||||
|  |  | ||||||
|         # Multiple transformations configured (first pattern matches) |  | ||||||
|         with self.settings( |  | ||||||
|             FILENAME_PARSE_TRANSFORMS=[ |  | ||||||
|                 (all_patt, "all.gif"), |  | ||||||
|                 (all_patt, "anotherall.gif"), |  | ||||||
|             ], |  | ||||||
|         ): |  | ||||||
|             info = FileInfo.from_filename(filename) |  | ||||||
|             self.assertEqual(info.title, "all") |  | ||||||
|  |  | ||||||
|         # Multiple transformations configured (second pattern matches) |  | ||||||
|         with self.settings( |  | ||||||
|             FILENAME_PARSE_TRANSFORMS=[ |  | ||||||
|                 (none_patt, "none.gif"), |  | ||||||
|                 (all_patt, "anotherall.gif"), |  | ||||||
|             ], |  | ||||||
|         ): |  | ||||||
|             info = FileInfo.from_filename(filename) |  | ||||||
|             self.assertEqual(info.title, "anotherall") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class _BaseTestParser(DocumentParser): | class _BaseTestParser(DocumentParser): | ||||||
|     def get_settings(self): |     def get_settings(self): | ||||||
|         """ |         """ | ||||||
|   | |||||||
| @@ -3,7 +3,6 @@ import json | |||||||
| import math | import math | ||||||
| import multiprocessing | import multiprocessing | ||||||
| import os | import os | ||||||
| import re |  | ||||||
| import tempfile | import tempfile | ||||||
| from os import PathLike | from os import PathLike | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| @@ -1089,11 +1088,6 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") | |||||||
| # fewer dates shown. | # fewer dates shown. | ||||||
| NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) | NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) | ||||||
|  |  | ||||||
| # Transformations applied before filename parsing |  | ||||||
| FILENAME_PARSE_TRANSFORMS = [] |  | ||||||
| for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): |  | ||||||
|     FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) |  | ||||||
|  |  | ||||||
| # Specify the filename format for out files | # Specify the filename format for out files | ||||||
| FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H