mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge pull request #542 from grembo/master
Allow configuring transformations to be applied to the filename before
This commit is contained in:
		@@ -54,6 +54,34 @@ filename as described above.
 | 
			
		||||
 | 
			
		||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
 | 
			
		||||
 | 
			
		||||
Transforming filenames for parsing
 | 
			
		||||
----------------------------------
 | 
			
		||||
Some devices can't produce filenames that can be parsed by the default
 | 
			
		||||
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
 | 
			
		||||
``paperless.conf`` one can add transformations that are applied to the filename
 | 
			
		||||
before it's parsed.
 | 
			
		||||
 | 
			
		||||
The option contains a list of dictionaries of regular expressions (key:
 | 
			
		||||
``pattern``) and replacements (key: ``repl``) in JSON format, which are
 | 
			
		||||
applied in order by passing them to ``re.subn``. Transformation stops
 | 
			
		||||
after the first match, so at most one transformation is applied. The general
 | 
			
		||||
syntax is
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
   [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
 | 
			
		||||
 | 
			
		||||
The example below is for a Brother ADS-2400N, a scanner that allows
 | 
			
		||||
different names to different hardware buttons (useful for handling
 | 
			
		||||
multiple entities in one instance), but insists on adding ``_<count>``
 | 
			
		||||
to the filename.
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
   # Brother profile configuration, support "Name_Date_Count" (the default
 | 
			
		||||
   # setting) and "Name_Count" (use "Name" as tag and "Count" as title).
 | 
			
		||||
   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
 | 
			
		||||
 | 
			
		||||
.. _guesswork-content:
 | 
			
		||||
 | 
			
		||||
Reading the Document Contents
 | 
			
		||||
 
 | 
			
		||||
@@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
 | 
			
		||||
# as normal.
 | 
			
		||||
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
 | 
			
		||||
 | 
			
		||||
# Sometimes devices won't create filenames which can be parsed properly
 | 
			
		||||
# by the filename parser (see
 | 
			
		||||
# https://paperless.readthedocs.io/en/latest/guesswork.html).
 | 
			
		||||
#
 | 
			
		||||
# This setting allows to specify a list of transformations
 | 
			
		||||
# in regular expression syntax, which are passed in order to re.sub.
 | 
			
		||||
# Transformation stops after the first match, so at most one transformation
 | 
			
		||||
# is applied.
 | 
			
		||||
#
 | 
			
		||||
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
 | 
			
		||||
# as keys.
 | 
			
		||||
#
 | 
			
		||||
# The example below transforms filenames created by a Brother ADS-2400N
 | 
			
		||||
# document scanner in its standard configuration `Name_Date_Count', so that
 | 
			
		||||
# count is used as title, name as tag and date can be parsed by paperless.
 | 
			
		||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# The following values use sensible defaults for modern systems, but if you're
 | 
			
		||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
 | 
			
		||||
 
 | 
			
		||||
@@ -483,8 +483,18 @@ class FileInfo:
 | 
			
		||||
          "<title>.<suffix>"
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        filename = os.path.basename(path)
 | 
			
		||||
 | 
			
		||||
        # Mutate filename in-place before parsing its components
 | 
			
		||||
        # by applying at most one of the configured transformations.
 | 
			
		||||
        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
 | 
			
		||||
            (filename, count) = pattern.subn(repl, filename)
 | 
			
		||||
            if count:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        # Parse filename components.
 | 
			
		||||
        for regex in cls.REGEXES.values():
 | 
			
		||||
            m = regex.match(os.path.basename(path))
 | 
			
		||||
            m = regex.match(filename)
 | 
			
		||||
            if m:
 | 
			
		||||
                properties = m.groupdict()
 | 
			
		||||
                cls._mangle_property(properties, "created")
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,5 @@
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from unittest import mock
 | 
			
		||||
from tempfile import TemporaryDirectory
 | 
			
		||||
@@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
 | 
			
		||||
        info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
 | 
			
		||||
        self.assertEqual(info.title, "title")
 | 
			
		||||
        self.assertIsNone(info.created)
 | 
			
		||||
 | 
			
		||||
    def test_filename_parse_transforms(self):
 | 
			
		||||
 | 
			
		||||
        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
 | 
			
		||||
        all_patt = re.compile("^.*$")
 | 
			
		||||
        none_patt = re.compile("$a")
 | 
			
		||||
        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
 | 
			
		||||
        repl1 = " - \\4 - \\1."    # (empty) corrspondent, title and tags
 | 
			
		||||
        repl2 = "\\2Z - " + repl1  # creation date + repl1
 | 
			
		||||
 | 
			
		||||
        # No transformations configured (= default)
 | 
			
		||||
        info = FileInfo.from_path(path)
 | 
			
		||||
        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
 | 
			
		||||
        self.assertEqual(info.extension, "pdf")
 | 
			
		||||
        self.assertEqual(info.tags, ())
 | 
			
		||||
        self.assertIsNone(info.created)
 | 
			
		||||
 | 
			
		||||
        # Pattern doesn't match (filename unaltered)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
 | 
			
		||||
            self.assertEqual(info.extension, "pdf")
 | 
			
		||||
 | 
			
		||||
        # Simple transformation (match all)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "all")
 | 
			
		||||
            self.assertEqual(info.extension, "gif")
 | 
			
		||||
 | 
			
		||||
        # Multiple transformations configured (first pattern matches)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
                FILENAME_PARSE_TRANSFORMS=[
 | 
			
		||||
                    (all_patt, "all.gif"),
 | 
			
		||||
                    (all_patt, "anotherall.gif")]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "all")
 | 
			
		||||
            self.assertEqual(info.extension, "gif")
 | 
			
		||||
 | 
			
		||||
        # Multiple transformations configured (second pattern matches)
 | 
			
		||||
        with self.settings(
 | 
			
		||||
                FILENAME_PARSE_TRANSFORMS=[
 | 
			
		||||
                    (none_patt, "none.gif"),
 | 
			
		||||
                    (all_patt, "anotherall.gif")]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "anotherall")
 | 
			
		||||
            self.assertEqual(info.extension, "gif")
 | 
			
		||||
 | 
			
		||||
        # Complex transformation without date in replacement string
 | 
			
		||||
        with self.settings(
 | 
			
		||||
                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "0001")
 | 
			
		||||
            self.assertEqual(info.extension, "pdf")
 | 
			
		||||
            self.assertEqual(len(info.tags), 2)
 | 
			
		||||
            self.assertEqual(info.tags[0].slug, "tag1")
 | 
			
		||||
            self.assertEqual(info.tags[1].slug, "tag2")
 | 
			
		||||
            self.assertIsNone(info.created)
 | 
			
		||||
 | 
			
		||||
        # Complex transformation with date in replacement string
 | 
			
		||||
        with self.settings(
 | 
			
		||||
            FILENAME_PARSE_TRANSFORMS=[
 | 
			
		||||
                (none_patt, "none.gif"),
 | 
			
		||||
                (exact_patt, repl2),    # <-- matches
 | 
			
		||||
                (exact_patt, repl1),
 | 
			
		||||
                (all_patt, "all.gif")]):
 | 
			
		||||
            info = FileInfo.from_path(path)
 | 
			
		||||
            self.assertEqual(info.title, "0001")
 | 
			
		||||
            self.assertEqual(info.extension, "pdf")
 | 
			
		||||
            self.assertEqual(len(info.tags), 2)
 | 
			
		||||
            self.assertEqual(info.tags[0].slug, "tag1")
 | 
			
		||||
            self.assertEqual(info.tags[1].slug, "tag2")
 | 
			
		||||
            self.assertEqual(info.created.year, 2019)
 | 
			
		||||
            self.assertEqual(info.created.month, 9)
 | 
			
		||||
            self.assertEqual(info.created.day, 8)
 | 
			
		||||
 
 | 
			
		||||
@@ -10,7 +10,9 @@ For the full list of settings and their values, see
 | 
			
		||||
https://docs.djangoproject.com/en/1.10/ref/settings/
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from dotenv import load_dotenv
 | 
			
		||||
 | 
			
		||||
@@ -322,6 +324,11 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 | 
			
		||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 | 
			
		||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 | 
			
		||||
 | 
			
		||||
# Transformations applied before filename parsing
 | 
			
		||||
FILENAME_PARSE_TRANSFORMS = []
 | 
			
		||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
 | 
			
		||||
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 | 
			
		||||
 | 
			
		||||
# Specify for how many years a correspondent is considered recent. Recent
 | 
			
		||||
# correspondents will be shown in a separate "Recent correspondents" filter as
 | 
			
		||||
# well. Set to 0 to disable this filter.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user