Merge pull request #542 from grembo/master

Allow configuring transformations to be applied to the filename before
2026-02-14 00:09:35 -06:00 · 2019-09-09 20:53:45 +01:00
parent ebd9f918d2 4f85d9ed9f
commit 1c956652f3
5 changed files with 141 additions and 1 deletions
--- a/docs/guesswork.rst
+++ b/docs/guesswork.rst
@@ -54,6 +54,34 @@ filename as described above.
 .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
 Transforming filenames for parsing
 ----------------------------------
 Some devices can't produce filenames that can be parsed by the default
 parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
 ``paperless.conf`` one can add transformations that are applied to the filename
 before it's parsed.
 The option contains a list of dictionaries of regular expressions (key:
 ``pattern``) and replacements (key: ``repl``) in JSON format, which are
 applied in order by passing them to ``re.subn``. Transformation stops
 after the first match, so at most one transformation is applied. The general
 syntax is
 .. code:: python
   [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
 The example below is for a Brother ADS-2400N, a scanner that allows
 different names to different hardware buttons (useful for handling
 multiple entities in one instance), but insists on adding ``_<count>``
 to the filename.
 .. code:: python
   # Brother profile configuration, support "Name_Date_Count" (the default
   # setting) and "Name_Count" (use "Name" as tag and "Count" as title).
   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
 .. _guesswork-content:
 Reading the Document Contents
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
 # as normal.
 #PAPERLESS_FILENAME_DATE_ORDER="YMD"
 # Sometimes devices won't create filenames which can be parsed properly
 # by the filename parser (see
 # https://paperless.readthedocs.io/en/latest/guesswork.html).
 #
 # This setting allows to specify a list of transformations
 # in regular expression syntax, which are passed in order to re.sub.
 # Transformation stops after the first match, so at most one transformation
 # is applied.
 #
 # Syntax is a JSON array of dictionaries containing "pattern" and "repl"
 # as keys.
 #
 # The example below transforms filenames created by a Brother ADS-2400N
 # document scanner in its standard configuration `Name_Date_Count', so that
 # count is used as title, name as tag and date can be parsed by paperless.
 #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource device (like a Raspberry Pi), modifying
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -483,8 +483,18 @@ class FileInfo:
          "<title>.<suffix>"
        """
        filename = os.path.basename(path)
        # Mutate filename in-place before parsing its components
        # by applying at most one of the configured transformations.
        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
            (filename, count) = pattern.subn(repl, filename)
            if count:
                break
        # Parse filename components.
        for regex in cls.REGEXES.values():
-            m = regex.match(os.path.basename(path))
+            m = regex.match(filename)
            if m:
                properties = m.groupdict()
                cls._mangle_property(properties, "created")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,3 +1,5 @@
 import re
 from django.test import TestCase
 from unittest import mock
 from tempfile import TemporaryDirectory
@@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
        info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
        self.assertEqual(info.title, "title")
        self.assertIsNone(info.created)
    def test_filename_parse_transforms(self):
        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
        all_patt = re.compile("^.*$")
        none_patt = re.compile("$a")
        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
        repl1 = " - \\4 - \\1."    # (empty) corrspondent, title and tags
        repl2 = "\\2Z - " + repl1  # creation date + repl1
        # No transformations configured (= default)
        info = FileInfo.from_path(path)
        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
        self.assertEqual(info.extension, "pdf")
        self.assertEqual(info.tags, ())
        self.assertIsNone(info.created)
        # Pattern doesn't match (filename unaltered)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
            self.assertEqual(info.extension, "pdf")
        # Simple transformation (match all)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "all")
            self.assertEqual(info.extension, "gif")
        # Multiple transformations configured (first pattern matches)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[
                    (all_patt, "all.gif"),
                    (all_patt, "anotherall.gif")]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "all")
            self.assertEqual(info.extension, "gif")
        # Multiple transformations configured (second pattern matches)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[
                    (none_patt, "none.gif"),
                    (all_patt, "anotherall.gif")]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "anotherall")
            self.assertEqual(info.extension, "gif")
        # Complex transformation without date in replacement string
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "0001")
            self.assertEqual(info.extension, "pdf")
            self.assertEqual(len(info.tags), 2)
            self.assertEqual(info.tags[0].slug, "tag1")
            self.assertEqual(info.tags[1].slug, "tag2")
            self.assertIsNone(info.created)
        # Complex transformation with date in replacement string
        with self.settings(
            FILENAME_PARSE_TRANSFORMS=[
                (none_patt, "none.gif"),
                (exact_patt, repl2),    # <-- matches
                (exact_patt, repl1),
                (all_patt, "all.gif")]):
            info = FileInfo.from_path(path)
            self.assertEqual(info.title, "0001")
            self.assertEqual(info.extension, "pdf")
            self.assertEqual(len(info.tags), 2)
            self.assertEqual(info.tags[0].slug, "tag1")
            self.assertEqual(info.tags[1].slug, "tag2")
            self.assertEqual(info.created.year, 2019)
            self.assertEqual(info.created.month, 9)
            self.assertEqual(info.created.day, 8)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -10,7 +10,9 @@ For the full list of settings and their values, see
 https://docs.djangoproject.com/en/1.10/ref/settings/
 """
 import json
 import os
 import re
 from dotenv import load_dotenv
@@ -322,6 +324,11 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 # Transformations applied before filename parsing
 FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as
 # well. Set to 0 to disable this filter.