Merge pull request #542 from grembo/master

Allow configuring transformations to be applied to the filename before
2026-02-03 23:22:42 -06:00 · 2019-09-09 20:53:45 +01:00
parent d1cc5bb256 658e73d79a
commit 882c3b7083
5 changed files with 141 additions and 1 deletions
--- a/docs/guesswork.rst
+++ b/docs/guesswork.rst
@@ -54,6 +54,34 @@ filename as described above.

 .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings

+Transforming filenames for parsing
+----------------------------------
+Some devices can't produce filenames that can be parsed by the default
+parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
+``paperless.conf`` one can add transformations that are applied to the filename
+before it's parsed.
+
+The option contains a list of dictionaries of regular expressions (key:
+``pattern``) and replacements (key: ``repl``) in JSON format, which are
+applied in order by passing them to ``re.subn``. Transformation stops
+after the first match, so at most one transformation is applied. The general
+syntax is
+
+.. code:: python
+
+   [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
+
+The example below is for a Brother ADS-2400N, a scanner that allows
+different names to different hardware buttons (useful for handling
+multiple entities in one instance), but insists on adding ``_<count>``
+to the filename.
+
+.. code:: python
+
+   # Brother profile configuration, support "Name_Date_Count" (the default
+   # setting) and "Name_Count" (use "Name" as tag and "Count" as title).
+   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
+
 .. _guesswork-content:

 Reading the Document Contents
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
 # as normal.
 #PAPERLESS_FILENAME_DATE_ORDER="YMD"

+# Sometimes devices won't create filenames which can be parsed properly
+# by the filename parser (see
+# https://paperless.readthedocs.io/en/latest/guesswork.html).
+#
+# This setting allows to specify a list of transformations
+# in regular expression syntax, which are passed in order to re.sub.
+# Transformation stops after the first match, so at most one transformation
+# is applied.
+#
+# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
+# as keys.
+#
+# The example below transforms filenames created by a Brother ADS-2400N
+# document scanner in its standard configuration `Name_Date_Count', so that
+# count is used as title, name as tag and date can be parsed by paperless.
+#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
+
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource device (like a Raspberry Pi), modifying
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -483,8 +483,18 @@ class FileInfo:
          "<title>.<suffix>"
        """

+        filename = os.path.basename(path)
+
+        # Mutate filename in-place before parsing its components
+        # by applying at most one of the configured transformations.
+        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
+            (filename, count) = pattern.subn(repl, filename)
+            if count:
+                break
+
+        # Parse filename components.
        for regex in cls.REGEXES.values():
-            m = regex.match(os.path.basename(path))
+            m = regex.match(filename)
            if m:
                properties = m.groupdict()
                cls._mangle_property(properties, "created")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,3 +1,5 @@
+import re
+
 from django.test import TestCase
 from unittest import mock
 from tempfile import TemporaryDirectory
@@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
        info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
        self.assertEqual(info.title, "title")
        self.assertIsNone(info.created)
+
+    def test_filename_parse_transforms(self):
+
+        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
+        all_patt = re.compile("^.*$")
+        none_patt = re.compile("$a")
+        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
+        repl1 = " - \\4 - \\1."    # (empty) corrspondent, title and tags
+        repl2 = "\\2Z - " + repl1  # creation date + repl1
+
+        # No transformations configured (= default)
+        info = FileInfo.from_path(path)
+        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+        self.assertEqual(info.extension, "pdf")
+        self.assertEqual(info.tags, ())
+        self.assertIsNone(info.created)
+
+        # Pattern doesn't match (filename unaltered)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+            self.assertEqual(info.extension, "pdf")
+
+        # Simple transformation (match all)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (first pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (all_patt, "all.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (second pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (none_patt, "none.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "anotherall")
+            self.assertEqual(info.extension, "gif")
+
+        # Complex transformation without date in replacement string
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertIsNone(info.created)
+
+        # Complex transformation with date in replacement string
+        with self.settings(
+            FILENAME_PARSE_TRANSFORMS=[
+                (none_patt, "none.gif"),
+                (exact_patt, repl2),    # <-- matches
+                (exact_patt, repl1),
+                (all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertEqual(info.created.year, 2019)
+            self.assertEqual(info.created.month, 9)
+            self.assertEqual(info.created.day, 8)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -10,7 +10,9 @@ For the full list of settings and their values, see
 https://docs.djangoproject.com/en/1.10/ref/settings/
 """

+import json
 import os
+import re

 from dotenv import load_dotenv

@@ -322,6 +324,11 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")

+# Transformations applied before filename parsing
+FILENAME_PARSE_TRANSFORMS = []
+for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
+    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
+
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as
 # well. Set to 0 to disable this filter.