From f0320fb72d7059736ba1fc84c6781861c8415e49 Mon Sep 17 00:00:00 2001
From: Michael Gmelin <freebsd@grem.de>
Date: Sat, 18 May 2019 19:25:50 +0200
Subject: [PATCH 1/3] Allow configuring transformations to be applied to the
 filename before parsing. The motivation was that files produced by a Brother
 scanner wouldn't match paperless' expectations. At most one transformation is
 applied (first matching). It won't affect the filename on disk.

This is generic enough so that it is useful for various purposes.
In my case it allows me to use the different hardware buttons on
the scanner to use different profiles, feeding one instance of
paperless with documents of multiple entities and tagging them
accordingly.

Example:

PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."},{"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
---
 docs/guesswork.rst        | 28 ++++++++++++++++++++++++++++
 paperless.conf.example    | 17 +++++++++++++++++
 src/documents/models.py   |  8 +++++++-
 src/paperless/settings.py | 11 +++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/docs/guesswork.rst b/docs/guesswork.rst
index 0e728d7af..c12ecd0c4 100644
--- a/docs/guesswork.rst
+++ b/docs/guesswork.rst
@@ -54,6 +54,34 @@ filename as described above.
 
 .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
 
+Transforming filenames for parsing
+----------------------------------
+Some devices can't produce filenames that can be parsed by the default
+parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
+``paperless.conf`` one can add transformations that are applied to the filename
+before it's parsed.
+
+The option contains a list of dictionaries of regular expressions (key:
+``pattern``) and replacements (key: ``repl``) in JSON format, which are
+applied in order by passing them to ``re.subn``. Transformation stops
+after the first match, so at most one transformation is applied. The general
+syntax is
+
+.. code:: python
+
+   [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
+
+The example below is for a Brother ADS-2400N, a scanner that allows
+different names to different hardware buttons (useful for handling
+multiple entities in one instance), but insists on adding ``_<count>``
+to the filename.
+
+.. code:: python
+
+   # Brother profile configuration, support "Name_Date_Count" (the default
+   # setting) and "Name_Count" (use "Name" as tag and "Count" as title).
+   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
+
 .. _guesswork-content:
 
 Reading the Document Contents
diff --git a/paperless.conf.example b/paperless.conf.example
index 05a6c9cca..b04e93f94 100644
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
 # as normal.
 #PAPERLESS_FILENAME_DATE_ORDER="YMD"
 
+# Sometimes devices won't create filenames which can be parsed properly
+# by the filename parser (see
+# https://paperless.readthedocs.io/en/latest/guesswork.html).
+#
+# This setting allows to specify a list of transformations
+# in regular expression syntax, which are passed in order to re.sub.
+# Transformation stops after the first match, so at most one transformation
+# is applied.
+#
+# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
+# as keys.
+#
+# The example below transforms filenames created by a Brother ADS-2400N
+# document scanner in its standard configuration `Name_Date_Count', so that
+# count is used as title, name as tag and date can be parsed by paperless.
+#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
+
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource device (like a Raspberry Pi), modifying
diff --git a/src/documents/models.py b/src/documents/models.py
index 37c1cfdbf..9647fbacd 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -483,8 +483,14 @@ class FileInfo:
           "<title>.<suffix>"
         """
 
+        filename = os.path.basename(path)
+        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
+            (filename, count) = pattern.subn(repl, filename)
+            if count:
+                break
+
         for regex in cls.REGEXES.values():
-            m = regex.match(os.path.basename(path))
+            m = regex.match(filename)
             if m:
                 properties = m.groupdict()
                 cls._mangle_property(properties, "created")
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 917d1e64f..ad110d7bf 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -10,7 +10,9 @@ For the full list of settings and their values, see
 https://docs.djangoproject.com/en/1.10/ref/settings/
 """
 
+import json
 import os
+import re
 
 from dotenv import load_dotenv
 
@@ -317,6 +319,15 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 
+# Transformations applied before filename parsing
+FILENAME_PARSE_TRANSFORMS = []
+_filename_parse_transforms = os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS")
+if _filename_parse_transforms:
+    FILENAME_PARSE_TRANSFORMS = [(
+            re.compile(t["pattern"]), t["repl"])
+        for t in json.loads(_filename_parse_transforms)
+    ]
+
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as
 # well. Set to 0 to disable this filter.

From 3b88d6722a32c983062a1fd4c4fed8642e39f809 Mon Sep 17 00:00:00 2001
From: Michael Gmelin <freebsd@grem.de>
Date: Sun, 8 Sep 2019 17:00:02 +0200
Subject: [PATCH 2/3] Address review comments by @danielquinn

---
 src/documents/models.py   | 4 ++++
 src/paperless/settings.py | 8 ++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/documents/models.py b/src/documents/models.py
index 9647fbacd..c6fc8191e 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -484,11 +484,15 @@ class FileInfo:
         """
 
         filename = os.path.basename(path)
+
+        # Mutate filename in-place before parsing its components
+        # by applying at most one of the configured transformations.
         for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
             (filename, count) = pattern.subn(repl, filename)
             if count:
                 break
 
+        # Parse filename components.
         for regex in cls.REGEXES.values():
             m = regex.match(filename)
             if m:
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index ad110d7bf..b6bc67ef0 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -321,12 +321,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 
 # Transformations applied before filename parsing
 FILENAME_PARSE_TRANSFORMS = []
-_filename_parse_transforms = os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS")
-if _filename_parse_transforms:
-    FILENAME_PARSE_TRANSFORMS = [(
-            re.compile(t["pattern"]), t["repl"])
-        for t in json.loads(_filename_parse_transforms)
-    ]
+for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
+    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as

From 4f85d9ed9f450e10afb390ed0ff989ca40cede6d Mon Sep 17 00:00:00 2001
From: Michael Gmelin <freebsd@grem.de>
Date: Sun, 8 Sep 2019 20:24:58 +0200
Subject: [PATCH 3/3] Add unit test for PAPERLESS_FILENAME_PARSE_TRANSFORMS
 feature.

---
 src/documents/tests/test_consumer.py | 78 ++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 3f5c69774..512447741 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,3 +1,5 @@
+import re
+
 from django.test import TestCase
 from unittest import mock
 from tempfile import TemporaryDirectory
@@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
         info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
         self.assertEqual(info.title, "title")
         self.assertIsNone(info.created)
+
+    def test_filename_parse_transforms(self):
+
+        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
+        all_patt = re.compile("^.*$")
+        none_patt = re.compile("$a")
+        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
+        repl1 = " - \\4 - \\1."    # (empty) corrspondent, title and tags
+        repl2 = "\\2Z - " + repl1  # creation date + repl1
+
+        # No transformations configured (= default)
+        info = FileInfo.from_path(path)
+        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+        self.assertEqual(info.extension, "pdf")
+        self.assertEqual(info.tags, ())
+        self.assertIsNone(info.created)
+
+        # Pattern doesn't match (filename unaltered)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+            self.assertEqual(info.extension, "pdf")
+
+        # Simple transformation (match all)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (first pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (all_patt, "all.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (second pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (none_patt, "none.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "anotherall")
+            self.assertEqual(info.extension, "gif")
+
+        # Complex transformation without date in replacement string
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertIsNone(info.created)
+
+        # Complex transformation with date in replacement string
+        with self.settings(
+            FILENAME_PARSE_TRANSFORMS=[
+                (none_patt, "none.gif"),
+                (exact_patt, repl2),    # <-- matches
+                (exact_patt, repl1),
+                (all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertEqual(info.created.year, 2019)
+            self.assertEqual(info.created.month, 9)
+            self.assertEqual(info.created.day, 8)