Allow configuring transformations to be applied to the filename before

parsing. The motivation was that files produced by a Brother scanner
wouldn't match paperless' expectations. At most one transformation
is applied (first matching). It won't affect the filename on disk.

This is generic enough so that it is useful for various purposes.
In my case it allows me to use the different hardware buttons on
the scanner to use different profiles, feeding one instance of
paperless with documents of multiple entities and tagging them
accordingly.

Example:

PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."},{"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
This commit is contained in:
Michael Gmelin
2019-05-18 19:25:50 +02:00
parent f7a4f0575e
commit 14b81c613c
4 changed files with 63 additions and 1 deletions

View File

@@ -483,8 +483,14 @@ class FileInfo:
"<title>.<suffix>"
"""
filename = os.path.basename(path)
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
(filename, count) = pattern.subn(repl, filename)
if count:
break
for regex in cls.REGEXES.values():
m = regex.match(os.path.basename(path))
m = regex.match(filename)
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")

View File

@@ -10,7 +10,9 @@ For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.10/ref/settings/
"""
import json
import os
import re
from dotenv import load_dotenv
@@ -317,6 +319,15 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
# Transformations applied before filename parsing
FILENAME_PARSE_TRANSFORMS = []
_filename_parse_transforms = os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS")
if _filename_parse_transforms:
FILENAME_PARSE_TRANSFORMS = [(
re.compile(t["pattern"]), t["repl"])
for t in json.loads(_filename_parse_transforms)
]
# Specify for how many years a correspondent is considered recent. Recent
# correspondents will be shown in a separate "Recent correspondents" filter as
# well. Set to 0 to disable this filter.