diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 0e728d7af..c12ecd0c4 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -54,6 +54,34 @@ filename as described above. .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings +Transforming filenames for parsing +---------------------------------- +Some devices can't produce filenames that can be parsed by the default +parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in +``paperless.conf`` one can add transformations that are applied to the filename +before it's parsed. + +The option contains a list of dictionaries of regular expressions (key: +``pattern``) and replacements (key: ``repl``) in JSON format, which are +applied in order by passing them to ``re.subn``. Transformation stops +after the first match, so at most one transformation is applied. The general +syntax is + +.. code:: python + + [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}] + +The example below is for a Brother ADS-2400N, a scanner that allows +different names to different hardware buttons (useful for handling +multiple entities in one instance), but insists on adding ``_<count>`` +to the filename. + +.. code:: python + + # Brother profile configuration, support "Name_Date_Count" (the default + # setting) and "Name_Count" (use "Name" as tag and "Count" as title). + PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}] + .. _guesswork-content: Reading the Document Contents diff --git a/paperless.conf.example b/paperless.conf.example index 05a6c9cca..b04e93f94 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET="" # as normal. #PAPERLESS_FILENAME_DATE_ORDER="YMD" +# Sometimes devices won't create filenames which can be parsed properly +# by the filename parser (see +# https://paperless.readthedocs.io/en/latest/guesswork.html). +# +# This setting allows to specify a list of transformations +# in regular expression syntax, which are passed in order to re.sub. +# Transformation stops after the first match, so at most one transformation +# is applied. +# +# Syntax is a JSON array of dictionaries containing "pattern" and "repl" +# as keys. +# +# The example below transforms filenames created by a Brother ADS-2400N +# document scanner in its standard configuration `Name_Date_Count', so that +# count is used as title, name as tag and date can be parsed by paperless. +#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}] + # # The following values use sensible defaults for modern systems, but if you're # running Paperless on a low-resource device (like a Raspberry Pi), modifying diff --git a/src/documents/models.py b/src/documents/models.py index 37c1cfdbf..9647fbacd 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -483,8 +483,14 @@ class FileInfo: "<title>.<suffix>" """ + filename = os.path.basename(path) + for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: + (filename, count) = pattern.subn(repl, filename) + if count: + break + for regex in cls.REGEXES.values(): - m = regex.match(os.path.basename(path)) + m = regex.match(filename) if m: properties = m.groupdict() cls._mangle_property(properties, "created") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 917d1e64f..ad110d7bf 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -10,7 +10,9 @@ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.10/ref/settings/ """ +import json import os +import re from dotenv import load_dotenv @@ -317,6 +319,15 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END") DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") +# Transformations applied before filename parsing +FILENAME_PARSE_TRANSFORMS = [] +_filename_parse_transforms = os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS") +if _filename_parse_transforms: + FILENAME_PARSE_TRANSFORMS = [( + re.compile(t["pattern"]), t["repl"]) + for t in json.loads(_filename_parse_transforms) + ] + # Specify for how many years a correspondent is considered recent. Recent # correspondents will be shown in a separate "Recent correspondents" filter as # well. Set to 0 to disable this filter.