mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Allow configuring transformations to be applied to the filename before
parsing. The motivation was that files produced by a Brother scanner wouldn't match paperless' expectations. At most one transformation is applied (first matching). It won't affect the filename on disk. This is generic enough so that it is useful for various purposes. In my case it allows me to use the different hardware buttons on the scanner to use different profiles, feeding one instance of paperless with documents of multiple entities and tagging them accordingly. Example: PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."},{"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
This commit is contained in:
parent
8e6d7cba13
commit
f0320fb72d
@ -54,6 +54,34 @@ filename as described above.
|
||||
|
||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
||||
|
||||
Transforming filenames for parsing
|
||||
----------------------------------
|
||||
Some devices can't produce filenames that can be parsed by the default
|
||||
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
|
||||
``paperless.conf`` one can add transformations that are applied to the filename
|
||||
before it's parsed.
|
||||
|
||||
The option contains a list of dictionaries of regular expressions (key:
|
||||
``pattern``) and replacements (key: ``repl``) in JSON format, which are
|
||||
applied in order by passing them to ``re.subn``. Transformation stops
|
||||
after the first match, so at most one transformation is applied. The general
|
||||
syntax is
|
||||
|
||||
.. code:: python
|
||||
|
||||
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
|
||||
|
||||
The example below is for a Brother ADS-2400N, a scanner that allows
|
||||
different names to different hardware buttons (useful for handling
|
||||
multiple entities in one instance), but insists on adding ``_<count>``
|
||||
to the filename.
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Brother profile configuration, support "Name_Date_Count" (the default
|
||||
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||
|
||||
.. _guesswork-content:
|
||||
|
||||
Reading the Document Contents
|
||||
|
@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
|
||||
# as normal.
|
||||
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
|
||||
|
||||
# Sometimes devices won't create filenames which can be parsed properly
|
||||
# by the filename parser (see
|
||||
# https://paperless.readthedocs.io/en/latest/guesswork.html).
|
||||
#
|
||||
# This setting allows to specify a list of transformations
|
||||
# in regular expression syntax, which are passed in order to re.sub.
|
||||
# Transformation stops after the first match, so at most one transformation
|
||||
# is applied.
|
||||
#
|
||||
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
|
||||
# as keys.
|
||||
#
|
||||
# The example below transforms filenames created by a Brother ADS-2400N
|
||||
# document scanner in its standard configuration `Name_Date_Count', so that
|
||||
# count is used as title, name as tag and date can be parsed by paperless.
|
||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
|
||||
|
||||
#
|
||||
# The following values use sensible defaults for modern systems, but if you're
|
||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
||||
|
@ -483,8 +483,14 @@ class FileInfo:
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
filename = os.path.basename(path)
|
||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||
(filename, count) = pattern.subn(repl, filename)
|
||||
if count:
|
||||
break
|
||||
|
||||
for regex in cls.REGEXES.values():
|
||||
m = regex.match(os.path.basename(path))
|
||||
m = regex.match(filename)
|
||||
if m:
|
||||
properties = m.groupdict()
|
||||
cls._mangle_property(properties, "created")
|
||||
|
@ -10,7 +10,9 @@ For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/1.10/ref/settings/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@ -317,6 +319,15 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
# Transformations applied before filename parsing
|
||||
FILENAME_PARSE_TRANSFORMS = []
|
||||
_filename_parse_transforms = os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS")
|
||||
if _filename_parse_transforms:
|
||||
FILENAME_PARSE_TRANSFORMS = [(
|
||||
re.compile(t["pattern"]), t["repl"])
|
||||
for t in json.loads(_filename_parse_transforms)
|
||||
]
|
||||
|
||||
# Specify for how many years a correspondent is considered recent. Recent
|
||||
# correspondents will be shown in a separate "Recent correspondents" filter as
|
||||
# well. Set to 0 to disable this filter.
|
||||
|
Loading…
x
Reference in New Issue
Block a user