mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Merge pull request #542 from grembo/master
Allow configuring transformations to be applied to the filename before
This commit is contained in:
commit
1c956652f3
@ -54,6 +54,34 @@ filename as described above.
|
|||||||
|
|
||||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
||||||
|
|
||||||
|
Transforming filenames for parsing
|
||||||
|
----------------------------------
|
||||||
|
Some devices can't produce filenames that can be parsed by the default
|
||||||
|
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
|
||||||
|
``paperless.conf`` one can add transformations that are applied to the filename
|
||||||
|
before it's parsed.
|
||||||
|
|
||||||
|
The option contains a list of dictionaries of regular expressions (key:
|
||||||
|
``pattern``) and replacements (key: ``repl``) in JSON format, which are
|
||||||
|
applied in order by passing them to ``re.subn``. Transformation stops
|
||||||
|
after the first match, so at most one transformation is applied. The general
|
||||||
|
syntax is
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
|
||||||
|
|
||||||
|
The example below is for a Brother ADS-2400N, a scanner that allows
|
||||||
|
different names to different hardware buttons (useful for handling
|
||||||
|
multiple entities in one instance), but insists on adding ``_<count>``
|
||||||
|
to the filename.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
# Brother profile configuration, support "Name_Date_Count" (the default
|
||||||
|
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
|
||||||
|
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||||
|
|
||||||
.. _guesswork-content:
|
.. _guesswork-content:
|
||||||
|
|
||||||
Reading the Document Contents
|
Reading the Document Contents
|
||||||
|
@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
|
|||||||
# as normal.
|
# as normal.
|
||||||
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
|
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
|
||||||
|
|
||||||
|
# Sometimes devices won't create filenames which can be parsed properly
|
||||||
|
# by the filename parser (see
|
||||||
|
# https://paperless.readthedocs.io/en/latest/guesswork.html).
|
||||||
|
#
|
||||||
|
# This setting allows to specify a list of transformations
|
||||||
|
# in regular expression syntax, which are passed in order to re.sub.
|
||||||
|
# Transformation stops after the first match, so at most one transformation
|
||||||
|
# is applied.
|
||||||
|
#
|
||||||
|
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
|
||||||
|
# as keys.
|
||||||
|
#
|
||||||
|
# The example below transforms filenames created by a Brother ADS-2400N
|
||||||
|
# document scanner in its standard configuration `Name_Date_Count', so that
|
||||||
|
# count is used as title, name as tag and date can be parsed by paperless.
|
||||||
|
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
|
||||||
|
|
||||||
#
|
#
|
||||||
# The following values use sensible defaults for modern systems, but if you're
|
# The following values use sensible defaults for modern systems, but if you're
|
||||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
||||||
|
@ -483,8 +483,18 @@ class FileInfo:
|
|||||||
"<title>.<suffix>"
|
"<title>.<suffix>"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
filename = os.path.basename(path)
|
||||||
|
|
||||||
|
# Mutate filename in-place before parsing its components
|
||||||
|
# by applying at most one of the configured transformations.
|
||||||
|
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||||
|
(filename, count) = pattern.subn(repl, filename)
|
||||||
|
if count:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Parse filename components.
|
||||||
for regex in cls.REGEXES.values():
|
for regex in cls.REGEXES.values():
|
||||||
m = regex.match(os.path.basename(path))
|
m = regex.match(filename)
|
||||||
if m:
|
if m:
|
||||||
properties = m.groupdict()
|
properties = m.groupdict()
|
||||||
cls._mangle_property(properties, "created")
|
cls._mangle_property(properties, "created")
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
|
|||||||
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
|
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
|
||||||
self.assertEqual(info.title, "title")
|
self.assertEqual(info.title, "title")
|
||||||
self.assertIsNone(info.created)
|
self.assertIsNone(info.created)
|
||||||
|
|
||||||
|
def test_filename_parse_transforms(self):
|
||||||
|
|
||||||
|
path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
|
||||||
|
all_patt = re.compile("^.*$")
|
||||||
|
none_patt = re.compile("$a")
|
||||||
|
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
|
||||||
|
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
|
||||||
|
repl2 = "\\2Z - " + repl1 # creation date + repl1
|
||||||
|
|
||||||
|
# No transformations configured (= default)
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||||
|
self.assertEqual(info.extension, "pdf")
|
||||||
|
self.assertEqual(info.tags, ())
|
||||||
|
self.assertIsNone(info.created)
|
||||||
|
|
||||||
|
# Pattern doesn't match (filename unaltered)
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||||
|
self.assertEqual(info.extension, "pdf")
|
||||||
|
|
||||||
|
# Simple transformation (match all)
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "all")
|
||||||
|
self.assertEqual(info.extension, "gif")
|
||||||
|
|
||||||
|
# Multiple transformations configured (first pattern matches)
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[
|
||||||
|
(all_patt, "all.gif"),
|
||||||
|
(all_patt, "anotherall.gif")]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "all")
|
||||||
|
self.assertEqual(info.extension, "gif")
|
||||||
|
|
||||||
|
# Multiple transformations configured (second pattern matches)
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[
|
||||||
|
(none_patt, "none.gif"),
|
||||||
|
(all_patt, "anotherall.gif")]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "anotherall")
|
||||||
|
self.assertEqual(info.extension, "gif")
|
||||||
|
|
||||||
|
# Complex transformation without date in replacement string
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "0001")
|
||||||
|
self.assertEqual(info.extension, "pdf")
|
||||||
|
self.assertEqual(len(info.tags), 2)
|
||||||
|
self.assertEqual(info.tags[0].slug, "tag1")
|
||||||
|
self.assertEqual(info.tags[1].slug, "tag2")
|
||||||
|
self.assertIsNone(info.created)
|
||||||
|
|
||||||
|
# Complex transformation with date in replacement string
|
||||||
|
with self.settings(
|
||||||
|
FILENAME_PARSE_TRANSFORMS=[
|
||||||
|
(none_patt, "none.gif"),
|
||||||
|
(exact_patt, repl2), # <-- matches
|
||||||
|
(exact_patt, repl1),
|
||||||
|
(all_patt, "all.gif")]):
|
||||||
|
info = FileInfo.from_path(path)
|
||||||
|
self.assertEqual(info.title, "0001")
|
||||||
|
self.assertEqual(info.extension, "pdf")
|
||||||
|
self.assertEqual(len(info.tags), 2)
|
||||||
|
self.assertEqual(info.tags[0].slug, "tag1")
|
||||||
|
self.assertEqual(info.tags[1].slug, "tag2")
|
||||||
|
self.assertEqual(info.created.year, 2019)
|
||||||
|
self.assertEqual(info.created.month, 9)
|
||||||
|
self.assertEqual(info.created.day, 8)
|
||||||
|
@ -10,7 +10,9 @@ For the full list of settings and their values, see
|
|||||||
https://docs.djangoproject.com/en/1.10/ref/settings/
|
https://docs.djangoproject.com/en/1.10/ref/settings/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
@ -322,6 +324,11 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
|||||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||||
|
|
||||||
|
# Transformations applied before filename parsing
|
||||||
|
FILENAME_PARSE_TRANSFORMS = []
|
||||||
|
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||||
|
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||||
|
|
||||||
# Specify for how many years a correspondent is considered recent. Recent
|
# Specify for how many years a correspondent is considered recent. Recent
|
||||||
# correspondents will be shown in a separate "Recent correspondents" filter as
|
# correspondents will be shown in a separate "Recent correspondents" filter as
|
||||||
# well. Set to 0 to disable this filter.
|
# well. Set to 0 to disable this filter.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user