mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #542 from grembo/master
Allow configuring transformations to be applied to the filename before
This commit is contained in:
commit
1c956652f3
@ -54,6 +54,34 @@ filename as described above.
|
||||
|
||||
.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
|
||||
|
||||
Transforming filenames for parsing
|
||||
----------------------------------
|
||||
Some devices can't produce filenames that can be parsed by the default
|
||||
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
|
||||
``paperless.conf`` one can add transformations that are applied to the filename
|
||||
before it's parsed.
|
||||
|
||||
The option contains a list of dictionaries of regular expressions (key:
|
||||
``pattern``) and replacements (key: ``repl``) in JSON format, which are
|
||||
applied in order by passing them to ``re.subn``. Transformation stops
|
||||
after the first match, so at most one transformation is applied. The general
|
||||
syntax is
|
||||
|
||||
.. code:: python
|
||||
|
||||
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
|
||||
|
||||
The example below is for a Brother ADS-2400N, a scanner that allows
|
||||
different names to different hardware buttons (useful for handling
|
||||
multiple entities in one instance), but insists on adding ``_<count>``
|
||||
to the filename.
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Brother profile configuration, support "Name_Date_Count" (the default
|
||||
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||
|
||||
.. _guesswork-content:
|
||||
|
||||
Reading the Document Contents
|
||||
|
@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
|
||||
# as normal.
|
||||
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
|
||||
|
||||
# Sometimes devices won't create filenames which can be parsed properly
|
||||
# by the filename parser (see
|
||||
# https://paperless.readthedocs.io/en/latest/guesswork.html).
|
||||
#
|
||||
# This setting allows to specify a list of transformations
|
||||
# in regular expression syntax, which are passed in order to re.sub.
|
||||
# Transformation stops after the first match, so at most one transformation
|
||||
# is applied.
|
||||
#
|
||||
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
|
||||
# as keys.
|
||||
#
|
||||
# The example below transforms filenames created by a Brother ADS-2400N
|
||||
# document scanner in its standard configuration `Name_Date_Count', so that
|
||||
# count is used as title, name as tag and date can be parsed by paperless.
|
||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
|
||||
|
||||
#
|
||||
# The following values use sensible defaults for modern systems, but if you're
|
||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
||||
|
@ -483,8 +483,18 @@ class FileInfo:
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
filename = os.path.basename(path)
|
||||
|
||||
# Mutate filename in-place before parsing its components
|
||||
# by applying at most one of the configured transformations.
|
||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||
(filename, count) = pattern.subn(repl, filename)
|
||||
if count:
|
||||
break
|
||||
|
||||
# Parse filename components.
|
||||
for regex in cls.REGEXES.values():
|
||||
m = regex.match(os.path.basename(path))
|
||||
m = regex.match(filename)
|
||||
if m:
|
||||
properties = m.groupdict()
|
||||
cls._mangle_property(properties, "created")
|
||||
|
@ -1,3 +1,5 @@
|
||||
import re
|
||||
|
||||
from django.test import TestCase
|
||||
from unittest import mock
|
||||
from tempfile import TemporaryDirectory
|
||||
@ -372,3 +374,79 @@ class TestFieldPermutations(TestCase):
|
||||
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
|
||||
self.assertEqual(info.title, "title")
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
def test_filename_parse_transforms(self):
|
||||
|
||||
path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
|
||||
all_patt = re.compile("^.*$")
|
||||
none_patt = re.compile("$a")
|
||||
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
|
||||
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
|
||||
repl2 = "\\2Z - " + repl1 # creation date + repl1
|
||||
|
||||
# No transformations configured (= default)
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(info.tags, ())
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
# Pattern doesn't match (filename unaltered)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
|
||||
# Simple transformation (match all)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "all")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Multiple transformations configured (first pattern matches)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(all_patt, "all.gif"),
|
||||
(all_patt, "anotherall.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "all")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Multiple transformations configured (second pattern matches)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(none_patt, "none.gif"),
|
||||
(all_patt, "anotherall.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "anotherall")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Complex transformation without date in replacement string
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
# Complex transformation with date in replacement string
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(none_patt, "none.gif"),
|
||||
(exact_patt, repl2), # <-- matches
|
||||
(exact_patt, repl1),
|
||||
(all_patt, "all.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertEqual(info.created.year, 2019)
|
||||
self.assertEqual(info.created.month, 9)
|
||||
self.assertEqual(info.created.day, 8)
|
||||
|
@ -10,7 +10,9 @@ For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/1.10/ref/settings/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@ -322,6 +324,11 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
# Transformations applied before filename parsing
|
||||
FILENAME_PARSE_TRANSFORMS = []
|
||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||
|
||||
# Specify for how many years a correspondent is considered recent. Recent
|
||||
# correspondents will be shown in a separate "Recent correspondents" filter as
|
||||
# well. Set to 0 to disable this filter.
|
||||
|
Loading…
x
Reference in New Issue
Block a user