Feature: Allow tagging by putting barcode stickers on documents (#5580)

This commit is contained in:
pkrahmer 2024-02-05 18:38:19 +01:00 committed by GitHub
parent c7e0c32226
commit fb82aa0ee1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 248 additions and 1 deletions

View File

@ -1173,6 +1173,55 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0.
Defaults to "300"
#### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=<bool>`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE}
: Enables the detection of barcodes in the scanned document and
assigns or creates tags if a properly formatted barcode is detected.
The barcode must match one of the (configurable) regular expressions.
If the barcode text contains ',' (comma), it is split into multiple
barcodes which are individually processed for tagging.
Matching is case insensitive.
Defaults to false.
#### [`PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING=<json dict>`](#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING) {#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING}
: Defines a dictionary of filter regex and substitute expressions.
Syntax: {"<regex>": "<substitute>" [,...]]}
A barcode is considered for tagging if the barcode text matches
at least one of the provided <regex> pattern.
If a match is found, the <substitute> rule is applied. This allows very
versatile reformatting and mapping of barcode pattern to tag values.
If a tag is not found it will be created.
Defaults to:
{"TAG:(.*)": "\\g<1>"} which defines
- a regex TAG:(.*) which includes barcodes beginning with TAG:
followed by any text that gets stored into match group #1 and
- a substitute \\g<1> that replaces the original barcode text
by the content in match group #1.
Consequently, the tag is the barcode text without its TAG: prefix.
More examples:
{"ASN12.*": "JOHN", "ASN13.*": "SMITH"} for example maps
- ASN12nnnn barcodes to the tag JOHN and
- ASN13nnnn barcodes to the tag SMITH.
{"T-J": "JOHN", "T-S": "SMITH", "T-D": "DOE"} directly maps
- T-J barcodes to the tag JOHN,
- T-S barcodes to the tag SMITH and
- T-D barcodes to the tag DOE.
Please refer to the Python regex documentation for more information.
## Audit Trail
#### [`PAPERLESS_AUDIT_LOG_ENABLED=<bool>`](#PAPERLESS_AUDIT_LOG_ENABLED) {#PAPERLESS_AUDIT_LOG_ENABLED}

View File

@ -68,6 +68,8 @@
#PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
#PAPERLESS_CONSUMER_BARCODE_UPSCALE=0.0
#PAPERLESS_CONSUMER_BARCODE_DPI=300
#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=false
#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"}
#PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=false
#PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME=double-sided
#PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=false

View File

@ -14,6 +14,7 @@ from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.models import Tag
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions
@ -65,7 +66,9 @@ class BarcodePlugin(ConsumeTaskPlugin):
supported_mimes = {"application/pdf"}
return (
settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
settings.CONSUMER_ENABLE_ASN_BARCODE
or settings.CONSUMER_ENABLE_BARCODES
or settings.CONSUMER_ENABLE_TAG_BARCODE
) and self.input_doc.mime_type in supported_mimes
def setup(self):
@ -90,6 +93,16 @@ class BarcodePlugin(ConsumeTaskPlugin):
logger.info(f"Found ASN in barcode: {located_asn}")
self.metadata.asn = located_asn
# try reading tags from barcodes
if settings.CONSUMER_ENABLE_TAG_BARCODE:
tags = self.tags
if tags is not None and len(tags) > 0:
if self.metadata.tag_ids:
self.metadata.tag_ids += tags
else:
self.metadata.tag_ids = tags
logger.info(f"Found tags in barcode: {tags}")
separator_pages = self.get_separation_pages()
if not separator_pages:
return "No pages to split on!"
@ -279,6 +292,53 @@ class BarcodePlugin(ConsumeTaskPlugin):
return asn
@property
def tags(self) -> Optional[list[int]]:
"""
Search the parsed barcodes for any tags.
Returns the detected tag ids (or empty list)
"""
tags = []
# Ensure the barcodes have been read
self.detect()
for x in self.barcodes:
tag_texts = x.value
for raw in tag_texts.split(","):
try:
tag = None
for regex in settings.CONSUMER_TAG_BARCODE_MAPPING:
if re.match(regex, raw, flags=re.IGNORECASE):
sub = settings.CONSUMER_TAG_BARCODE_MAPPING[regex]
tag = (
re.sub(regex, sub, raw, flags=re.IGNORECASE)
if sub
else raw
)
break
if tag:
tag = Tag.objects.get_or_create(
name__iexact=tag,
defaults={"name": tag},
)[0]
logger.debug(
f"Found Tag Barcode '{raw}', substituted "
f"to '{tag}' and mapped to "
f"tag #{tag.pk}.",
)
tags.append(tag.pk)
except Exception as e:
logger.error(
f"Failed to find or create TAG '{raw}' because: {e}",
)
return tags
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page

View File

@ -14,6 +14,7 @@ from documents.barcodes import BarcodePlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
from documents.tests.utils import DummyProgressManager
@ -741,3 +742,125 @@ class TestBarcodeZxing(TestBarcode):
@override_settings(CONSUMER_BARCODE_SCANNER="ZXING")
class TestAsnBarcodesZxing(TestAsnBarcode):
pass
class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase):
@contextmanager
def get_reader(self, filepath: Path) -> BarcodePlugin:
reader = BarcodePlugin(
ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
DocumentMetadataOverrides(),
DummyProgressManager(filepath.name, None),
self.dirs.scratch_dir,
"task-id",
)
reader.setup()
yield reader
reader.cleanup()
@override_settings(CONSUMER_ENABLE_TAG_BARCODE=True)
def test_scan_file_without_matching_barcodes(self):
"""
GIVEN:
- PDF containing tag barcodes but none with matching prefix (default "TAG:")
WHEN:
- File is scanned for barcodes
THEN:
- No TAG has been created
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
with self.get_reader(test_file) as reader:
reader.run()
tags = reader.metadata.tag_ids
self.assertEqual(tags, None)
@override_settings(
CONSUMER_ENABLE_TAG_BARCODE=False,
CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"},
)
def test_scan_file_with_matching_barcode_but_function_disabled(self):
"""
GIVEN:
- PDF containing a tag barcode with matching custom prefix
- The tag barcode functionality is disabled
WHEN:
- File is scanned for barcodes
THEN:
- No TAG has been created
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
with self.get_reader(test_file) as reader:
reader.run()
tags = reader.metadata.tag_ids
self.assertEqual(tags, None)
@override_settings(
CONSUMER_ENABLE_TAG_BARCODE=True,
CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"},
)
def test_scan_file_for_tag_custom_prefix(self):
"""
GIVEN:
- PDF containing a tag barcode with custom prefix
- The barcode mapping accepts this prefix and removes it from the mapped tag value
- The created tag is the non-prefixed values
WHEN:
- File is scanned for barcodes
THEN:
- The TAG is located
- One TAG has been created
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
with self.get_reader(test_file) as reader:
reader.metadata.tag_ids = [99]
reader.run()
self.assertEqual(reader.pdf_file, test_file)
tags = reader.metadata.tag_ids
self.assertEqual(len(tags), 2)
self.assertEqual(tags[0], 99)
self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[1])
@override_settings(
CONSUMER_ENABLE_TAG_BARCODE=True,
CONSUMER_TAG_BARCODE_MAPPING={"ASN(.*)": "\\g<1>"},
)
def test_scan_file_for_many_custom_tags(self):
"""
GIVEN:
- PDF containing multiple tag barcode with custom prefix
- The barcode mapping accepts this prefix and removes it from the mapped tag value
- The created tags are the non-prefixed values
WHEN:
- File is scanned for barcodes
THEN:
- The TAG is located
- File Tags have been created
"""
test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf"
with self.get_reader(test_file) as reader:
reader.run()
tags = reader.metadata.tag_ids
self.assertEqual(len(tags), 5)
self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[0])
self.assertEqual(Tag.objects.get(name__iexact="00124").pk, tags[1])
self.assertEqual(Tag.objects.get(name__iexact="00125").pk, tags[2])
self.assertEqual(Tag.objects.get(name__iexact="00126").pk, tags[3])
self.assertEqual(Tag.objects.get(name__iexact="00127").pk, tags[4])
@override_settings(
CONSUMER_ENABLE_TAG_BARCODE=True,
CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<3>"},
)
def test_scan_file_for_tag_raises_value_error(self):
"""
GIVEN:
- Any error occurs during tag barcode processing
THEN:
- The processing should be skipped and not break the import
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
with self.get_reader(test_file) as reader:
reader.run()
# expect error to be caught and logged only
tags = reader.metadata.tag_ids
self.assertEqual(tags, None)

View File

@ -853,6 +853,19 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE",
)
CONSUMER_TAG_BARCODE_MAPPING = dict(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING",
'{"TAG:(.*)": "\\\\g<1>"}',
),
),
)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
)