diff --git a/docs/configuration.md b/docs/configuration.md index f473921cb..e99e0a085 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1173,6 +1173,55 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0. Defaults to "300" +#### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE} + +: Enables the detection of barcodes in the scanned document and +assigns or creates tags if a properly formatted barcode is detected. + + The barcode must match one of the (configurable) regular expressions. + If the barcode text contains ',' (comma), it is split into multiple + barcodes which are individually processed for tagging. + + Matching is case insensitive. + + Defaults to false. + +#### [`PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING=`](#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING) {#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING} + +: Defines a dictionary of filter regex and substitute expressions. + + Syntax: {"": "" [,...]]} + + A barcode is considered for tagging if the barcode text matches + at least one of the provided pattern. + + If a match is found, the rule is applied. This allows very + versatile reformatting and mapping of barcode pattern to tag values. + + If a tag is not found it will be created. + + Defaults to: + + {"TAG:(.*)": "\\g<1>"} which defines + - a regex TAG:(.*) which includes barcodes beginning with TAG: + followed by any text that gets stored into match group #1 and + - a substitute \\g<1> that replaces the original barcode text + by the content in match group #1. + Consequently, the tag is the barcode text without its TAG: prefix. + + More examples: + + {"ASN12.*": "JOHN", "ASN13.*": "SMITH"} for example maps + - ASN12nnnn barcodes to the tag JOHN and + - ASN13nnnn barcodes to the tag SMITH. + + {"T-J": "JOHN", "T-S": "SMITH", "T-D": "DOE"} directly maps + - T-J barcodes to the tag JOHN, + - T-S barcodes to the tag SMITH and + - T-D barcodes to the tag DOE. + + Please refer to the Python regex documentation for more information. + ## Audit Trail #### [`PAPERLESS_AUDIT_LOG_ENABLED=`](#PAPERLESS_AUDIT_LOG_ENABLED) {#PAPERLESS_AUDIT_LOG_ENABLED} diff --git a/paperless.conf.example b/paperless.conf.example index 1610dcda9..db557a7b6 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -68,6 +68,8 @@ #PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT #PAPERLESS_CONSUMER_BARCODE_UPSCALE=0.0 #PAPERLESS_CONSUMER_BARCODE_DPI=300 +#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=false +#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"} #PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=false #PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME=double-sided #PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=false diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 606451f84..4bfb9b791 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -14,6 +14,7 @@ from PIL import Image from documents.converters import convert_from_tiff_to_pdf from documents.data_models import ConsumableDocument +from documents.models import Tag from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import StopConsumeTaskError from documents.plugins.helpers import ProgressStatusOptions @@ -65,7 +66,9 @@ class BarcodePlugin(ConsumeTaskPlugin): supported_mimes = {"application/pdf"} return ( - settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES + settings.CONSUMER_ENABLE_ASN_BARCODE + or settings.CONSUMER_ENABLE_BARCODES + or settings.CONSUMER_ENABLE_TAG_BARCODE ) and self.input_doc.mime_type in supported_mimes def setup(self): @@ -90,6 +93,16 @@ class BarcodePlugin(ConsumeTaskPlugin): logger.info(f"Found ASN in barcode: {located_asn}") self.metadata.asn = located_asn + # try reading tags from barcodes + if settings.CONSUMER_ENABLE_TAG_BARCODE: + tags = self.tags + if tags is not None and len(tags) > 0: + if self.metadata.tag_ids: + self.metadata.tag_ids += tags + else: + self.metadata.tag_ids = tags + logger.info(f"Found tags in barcode: {tags}") + separator_pages = self.get_separation_pages() if not separator_pages: return "No pages to split on!" @@ -279,6 +292,53 @@ class BarcodePlugin(ConsumeTaskPlugin): return asn + @property + def tags(self) -> Optional[list[int]]: + """ + Search the parsed barcodes for any tags. + Returns the detected tag ids (or empty list) + """ + tags = [] + + # Ensure the barcodes have been read + self.detect() + + for x in self.barcodes: + tag_texts = x.value + + for raw in tag_texts.split(","): + try: + tag = None + for regex in settings.CONSUMER_TAG_BARCODE_MAPPING: + if re.match(regex, raw, flags=re.IGNORECASE): + sub = settings.CONSUMER_TAG_BARCODE_MAPPING[regex] + tag = ( + re.sub(regex, sub, raw, flags=re.IGNORECASE) + if sub + else raw + ) + break + + if tag: + tag = Tag.objects.get_or_create( + name__iexact=tag, + defaults={"name": tag}, + )[0] + + logger.debug( + f"Found Tag Barcode '{raw}', substituted " + f"to '{tag}' and mapped to " + f"tag #{tag.pk}.", + ) + tags.append(tag.pk) + + except Exception as e: + logger.error( + f"Failed to find or create TAG '{raw}' because: {e}", + ) + + return tags + def get_separation_pages(self) -> dict[int, bool]: """ Search the parsed barcodes for separators and returns a dict of page diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 4552a2b77..3dd6d62ff 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -14,6 +14,7 @@ from documents.barcodes import BarcodePlugin from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource +from documents.models import Tag from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DocumentConsumeDelayMixin from documents.tests.utils import DummyProgressManager @@ -741,3 +742,125 @@ class TestBarcodeZxing(TestBarcode): @override_settings(CONSUMER_BARCODE_SCANNER="ZXING") class TestAsnBarcodesZxing(TestAsnBarcode): pass + + +class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase): + @contextmanager + def get_reader(self, filepath: Path) -> BarcodePlugin: + reader = BarcodePlugin( + ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath), + DocumentMetadataOverrides(), + DummyProgressManager(filepath.name, None), + self.dirs.scratch_dir, + "task-id", + ) + reader.setup() + yield reader + reader.cleanup() + + @override_settings(CONSUMER_ENABLE_TAG_BARCODE=True) + def test_scan_file_without_matching_barcodes(self): + """ + GIVEN: + - PDF containing tag barcodes but none with matching prefix (default "TAG:") + WHEN: + - File is scanned for barcodes + THEN: + - No TAG has been created + """ + test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" + with self.get_reader(test_file) as reader: + reader.run() + tags = reader.metadata.tag_ids + self.assertEqual(tags, None) + + @override_settings( + CONSUMER_ENABLE_TAG_BARCODE=False, + CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"}, + ) + def test_scan_file_with_matching_barcode_but_function_disabled(self): + """ + GIVEN: + - PDF containing a tag barcode with matching custom prefix + - The tag barcode functionality is disabled + WHEN: + - File is scanned for barcodes + THEN: + - No TAG has been created + """ + test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" + with self.get_reader(test_file) as reader: + reader.run() + tags = reader.metadata.tag_ids + self.assertEqual(tags, None) + + @override_settings( + CONSUMER_ENABLE_TAG_BARCODE=True, + CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"}, + ) + def test_scan_file_for_tag_custom_prefix(self): + """ + GIVEN: + - PDF containing a tag barcode with custom prefix + - The barcode mapping accepts this prefix and removes it from the mapped tag value + - The created tag is the non-prefixed values + WHEN: + - File is scanned for barcodes + THEN: + - The TAG is located + - One TAG has been created + """ + test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" + with self.get_reader(test_file) as reader: + reader.metadata.tag_ids = [99] + reader.run() + self.assertEqual(reader.pdf_file, test_file) + tags = reader.metadata.tag_ids + self.assertEqual(len(tags), 2) + self.assertEqual(tags[0], 99) + self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[1]) + + @override_settings( + CONSUMER_ENABLE_TAG_BARCODE=True, + CONSUMER_TAG_BARCODE_MAPPING={"ASN(.*)": "\\g<1>"}, + ) + def test_scan_file_for_many_custom_tags(self): + """ + GIVEN: + - PDF containing multiple tag barcode with custom prefix + - The barcode mapping accepts this prefix and removes it from the mapped tag value + - The created tags are the non-prefixed values + WHEN: + - File is scanned for barcodes + THEN: + - The TAG is located + - File Tags have been created + """ + test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf" + with self.get_reader(test_file) as reader: + reader.run() + tags = reader.metadata.tag_ids + self.assertEqual(len(tags), 5) + self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[0]) + self.assertEqual(Tag.objects.get(name__iexact="00124").pk, tags[1]) + self.assertEqual(Tag.objects.get(name__iexact="00125").pk, tags[2]) + self.assertEqual(Tag.objects.get(name__iexact="00126").pk, tags[3]) + self.assertEqual(Tag.objects.get(name__iexact="00127").pk, tags[4]) + + @override_settings( + CONSUMER_ENABLE_TAG_BARCODE=True, + CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<3>"}, + ) + def test_scan_file_for_tag_raises_value_error(self): + """ + GIVEN: + - Any error occurs during tag barcode processing + THEN: + - The processing should be skipped and not break the import + """ + test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" + with self.get_reader(test_file) as reader: + reader.run() + # expect error to be caught and logged only + tags = reader.metadata.tag_ids + self.assertEqual(tags, None) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 7179f0358..4f7894acc 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -853,6 +853,19 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float( CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300) +CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean( + "PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE", +) + +CONSUMER_TAG_BARCODE_MAPPING = dict( + json.loads( + os.getenv( + "PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING", + '{"TAG:(.*)": "\\\\g<1>"}', + ), + ), +) + CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED", )