diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index e0664f906..46a96061b 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -2,10 +2,11 @@ import logging import os import shutil import tempfile +from dataclasses import dataclass from functools import lru_cache +from pathlib import Path from typing import List from typing import Optional -from typing import Tuple import magic from django.conf import settings @@ -25,6 +26,42 @@ class BarcodeImageFormatError(Exception): pass +@dataclass(frozen=True) +class Barcode: + """ + Holds the information about a single barcode and its location + """ + + page: int + value: str + + @property + def is_separator(self) -> bool: + """ + Returns True if the barcode value equals the configured separation value, + False otherwise + """ + return self.value == settings.CONSUMER_BARCODE_STRING + + @property + def is_asn(self) -> bool: + """ + Returns True if the barcode value matches the configured ASN prefix, + False otherwise + """ + return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX) + + +@dataclass +class DocumentBarcodeInfo: + """ + Describes a single document's barcode status + """ + + pdf_path: Path + barcodes: List[Barcode] + + @lru_cache(maxsize=8) def supported_file_type(mime_type) -> bool: """ @@ -109,14 +146,14 @@ def convert_from_tiff_to_pdf(filepath: str) -> str: def scan_file_for_barcodes( filepath: str, -) -> Tuple[Optional[str], List[Tuple[int, str]]]: +) -> DocumentBarcodeInfo: """ Scan the provided pdf file for any barcodes Returns a PDF filepath and a list of (page_number, barcode_text) tuples """ - def _pikepdf_barcode_scan(pdf_filepath: str): + def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]: detected_barcodes = [] with Pdf.open(pdf_filepath) as pdf: for page_num, page in enumerate(pdf.pages): @@ -135,22 +172,21 @@ def scan_file_for_barcodes( # raise an exception, triggering fallback pillow_img = pdfimage.as_pil_image() - barcodes_on_page = barcode_reader(pillow_img) - detected_barcodes.extend( - [(page_num, text) for text in barcodes_on_page], - ) + for barcode_value in barcode_reader(pillow_img): + detected_barcodes.append(Barcode(page_num, barcode_value)) + return detected_barcodes - def _pdf2image_barcode_scan(pdf_filepath: str): + def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: detected_barcodes = [] # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: pages_from_path = convert_from_path(pdf_filepath, output_folder=path) for current_page_number, page in enumerate(pages_from_path): - barcodes_on_page = barcode_reader(page) - detected_barcodes.extend( - [(current_page_number, text) for text in barcodes_on_page], - ) + for barcode_value in barcode_reader(page): + detected_barcodes.append( + Barcode(current_page_number, barcode_value), + ) return detected_barcodes pdf_filepath = None @@ -191,26 +227,22 @@ def scan_file_for_barcodes( f"Unsupported file format for barcode reader: {str(mime_type)}", ) - return pdf_filepath, barcodes + return DocumentBarcodeInfo(pdf_filepath, barcodes) -def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]: +def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]: """ Search the parsed barcodes for separators - and returns a list of pagenumbers, which + and returns a list of page numbers, which separate the file into new files """ # filter all barcodes for the separator string - separator_barcodes = list( - filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes), - ) # get the page numbers of the separating barcodes - separator_page_numbers = [page for page, _ in separator_barcodes] - return separator_page_numbers + return [bc.page for bc in barcodes if bc.is_separator] -def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]: +def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: """ Search the parsed barcodes for any ASNs. The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX @@ -219,11 +251,9 @@ def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]: """ asn = None - # only the barcode text is important here -> discard the page number - barcodes = [text for _, text in barcodes] # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX asn_text = next( - (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)), + (x.value for x in barcodes if x.is_asn), None, ) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 7f4c8e125..77b48e33d 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -112,17 +112,20 @@ def consume_file( # read all barcodes in the current document if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: - pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path) + doc_barcode_info = barcodes.scan_file_for_barcodes(path) # split document by separator pages, if enabled if settings.CONSUMER_ENABLE_BARCODES: - separators = barcodes.get_separating_barcodes(parsed_barcodes) + separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) if len(separators) > 0: logger.debug( f"Pages with separators found in: {str(path)}", ) - document_list = barcodes.separate_pages(pdf_filepath, separators) + document_list = barcodes.separate_pages( + doc_barcode_info.pdf_path, + separators, + ) if document_list: for n, document in enumerate(document_list): @@ -151,10 +154,10 @@ def consume_file( ) # Delete the PDF file which was split - os.remove(pdf_filepath) + os.remove(doc_barcode_info.pdf_path) # If the original was a TIFF, remove the original file as well - if str(pdf_filepath) != str(path): + if str(doc_barcode_info.pdf_path) != str(path): logger.debug(f"Deleting file {path}") os.unlink(path) @@ -181,7 +184,7 @@ def consume_file( # try reading the ASN from barcode if settings.CONSUMER_ENABLE_ASN_BARCODE: - asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) if asn: logger.info(f"Found ASN in barcode: {asn}") diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 15e7efd94..dba4afc99 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -191,26 +191,26 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_barcodes_none_present(self): test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) def test_scan_file_for_separating_barcodes3(self): @@ -218,14 +218,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) def test_scan_file_for_separating_barcodes4(self): @@ -233,14 +233,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "several-patcht-codes.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [2, 5]) def test_scan_file_for_separating_barcodes_upsidedown(self): @@ -248,14 +248,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t-middle_reverse.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) def test_scan_file_for_barcodes_pillow_transcode_error(self): @@ -312,7 +312,7 @@ class TestBarcode(DirectoriesMixin, TestCase): with mock.patch("documents.barcodes.barcode_reader") as reader: reader.return_value = list() - _, _ = barcodes.scan_file_for_barcodes( + _ = barcodes.scan_file_for_barcodes( str(device_n_pdf.name), ) @@ -331,14 +331,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-fax-image.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) def test_scan_file_for_separating_qr_barcodes(self): @@ -346,14 +346,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t-qr.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [0]) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @@ -362,14 +362,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [0]) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @@ -378,14 +378,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [0]) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @@ -394,14 +394,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-128-custom.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_wrong_qr_barcodes(self): @@ -409,14 +409,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) def test_separate_pages(self): @@ -507,14 +507,14 @@ class TestBarcode(DirectoriesMixin, TestCase): ) tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(test_file, pdf_file) + self.assertEqual(test_file, doc_barcode_info.pdf_path) self.assertTrue(len(separator_page_numbers) > 0) document_list = barcodes.separate_pages(test_file, separator_page_numbers) @@ -622,14 +622,14 @@ class TestBarcode(DirectoriesMixin, TestCase): - Scanning handle the exception without exception """ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) separator_page_numbers = barcodes.get_separating_barcodes( - parsed_barcodes, + doc_barcode_info.barcodes, ) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) def test_scan_file_for_asn_barcode(self): @@ -637,12 +637,12 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-39-asn-123.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) - asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, 123) def test_scan_file_for_asn_not_existing(self): @@ -650,12 +650,12 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) - asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, None) def test_scan_file_for_asn_barcode_invalid(self): @@ -663,13 +663,13 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-39-asn-invalid.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) - asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, None) @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") @@ -678,10 +678,10 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "barcode-39-asn-custom-prefix.pdf", ) - pdf_file, parsed_barcodes = barcodes.scan_file_for_barcodes( + doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) - asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - self.assertEqual(pdf_file, test_file) + self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, 123)