diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 6e3ecfe05..9adb8aeea 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -5,6 +5,7 @@ import tempfile from dataclasses import dataclass from functools import lru_cache from pathlib import Path +from typing import Dict from typing import List from typing import Optional @@ -201,16 +202,25 @@ def scan_file_for_barcodes( return DocumentBarcodeInfo(pdf_filepath, barcodes) -def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]: +def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]: """ Search the parsed barcodes for separators - and returns a list of page numbers, which - separate the file into new files. + and returns a dict of page numbers, which + separate the file into new files, together + with the information whether to keep the page. """ # filter all barcodes for the separator string # get the page numbers of the separating barcodes + separator_pages = {bc.page: False for bc in barcodes if bc.is_separator} + if not settings.CONSUMER_ENABLE_ASN_BARCODE: + return separator_pages - return list({bc.page for bc in barcodes if bc.is_separator}) + # add the page numbers of the ASN barcodes + # (except for first page, that might lead to infinite loops). + return { + **separator_pages, + **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0}, + } def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: @@ -242,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: return asn -def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: +def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]: """ Separate the provided pdf file on the pages_to_split_on. - The pages which are defined by page_numbers will be removed. + The pages which are defined by the keys in page_numbers + will be removed if the corresponding value is false. Returns a list of (temporary) filepaths to consume. These will need to be deleted later. """ @@ -261,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: fname = os.path.splitext(os.path.basename(filepath))[0] pdf = Pdf.open(filepath) + # Start with an empty document + current_document: List[Page] = [] # A list of documents, ie a list of lists of pages - documents: List[List[Page]] = [] - # A single document, ie a list of pages - document: List[Page] = [] + documents: List[List[Page]] = [current_document] for idx, page in enumerate(pdf.pages): # Keep building the new PDF as long as it is not a # separator index if idx not in pages_to_split_on: - document.append(page) - # Make sure to append the very last document to the documents - if idx == (len(pdf.pages) - 1): - documents.append(document) - document = [] - else: - # This is a split index, save the current PDF pages, and restart - # a new destination page listing - logger.debug(f"Starting new document at idx {idx}") - documents.append(document) - document = [] + current_document.append(page) + continue + + # This is a split index + # Start a new destination page listing + logger.debug(f"Starting new document at idx {idx}") + current_document = [] + documents.append(current_document) + keep_page = pages_to_split_on[idx] + if keep_page: + # Keep the page + # (new document is started by asn barcode) + current_document.append(page) documents = [x for x in documents if len(x)] diff --git a/src/documents/tests/samples/barcodes/split-by-asn-1.pdf b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf new file mode 100644 index 000000000..82374b9d2 Binary files /dev/null and b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf differ diff --git a/src/documents/tests/samples/barcodes/split-by-asn-2.pdf b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf new file mode 100644 index 000000000..05cc16abe Binary files /dev/null and b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf differ diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 4f7f1278a..86c53755b 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -294,7 +294,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) + self.assertDictEqual(separator_page_numbers, {0: False}) def test_scan_file_for_separating_barcodes_none_present(self): """ @@ -314,7 +314,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) + self.assertDictEqual(separator_page_numbers, {}) def test_scan_file_for_separating_barcodes_middle_page(self): """ @@ -337,7 +337,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_scan_file_for_separating_barcodes_multiple_pages(self): """ @@ -360,7 +360,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [2, 5]) + self.assertDictEqual(separator_page_numbers, {2: False, 5: False}) def test_scan_file_for_separating_barcodes_upside_down(self): """ @@ -384,7 +384,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_scan_file_for_separating_barcodes_fax_decode(self): """ @@ -407,7 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_scan_file_for_separating_qr_barcodes(self): """ @@ -431,7 +431,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_barcodes(self): @@ -456,7 +456,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_qr_barcodes(self): @@ -482,7 +482,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_128_barcodes(self): @@ -508,7 +508,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) + self.assertDictEqual(separator_page_numbers, {0: False}) def test_scan_file_for_separating_wrong_qr_barcodes(self): """ @@ -533,7 +533,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) + self.assertDictEqual(separator_page_numbers, {}) @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC") def test_scan_file_for_separating_qr_barcodes(self): @@ -558,7 +558,7 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertGreater(len(doc_barcode_info.barcodes), 0) - self.assertListEqual(separator_page_numbers, [1]) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_separate_pages(self): """ @@ -573,7 +573,7 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - documents = barcodes.separate_pages(test_file, [1]) + documents = barcodes.separate_pages(test_file, {1: False}) self.assertEqual(len(documents), 2) @@ -591,7 +591,7 @@ class TestBarcode(DirectoriesMixin, TestCase): self.BARCODE_SAMPLE_DIR, "patch-code-t-double.pdf", ) - pages = barcodes.separate_pages(test_file, [1, 2]) + pages = barcodes.separate_pages(test_file, {1: False, 2: False}) self.assertEqual(len(pages), 2) @@ -610,7 +610,7 @@ class TestBarcode(DirectoriesMixin, TestCase): "patch-code-t-middle.pdf", ) with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - pages = barcodes.separate_pages(test_file, []) + pages = barcodes.separate_pages(test_file, {}) self.assertEqual(pages, []) self.assertEqual( cm.output, @@ -858,7 +858,88 @@ class TestBarcode(DirectoriesMixin, TestCase): ) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) + self.assertDictEqual(separator_page_numbers, {}) + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_ENABLE_ASN_BARCODE=True, + ) + def test_separate_pages_by_asn_barcodes_and_patcht(self): + """ + GIVEN: + - Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11 + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced, split correctly by correct pages + """ + test_file = os.path.join( + os.path.dirname(__file__), + self.BARCODE_SAMPLE_DIR, + "split-by-asn-2.pdf", + ) + + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(test_file, doc_barcode_info.pdf_path) + self.assertDictEqual( + separator_page_numbers, + { + 2: False, + 4: True, + 5: True, + 8: True, + 10: True, + }, + ) + + document_list = barcodes.separate_pages(test_file, separator_page_numbers) + self.assertEqual(len(document_list), 6) + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_ENABLE_ASN_BARCODE=True, + ) + def test_separate_pages_by_asn_barcodes(self): + """ + GIVEN: + - Input PDF with ASN barcodes on pages 1,3,4,7,9 + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced, split correctly by correct pages + """ + test_file = os.path.join( + os.path.dirname(__file__), + self.BARCODE_SAMPLE_DIR, + "split-by-asn-1.pdf", + ) + + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(test_file, doc_barcode_info.pdf_path) + self.assertDictEqual( + separator_page_numbers, + { + 2: True, + 3: True, + 6: True, + 8: True, + }, + ) + + document_list = barcodes.separate_pages(test_file, separator_page_numbers) + self.assertEqual(len(document_list), 5) class TestAsnBarcodes(DirectoriesMixin, TestCase):