mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Feature: split documents on ASN barcode (#2554)
* also split documents when an ASN barcode is found * linter * fix test case parameters * avoid pre-python-3.9 features * simplify dict-creation in tests * simplify dict-creation in tests for empty dicts * Add test cases for the splitting by ASN barcode feature * deleted supporting files for test case construction
This commit is contained in:
parent
61f7e73961
commit
658d372cd2
@ -5,6 +5,7 @@ import tempfile
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
from typing import List
|
from typing import List
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -201,16 +202,25 @@ def scan_file_for_barcodes(
|
|||||||
return DocumentBarcodeInfo(pdf_filepath, barcodes)
|
return DocumentBarcodeInfo(pdf_filepath, barcodes)
|
||||||
|
|
||||||
|
|
||||||
def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
|
def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
|
||||||
"""
|
"""
|
||||||
Search the parsed barcodes for separators
|
Search the parsed barcodes for separators
|
||||||
and returns a list of page numbers, which
|
and returns a dict of page numbers, which
|
||||||
separate the file into new files.
|
separate the file into new files, together
|
||||||
|
with the information whether to keep the page.
|
||||||
"""
|
"""
|
||||||
# filter all barcodes for the separator string
|
# filter all barcodes for the separator string
|
||||||
# get the page numbers of the separating barcodes
|
# get the page numbers of the separating barcodes
|
||||||
|
separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
|
||||||
|
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||||
|
return separator_pages
|
||||||
|
|
||||||
return list({bc.page for bc in barcodes if bc.is_separator})
|
# add the page numbers of the ASN barcodes
|
||||||
|
# (except for first page, that might lead to infinite loops).
|
||||||
|
return {
|
||||||
|
**separator_pages,
|
||||||
|
**{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
||||||
@ -242,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
|||||||
return asn
|
return asn
|
||||||
|
|
||||||
|
|
||||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Separate the provided pdf file on the pages_to_split_on.
|
Separate the provided pdf file on the pages_to_split_on.
|
||||||
The pages which are defined by page_numbers will be removed.
|
The pages which are defined by the keys in page_numbers
|
||||||
|
will be removed if the corresponding value is false.
|
||||||
Returns a list of (temporary) filepaths to consume.
|
Returns a list of (temporary) filepaths to consume.
|
||||||
These will need to be deleted later.
|
These will need to be deleted later.
|
||||||
"""
|
"""
|
||||||
@ -261,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
|||||||
fname = os.path.splitext(os.path.basename(filepath))[0]
|
fname = os.path.splitext(os.path.basename(filepath))[0]
|
||||||
pdf = Pdf.open(filepath)
|
pdf = Pdf.open(filepath)
|
||||||
|
|
||||||
|
# Start with an empty document
|
||||||
|
current_document: List[Page] = []
|
||||||
# A list of documents, ie a list of lists of pages
|
# A list of documents, ie a list of lists of pages
|
||||||
documents: List[List[Page]] = []
|
documents: List[List[Page]] = [current_document]
|
||||||
# A single document, ie a list of pages
|
|
||||||
document: List[Page] = []
|
|
||||||
|
|
||||||
for idx, page in enumerate(pdf.pages):
|
for idx, page in enumerate(pdf.pages):
|
||||||
# Keep building the new PDF as long as it is not a
|
# Keep building the new PDF as long as it is not a
|
||||||
# separator index
|
# separator index
|
||||||
if idx not in pages_to_split_on:
|
if idx not in pages_to_split_on:
|
||||||
document.append(page)
|
current_document.append(page)
|
||||||
# Make sure to append the very last document to the documents
|
continue
|
||||||
if idx == (len(pdf.pages) - 1):
|
|
||||||
documents.append(document)
|
# This is a split index
|
||||||
document = []
|
# Start a new destination page listing
|
||||||
else:
|
logger.debug(f"Starting new document at idx {idx}")
|
||||||
# This is a split index, save the current PDF pages, and restart
|
current_document = []
|
||||||
# a new destination page listing
|
documents.append(current_document)
|
||||||
logger.debug(f"Starting new document at idx {idx}")
|
keep_page = pages_to_split_on[idx]
|
||||||
documents.append(document)
|
if keep_page:
|
||||||
document = []
|
# Keep the page
|
||||||
|
# (new document is started by asn barcode)
|
||||||
|
current_document.append(page)
|
||||||
|
|
||||||
documents = [x for x in documents if len(x)]
|
documents = [x for x in documents if len(x)]
|
||||||
|
|
||||||
|
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
Binary file not shown.
@ -294,7 +294,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [0])
|
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_none_present(self):
|
def test_scan_file_for_separating_barcodes_none_present(self):
|
||||||
"""
|
"""
|
||||||
@ -314,7 +314,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [])
|
self.assertDictEqual(separator_page_numbers, {})
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_middle_page(self):
|
def test_scan_file_for_separating_barcodes_middle_page(self):
|
||||||
"""
|
"""
|
||||||
@ -337,7 +337,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [1])
|
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_multiple_pages(self):
|
def test_scan_file_for_separating_barcodes_multiple_pages(self):
|
||||||
"""
|
"""
|
||||||
@ -360,7 +360,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [2, 5])
|
self.assertDictEqual(separator_page_numbers, {2: False, 5: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_upside_down(self):
|
def test_scan_file_for_separating_barcodes_upside_down(self):
|
||||||
"""
|
"""
|
||||||
@ -384,7 +384,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [1])
|
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes_fax_decode(self):
|
def test_scan_file_for_separating_barcodes_fax_decode(self):
|
||||||
"""
|
"""
|
||||||
@ -407,7 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [1])
|
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_qr_barcodes(self):
|
def test_scan_file_for_separating_qr_barcodes(self):
|
||||||
"""
|
"""
|
||||||
@ -431,7 +431,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [0])
|
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||||
|
|
||||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||||
def test_scan_file_for_separating_custom_barcodes(self):
|
def test_scan_file_for_separating_custom_barcodes(self):
|
||||||
@ -456,7 +456,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [0])
|
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||||
|
|
||||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||||
def test_scan_file_for_separating_custom_qr_barcodes(self):
|
def test_scan_file_for_separating_custom_qr_barcodes(self):
|
||||||
@ -482,7 +482,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [0])
|
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||||
|
|
||||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||||
def test_scan_file_for_separating_custom_128_barcodes(self):
|
def test_scan_file_for_separating_custom_128_barcodes(self):
|
||||||
@ -508,7 +508,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [0])
|
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||||
|
|
||||||
def test_scan_file_for_separating_wrong_qr_barcodes(self):
|
def test_scan_file_for_separating_wrong_qr_barcodes(self):
|
||||||
"""
|
"""
|
||||||
@ -533,7 +533,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [])
|
self.assertDictEqual(separator_page_numbers, {})
|
||||||
|
|
||||||
@override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
|
@override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
|
||||||
def test_scan_file_for_separating_qr_barcodes(self):
|
def test_scan_file_for_separating_qr_barcodes(self):
|
||||||
@ -558,7 +558,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertGreater(len(doc_barcode_info.barcodes), 0)
|
self.assertGreater(len(doc_barcode_info.barcodes), 0)
|
||||||
self.assertListEqual(separator_page_numbers, [1])
|
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||||
|
|
||||||
def test_separate_pages(self):
|
def test_separate_pages(self):
|
||||||
"""
|
"""
|
||||||
@ -573,7 +573,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
self.BARCODE_SAMPLE_DIR,
|
self.BARCODE_SAMPLE_DIR,
|
||||||
"patch-code-t-middle.pdf",
|
"patch-code-t-middle.pdf",
|
||||||
)
|
)
|
||||||
documents = barcodes.separate_pages(test_file, [1])
|
documents = barcodes.separate_pages(test_file, {1: False})
|
||||||
|
|
||||||
self.assertEqual(len(documents), 2)
|
self.assertEqual(len(documents), 2)
|
||||||
|
|
||||||
@ -591,7 +591,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
self.BARCODE_SAMPLE_DIR,
|
self.BARCODE_SAMPLE_DIR,
|
||||||
"patch-code-t-double.pdf",
|
"patch-code-t-double.pdf",
|
||||||
)
|
)
|
||||||
pages = barcodes.separate_pages(test_file, [1, 2])
|
pages = barcodes.separate_pages(test_file, {1: False, 2: False})
|
||||||
|
|
||||||
self.assertEqual(len(pages), 2)
|
self.assertEqual(len(pages), 2)
|
||||||
|
|
||||||
@ -610,7 +610,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
"patch-code-t-middle.pdf",
|
"patch-code-t-middle.pdf",
|
||||||
)
|
)
|
||||||
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
|
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
|
||||||
pages = barcodes.separate_pages(test_file, [])
|
pages = barcodes.separate_pages(test_file, {})
|
||||||
self.assertEqual(pages, [])
|
self.assertEqual(pages, [])
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
cm.output,
|
cm.output,
|
||||||
@ -858,7 +858,88 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||||
self.assertListEqual(separator_page_numbers, [])
|
self.assertDictEqual(separator_page_numbers, {})
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
CONSUMER_ENABLE_BARCODES=True,
|
||||||
|
CONSUMER_ENABLE_ASN_BARCODE=True,
|
||||||
|
)
|
||||||
|
def test_separate_pages_by_asn_barcodes_and_patcht(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11
|
||||||
|
WHEN:
|
||||||
|
- Input file is split on barcodes
|
||||||
|
THEN:
|
||||||
|
- Correct number of files produced, split correctly by correct pages
|
||||||
|
"""
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
self.BARCODE_SAMPLE_DIR,
|
||||||
|
"split-by-asn-2.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_barcode_info = barcodes.scan_file_for_barcodes(
|
||||||
|
test_file,
|
||||||
|
)
|
||||||
|
separator_page_numbers = barcodes.get_separating_barcodes(
|
||||||
|
doc_barcode_info.barcodes,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(test_file, doc_barcode_info.pdf_path)
|
||||||
|
self.assertDictEqual(
|
||||||
|
separator_page_numbers,
|
||||||
|
{
|
||||||
|
2: False,
|
||||||
|
4: True,
|
||||||
|
5: True,
|
||||||
|
8: True,
|
||||||
|
10: True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
|
||||||
|
self.assertEqual(len(document_list), 6)
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
CONSUMER_ENABLE_BARCODES=True,
|
||||||
|
CONSUMER_ENABLE_ASN_BARCODE=True,
|
||||||
|
)
|
||||||
|
def test_separate_pages_by_asn_barcodes(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Input PDF with ASN barcodes on pages 1,3,4,7,9
|
||||||
|
WHEN:
|
||||||
|
- Input file is split on barcodes
|
||||||
|
THEN:
|
||||||
|
- Correct number of files produced, split correctly by correct pages
|
||||||
|
"""
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
self.BARCODE_SAMPLE_DIR,
|
||||||
|
"split-by-asn-1.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_barcode_info = barcodes.scan_file_for_barcodes(
|
||||||
|
test_file,
|
||||||
|
)
|
||||||
|
separator_page_numbers = barcodes.get_separating_barcodes(
|
||||||
|
doc_barcode_info.barcodes,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(test_file, doc_barcode_info.pdf_path)
|
||||||
|
self.assertDictEqual(
|
||||||
|
separator_page_numbers,
|
||||||
|
{
|
||||||
|
2: True,
|
||||||
|
3: True,
|
||||||
|
6: True,
|
||||||
|
8: True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
|
||||||
|
self.assertEqual(len(document_list), 5)
|
||||||
|
|
||||||
|
|
||||||
class TestAsnBarcodes(DirectoriesMixin, TestCase):
|
class TestAsnBarcodes(DirectoriesMixin, TestCase):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user