From 122e4141b0722262ce97c3bed739ab05d349a2ad Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 15 Dec 2023 09:17:25 -0800 Subject: [PATCH] Fix: Document metadata is lost during barcode splitting (#4982) * Fixes barcode splitting dropping metadata that might be needed for the round 2 --- src/documents/barcodes.py | 54 ++-- src/documents/tasks.py | 2 +- src/documents/tests/test_barcodes.py | 402 +++++++++------------------ src/documents/tests/utils.py | 12 +- 4 files changed, 168 insertions(+), 302 deletions(-) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index cac02e3e9..3e2b309b6 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -14,6 +14,8 @@ from pikepdf import Pdf from PIL import Image from documents.converters import convert_from_tiff_to_pdf +from documents.data_models import ConsumableDocument +from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats @@ -53,6 +55,7 @@ class BarcodeReader: self.mime: Final[str] = mime_type self.pdf_file: Path = self.file self.barcodes: list[Barcode] = [] + self._tiff_conversion_done = False self.temp_dir: Optional[tempfile.TemporaryDirectory] = None if settings.CONSUMER_BARCODE_TIFF_SUPPORT: @@ -150,12 +153,14 @@ class BarcodeReader: def convert_from_tiff_to_pdf(self): """ - May convert a TIFF image into a PDF, if the input is a TIFF + May convert a TIFF image into a PDF, if the input is a TIFF and + the TIFF has not been made into a PDF """ # Nothing to do, pdf_file is already assigned correctly - if self.mime != "image/tiff": + if self.mime != "image/tiff" or self._tiff_conversion_done: return + self._tiff_conversion_done = True self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name)) def detect(self) -> None: @@ -167,6 +172,9 @@ class BarcodeReader: if self.barcodes: return + # No op if not a TIFF + self.convert_from_tiff_to_pdf() + # Choose the library for reading if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": reader = self.read_barcodes_pyzbar @@ -240,7 +248,7 @@ class BarcodeReader: """ document_paths = [] - fname = self.file.with_suffix("").name + fname = self.file.stem with Pdf.open(self.pdf_file) as input_pdf: # Start with an empty document current_document: list[Page] = [] @@ -290,7 +298,7 @@ class BarcodeReader: def separate( self, source: DocumentSource, - override_name: Optional[str] = None, + overrides: DocumentMetadataOverrides, ) -> bool: """ Separates the document, based on barcodes and configuration, creating new @@ -316,27 +324,23 @@ class BarcodeReader: logger.warning("No pages to split on!") return False - # Create the split documents - doc_paths = self.separate_pages(separator_pages) + tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve() - # Save the new documents to correct folder - if source != DocumentSource.ConsumeFolder: - # The given file is somewhere in SCRATCH_DIR, - # and new documents must be moved to the CONSUMPTION_DIR - # for the consumer to notice them - save_to_dir = settings.CONSUMPTION_DIR - else: - # The given file is somewhere in CONSUMPTION_DIR, - # and may be some levels down for recursive tagging - # so use the file's parent to preserve any metadata - save_to_dir = self.file.parent + from documents import tasks - for idx, document_path in enumerate(doc_paths): - if override_name is not None: - newname = f"{idx}_{override_name}" - dest = save_to_dir / newname - else: - dest = save_to_dir - logger.info(f"Saving {document_path} to {dest}") - copy_file_with_basic_stats(document_path, dest) + # Create the split document tasks + for new_document in self.separate_pages(separator_pages): + copy_file_with_basic_stats(new_document, tmp_dir / new_document.name) + + tasks.consume_file.delay( + ConsumableDocument( + # Same source, for templates + source=source, + # Can't use same folder or the consume might grab it again + original_file=(tmp_dir / new_document.name).resolve(), + ), + # All the same metadata + overrides, + ) + logger.info("Barcode splitting complete!") return True diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 10a44a8fe..d0728a719 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -140,7 +140,7 @@ def consume_file( with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: if settings.CONSUMER_ENABLE_BARCODES and reader.separate( input_doc.source, - overrides.filename, + overrides, ): # notify the sender, otherwise the progress bar # in the UI stays stuck diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 70f7807cc..e4d8ccc57 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1,5 +1,4 @@ import shutil -from pathlib import Path from unittest import mock import pytest @@ -11,10 +10,13 @@ from documents import tasks from documents.barcodes import BarcodeReader from documents.consumer import ConsumerError from documents.data_models import ConsumableDocument +from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource from documents.models import Document from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import DocumentConsumeDelayMixin from documents.tests.utils import FileSystemAssertsMixin +from documents.tests.utils import SampleDirMixin try: import zxingcpp # noqa: F401 @@ -25,11 +27,7 @@ except ImportError: @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR") -class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - SAMPLE_DIR = Path(__file__).parent / "samples" - - BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" - +class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase): def test_scan_file_for_separating_barcodes(self): """ GIVEN: @@ -48,6 +46,46 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(reader.pdf_file, test_file) self.assertDictEqual(separator_page_numbers, {0: False}) + @override_settings( + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_scan_tiff_for_separating_barcodes(self): + """ + GIVEN: + - TIFF image containing barcodes + WHEN: + - Consume task returns + THEN: + - The file was split + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff" + + with BarcodeReader(test_file, "image/tiff") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() + + self.assertDictEqual(separator_page_numbers, {1: False}) + + @override_settings( + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_scan_tiff_with_alpha_for_separating_barcodes(self): + """ + GIVEN: + - TIFF image containing barcodes + WHEN: + - Consume task returns + THEN: + - The file was split + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff" + + with BarcodeReader(test_file, "image/tiff") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() + + self.assertDictEqual(separator_page_numbers, {1: False}) + def test_scan_file_for_separating_barcodes_none_present(self): """ GIVEN: @@ -285,6 +323,28 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertGreater(len(reader.barcodes), 0) self.assertDictEqual(separator_page_numbers, {1: False}) + def test_scan_file_for_separating_barcodes_password(self): + """ + GIVEN: + - Password protected PDF + WHEN: + - File is scanned for barcode + THEN: + - Scanning handles the exception without crashing + """ + test_file = self.SAMPLE_DIR / "password-is-test.pdf" + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + warning = cm.output[0] + expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" + self.assertTrue(warning.startswith(expected_str)) + + separator_page_numbers = reader.get_separation_pages() + + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {}) + def test_separate_pages(self): """ GIVEN: @@ -332,8 +392,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): with self.assertLogs("paperless.barcodes", level="WARNING") as cm: with BarcodeReader(test_file, "application/pdf") as reader: - success = reader.separate(DocumentSource.ApiUpload) - self.assertFalse(success) + self.assertFalse( + reader.separate( + DocumentSource.ApiUpload, + DocumentMetadataOverrides(), + ), + ) self.assertEqual( cm.output, [ @@ -341,215 +405,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ], ) - def test_save_to_dir_given_name(self): - """ - GIVEN: - - File to save to a directory - - There is a name override - WHEN: - - The file is saved - THEN: - - The file exists - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: - reader.separate(DocumentSource.ApiUpload, "newname.pdf") - - self.assertEqual(reader.pdf_file, test_file) - target_file1 = settings.CONSUMPTION_DIR / "0_newname.pdf" - target_file2 = settings.CONSUMPTION_DIR / "1_newname.pdf" - self.assertIsFile(target_file1) - self.assertIsFile(target_file2) - - def test_barcode_splitter_api_upload(self): - """ - GIVEN: - - Input file containing barcodes - WHEN: - - Input file is split on barcodes - THEN: - - Correct number of files produced - """ - sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - test_file = settings.SCRATCH_DIR / "patch-code-t-middle.pdf" - shutil.copy(sample_file, test_file) - - with BarcodeReader(test_file, "application/pdf") as reader: - reader.separate(DocumentSource.ApiUpload) - - self.assertEqual(reader.pdf_file, test_file) - - target_file1 = ( - settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf" - ) - - target_file2 = ( - settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf" - ) - - self.assertIsFile(target_file1) - self.assertIsFile(target_file2) - - def test_barcode_splitter_consume_dir(self): - """ - GIVEN: - - Input file containing barcodes - WHEN: - - Input file is split on barcodes - THEN: - - Correct number of files produced - """ - sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - test_file = settings.CONSUMPTION_DIR / "patch-code-t-middle.pdf" - shutil.copy(sample_file, test_file) - - with BarcodeReader(test_file, "application/pdf") as reader: - reader.detect() - reader.separate(DocumentSource.ConsumeFolder) - - self.assertEqual(reader.pdf_file, test_file) - - target_file1 = ( - settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf" - ) - - target_file2 = ( - settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf" - ) - - self.assertIsFile(target_file1) - self.assertIsFile(target_file2) - - def test_barcode_splitter_consume_dir_recursive(self): - """ - GIVEN: - - Input file containing barcodes - - Input file is within a directory structure of the consume folder - WHEN: - - Input file is split on barcodes - THEN: - - Correct number of files produced - - Output files are within the same directory structure - """ - sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - test_file = ( - settings.CONSUMPTION_DIR / "tag1" / "tag2" / "patch-code-t-middle.pdf" - ) - test_file.parent.mkdir(parents=True) - shutil.copy(sample_file, test_file) - - with BarcodeReader(test_file, "application/pdf") as reader: - reader.separate(DocumentSource.ConsumeFolder) - - self.assertEqual(reader.pdf_file, test_file) - - target_file1 = ( - settings.CONSUMPTION_DIR - / "tag1" - / "tag2" - / "patch-code-t-middle_document_0.pdf" - ) - - target_file2 = ( - settings.CONSUMPTION_DIR - / "tag1" - / "tag2" - / "patch-code-t-middle_document_1.pdf" - ) - - self.assertIsFile(target_file1) - self.assertIsFile(target_file2) - - @override_settings(CONSUMER_ENABLE_BARCODES=True) - def test_consume_barcode_file(self): - """ - GIVEN: - - Input file with barcodes given to consume task - WHEN: - - Consume task returns - THEN: - - The file was split - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - - dst = settings.SCRATCH_DIR / "patch-code-t-middle.pdf" - shutil.copy(test_file, dst) - - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual( - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ), - "File successfully split", - ) - - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - def test_consume_barcode_tiff_file(self): - """ - GIVEN: - - TIFF image containing barcodes - WHEN: - - Consume task returns - THEN: - - The file was split - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff" - - dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff" - shutil.copy(test_file, dst) - - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual( - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ), - "File successfully split", - ) - self.assertIsNotFile(dst) - - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - def test_consume_barcode_tiff_file_with_alpha(self): - """ - GIVEN: - - TIFF image containing barcodes - - TIFF image has an alpha layer - WHEN: - - Consume task handles the alpha layer and returns - THEN: - - The file was split without issue - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff" - - dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff" - shutil.copy(test_file, dst) - - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual( - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ), - "File successfully split", - ) - self.assertIsNotFile(dst) - @override_settings( CONSUMER_ENABLE_BARCODES=True, CONSUMER_BARCODE_TIFF_SUPPORT=True, @@ -597,60 +452,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNone(kwargs["override_document_type_id"]) self.assertIsNone(kwargs["override_tag_ids"]) - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - def test_consume_barcode_supported_no_extension_file(self): - """ - GIVEN: - - TIFF image containing barcodes - - TIFF file is given without extension - WHEN: - - Consume task returns - THEN: - - The file was split - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff" - - dst = settings.SCRATCH_DIR / "patch-code-t-middle" - shutil.copy(test_file, dst) - - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual( - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ), - "File successfully split", - ) - self.assertIsNotFile(dst) - - def test_scan_file_for_separating_barcodes_password(self): - """ - GIVEN: - - Password protected PDF - WHEN: - - File is scanned for barcode - THEN: - - Scanning handles the exception without crashing - """ - test_file = self.SAMPLE_DIR / "password-is-test.pdf" - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - with BarcodeReader(test_file, "application/pdf") as reader: - reader.detect() - warning = cm.output[0] - expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" - self.assertTrue(warning.startswith(expected_str)) - - separator_page_numbers = reader.get_separation_pages() - - self.assertEqual(reader.pdf_file, test_file) - self.assertDictEqual(separator_page_numbers, {}) - @override_settings( CONSUMER_ENABLE_BARCODES=True, CONSUMER_ENABLE_ASN_BARCODE=True, @@ -722,11 +523,64 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(len(document_list), 5) -class TestAsnBarcode(DirectoriesMixin, TestCase): - SAMPLE_DIR = Path(__file__).parent / "samples" +@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR") +class TestBarcodeNewConsume( + DirectoriesMixin, + FileSystemAssertsMixin, + SampleDirMixin, + DocumentConsumeDelayMixin, + TestCase, +): + @override_settings(CONSUMER_ENABLE_BARCODES=True) + def test_consume_barcode_file(self): + """ + GIVEN: + - Incoming file with at 1 barcode producing 2 documents + - Document includes metadata override information + WHEN: + - The document is split + THEN: + - Two new consume tasks are created + - Metadata overrides are preserved for the new consume + - The document source is unchanged (for consume templates) + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + temp_copy = self.dirs.scratch_dir / test_file.name + shutil.copy(test_file, temp_copy) - BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" + overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9]) + with mock.patch("documents.tasks.async_to_sync") as progress_mocker: + self.assertEqual( + tasks.consume_file( + ConsumableDocument( + source=DocumentSource.ConsumeFolder, + original_file=temp_copy, + ), + overrides, + ), + "File successfully split", + ) + # We let the consumer know progress is done + progress_mocker.assert_called_once() + # 2 new document consume tasks created + self.assertEqual(self.consume_file_mock.call_count, 2) + + self.assertIsNotFile(temp_copy) + + # Check the split files exist + # Check the source is unchanged + # Check the overrides are unchanged + for ( + new_input_doc, + new_doc_overrides, + ) in self.get_all_consume_delay_call_args(): + self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder) + self.assertIsFile(new_input_doc.original_file) + self.assertEqual(overrides, new_doc_overrides) + + +class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") def test_scan_file_for_asn_custom_prefix(self): """ diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index b1e9c0523..fe7dbb059 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -235,8 +235,10 @@ class DocumentConsumeDelayMixin: """ Iterates over all calls to the async task and returns the arguments """ + # Must be at least 1 call + self.consume_file_mock.assert_called() - for args, _ in self.consume_file_mock.call_args_list: + for args, kwargs in self.consume_file_mock.call_args_list: input_doc, overrides = args yield (input_doc, overrides) @@ -244,7 +246,7 @@ class DocumentConsumeDelayMixin: def get_specific_consume_delay_call_args( self, index: int, - ) -> Iterator[tuple[ConsumableDocument, DocumentMetadataOverrides]]: + ) -> tuple[ConsumableDocument, DocumentMetadataOverrides]: """ Returns the arguments of a specific call to the async task """ @@ -299,3 +301,9 @@ class TestMigrations(TransactionTestCase): def setUpBeforeMigration(self, apps): pass + + +class SampleDirMixin: + SAMPLE_DIR = Path(__file__).parent / "samples" + + BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"