Fix: Document metadata is lost during barcode splitting (#4982)

* Fixes barcode splitting dropping metadata that might be needed for the round 2
This commit is contained in:
Trenton H 2023-12-15 09:17:25 -08:00 committed by GitHub
parent be2de4f15d
commit 122e4141b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 168 additions and 302 deletions

View File

@ -14,6 +14,8 @@ from pikepdf import Pdf
from PIL import Image from PIL import Image
from documents.converters import convert_from_tiff_to_pdf from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource from documents.data_models import DocumentSource
from documents.utils import copy_basic_file_stats from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
@ -53,6 +55,7 @@ class BarcodeReader:
self.mime: Final[str] = mime_type self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file self.pdf_file: Path = self.file
self.barcodes: list[Barcode] = [] self.barcodes: list[Barcode] = []
self._tiff_conversion_done = False
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
if settings.CONSUMER_BARCODE_TIFF_SUPPORT: if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
@ -150,12 +153,14 @@ class BarcodeReader:
def convert_from_tiff_to_pdf(self): def convert_from_tiff_to_pdf(self):
""" """
May convert a TIFF image into a PDF, if the input is a TIFF May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
""" """
# Nothing to do, pdf_file is already assigned correctly # Nothing to do, pdf_file is already assigned correctly
if self.mime != "image/tiff": if self.mime != "image/tiff" or self._tiff_conversion_done:
return return
self._tiff_conversion_done = True
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name)) self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None: def detect(self) -> None:
@ -167,6 +172,9 @@ class BarcodeReader:
if self.barcodes: if self.barcodes:
return return
# No op if not a TIFF
self.convert_from_tiff_to_pdf()
# Choose the library for reading # Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader = self.read_barcodes_pyzbar reader = self.read_barcodes_pyzbar
@ -240,7 +248,7 @@ class BarcodeReader:
""" """
document_paths = [] document_paths = []
fname = self.file.with_suffix("").name fname = self.file.stem
with Pdf.open(self.pdf_file) as input_pdf: with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document # Start with an empty document
current_document: list[Page] = [] current_document: list[Page] = []
@ -290,7 +298,7 @@ class BarcodeReader:
def separate( def separate(
self, self,
source: DocumentSource, source: DocumentSource,
override_name: Optional[str] = None, overrides: DocumentMetadataOverrides,
) -> bool: ) -> bool:
""" """
Separates the document, based on barcodes and configuration, creating new Separates the document, based on barcodes and configuration, creating new
@ -316,27 +324,23 @@ class BarcodeReader:
logger.warning("No pages to split on!") logger.warning("No pages to split on!")
return False return False
# Create the split documents tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
doc_paths = self.separate_pages(separator_pages)
# Save the new documents to correct folder from documents import tasks
if source != DocumentSource.ConsumeFolder:
# The given file is somewhere in SCRATCH_DIR,
# and new documents must be moved to the CONSUMPTION_DIR
# for the consumer to notice them
save_to_dir = settings.CONSUMPTION_DIR
else:
# The given file is somewhere in CONSUMPTION_DIR,
# and may be some levels down for recursive tagging
# so use the file's parent to preserve any metadata
save_to_dir = self.file.parent
for idx, document_path in enumerate(doc_paths): # Create the split document tasks
if override_name is not None: for new_document in self.separate_pages(separator_pages):
newname = f"{idx}_{override_name}" copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
dest = save_to_dir / newname
else: tasks.consume_file.delay(
dest = save_to_dir ConsumableDocument(
logger.info(f"Saving {document_path} to {dest}") # Same source, for templates
copy_file_with_basic_stats(document_path, dest) source=source,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
overrides,
)
logger.info("Barcode splitting complete!")
return True return True

View File

@ -140,7 +140,7 @@ def consume_file(
with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
if settings.CONSUMER_ENABLE_BARCODES and reader.separate( if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
input_doc.source, input_doc.source,
overrides.filename, overrides,
): ):
# notify the sender, otherwise the progress bar # notify the sender, otherwise the progress bar
# in the UI stays stuck # in the UI stays stuck

View File

@ -1,5 +1,4 @@
import shutil import shutil
from pathlib import Path
from unittest import mock from unittest import mock
import pytest import pytest
@ -11,10 +10,13 @@ from documents import tasks
from documents.barcodes import BarcodeReader from documents.barcodes import BarcodeReader
from documents.consumer import ConsumerError from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource from documents.data_models import DocumentSource
from documents.models import Document from documents.models import Document
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
try: try:
import zxingcpp # noqa: F401 import zxingcpp # noqa: F401
@ -25,11 +27,7 @@ except ImportError:
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR") @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
def test_scan_file_for_separating_barcodes(self): def test_scan_file_for_separating_barcodes(self):
""" """
GIVEN: GIVEN:
@ -48,6 +46,46 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(reader.pdf_file, test_file) self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {0: False}) self.assertDictEqual(separator_page_numbers, {0: False})
@override_settings(
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_scan_tiff_for_separating_barcodes(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
self.assertDictEqual(separator_page_numbers, {1: False})
@override_settings(
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_scan_tiff_with_alpha_for_separating_barcodes(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_none_present(self): def test_scan_file_for_separating_barcodes_none_present(self):
""" """
GIVEN: GIVEN:
@ -285,6 +323,28 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertGreater(len(reader.barcodes), 0) self.assertGreater(len(reader.barcodes), 0)
self.assertDictEqual(separator_page_numbers, {1: False}) self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
WHEN:
- File is scanned for barcode
THEN:
- Scanning handles the exception without crashing
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
self.assertTrue(warning.startswith(expected_str))
separator_page_numbers = reader.get_separation_pages()
self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {})
def test_separate_pages(self): def test_separate_pages(self):
""" """
GIVEN: GIVEN:
@ -332,8 +392,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertLogs("paperless.barcodes", level="WARNING") as cm: with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader: with BarcodeReader(test_file, "application/pdf") as reader:
success = reader.separate(DocumentSource.ApiUpload) self.assertFalse(
self.assertFalse(success) reader.separate(
DocumentSource.ApiUpload,
DocumentMetadataOverrides(),
),
)
self.assertEqual( self.assertEqual(
cm.output, cm.output,
[ [
@ -341,215 +405,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
], ],
) )
def test_save_to_dir_given_name(self):
"""
GIVEN:
- File to save to a directory
- There is a name override
WHEN:
- The file is saved
THEN:
- The file exists
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ApiUpload, "newname.pdf")
self.assertEqual(reader.pdf_file, test_file)
target_file1 = settings.CONSUMPTION_DIR / "0_newname.pdf"
target_file2 = settings.CONSUMPTION_DIR / "1_newname.pdf"
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_api_upload(self):
"""
GIVEN:
- Input file containing barcodes
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ApiUpload)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_consume_dir(self):
"""
GIVEN:
- Input file containing barcodes
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = settings.CONSUMPTION_DIR / "patch-code-t-middle.pdf"
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
reader.separate(DocumentSource.ConsumeFolder)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_consume_dir_recursive(self):
"""
GIVEN:
- Input file containing barcodes
- Input file is within a directory structure of the consume folder
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
- Output files are within the same directory structure
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = (
settings.CONSUMPTION_DIR / "tag1" / "tag2" / "patch-code-t-middle.pdf"
)
test_file.parent.mkdir(parents=True)
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ConsumeFolder)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR
/ "tag1"
/ "tag2"
/ "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR
/ "tag1"
/ "tag2"
/ "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
"""
GIVEN:
- Input file with barcodes given to consume task
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file_with_alpha(self):
"""
GIVEN:
- TIFF image containing barcodes
- TIFF image has an alpha layer
WHEN:
- Consume task handles the alpha layer and returns
THEN:
- The file was split without issue
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
@override_settings( @override_settings(
CONSUMER_ENABLE_BARCODES=True, CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True, CONSUMER_BARCODE_TIFF_SUPPORT=True,
@ -597,60 +452,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNone(kwargs["override_document_type_id"]) self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"]) self.assertIsNone(kwargs["override_tag_ids"])
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_supported_no_extension_file(self):
"""
GIVEN:
- TIFF image containing barcodes
- TIFF file is given without extension
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
WHEN:
- File is scanned for barcode
THEN:
- Scanning handles the exception without crashing
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
self.assertTrue(warning.startswith(expected_str))
separator_page_numbers = reader.get_separation_pages()
self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {})
@override_settings( @override_settings(
CONSUMER_ENABLE_BARCODES=True, CONSUMER_ENABLE_BARCODES=True,
CONSUMER_ENABLE_ASN_BARCODE=True, CONSUMER_ENABLE_ASN_BARCODE=True,
@ -722,11 +523,64 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(len(document_list), 5) self.assertEqual(len(document_list), 5)
class TestAsnBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
SAMPLE_DIR = Path(__file__).parent / "samples" class TestBarcodeNewConsume(
DirectoriesMixin,
FileSystemAssertsMixin,
SampleDirMixin,
DocumentConsumeDelayMixin,
TestCase,
):
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
"""
GIVEN:
- Incoming file with at 1 barcode producing 2 documents
- Document includes metadata override information
WHEN:
- The document is split
THEN:
- Two new consume tasks are created
- Metadata overrides are preserved for the new consume
- The document source is unchanged (for consume templates)
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
temp_copy = self.dirs.scratch_dir / test_file.name
shutil.copy(test_file, temp_copy)
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=temp_copy,
),
overrides,
),
"File successfully split",
)
# We let the consumer know progress is done
progress_mocker.assert_called_once()
# 2 new document consume tasks created
self.assertEqual(self.consume_file_mock.call_count, 2)
self.assertIsNotFile(temp_copy)
# Check the split files exist
# Check the source is unchanged
# Check the overrides are unchanged
for (
new_input_doc,
new_doc_overrides,
) in self.get_all_consume_delay_call_args():
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
self.assertIsFile(new_input_doc.original_file)
self.assertEqual(overrides, new_doc_overrides)
class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
@override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
def test_scan_file_for_asn_custom_prefix(self): def test_scan_file_for_asn_custom_prefix(self):
""" """

View File

@ -235,8 +235,10 @@ class DocumentConsumeDelayMixin:
""" """
Iterates over all calls to the async task and returns the arguments Iterates over all calls to the async task and returns the arguments
""" """
# Must be at least 1 call
self.consume_file_mock.assert_called()
for args, _ in self.consume_file_mock.call_args_list: for args, kwargs in self.consume_file_mock.call_args_list:
input_doc, overrides = args input_doc, overrides = args
yield (input_doc, overrides) yield (input_doc, overrides)
@ -244,7 +246,7 @@ class DocumentConsumeDelayMixin:
def get_specific_consume_delay_call_args( def get_specific_consume_delay_call_args(
self, self,
index: int, index: int,
) -> Iterator[tuple[ConsumableDocument, DocumentMetadataOverrides]]: ) -> tuple[ConsumableDocument, DocumentMetadataOverrides]:
""" """
Returns the arguments of a specific call to the async task Returns the arguments of a specific call to the async task
""" """
@ -299,3 +301,9 @@ class TestMigrations(TransactionTestCase):
def setUpBeforeMigration(self, apps): def setUpBeforeMigration(self, apps):
pass pass
class SampleDirMixin:
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"