Fix: Document metadata is lost during barcode splitting (#4982)

* Fixes barcode splitting dropping metadata that might be needed for the round 2
This commit is contained in:
Trenton H 2023-12-15 09:17:25 -08:00 committed by GitHub
parent be2de4f15d
commit 122e4141b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 168 additions and 302 deletions

View File

@ -14,6 +14,8 @@ from pikepdf import Pdf
from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
@ -53,6 +55,7 @@ class BarcodeReader:
self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file
self.barcodes: list[Barcode] = []
self._tiff_conversion_done = False
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
@ -150,12 +153,14 @@ class BarcodeReader:
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.mime != "image/tiff":
if self.mime != "image/tiff" or self._tiff_conversion_done:
return
self._tiff_conversion_done = True
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None:
@ -167,6 +172,9 @@ class BarcodeReader:
if self.barcodes:
return
# No op if not a TIFF
self.convert_from_tiff_to_pdf()
# Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader = self.read_barcodes_pyzbar
@ -240,7 +248,7 @@ class BarcodeReader:
"""
document_paths = []
fname = self.file.with_suffix("").name
fname = self.file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
@ -290,7 +298,7 @@ class BarcodeReader:
def separate(
self,
source: DocumentSource,
override_name: Optional[str] = None,
overrides: DocumentMetadataOverrides,
) -> bool:
"""
Separates the document, based on barcodes and configuration, creating new
@ -316,27 +324,23 @@ class BarcodeReader:
logger.warning("No pages to split on!")
return False
# Create the split documents
doc_paths = self.separate_pages(separator_pages)
tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
# Save the new documents to correct folder
if source != DocumentSource.ConsumeFolder:
# The given file is somewhere in SCRATCH_DIR,
# and new documents must be moved to the CONSUMPTION_DIR
# for the consumer to notice them
save_to_dir = settings.CONSUMPTION_DIR
else:
# The given file is somewhere in CONSUMPTION_DIR,
# and may be some levels down for recursive tagging
# so use the file's parent to preserve any metadata
save_to_dir = self.file.parent
from documents import tasks
for idx, document_path in enumerate(doc_paths):
if override_name is not None:
newname = f"{idx}_{override_name}"
dest = save_to_dir / newname
else:
dest = save_to_dir
logger.info(f"Saving {document_path} to {dest}")
copy_file_with_basic_stats(document_path, dest)
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=source,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
overrides,
)
logger.info("Barcode splitting complete!")
return True

View File

@ -140,7 +140,7 @@ def consume_file(
with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
input_doc.source,
overrides.filename,
overrides,
):
# notify the sender, otherwise the progress bar
# in the UI stays stuck

View File

@ -1,5 +1,4 @@
import shutil
from pathlib import Path
from unittest import mock
import pytest
@ -11,10 +10,13 @@ from documents import tasks
from documents.barcodes import BarcodeReader
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
try:
import zxingcpp # noqa: F401
@ -25,11 +27,7 @@ except ImportError:
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
def test_scan_file_for_separating_barcodes(self):
"""
GIVEN:
@ -48,6 +46,46 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {0: False})
@override_settings(
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_scan_tiff_for_separating_barcodes(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
self.assertDictEqual(separator_page_numbers, {1: False})
@override_settings(
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_scan_tiff_with_alpha_for_separating_barcodes(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_none_present(self):
"""
GIVEN:
@ -285,6 +323,28 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertGreater(len(reader.barcodes), 0)
self.assertDictEqual(separator_page_numbers, {1: False})
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
WHEN:
- File is scanned for barcode
THEN:
- Scanning handles the exception without crashing
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
self.assertTrue(warning.startswith(expected_str))
separator_page_numbers = reader.get_separation_pages()
self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {})
def test_separate_pages(self):
"""
GIVEN:
@ -332,8 +392,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
success = reader.separate(DocumentSource.ApiUpload)
self.assertFalse(success)
self.assertFalse(
reader.separate(
DocumentSource.ApiUpload,
DocumentMetadataOverrides(),
),
)
self.assertEqual(
cm.output,
[
@ -341,215 +405,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
],
)
def test_save_to_dir_given_name(self):
"""
GIVEN:
- File to save to a directory
- There is a name override
WHEN:
- The file is saved
THEN:
- The file exists
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ApiUpload, "newname.pdf")
self.assertEqual(reader.pdf_file, test_file)
target_file1 = settings.CONSUMPTION_DIR / "0_newname.pdf"
target_file2 = settings.CONSUMPTION_DIR / "1_newname.pdf"
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_api_upload(self):
"""
GIVEN:
- Input file containing barcodes
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ApiUpload)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_consume_dir(self):
"""
GIVEN:
- Input file containing barcodes
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = settings.CONSUMPTION_DIR / "patch-code-t-middle.pdf"
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
reader.separate(DocumentSource.ConsumeFolder)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
def test_barcode_splitter_consume_dir_recursive(self):
"""
GIVEN:
- Input file containing barcodes
- Input file is within a directory structure of the consume folder
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
- Output files are within the same directory structure
"""
sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
test_file = (
settings.CONSUMPTION_DIR / "tag1" / "tag2" / "patch-code-t-middle.pdf"
)
test_file.parent.mkdir(parents=True)
shutil.copy(sample_file, test_file)
with BarcodeReader(test_file, "application/pdf") as reader:
reader.separate(DocumentSource.ConsumeFolder)
self.assertEqual(reader.pdf_file, test_file)
target_file1 = (
settings.CONSUMPTION_DIR
/ "tag1"
/ "tag2"
/ "patch-code-t-middle_document_0.pdf"
)
target_file2 = (
settings.CONSUMPTION_DIR
/ "tag1"
/ "tag2"
/ "patch-code-t-middle_document_1.pdf"
)
self.assertIsFile(target_file1)
self.assertIsFile(target_file2)
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
"""
GIVEN:
- Input file with barcodes given to consume task
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file_with_alpha(self):
"""
GIVEN:
- TIFF image containing barcodes
- TIFF image has an alpha layer
WHEN:
- Consume task handles the alpha layer and returns
THEN:
- The file was split without issue
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
@ -597,60 +452,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_supported_no_extension_file(self):
"""
GIVEN:
- TIFF image containing barcodes
- TIFF file is given without extension
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
dst = settings.SCRATCH_DIR / "patch-code-t-middle"
shutil.copy(test_file, dst)
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
"File successfully split",
)
self.assertIsNotFile(dst)
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
WHEN:
- File is scanned for barcode
THEN:
- Scanning handles the exception without crashing
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
self.assertTrue(warning.startswith(expected_str))
separator_page_numbers = reader.get_separation_pages()
self.assertEqual(reader.pdf_file, test_file)
self.assertDictEqual(separator_page_numbers, {})
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_ENABLE_ASN_BARCODE=True,
@ -722,11 +523,64 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(len(document_list), 5)
class TestAsnBarcode(DirectoriesMixin, TestCase):
SAMPLE_DIR = Path(__file__).parent / "samples"
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
class TestBarcodeNewConsume(
DirectoriesMixin,
FileSystemAssertsMixin,
SampleDirMixin,
DocumentConsumeDelayMixin,
TestCase,
):
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
"""
GIVEN:
- Incoming file with at 1 barcode producing 2 documents
- Document includes metadata override information
WHEN:
- The document is split
THEN:
- Two new consume tasks are created
- Metadata overrides are preserved for the new consume
- The document source is unchanged (for consume templates)
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
temp_copy = self.dirs.scratch_dir / test_file.name
shutil.copy(test_file, temp_copy)
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=temp_copy,
),
overrides,
),
"File successfully split",
)
# We let the consumer know progress is done
progress_mocker.assert_called_once()
# 2 new document consume tasks created
self.assertEqual(self.consume_file_mock.call_count, 2)
self.assertIsNotFile(temp_copy)
# Check the split files exist
# Check the source is unchanged
# Check the overrides are unchanged
for (
new_input_doc,
new_doc_overrides,
) in self.get_all_consume_delay_call_args():
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
self.assertIsFile(new_input_doc.original_file)
self.assertEqual(overrides, new_doc_overrides)
class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
@override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
def test_scan_file_for_asn_custom_prefix(self):
"""

View File

@ -235,8 +235,10 @@ class DocumentConsumeDelayMixin:
"""
Iterates over all calls to the async task and returns the arguments
"""
# Must be at least 1 call
self.consume_file_mock.assert_called()
for args, _ in self.consume_file_mock.call_args_list:
for args, kwargs in self.consume_file_mock.call_args_list:
input_doc, overrides = args
yield (input_doc, overrides)
@ -244,7 +246,7 @@ class DocumentConsumeDelayMixin:
def get_specific_consume_delay_call_args(
self,
index: int,
) -> Iterator[tuple[ConsumableDocument, DocumentMetadataOverrides]]:
) -> tuple[ConsumableDocument, DocumentMetadataOverrides]:
"""
Returns the arguments of a specific call to the async task
"""
@ -299,3 +301,9 @@ class TestMigrations(TransactionTestCase):
def setUpBeforeMigration(self, apps):
pass
class SampleDirMixin:
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"