Fix: Document metadata is lost during barcode splitting (#4982)

* Fixes barcode splitting dropping metadata that might be needed for the round 2
This commit is contained in:
Trenton H
2023-12-15 09:17:25 -08:00
committed by GitHub
parent fff8674b56
commit e8877c2c0e
4 changed files with 168 additions and 302 deletions

View File

@@ -14,6 +14,8 @@ from pikepdf import Pdf
from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
@@ -53,6 +55,7 @@ class BarcodeReader:
self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file
self.barcodes: list[Barcode] = []
self._tiff_conversion_done = False
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
@@ -150,12 +153,14 @@ class BarcodeReader:
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.mime != "image/tiff":
if self.mime != "image/tiff" or self._tiff_conversion_done:
return
self._tiff_conversion_done = True
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None:
@@ -167,6 +172,9 @@ class BarcodeReader:
if self.barcodes:
return
# No op if not a TIFF
self.convert_from_tiff_to_pdf()
# Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader = self.read_barcodes_pyzbar
@@ -240,7 +248,7 @@ class BarcodeReader:
"""
document_paths = []
fname = self.file.with_suffix("").name
fname = self.file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
@@ -290,7 +298,7 @@ class BarcodeReader:
def separate(
self,
source: DocumentSource,
override_name: Optional[str] = None,
overrides: DocumentMetadataOverrides,
) -> bool:
"""
Separates the document, based on barcodes and configuration, creating new
@@ -316,27 +324,23 @@ class BarcodeReader:
logger.warning("No pages to split on!")
return False
# Create the split documents
doc_paths = self.separate_pages(separator_pages)
tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
# Save the new documents to correct folder
if source != DocumentSource.ConsumeFolder:
# The given file is somewhere in SCRATCH_DIR,
# and new documents must be moved to the CONSUMPTION_DIR
# for the consumer to notice them
save_to_dir = settings.CONSUMPTION_DIR
else:
# The given file is somewhere in CONSUMPTION_DIR,
# and may be some levels down for recursive tagging
# so use the file's parent to preserve any metadata
save_to_dir = self.file.parent
from documents import tasks
for idx, document_path in enumerate(doc_paths):
if override_name is not None:
newname = f"{idx}_{override_name}"
dest = save_to_dir / newname
else:
dest = save_to_dir
logger.info(f"Saving {document_path} to {dest}")
copy_file_with_basic_stats(document_path, dest)
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=source,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
overrides,
)
logger.info("Barcode splitting complete!")
return True