add TIFF barcode support

Signed-off-by: Florian Brandes <florian.brandes@posteo.de>
This commit is contained in:
Florian Brandes 2022-04-16 21:56:10 +02:00
parent cc93616019
commit ad5188a280
No known key found for this signature in database
GPG Key ID: 074048E893713170
6 changed files with 138 additions and 37 deletions

View File

@ -626,6 +626,12 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Defaults to false. Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.
This will automatically convert any TIFF image(s) to pdfs for later
processing.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defines the string to be detected as a separator barcode. Defines the string to be detected as a separator barcode.

View File

@ -22,6 +22,8 @@ from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pikepdf import Pdf from pikepdf import Pdf
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar from pyzbar import pyzbar
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -93,6 +95,41 @@ def barcode_reader(image) -> List[str]:
return barcodes return barcodes
def convert_from_tiff_to_pdf(filepath: str) -> str:
"""
converts a given TIFF image file to pdf.
Returns the new pdf file.
"""
file_extension = os.path.splitext(os.path.basename(filepath))[1]
# use old file name with pdf extension
if file_extension == ".tif":
newpath = filepath.replace(".tif", ".pdf")
elif file_extension == ".tiff":
newpath = filepath.replace(".tiff", ".pdf")
else:
logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
return ""
image = Image.open(filepath)
images = []
for i, page in enumerate(ImageSequence.Iterator(image)):
page = page.convert("RGB")
images.append(page)
try:
if len(images) == 1:
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
os.unlink(filepath)
except OSError as e:
logger.warning(
f"Could not save the file as pdf. "
f"The original image file was not deleted. Error: "
f"{str(e)}",
)
image.close()
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]: def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
""" """
Scan the provided file for page separating barcodes Scan the provided file for page separating barcodes
@ -195,42 +232,56 @@ def consume_file(
if settings.CONSUMER_ENABLE_BARCODES: if settings.CONSUMER_ENABLE_BARCODES:
separators = [] separators = []
document_list = [] document_list = []
separators = scan_file_for_separating_barcodes(path) if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
if separators: supported_extensions = [".pdf", ".tiff", ".tif"]
logger.debug(f"Pages with separators found in: {str(path)}") else:
document_list = separate_pages(path, separators) supported_extensions = [".pdf"]
if document_list: file_extension = os.path.splitext(os.path.basename(path))[1]
for n, document in enumerate(document_list): if file_extension not in supported_extensions:
# save to consumption dir logger.warning(
# rename it to the original filename with number prefix f"Unsupported file format for barcode reader: {str(file_extension)}",
if override_filename: )
newname = f"{str(n)}_" + override_filename else:
else: if file_extension == ".tif" or file_extension == ".tiff":
newname = None path = convert_from_tiff_to_pdf(path)
save_to_dir(document, newname=newname) separators = scan_file_for_separating_barcodes(path)
# if we got here, the document was successfully split if separators:
# and can safely be deleted logger.debug(f"Pages with separators found in: {str(path)}")
logger.debug("Deleting file {}".format(path)) document_list = separate_pages(path, separators)
os.unlink(path) if document_list:
# notify the sender, otherwise the progress bar for n, document in enumerate(document_list):
# in the UI stays stuck # save to consumption dir
payload = { # rename it to the original filename with number prefix
"filename": override_filename, if override_filename:
"task_id": task_id, newname = f"{str(n)}_" + override_filename
"current_progress": 100, else:
"max_progress": 100, newname = None
"status": "SUCCESS", save_to_dir(document, newname=newname)
"message": "finished", # if we got here, the document was successfully split
} # and can safely be deleted
try: logger.debug("Deleting file {}".format(path))
async_to_sync(get_channel_layer().group_send)( os.unlink(path)
"status_updates", # notify the sender, otherwise the progress bar
{"type": "status_update", "data": payload}, # in the UI stays stuck
) payload = {
except OSError as e: "filename": override_filename,
logger.warning("OSError. It could be, the broker cannot be reached.") "task_id": task_id,
logger.warning(str(e)) "current_progress": 100,
return "File successfully split" "max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except OSError as e:
logger.warning(
"OSError. It could be, the broker cannot be reached.",
)
logger.warning(str(e))
return "File successfully split"
# continue with consumption if no barcode was found # continue with consumption if no barcode was found
document = Consumer().try_consume_file( document = Consumer().try_consume_file(

Binary file not shown.

View File

@ -204,6 +204,30 @@ class TestTasks(DirectoriesMixin, TestCase):
img = Image.open(test_file) img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_convert_from_tiff_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
shutil.copy(test_file, dst)
target_file = tasks.convert_from_tiff_to_pdf(dst)
file_extension = os.path.splitext(os.path.basename(target_file))[1]
self.assertTrue(os.path.isfile(target_file))
self.assertEqual(file_extension, ".pdf")
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
shutil.copy(test_file, dst)
target_file = tasks.convert_from_tiff_to_pdf(dst)
self.assertFalse(os.path.isfile(target_file))
def test_scan_file_for_separating_barcodes(self): def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join( test_file = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
@ -400,7 +424,23 @@ class TestTasks(DirectoriesMixin, TestCase):
"barcodes", "barcodes",
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
) )
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
shutil.copy(test_file, dst) shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split") self.assertEqual(tasks.consume_file(dst), "File successfully split")

View File

@ -502,6 +502,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES", "PAPERLESS_CONSUMER_ENABLE_BARCODES",
) )
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")