add TIFF barcode support

Signed-off-by: Florian Brandes <florian.brandes@posteo.de>
This commit is contained in:
Florian Brandes 2022-04-16 21:56:10 +02:00
parent cc93616019
commit ad5188a280
No known key found for this signature in database
GPG Key ID: 074048E893713170
6 changed files with 138 additions and 37 deletions

View File

@ -626,6 +626,12 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Defaults to false. Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.
This will automatically convert any TIFF image(s) to pdfs for later
processing.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defines the string to be detected as a separator barcode. Defines the string to be detected as a separator barcode.

View File

@ -22,6 +22,8 @@ from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pikepdf import Pdf from pikepdf import Pdf
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar from pyzbar import pyzbar
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -93,6 +95,41 @@ def barcode_reader(image) -> List[str]:
return barcodes return barcodes
def convert_from_tiff_to_pdf(filepath: str) -> str:
"""
converts a given TIFF image file to pdf.
Returns the new pdf file.
"""
file_extension = os.path.splitext(os.path.basename(filepath))[1]
# use old file name with pdf extension
if file_extension == ".tif":
newpath = filepath.replace(".tif", ".pdf")
elif file_extension == ".tiff":
newpath = filepath.replace(".tiff", ".pdf")
else:
logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
return ""
image = Image.open(filepath)
images = []
for i, page in enumerate(ImageSequence.Iterator(image)):
page = page.convert("RGB")
images.append(page)
try:
if len(images) == 1:
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
os.unlink(filepath)
except OSError as e:
logger.warning(
f"Could not save the file as pdf. "
f"The original image file was not deleted. Error: "
f"{str(e)}",
)
image.close()
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]: def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
""" """
Scan the provided file for page separating barcodes Scan the provided file for page separating barcodes
@ -195,6 +232,18 @@ def consume_file(
if settings.CONSUMER_ENABLE_BARCODES: if settings.CONSUMER_ENABLE_BARCODES:
separators = [] separators = []
document_list = [] document_list = []
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
supported_extensions = [".pdf", ".tiff", ".tif"]
else:
supported_extensions = [".pdf"]
file_extension = os.path.splitext(os.path.basename(path))[1]
if file_extension not in supported_extensions:
logger.warning(
f"Unsupported file format for barcode reader: {str(file_extension)}",
)
else:
if file_extension == ".tif" or file_extension == ".tiff":
path = convert_from_tiff_to_pdf(path)
separators = scan_file_for_separating_barcodes(path) separators = scan_file_for_separating_barcodes(path)
if separators: if separators:
logger.debug(f"Pages with separators found in: {str(path)}") logger.debug(f"Pages with separators found in: {str(path)}")
@ -228,7 +277,9 @@ def consume_file(
{"type": "status_update", "data": payload}, {"type": "status_update", "data": payload},
) )
except OSError as e: except OSError as e:
logger.warning("OSError. It could be, the broker cannot be reached.") logger.warning(
"OSError. It could be, the broker cannot be reached.",
)
logger.warning(str(e)) logger.warning(str(e))
return "File successfully split" return "File successfully split"

Binary file not shown.

View File

@ -204,6 +204,30 @@ class TestTasks(DirectoriesMixin, TestCase):
img = Image.open(test_file) img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_convert_from_tiff_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
shutil.copy(test_file, dst)
target_file = tasks.convert_from_tiff_to_pdf(dst)
file_extension = os.path.splitext(os.path.basename(target_file))[1]
self.assertTrue(os.path.isfile(target_file))
self.assertEqual(file_extension, ".pdf")
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
shutil.copy(test_file, dst)
target_file = tasks.convert_from_tiff_to_pdf(dst)
self.assertFalse(os.path.isfile(target_file))
def test_scan_file_for_separating_barcodes(self): def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join( test_file = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
@ -400,7 +424,23 @@ class TestTasks(DirectoriesMixin, TestCase):
"barcodes", "barcodes",
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
) )
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
shutil.copy(test_file, dst) shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split") self.assertEqual(tasks.consume_file(dst), "File successfully split")

View File

@ -502,6 +502,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES", "PAPERLESS_CONSUMER_ENABLE_BARCODES",
) )
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")