Merge pull request #766 from paperless-ngx/feature-barcode-tiff-support

Feature barcode tiff support
This commit is contained in:
Quinn Casey 2022-04-27 19:46:16 -07:00 committed by GitHub
commit 8c8f366e0f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 191 additions and 38 deletions

View File

@ -629,8 +629,19 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
If no barcodes are detected in the uploaded file, no page separation
will happen.
The original document will be removed and the separated pages will be
saved as pdf.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.
This will automatically convert any TIFF image(s) to pdfs for later
processing.
This only has an effect, if PAPERLESS_CONSUMER_ENABLE_BARCODES has been
enabled.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defines the string to be detected as a separator barcode.

View File

@ -22,6 +22,8 @@ from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path
from pikepdf import Pdf
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
from whoosh.writing import AsyncWriter
@ -93,9 +95,41 @@ def barcode_reader(image) -> List[str]:
return barcodes
def convert_from_tiff_to_pdf(filepath: str) -> str:
"""
converts a given TIFF image file to pdf into a temp. directory.
Returns the new pdf file.
"""
file_name = os.path.splitext(os.path.basename(filepath))[0]
file_extension = os.path.splitext(os.path.basename(filepath))[1].lower()
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
# use old file name with pdf extension
if file_extension == ".tif" or file_extension == ".tiff":
newpath = os.path.join(tempdir, file_name + ".pdf")
else:
logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
return None
with Image.open(filepath) as image:
images = []
for i, page in enumerate(ImageSequence.Iterator(image)):
page = page.convert("RGB")
images.append(page)
try:
if len(images) == 1:
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
except OSError as e:
logger.warning(
f"Could not save the file as pdf. Error: {str(e)}",
)
return None
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
"""
Scan the provided file for page separating barcodes
Scan the provided pdf file for page separating barcodes
Returns a list of pagenumbers, which separate the file
"""
separator_page_numbers = []
@ -112,7 +146,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
"""
Separate the provided file on the pages_to_split_on.
Separate the provided pdf file on the pages_to_split_on.
The pages which are defined by page_numbers will be removed.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
@ -195,10 +229,31 @@ def consume_file(
if settings.CONSUMER_ENABLE_BARCODES:
separators = []
document_list = []
separators = scan_file_for_separating_barcodes(path)
converted_tiff = None
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
supported_extensions = [".pdf", ".tiff", ".tif"]
else:
supported_extensions = [".pdf"]
file_extension = os.path.splitext(os.path.basename(path))[1].lower()
if file_extension not in supported_extensions:
# if not supported, skip this routine
logger.warning(
f"Unsupported file format for barcode reader: {str(file_extension)}",
)
else:
if file_extension in {".tif", ".tiff"}:
file_to_process = convert_from_tiff_to_pdf(path)
else:
file_to_process = path
separators = scan_file_for_separating_barcodes(file_to_process)
if separators:
logger.debug(f"Pages with separators found in: {str(path)}")
document_list = separate_pages(path, separators)
logger.debug(
f"Pages with separators found in: {str(path)}",
)
document_list = separate_pages(file_to_process, separators)
if document_list:
for n, document in enumerate(document_list):
# save to consumption dir
@ -210,6 +265,9 @@ def consume_file(
save_to_dir(document, newname=newname)
# if we got here, the document was successfully split
# and can safely be deleted
if converted_tiff:
logger.debug("Deleting file {}".format(file_to_process))
os.unlink(file_to_process)
logger.debug("Deleting file {}".format(path))
os.unlink(path)
# notify the sender, otherwise the progress bar
@ -228,8 +286,12 @@ def consume_file(
{"type": "status_update", "data": payload},
)
except OSError as e:
logger.warning("OSError. It could be, the broker cannot be reached.")
logger.warning(
"OSError. It could be, the broker cannot be reached.",
)
logger.warning(str(e))
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
# continue with consumption if no barcode was found

Binary file not shown.

View File

@ -204,6 +204,29 @@ class TestTasks(DirectoriesMixin, TestCase):
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_convert_from_tiff_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
shutil.copy(test_file, dst)
target_file = tasks.convert_from_tiff_to_pdf(dst)
file_extension = os.path.splitext(os.path.basename(target_file))[1]
self.assertTrue(os.path.isfile(target_file))
self.assertEqual(file_extension, ".pdf")
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
shutil.copy(test_file, dst)
self.assertIsNone(tasks.convert_from_tiff_to_pdf(dst))
def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
@ -400,11 +423,64 @@ class TestTasks(DirectoriesMixin, TestCase):
"barcodes",
"patch-code-t-middle.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd")
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
@mock.patch("documents.consumer.Consumer.try_consume_file")
def test_consume_barcode_unsupported_jpg_file(self, m):
"""
This test assumes barcode and TIFF support are enabled and
the user uploads an unsupported image file (e.g. jpg)
The function shouldn't try to scan for separating barcodes
and continue archiving the file as is.
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"simple.jpg",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
shutil.copy(test_file, dst)
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
self.assertIn("Success", tasks.consume_file(dst))
self.assertEqual(
cm.output,
[
"WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg",
],
)
m.assert_called_once()
args, kwargs = m.call_args
self.assertIsNone(kwargs["override_filename"])
self.assertIsNone(kwargs["override_title"])
self.assertIsNone(kwargs["override_correspondent_id"])
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_success(self, m):
m.return_value = SanityCheckMessages()

View File

@ -503,6 +503,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
)
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")