mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #766 from paperless-ngx/feature-barcode-tiff-support
Feature barcode tiff support
This commit is contained in:
commit
8c8f366e0f
@ -629,8 +629,19 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
|
|||||||
If no barcodes are detected in the uploaded file, no page separation
|
If no barcodes are detected in the uploaded file, no page separation
|
||||||
will happen.
|
will happen.
|
||||||
|
|
||||||
|
The original document will be removed and the separated pages will be
|
||||||
|
saved as pdf.
|
||||||
|
|
||||||
Defaults to false.
|
Defaults to false.
|
||||||
|
|
||||||
|
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
|
||||||
|
Whether TIFF image files should be scanned for barcodes.
|
||||||
|
This will automatically convert any TIFF image(s) to pdfs for later
|
||||||
|
processing.
|
||||||
|
This only has an effect, if PAPERLESS_CONSUMER_ENABLE_BARCODES has been
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
Defaults to false.
|
||||||
|
|
||||||
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
|
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
|
||||||
Defines the string to be detected as a separator barcode.
|
Defines the string to be detected as a separator barcode.
|
||||||
|
@ -22,6 +22,8 @@ from documents.models import Tag
|
|||||||
from documents.sanity_checker import SanityCheckFailedException
|
from documents.sanity_checker import SanityCheckFailedException
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
from pikepdf import Pdf
|
from pikepdf import Pdf
|
||||||
|
from PIL import Image
|
||||||
|
from PIL import ImageSequence
|
||||||
from pyzbar import pyzbar
|
from pyzbar import pyzbar
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
@ -93,9 +95,41 @@ def barcode_reader(image) -> List[str]:
|
|||||||
return barcodes
|
return barcodes
|
||||||
|
|
||||||
|
|
||||||
|
def convert_from_tiff_to_pdf(filepath: str) -> str:
|
||||||
|
"""
|
||||||
|
converts a given TIFF image file to pdf into a temp. directory.
|
||||||
|
Returns the new pdf file.
|
||||||
|
"""
|
||||||
|
file_name = os.path.splitext(os.path.basename(filepath))[0]
|
||||||
|
file_extension = os.path.splitext(os.path.basename(filepath))[1].lower()
|
||||||
|
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
|
# use old file name with pdf extension
|
||||||
|
if file_extension == ".tif" or file_extension == ".tiff":
|
||||||
|
newpath = os.path.join(tempdir, file_name + ".pdf")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
|
||||||
|
return None
|
||||||
|
with Image.open(filepath) as image:
|
||||||
|
images = []
|
||||||
|
for i, page in enumerate(ImageSequence.Iterator(image)):
|
||||||
|
page = page.convert("RGB")
|
||||||
|
images.append(page)
|
||||||
|
try:
|
||||||
|
if len(images) == 1:
|
||||||
|
images[0].save(newpath)
|
||||||
|
else:
|
||||||
|
images[0].save(newpath, save_all=True, append_images=images[1:])
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Could not save the file as pdf. Error: {str(e)}",
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
return newpath
|
||||||
|
|
||||||
|
|
||||||
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
|
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Scan the provided file for page separating barcodes
|
Scan the provided pdf file for page separating barcodes
|
||||||
Returns a list of pagenumbers, which separate the file
|
Returns a list of pagenumbers, which separate the file
|
||||||
"""
|
"""
|
||||||
separator_page_numbers = []
|
separator_page_numbers = []
|
||||||
@ -112,7 +146,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
|
|||||||
|
|
||||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Separate the provided file on the pages_to_split_on.
|
Separate the provided pdf file on the pages_to_split_on.
|
||||||
The pages which are defined by page_numbers will be removed.
|
The pages which are defined by page_numbers will be removed.
|
||||||
Returns a list of (temporary) filepaths to consume.
|
Returns a list of (temporary) filepaths to consume.
|
||||||
These will need to be deleted later.
|
These will need to be deleted later.
|
||||||
@ -195,42 +229,70 @@ def consume_file(
|
|||||||
if settings.CONSUMER_ENABLE_BARCODES:
|
if settings.CONSUMER_ENABLE_BARCODES:
|
||||||
separators = []
|
separators = []
|
||||||
document_list = []
|
document_list = []
|
||||||
separators = scan_file_for_separating_barcodes(path)
|
converted_tiff = None
|
||||||
if separators:
|
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
|
||||||
logger.debug(f"Pages with separators found in: {str(path)}")
|
supported_extensions = [".pdf", ".tiff", ".tif"]
|
||||||
document_list = separate_pages(path, separators)
|
else:
|
||||||
if document_list:
|
supported_extensions = [".pdf"]
|
||||||
for n, document in enumerate(document_list):
|
file_extension = os.path.splitext(os.path.basename(path))[1].lower()
|
||||||
# save to consumption dir
|
if file_extension not in supported_extensions:
|
||||||
# rename it to the original filename with number prefix
|
# if not supported, skip this routine
|
||||||
if override_filename:
|
logger.warning(
|
||||||
newname = f"{str(n)}_" + override_filename
|
f"Unsupported file format for barcode reader: {str(file_extension)}",
|
||||||
else:
|
)
|
||||||
newname = None
|
else:
|
||||||
save_to_dir(document, newname=newname)
|
if file_extension in {".tif", ".tiff"}:
|
||||||
# if we got here, the document was successfully split
|
file_to_process = convert_from_tiff_to_pdf(path)
|
||||||
# and can safely be deleted
|
else:
|
||||||
logger.debug("Deleting file {}".format(path))
|
file_to_process = path
|
||||||
os.unlink(path)
|
|
||||||
# notify the sender, otherwise the progress bar
|
separators = scan_file_for_separating_barcodes(file_to_process)
|
||||||
# in the UI stays stuck
|
|
||||||
payload = {
|
if separators:
|
||||||
"filename": override_filename,
|
logger.debug(
|
||||||
"task_id": task_id,
|
f"Pages with separators found in: {str(path)}",
|
||||||
"current_progress": 100,
|
|
||||||
"max_progress": 100,
|
|
||||||
"status": "SUCCESS",
|
|
||||||
"message": "finished",
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
async_to_sync(get_channel_layer().group_send)(
|
|
||||||
"status_updates",
|
|
||||||
{"type": "status_update", "data": payload},
|
|
||||||
)
|
)
|
||||||
except OSError as e:
|
document_list = separate_pages(file_to_process, separators)
|
||||||
logger.warning("OSError. It could be, the broker cannot be reached.")
|
|
||||||
logger.warning(str(e))
|
if document_list:
|
||||||
return "File successfully split"
|
for n, document in enumerate(document_list):
|
||||||
|
# save to consumption dir
|
||||||
|
# rename it to the original filename with number prefix
|
||||||
|
if override_filename:
|
||||||
|
newname = f"{str(n)}_" + override_filename
|
||||||
|
else:
|
||||||
|
newname = None
|
||||||
|
save_to_dir(document, newname=newname)
|
||||||
|
# if we got here, the document was successfully split
|
||||||
|
# and can safely be deleted
|
||||||
|
if converted_tiff:
|
||||||
|
logger.debug("Deleting file {}".format(file_to_process))
|
||||||
|
os.unlink(file_to_process)
|
||||||
|
logger.debug("Deleting file {}".format(path))
|
||||||
|
os.unlink(path)
|
||||||
|
# notify the sender, otherwise the progress bar
|
||||||
|
# in the UI stays stuck
|
||||||
|
payload = {
|
||||||
|
"filename": override_filename,
|
||||||
|
"task_id": task_id,
|
||||||
|
"current_progress": 100,
|
||||||
|
"max_progress": 100,
|
||||||
|
"status": "SUCCESS",
|
||||||
|
"message": "finished",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async_to_sync(get_channel_layer().group_send)(
|
||||||
|
"status_updates",
|
||||||
|
{"type": "status_update", "data": payload},
|
||||||
|
)
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(
|
||||||
|
"OSError. It could be, the broker cannot be reached.",
|
||||||
|
)
|
||||||
|
logger.warning(str(e))
|
||||||
|
# consuming stops here, since the original document with
|
||||||
|
# the barcodes has been split and will be consumed separately
|
||||||
|
return "File successfully split"
|
||||||
|
|
||||||
# continue with consumption if no barcode was found
|
# continue with consumption if no barcode was found
|
||||||
document = Consumer().try_consume_file(
|
document = Consumer().try_consume_file(
|
||||||
|
BIN
src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
Normal file
BIN
src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/simple.tiff
Normal file
BIN
src/documents/tests/samples/simple.tiff
Normal file
Binary file not shown.
@ -204,6 +204,29 @@ class TestTasks(DirectoriesMixin, TestCase):
|
|||||||
img = Image.open(test_file)
|
img = Image.open(test_file)
|
||||||
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
|
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
|
||||||
|
|
||||||
|
def test_convert_from_tiff_to_pdf(self):
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"simple.tiff",
|
||||||
|
)
|
||||||
|
dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
|
||||||
|
shutil.copy(test_file, dst)
|
||||||
|
target_file = tasks.convert_from_tiff_to_pdf(dst)
|
||||||
|
file_extension = os.path.splitext(os.path.basename(target_file))[1]
|
||||||
|
self.assertTrue(os.path.isfile(target_file))
|
||||||
|
self.assertEqual(file_extension, ".pdf")
|
||||||
|
|
||||||
|
def test_convert_error_from_pdf_to_pdf(self):
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"simple.pdf",
|
||||||
|
)
|
||||||
|
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
|
||||||
|
shutil.copy(test_file, dst)
|
||||||
|
self.assertIsNone(tasks.convert_from_tiff_to_pdf(dst))
|
||||||
|
|
||||||
def test_scan_file_for_separating_barcodes(self):
|
def test_scan_file_for_separating_barcodes(self):
|
||||||
test_file = os.path.join(
|
test_file = os.path.join(
|
||||||
os.path.dirname(__file__),
|
os.path.dirname(__file__),
|
||||||
@ -400,11 +423,64 @@ class TestTasks(DirectoriesMixin, TestCase):
|
|||||||
"barcodes",
|
"barcodes",
|
||||||
"patch-code-t-middle.pdf",
|
"patch-code-t-middle.pdf",
|
||||||
)
|
)
|
||||||
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd")
|
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
|
||||||
shutil.copy(test_file, dst)
|
shutil.copy(test_file, dst)
|
||||||
|
|
||||||
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
CONSUMER_ENABLE_BARCODES=True,
|
||||||
|
CONSUMER_BARCODE_TIFF_SUPPORT=True,
|
||||||
|
)
|
||||||
|
def test_consume_barcode_tiff_file(self):
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"barcodes",
|
||||||
|
"patch-code-t-middle.tiff",
|
||||||
|
)
|
||||||
|
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
|
||||||
|
shutil.copy(test_file, dst)
|
||||||
|
|
||||||
|
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
CONSUMER_ENABLE_BARCODES=True,
|
||||||
|
CONSUMER_BARCODE_TIFF_SUPPORT=True,
|
||||||
|
)
|
||||||
|
@mock.patch("documents.consumer.Consumer.try_consume_file")
|
||||||
|
def test_consume_barcode_unsupported_jpg_file(self, m):
|
||||||
|
"""
|
||||||
|
This test assumes barcode and TIFF support are enabled and
|
||||||
|
the user uploads an unsupported image file (e.g. jpg)
|
||||||
|
|
||||||
|
The function shouldn't try to scan for separating barcodes
|
||||||
|
and continue archiving the file as is.
|
||||||
|
"""
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"simple.jpg",
|
||||||
|
)
|
||||||
|
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
|
||||||
|
shutil.copy(test_file, dst)
|
||||||
|
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
|
||||||
|
self.assertIn("Success", tasks.consume_file(dst))
|
||||||
|
self.assertEqual(
|
||||||
|
cm.output,
|
||||||
|
[
|
||||||
|
"WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
m.assert_called_once()
|
||||||
|
|
||||||
|
args, kwargs = m.call_args
|
||||||
|
self.assertIsNone(kwargs["override_filename"])
|
||||||
|
self.assertIsNone(kwargs["override_title"])
|
||||||
|
self.assertIsNone(kwargs["override_correspondent_id"])
|
||||||
|
self.assertIsNone(kwargs["override_document_type_id"])
|
||||||
|
self.assertIsNone(kwargs["override_tag_ids"])
|
||||||
|
|
||||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||||
def test_sanity_check_success(self, m):
|
def test_sanity_check_success(self, m):
|
||||||
m.return_value = SanityCheckMessages()
|
m.return_value = SanityCheckMessages()
|
||||||
|
@ -503,6 +503,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean(
|
|||||||
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
|
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
|
||||||
|
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
|
||||||
|
)
|
||||||
|
|
||||||
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
|
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
|
||||||
|
|
||||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user