Updates how barcodes are detected, using pikepdf images, instead of converting each page to an image

This commit is contained in:
Trenton Holmes
2022-09-14 11:49:22 -07:00
committed by Trenton H
parent d9b345ffd9
commit b21f64de8a
5 changed files with 178 additions and 179 deletions

View File

@@ -3,13 +3,15 @@ import os
import shutil
import tempfile
from functools import lru_cache
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
from typing import List
from typing import Optional
from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@@ -32,7 +34,7 @@ def supported_file_type(mime_type) -> bool:
return mime_type in supported_mime
def barcode_reader(image) -> List[str]:
def barcode_reader(image: Image) -> List[str]:
"""
Read any barcodes contained in image
Returns a list containing all found barcodes
@@ -99,21 +101,39 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
"""
Scan the provided pdf file for page separating barcodes
Returns a list of pagenumbers, which separate the file
Returns a the PDF filepath and a list of pagenumbers,
which separate the file into new files
"""
separator_page_numbers = []
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
# use a temporary directory in case the file os too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if separator_barcode in current_barcodes:
separator_page_numbers.append(current_page_number)
return separator_page_numbers
pdf_filepath = None
mime_type = get_file_mime_type(filepath)
if supported_file_type(mime_type):
pdf_filepath = filepath
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
pdf = Pdf.open(pdf_filepath)
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
separator_page_numbers.append(page_num)
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
)
return pdf_filepath, separator_page_numbers
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:

View File

@@ -96,29 +96,13 @@ def consume_file(
# check for separators in current document
if settings.CONSUMER_ENABLE_BARCODES:
mime_type = barcodes.get_file_mime_type(path)
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
if not barcodes.supported_file_type(mime_type):
# if not supported, skip this routine
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
else:
separators = []
document_list = []
if mime_type == "image/tiff":
file_to_process = barcodes.convert_from_tiff_to_pdf(path)
else:
file_to_process = path
separators = barcodes.scan_file_for_separating_barcodes(file_to_process)
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
document_list = barcodes.separate_pages(file_to_process, separators)
document_list = barcodes.separate_pages(pdf_filepath, separators)
if document_list:
for n, document in enumerate(document_list):
@@ -134,15 +118,13 @@ def consume_file(
target_dir=path.parent,
)
# if we got here, the document was successfully split
# and can safely be deleted
if mime_type == "image/tiff":
# Remove the TIFF converted to PDF file
logger.debug(f"Deleting file {file_to_process}")
os.unlink(file_to_process)
# Remove the original file (new file is saved above)
logger.debug(f"Deleting file {path}")
os.unlink(path)
# Delete the PDF file which was split
os.remove(pdf_filepath)
# If the original was a TIFF, remove the original file as well
if str(pdf_filepath) != str(path):
logger.debug(f"Deleting file {path}")
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck

View File

@@ -13,22 +13,23 @@ from PIL import Image
class TestBarcode(DirectoriesMixin, TestCase):
SAMPLE_DIR = os.path.join(
os.path.dirname(__file__),
"samples",
)
BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
def test_barcode_reader(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT.png",
)
test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png")
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pbm",
)
img = Image.open(test_file)
@@ -37,9 +38,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion.png",
)
img = Image.open(test_file)
@@ -48,9 +47,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion2.png",
)
img = Image.open(test_file)
@@ -59,9 +56,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_unreadable(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-unreadable.png",
)
img = Image.open(test_file)
@@ -69,9 +64,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_qr(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"qr-code-PATCHT.png",
)
img = Image.open(test_file)
@@ -80,9 +73,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_128(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-PATCHT.png",
)
img = Image.open(test_file)
@@ -90,15 +81,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png")
test_file = os.path.join(self.SAMPLE_DIR, "simple.png")
img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.png",
)
img = Image.open(test_file)
@@ -106,9 +95,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_qr_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.png",
)
img = Image.open(test_file)
@@ -116,9 +103,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_128_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
img = Image.open(test_file)
@@ -126,19 +111,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_get_mime_type(self):
tiff_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.tiff",
)
pdf_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
png_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
@@ -173,8 +154,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
@@ -183,107 +163,127 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes2(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_separating_barcodes3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes4(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"several-patcht-codes.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [2, 5])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle_reverse.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-qr.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_separate_pages(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.separate_pages(test_file, [1])
@@ -311,9 +311,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_separate_pages_no_list(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
@@ -328,9 +326,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -340,9 +336,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
nonexistingdir = "/nowhere"
@@ -360,9 +354,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -372,31 +364,36 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_splitter(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
separators = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertTrue(separators)
document_list = barcodes.separate_pages(test_file, separators)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
@@ -408,9 +405,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
@@ -432,18 +427,17 @@ class TestBarcode(DirectoriesMixin, TestCase):
and continue archiving the file as is.
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.jpg",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
shutil.copy(test_file, dst)
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
self.assertIn("Success", tasks.consume_file(dst))
self.assertListEqual(
cm.output,
[
"WARNING:paperless.tasks:Unsupported file format for barcode reader: image/jpeg",
"WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
],
)
m.assert_called_once()
@@ -465,9 +459,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
the user uploads a supported image file, but without extension
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")