Removes pikepdf based scanning, fixes up unit testing (+ commenting)

This commit is contained in:
Trenton H 2023-01-27 08:34:00 -08:00
parent 94ad290e14
commit 2ab77fbaf7
4 changed files with 352 additions and 169 deletions

View File

@ -4,7 +4,6 @@ import shutil
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from math import ceil
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -12,10 +11,9 @@ from typing import Optional
import magic import magic
from django.conf import settings from django.conf import settings
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image from PIL import Image
from PIL import ImageSequence from PIL import ImageSequence
from pyzbar import pyzbar from pyzbar import pyzbar
@ -154,52 +152,15 @@ def scan_file_for_barcodes(
(page_number, barcode_text) tuples (page_number, barcode_text) tuples
""" """
def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
# Scale the image down
# See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
# TLDR: zbar has issues with larger images
width, height = pillow_img.size
if width > 1024:
scaler = ceil(width / 1024)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
width, height = pillow_img.size
if height > 2048:
scaler = ceil(height / 2048)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
for barcode_value in barcode_reader(pillow_img):
detected_barcodes.append(Barcode(page_num, barcode_value))
return detected_barcodes
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = [] detected_barcodes = []
# use a temporary directory in case the file is too big to handle in memory # use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path: with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path) pages_from_path = convert_from_path(
pdf_filepath,
dpi=300,
output_folder=path,
)
for current_page_number, page in enumerate(pages_from_path): for current_page_number, page in enumerate(pages_from_path):
for barcode_value in barcode_reader(page): for barcode_value in barcode_reader(page):
detected_barcodes.append( detected_barcodes.append(
@ -219,27 +180,19 @@ def scan_file_for_barcodes(
# Always try pikepdf first, it's usually fine, faster and # Always try pikepdf first, it's usually fine, faster and
# uses less memory # uses less memory
try: try:
barcodes = _pikepdf_barcode_scan(pdf_filepath) barcodes = _pdf2image_barcode_scan(pdf_filepath)
# Password protected files can't be checked # Password protected files can't be checked
except PasswordError as e: # This is the exception raised for those
except PDFPageCountError as e:
logger.warning( logger.warning(
f"File is likely password protected, not checking for barcodes: {e}", f"File is likely password protected, not checking for barcodes: {e}",
) )
# Handle pikepdf related image decoding issues with a fallback to page
# by page conversion to images in a temporary directory
except Exception as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
barcodes = _pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue # This file is really borked, allow the consumption to continue
# but it may fail further on # but it may fail further on
except Exception as e: # pragma: no cover except Exception as e: # pragma: no cover
logger.warning( logger.warning(
f"Exception during barcode scanning: {e}", f"Exception during barcode scanning: {e}",
) )
else: else:
logger.warning( logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}", f"Unsupported file format for barcode reader: {str(mime_type)}",

View File

@ -3,7 +3,6 @@ import shutil
import tempfile import tempfile
from unittest import mock from unittest import mock
import pikepdf
from django.conf import settings from django.conf import settings
from django.test import override_settings from django.test import override_settings
from django.test import TestCase from django.test import TestCase
@ -23,13 +22,29 @@ class TestBarcode(DirectoriesMixin, TestCase):
BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
def test_barcode_reader(self): def test_barcode_reader_png(self):
"""
GIVEN:
- PNG file with separator barcode
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png") test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png")
img = Image.open(test_file) img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING) separator_barcode = settings.CONSUMER_BARCODE_STRING
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader2(self): def test_barcode_reader_pbm(self):
"""
GIVEN:
- Netpbm bitmap file with separator barcode
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t.pbm", "patch-code-t.pbm",
@ -38,25 +53,49 @@ class TestBarcode(DirectoriesMixin, TestCase):
separator_barcode = str(settings.CONSUMER_BARCODE_STRING) separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_distorsion(self): def test_barcode_reader_distortion_scratchy(self):
"""
GIVEN:
- Image containing high noise
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion.png", "barcode-39-PATCHT-distortion.png",
) )
img = Image.open(test_file) img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING) separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_distorsion2(self): def test_barcode_reader_distortion_stretched(self):
"""
GIVEN:
- Image with a stretched barcode
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion2.png", "barcode-39-PATCHT-distortion2.png",
) )
img = Image.open(test_file) img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING) separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_unreadable(self): def test_barcode_reader_unreadable(self):
"""
GIVEN:
- Image with a truly unreadable barcode
WHEN:
- Image is scanned for codes
THEN:
- No barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-unreadable.png", "barcode-39-PATCHT-unreadable.png",
@ -65,6 +104,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), []) self.assertEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_qr(self): def test_barcode_reader_qr(self):
"""
GIVEN:
- Image file with QR separator barcode
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"qr-code-PATCHT.png", "qr-code-PATCHT.png",
@ -74,6 +121,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_128(self): def test_barcode_reader_128(self):
"""
GIVEN:
- Image file with 128 style separator barcode
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-128-PATCHT.png", "barcode-128-PATCHT.png",
@ -83,11 +138,27 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self): def test_barcode_reader_no_barcode(self):
"""
GIVEN:
- Image file with no barcode
WHEN:
- Image is scanned for codes
THEN:
- No barcode is detected
"""
test_file = os.path.join(self.SAMPLE_DIR, "simple.png") test_file = os.path.join(self.SAMPLE_DIR, "simple.png")
img = Image.open(test_file) img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), []) self.assertListEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self): def test_barcode_reader_custom_separator(self):
"""
GIVEN:
- Image file with custom separator barcode value
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.png", "barcode-39-custom.png",
@ -96,6 +167,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_qr_separator(self): def test_barcode_reader_custom_qr_separator(self):
"""
GIVEN:
- Image file with custom separator barcode value as a QR code
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.png", "barcode-qr-custom.png",
@ -104,6 +183,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_128_separator(self): def test_barcode_reader_custom_128_separator(self):
"""
GIVEN:
- Image file with custom separator 128 barcode value
WHEN:
- Image is scanned for codes
THEN:
- The barcode is detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png", "barcode-128-custom.png",
@ -164,6 +251,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"])
def test_get_mime_type(self): def test_get_mime_type(self):
"""
GIVEN:
-
WHEN:
-
THEN:
-
"""
tiff_file = os.path.join( tiff_file = os.path.join(
self.SAMPLE_DIR, self.SAMPLE_DIR,
"simple.tiff", "simple.tiff",
@ -194,6 +289,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png") self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png")
def test_convert_from_tiff_to_pdf(self): def test_convert_from_tiff_to_pdf(self):
"""
GIVEN:
-
WHEN:
-
THEN:
-
"""
test_file = os.path.join( test_file = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
"samples", "samples",
@ -207,6 +310,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(file_extension, ".pdf") self.assertEqual(file_extension, ".pdf")
def test_convert_error_from_pdf_to_pdf(self): def test_convert_error_from_pdf_to_pdf(self):
"""
GIVEN:
-
WHEN:
-
THEN:
-
"""
test_file = os.path.join( test_file = os.path.join(
self.SAMPLE_DIR, self.SAMPLE_DIR,
"simple.pdf", "simple.pdf",
@ -216,6 +327,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst)) self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst))
def test_scan_file_for_separating_barcodes(self): def test_scan_file_for_separating_barcodes(self):
"""
GIVEN:
-
WHEN:
-
THEN:
-
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf", "patch-code-t.pdf",
@ -231,6 +350,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertListEqual(separator_page_numbers, [0]) self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes_none_present(self): def test_scan_file_for_separating_barcodes_none_present(self):
"""
GIVEN:
-
WHEN:
-
THEN:
-
"""
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
doc_barcode_info = barcodes.scan_file_for_barcodes( doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file, test_file,
@ -242,7 +369,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, []) self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_separating_barcodes3(self): def test_scan_file_for_separating_barcodes_middle_page(self):
"""
GIVEN:
- PDF file containing a separator on page 1 (zero indexed)
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 1 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
@ -257,7 +392,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [1]) self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes4(self): def test_scan_file_for_separating_barcodes_multiple_pages(self):
"""
GIVEN:
- PDF file containing a separator on pages 2 and 5 (zero indexed)
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on pages 2 and 5 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"several-patcht-codes.pdf", "several-patcht-codes.pdf",
@ -272,7 +415,16 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [2, 5]) self.assertListEqual(separator_page_numbers, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self): def test_scan_file_for_separating_barcodes_upside_down(self):
"""
GIVEN:
- PDF file containing a separator on page 1 (zero indexed)
- The barcode is upside down
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 1 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle_reverse.pdf", "patch-code-t-middle_reverse.pdf",
@ -287,66 +439,6 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, [1]) self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_barcodes_pillow_transcode_error(self):
"""
GIVEN:
- A PDF containing an image which cannot be transcoded to a PIL image
WHEN:
- The image tries to be transcoded to a PIL image, but fails
THEN:
- The barcode reader is still called
"""
def _build_device_n_pdf(self, save_path: str):
# Based on the pikepdf tests
# https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
pdf = pikepdf.new()
pdf.add_blank_page(page_size=(72, 72))
imobj = pikepdf.Stream(
pdf,
bytes(range(0, 256)),
BitsPerComponent=8,
ColorSpace=pikepdf.Array(
[
pikepdf.Name.DeviceN,
pikepdf.Array([pikepdf.Name.Black]),
pikepdf.Name.DeviceCMYK,
pikepdf.Stream(
pdf,
b"{0 0 0 4 -1 roll}", # Colorspace conversion function
FunctionType=4,
Domain=[0.0, 1.0],
Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
),
],
),
Width=16,
Height=16,
Type=pikepdf.Name.XObject,
Subtype=pikepdf.Name.Image,
)
pim = pikepdf.PdfImage(imobj)
self.assertEqual(pim.mode, "DeviceN")
self.assertTrue(pim.is_device_n)
pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
pdf.pages[0].Resources = pikepdf.Dictionary(
XObject=pikepdf.Dictionary(Im0=imobj),
)
pdf.save(save_path)
with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
# Build an offending file
_build_device_n_pdf(self, str(device_n_pdf.name))
with mock.patch("documents.barcodes.barcode_reader") as reader:
reader.return_value = list()
_ = barcodes.scan_file_for_barcodes(
str(device_n_pdf.name),
)
reader.assert_called()
def test_scan_file_for_separating_barcodes_fax_decode(self): def test_scan_file_for_separating_barcodes_fax_decode(self):
""" """
GIVEN: GIVEN:
@ -371,6 +463,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertListEqual(separator_page_numbers, [1]) self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self): def test_scan_file_for_separating_qr_barcodes(self):
"""
GIVEN:
- PDF file containing a separator on page 0 (zero indexed)
- The barcode is a QR code
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 0 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-qr.pdf", "patch-code-t-qr.pdf",
@ -387,6 +488,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self): def test_scan_file_for_separating_custom_barcodes(self):
"""
GIVEN:
- PDF file containing a separator on page 0 (zero indexed)
- The barcode separation value is customized
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 0 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf", "barcode-39-custom.pdf",
@ -403,6 +513,16 @@ class TestBarcode(DirectoriesMixin, TestCase):
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self): def test_scan_file_for_separating_custom_qr_barcodes(self):
"""
GIVEN:
- PDF file containing a separator on page 0 (zero indexed)
- The barcode separation value is customized
- The barcode is a QR code
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 0 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.pdf", "barcode-qr-custom.pdf",
@ -419,6 +539,16 @@ class TestBarcode(DirectoriesMixin, TestCase):
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self): def test_scan_file_for_separating_custom_128_barcodes(self):
"""
GIVEN:
- PDF file containing a separator on page 0 (zero indexed)
- The barcode separation value is customized
- The barcode is a 128 code
WHEN:
- File is scanned for barcodes
THEN:
- Barcode is detected on page 0 (zero indexed)
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.pdf", "barcode-128-custom.pdf",
@ -434,6 +564,16 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertListEqual(separator_page_numbers, [0]) self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self): def test_scan_file_for_separating_wrong_qr_barcodes(self):
"""
GIVEN:
- PDF file containing a separator on page 0 (zero indexed)
- The barcode value is customized
- The separation value is NOT customized
WHEN:
- File is scanned for barcodes
THEN:
- No split pages are detected
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf", "barcode-39-custom.pdf",
@ -474,13 +614,21 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertListEqual(separator_page_numbers, [1]) self.assertListEqual(separator_page_numbers, [1])
def test_separate_pages(self): def test_separate_pages(self):
"""
GIVEN:
- Input PDF 2 pages after separation
WHEN:
- The input file separated at the barcode
THEN:
- Two new documents are produced
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
) )
pages = barcodes.separate_pages(test_file, [1]) documents = barcodes.separate_pages(test_file, [1])
self.assertEqual(len(pages), 2) self.assertEqual(len(documents), 2)
def test_separate_pages_double_code(self): def test_separate_pages_double_code(self):
""" """
@ -493,8 +641,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
""" """
test_file = os.path.join( test_file = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
"samples", self.BARCODE_SAMPLE_DIR,
"barcodes",
"patch-code-t-double.pdf", "patch-code-t-double.pdf",
) )
pages = barcodes.separate_pages(test_file, [1, 2]) pages = barcodes.separate_pages(test_file, [1, 2])
@ -502,6 +649,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(len(pages), 2) self.assertEqual(len(pages), 2)
def test_separate_pages_no_list(self): def test_separate_pages_no_list(self):
"""
GIVEN:
- Input file to separate
WHEN:
- No separation pages are provided
THEN:
- No new documents are produced
- A warning is logged
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
@ -517,16 +673,32 @@ class TestBarcode(DirectoriesMixin, TestCase):
) )
def test_save_to_dir(self): def test_save_to_dir(self):
"""
GIVEN:
- File to save to a directory
WHEN:
- The file is saved
THEN:
- The file exists
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf", "patch-code-t.pdf",
) )
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR)
barcodes.save_to_dir(test_file, target_dir=tempdir) target_file = os.path.join(settings.SCRATCH_DIR, "patch-code-t.pdf")
target_file = os.path.join(tempdir, "patch-code-t.pdf")
self.assertTrue(os.path.isfile(target_file)) self.assertTrue(os.path.isfile(target_file))
def test_save_to_dir2(self): def test_save_to_dir_not_existing(self):
"""
GIVEN:
- File to save to a directory
- The directory doesn't exist
WHEN:
- The file is saved
THEN:
- The file exists
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf", "patch-code-t.pdf",
@ -534,7 +706,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
nonexistingdir = "/nowhere" nonexistingdir = "/nowhere"
if os.path.isdir(nonexistingdir): if os.path.isdir(nonexistingdir):
self.fail("non-existing dir exists") self.fail("non-existing dir exists")
else:
with self.assertLogs("paperless.barcodes", level="WARNING") as cm: with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
barcodes.save_to_dir(test_file, target_dir=nonexistingdir) barcodes.save_to_dir(test_file, target_dir=nonexistingdir)
self.assertEqual( self.assertEqual(
@ -544,22 +716,41 @@ class TestBarcode(DirectoriesMixin, TestCase):
], ],
) )
def test_save_to_dir3(self): def test_save_to_dir_given_name(self):
"""
GIVEN:
- File to save to a directory
- There is a name override
WHEN:
- The file is saved
THEN:
- The file exists
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf", "patch-code-t.pdf",
) )
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) barcodes.save_to_dir(
barcodes.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) test_file,
target_file = os.path.join(tempdir, "newname.pdf") newname="newname.pdf",
target_dir=settings.SCRATCH_DIR,
)
target_file = os.path.join(settings.SCRATCH_DIR, "newname.pdf")
self.assertTrue(os.path.isfile(target_file)) self.assertTrue(os.path.isfile(target_file))
def test_barcode_splitter(self): def test_barcode_splitter(self):
"""
GIVEN:
- Input file containing barcodes
WHEN:
- Input file is split on barcodes
THEN:
- Correct number of files produced
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
) )
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
doc_barcode_info = barcodes.scan_file_for_barcodes( doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file, test_file,
@ -572,18 +763,33 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertTrue(len(separator_page_numbers) > 0) self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers) document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list) self.assertGreater(len(document_list), 0)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") for document in document_list:
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR)
target_file1 = os.path.join(
settings.SCRATCH_DIR,
"patch-code-t-middle_document_0.pdf",
)
target_file2 = os.path.join(
settings.SCRATCH_DIR,
"patch-code-t-middle_document_1.pdf",
)
self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2)) self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True) @override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self): def test_consume_barcode_file(self):
"""
GIVEN:
- Input file with barcodes given to consume task
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf", "patch-code-t-middle.pdf",
@ -600,6 +806,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
CONSUMER_BARCODE_TIFF_SUPPORT=True, CONSUMER_BARCODE_TIFF_SUPPORT=True,
) )
def test_consume_barcode_tiff_file(self): def test_consume_barcode_tiff_file(self):
"""
GIVEN:
- TIFF image containing barcodes
WHEN:
- Consume task returns
THEN:
- The file was split
"""
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff", "patch-code-t-middle.tiff",
@ -617,11 +831,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
@mock.patch("documents.consumer.Consumer.try_consume_file") @mock.patch("documents.consumer.Consumer.try_consume_file")
def test_consume_barcode_unsupported_jpg_file(self, m): def test_consume_barcode_unsupported_jpg_file(self, m):
""" """
This test assumes barcode and TIFF support are enabled and GIVEN:
the user uploads an unsupported image file (e.g. jpg) - JPEG image as input
WHEN:
The function shouldn't try to scan for separating barcodes - Consume task returns
and continue archiving the file as is. THEN:
- Barcode reader reported warning
- Consumption continued with the file
""" """
test_file = os.path.join( test_file = os.path.join(
self.SAMPLE_DIR, self.SAMPLE_DIR,
@ -629,8 +845,10 @@ class TestBarcode(DirectoriesMixin, TestCase):
) )
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
shutil.copy(test_file, dst) shutil.copy(test_file, dst)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm: with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
self.assertIn("Success", tasks.consume_file(dst)) self.assertIn("Success", tasks.consume_file(dst))
self.assertListEqual( self.assertListEqual(
cm.output, cm.output,
[ [
@ -652,8 +870,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
) )
def test_consume_barcode_supported_no_extension_file(self): def test_consume_barcode_supported_no_extension_file(self):
""" """
This test assumes barcode and TIFF support are enabled and GIVEN:
the user uploads a supported image file, but without extension - TIFF image containing barcodes
- TIFF file is given without extension
WHEN:
- Consume task returns
THEN:
- The file was split
""" """
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,
@ -669,11 +892,10 @@ class TestBarcode(DirectoriesMixin, TestCase):
""" """
GIVEN: GIVEN:
- Password protected PDF - Password protected PDF
- pikepdf based scanning
WHEN: WHEN:
- File is scanned for barcode - File is scanned for barcode
THEN: THEN:
- Scanning handles the exception without exception - Scanning handles the exception without crashing
""" """
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
doc_barcode_info = barcodes.scan_file_for_barcodes( doc_barcode_info = barcodes.scan_file_for_barcodes(
@ -808,7 +1030,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
@override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
def test_asn_too_large(self): def test_asn_too_large(self):
"""
GIVEN:
- ASN from barcode enabled
- Barcode contains too large an ASN value
WHEN:
- ASN from barcode checked for correctness
THEN:
- Exception is raised regarding size limits
"""
src = os.path.join( src = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
"samples", "samples",