diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 46a96061b..597f228f3 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -4,6 +4,7 @@ import shutil import tempfile from dataclasses import dataclass from functools import lru_cache +from math import ceil from pathlib import Path from typing import List from typing import Optional @@ -172,6 +173,24 @@ def scan_file_for_barcodes( # raise an exception, triggering fallback pillow_img = pdfimage.as_pil_image() + # Scale the image down + # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385 + # TLDR: zbar has issues with larger images + width, height = pillow_img.size + if width > 512: + scaler = ceil(width / 512) + new_width = int(width / scaler) + new_height = int(height / scaler) + pillow_img = pillow_img.resize((new_width, new_height)) + + width, height = pillow_img.size + + if height > 1024: + scaler = ceil(height / 1024) + new_width = int(width / scaler) + new_height = int(height / scaler) + pillow_img = pillow_img.resize((new_width, new_height)) + for barcode_value in barcode_reader(pillow_img): detected_barcodes.append(Barcode(page_num, barcode_value)) @@ -234,12 +253,12 @@ def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]: """ Search the parsed barcodes for separators and returns a list of page numbers, which - separate the file into new files + separate the file into new files. """ # filter all barcodes for the separator string # get the page numbers of the separating barcodes - return [bc.page for bc in barcodes if bc.is_separator] + return list({bc.page for bc in barcodes if bc.is_separator}) def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: @@ -266,7 +285,7 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: try: asn = int(asn_text) except ValueError as e: - logger.warn(f"Failed to parse ASN number because: {e}") + logger.warning(f"Failed to parse ASN number because: {e}") return asn diff --git a/src/documents/tests/samples/barcodes/many-qr-codes.pdf b/src/documents/tests/samples/barcodes/many-qr-codes.pdf new file mode 100644 index 000000000..f5d3f4a29 Binary files /dev/null and b/src/documents/tests/samples/barcodes/many-qr-codes.pdf differ diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index b2d0824ed..1dc2a88bc 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -447,6 +447,31 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) + @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC") + def test_scan_file_for_separating_qr_barcodes(self): + """ + GIVEN: + - Input PDF with certain QR codes that aren't detected at current size + WHEN: + - The input file is scanned for barcodes + THEN: + - QR codes are detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "many-qr-codes.pdf", + ) + + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertGreater(len(doc_barcode_info.barcodes), 0) + self.assertListEqual(separator_page_numbers, [1]) + def test_separate_pages(self): test_file = os.path.join( self.BARCODE_SAMPLE_DIR,