Rescales images from PDFs so zbar can better find them

This commit is contained in:
Trenton H 2023-01-18 06:56:51 -08:00
parent a0c1c48dca
commit 4195d5746f
3 changed files with 47 additions and 3 deletions

View File

@ -4,6 +4,7 @@ import shutil
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from math import ceil
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -172,6 +173,24 @@ def scan_file_for_barcodes(
# raise an exception, triggering fallback # raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image() pillow_img = pdfimage.as_pil_image()
# Scale the image down
# See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
# TLDR: zbar has issues with larger images
width, height = pillow_img.size
if width > 512:
scaler = ceil(width / 512)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
width, height = pillow_img.size
if height > 1024:
scaler = ceil(height / 1024)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
for barcode_value in barcode_reader(pillow_img): for barcode_value in barcode_reader(pillow_img):
detected_barcodes.append(Barcode(page_num, barcode_value)) detected_barcodes.append(Barcode(page_num, barcode_value))
@ -234,12 +253,12 @@ def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
""" """
Search the parsed barcodes for separators Search the parsed barcodes for separators
and returns a list of page numbers, which and returns a list of page numbers, which
separate the file into new files separate the file into new files.
""" """
# filter all barcodes for the separator string # filter all barcodes for the separator string
# get the page numbers of the separating barcodes # get the page numbers of the separating barcodes
return [bc.page for bc in barcodes if bc.is_separator] return list({bc.page for bc in barcodes if bc.is_separator})
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@ -266,7 +285,7 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
try: try:
asn = int(asn_text) asn = int(asn_text)
except ValueError as e: except ValueError as e:
logger.warn(f"Failed to parse ASN number because: {e}") logger.warning(f"Failed to parse ASN number because: {e}")
return asn return asn

Binary file not shown.

View File

@ -447,6 +447,31 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertListEqual(separator_page_numbers, []) self.assertListEqual(separator_page_numbers, [])
@override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
def test_scan_file_for_separating_qr_barcodes(self):
"""
GIVEN:
- Input PDF with certain QR codes that aren't detected at current size
WHEN:
- The input file is scanned for barcodes
THEN:
- QR codes are detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"many-qr-codes.pdf",
)
doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
separator_page_numbers = barcodes.get_separating_barcodes(
doc_barcode_info.barcodes,
)
self.assertGreater(len(doc_barcode_info.barcodes), 0)
self.assertListEqual(separator_page_numbers, [1])
def test_separate_pages(self): def test_separate_pages(self):
test_file = os.path.join( test_file = os.path.join(
self.BARCODE_SAMPLE_DIR, self.BARCODE_SAMPLE_DIR,