Refactor: performance and storage optimization of barcode scanning (#7646)

---------

Co-authored-by: Lukas Metzger <1814751+loewexy@users.noreply.github.com>
Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Lukas Metzger 2024-09-08 01:11:36 +02:00 committed by GitHub
parent e98d52830f
commit cc25cbc026
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 53 additions and 7 deletions

View File

@ -1289,6 +1289,15 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0.
Defaults to "300"
#### [`PAPERLESS_CONSUMER_BARCODE_MAX_PAGES=<int>`](#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES) {#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES}
: Because barcode detection is a computationally-intensive operation, this setting
limits the detection of barcodes to a number of first pages. If your scanner has
a limit for the number of pages that can be scanned it would be sensible to set this
as the limit here.
Defaults to "0", allowing all pages to be checked for barcodes.
#### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=<bool>`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE}
: Enables the detection of barcodes in the scanned document and

View File

@ -7,8 +7,8 @@ from typing import Optional
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from PIL import Image
@ -231,13 +231,41 @@ class BarcodePlugin(ConsumeTaskPlugin):
logger.debug("Scanning for barcodes using ZXING")
try:
pages_from_path = convert_from_path(
self.pdf_file,
dpi=settings.CONSUMER_BARCODE_DPI,
output_folder=self.temp_dir.name,
# Read number of pages from pdf
with Pdf.open(self.pdf_file) as pdf:
num_of_pages = len(pdf.pages)
logger.debug(f"PDF has {num_of_pages} pages")
# Get limit from configuration
barcode_max_pages = (
num_of_pages
if settings.CONSUMER_BARCODE_MAX_PAGES == 0
else settings.CONSUMER_BARCODE_MAX_PAGES
)
for current_page_number, page in enumerate(pages_from_path):
if barcode_max_pages < num_of_pages: # pragma: no cover
logger.debug(
f"Barcodes detection will be limited to the first {barcode_max_pages} pages",
)
# Loop al page
for current_page_number in range(min(num_of_pages, barcode_max_pages)):
logger.debug(f"Processing page {current_page_number}")
# Convert page to image
page = convert_from_path(
self.pdf_file,
dpi=settings.CONSUMER_BARCODE_DPI,
output_folder=self.temp_dir.name,
first_page=current_page_number + 1,
last_page=current_page_number + 1,
)[0]
# Remember filename, since it is lost by upscaling
page_filepath = Path(page.filename)
logger.debug(f"Image is at {page_filepath}")
# Upscale image if configured
factor = settings.CONSUMER_BARCODE_UPSCALE
if factor > 1.0:
logger.debug(
@ -248,14 +276,18 @@ class BarcodePlugin(ConsumeTaskPlugin):
(int(round(x * factor)), (int(round(y * factor)))),
)
# Detect barcodes
for barcode_value in reader(page):
self.barcodes.append(
Barcode(current_page_number, barcode_value),
)
# Delete temporary image file
page_filepath.unlink()
# Password protected files can't be checked
# This is the exception raised for those
except PDFPageCountError as e:
except PasswordError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)

View File

@ -925,6 +925,11 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_BARCODE_MAX_PAGES: Final[int] = __get_int(
"PAPERLESS_CONSUMER_BARCODE_MAX_PAGES",
0,
)
CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE",
)