From cc25cbc02632913935dde27d0c6099538d02dd58 Mon Sep 17 00:00:00 2001 From: Lukas Metzger <1814751+loewexy@users.noreply.github.com> Date: Sun, 8 Sep 2024 01:11:36 +0200 Subject: [PATCH] Refactor: performance and storage optimization of barcode scanning (#7646) --------- Co-authored-by: Lukas Metzger <1814751+loewexy@users.noreply.github.com> Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- docs/configuration.md | 9 ++++++++ src/documents/barcodes.py | 46 +++++++++++++++++++++++++++++++++------ src/paperless/settings.py | 5 +++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 7172afcb3..d8ec27d2c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1289,6 +1289,15 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0. Defaults to "300" +#### [`PAPERLESS_CONSUMER_BARCODE_MAX_PAGES=`](#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES) {#PAPERLESS_CONSUMER_BARCODE_MAX_PAGES} + +: Because barcode detection is a computationally-intensive operation, this setting +limits the detection of barcodes to a number of first pages. If your scanner has +a limit for the number of pages that can be scanned it would be sensible to set this +as the limit here. + + Defaults to "0", allowing all pages to be checked for barcodes. + #### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE} : Enables the detection of barcodes in the scanned document and diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 2e290a61b..97177cbf6 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -7,8 +7,8 @@ from typing import Optional from django.conf import settings from pdf2image import convert_from_path -from pdf2image.exceptions import PDFPageCountError from pikepdf import Page +from pikepdf import PasswordError from pikepdf import Pdf from PIL import Image @@ -231,13 +231,41 @@ class BarcodePlugin(ConsumeTaskPlugin): logger.debug("Scanning for barcodes using ZXING") try: - pages_from_path = convert_from_path( - self.pdf_file, - dpi=settings.CONSUMER_BARCODE_DPI, - output_folder=self.temp_dir.name, + # Read number of pages from pdf + with Pdf.open(self.pdf_file) as pdf: + num_of_pages = len(pdf.pages) + logger.debug(f"PDF has {num_of_pages} pages") + + # Get limit from configuration + barcode_max_pages = ( + num_of_pages + if settings.CONSUMER_BARCODE_MAX_PAGES == 0 + else settings.CONSUMER_BARCODE_MAX_PAGES ) - for current_page_number, page in enumerate(pages_from_path): + if barcode_max_pages < num_of_pages: # pragma: no cover + logger.debug( + f"Barcodes detection will be limited to the first {barcode_max_pages} pages", + ) + + # Loop al page + for current_page_number in range(min(num_of_pages, barcode_max_pages)): + logger.debug(f"Processing page {current_page_number}") + + # Convert page to image + page = convert_from_path( + self.pdf_file, + dpi=settings.CONSUMER_BARCODE_DPI, + output_folder=self.temp_dir.name, + first_page=current_page_number + 1, + last_page=current_page_number + 1, + )[0] + + # Remember filename, since it is lost by upscaling + page_filepath = Path(page.filename) + logger.debug(f"Image is at {page_filepath}") + + # Upscale image if configured factor = settings.CONSUMER_BARCODE_UPSCALE if factor > 1.0: logger.debug( @@ -248,14 +276,18 @@ class BarcodePlugin(ConsumeTaskPlugin): (int(round(x * factor)), (int(round(y * factor)))), ) + # Detect barcodes for barcode_value in reader(page): self.barcodes.append( Barcode(current_page_number, barcode_value), ) + # Delete temporary image file + page_filepath.unlink() + # Password protected files can't be checked # This is the exception raised for those - except PDFPageCountError as e: + except PasswordError as e: logger.warning( f"File is likely password protected, not checking for barcodes: {e}", ) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index ee6110732..ebe64ba9e 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -925,6 +925,11 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float( CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300) +CONSUMER_BARCODE_MAX_PAGES: Final[int] = __get_int( + "PAPERLESS_CONSUMER_BARCODE_MAX_PAGES", + 0, +) + CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE", )