Allows using pdf2image instead of pikepdf if desired

2026-02-20 00:39:32 -06:00 · 2022-10-24 08:40:33 -07:00
parent 0a19ad4edb
commit f8ce6285df
4 changed files with 70 additions and 12 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -701,6 +701,17 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
    Defaults to false.
 PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
    Enables the legacy method of detecting barcodes.  By default, images are
    extracted directly from the PDF structure for barcode detection.  If this
    configuration value is set, images of the whole PDF page will be used instead.
    This is a slower and more memory intensive process, but may be required for
    certain files, depending on how it is produced and how images are encoded.
    Defaults to false.
 PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
    Whether TIFF image files should be scanned for barcodes.
    This will automatically convert any TIFF image(s) to pdfs for later
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -150,16 +150,20 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
        if mime_type == "image/tiff":
            pdf_filepath = convert_from_tiff_to_pdf(filepath)
-        try:
+        if settings.CONSUMER_USE_LEGACY_DETECTION:
            _pikepdf_barcode_scan(pdf_filepath)
        except Exception as e:
            logger.warning(
                f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
            )
            # Reset this incase pikepdf got part way through
            separator_page_numbers = []
            _pdf2image_barcode_scan(pdf_filepath)
        else:
            try:
                _pikepdf_barcode_scan(pdf_filepath)
            except Exception as e:
                logger.warning(
                    f"Exception using pikepdf for barcodes,"
                    f" falling back to pdf2image: {e}",
                )
                # Reset this incase pikepdf got part way through
                separator_page_numbers = []
                _pdf2image_barcode_scan(pdf_filepath)
    else:
        logger.warning(
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -468,6 +468,41 @@ class TestBarcode(DirectoriesMixin, TestCase):
        self.assertTrue(os.path.isfile(target_file1))
        self.assertTrue(os.path.isfile(target_file2))
    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
    def test_barcode_splitter_legacy_fallback(self):
        """
        GIVEN:
            - File containing barcode
            - Legacy method of detection is enabled
        WHEN:
            - File is scanned for barcodes
        THEN:
            - Barcodes are properly detected
        """
        test_file = os.path.join(
            self.BARCODE_SAMPLE_DIR,
            "patch-code-t-middle.pdf",
        )
        tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
            test_file,
        )
        self.assertEqual(test_file, pdf_file)
        self.assertTrue(len(separator_page_numbers) > 0)
        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
        self.assertTrue(document_list)
        for document in document_list:
            barcodes.save_to_dir(document, target_dir=tempdir)
        target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
        target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
        self.assertTrue(os.path.isfile(target_file1))
        self.assertTrue(os.path.isfile(target_file2))
    @override_settings(CONSUMER_ENABLE_BARCODES=True)
    def test_consume_barcode_file(self):
        test_file = os.path.join(
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -558,15 +558,23 @@ CONSUMER_IGNORE_PATTERNS = list(
 CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
-CONSUMER_ENABLE_BARCODES = __get_boolean(
+CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_ENABLE_BARCODES",
 )
-CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
+CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
 )
-CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
+CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
    "PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
    "NO",
 )
 CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
    "PAPERLESS_CONSUMER_BARCODE_STRING",
    "PATCHT",
 )
 OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))