Allows using pdf2image instead of pikepdf if desired

This commit is contained in:
Trenton H 2022-10-24 08:40:33 -07:00
parent 0a19ad4edb
commit f8ce6285df
4 changed files with 70 additions and 12 deletions

View File

@ -701,6 +701,17 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Defaults to false.
PAPERLESS_CONSUMER_USE_LEGACY_DETECTION=<bool>
Enables the legacy method of detecting barcodes. By default, images are
extracted directly from the PDF structure for barcode detection. If this
configuration value is set, images of the whole PDF page will be used instead.
This is a slower and more memory intensive process, but may be required for
certain files, depending on how it is produced and how images are encoded.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool>
Whether TIFF image files should be scanned for barcodes.
This will automatically convert any TIFF image(s) to pdfs for later

View File

@ -150,16 +150,20 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
try:
_pikepdf_barcode_scan(pdf_filepath)
except Exception as e:
logger.warning(
f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
)
# Reset this incase pikepdf got part way through
separator_page_numbers = []
if settings.CONSUMER_USE_LEGACY_DETECTION:
_pdf2image_barcode_scan(pdf_filepath)
else:
try:
_pikepdf_barcode_scan(pdf_filepath)
except Exception as e:
logger.warning(
f"Exception using pikepdf for barcodes,"
f" falling back to pdf2image: {e}",
)
# Reset this incase pikepdf got part way through
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
else:
logger.warning(

View File

@ -468,6 +468,41 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_barcode_splitter_legacy_fallback(self):
"""
GIVEN:
- File containing barcode
- Legacy method of detection is enabled
WHEN:
- File is scanned for barcodes
THEN:
- Barcodes are properly detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(

View File

@ -558,15 +558,23 @@ CONSUMER_IGNORE_PATTERNS = list(
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
CONSUMER_ENABLE_BARCODES = __get_boolean(
CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
)
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
CONSUMER_USE_LEGACY_DETECTION: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_USE_LEGACY_DETECTION",
"NO",
)
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_STRING",
"PATCHT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))