diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 8d114d68c..79fa2746f 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -1,12 +1,11 @@ import logging -import os import shutil import tempfile from dataclasses import dataclass -from functools import lru_cache from pathlib import Path from subprocess import run from typing import Dict +from typing import Final from typing import List from typing import Optional @@ -18,13 +17,11 @@ from pikepdf import Page from pikepdf import Pdf from PIL import Image +from documents.data_models import DocumentSource + logger = logging.getLogger("paperless.barcodes") -class BarcodeImageFormatError(Exception): - pass - - @dataclass(frozen=True) class Barcode: """ @@ -51,56 +48,72 @@ class Barcode: return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX) -@dataclass -class DocumentBarcodeInfo: - """ - Describes a single document's barcode status - """ +class BarcodeReader: + def __init__(self, filepath: Path, mime_type: str) -> None: + self.file: Final[Path] = filepath + self.mime: Final[str] = mime_type + self.pdf_file: Path = self.file + self.barcodes: List[Barcode] = [] + self.temp_dir: Optional[Path] = None - pdf_path: Path - barcodes: List[Barcode] + if settings.CONSUMER_BARCODE_TIFF_SUPPORT: + self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"} + else: + self.SUPPORTED_FILE_MIMES = {"application/pdf"} + def __enter__(self): + if self.supported_mime_type: + self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes") + return self -@lru_cache(maxsize=8) -def supported_file_type(mime_type: str) -> bool: - """ - Determines if the file is valid for barcode - processing, based on MIME type and settings + def __exit__(self, exc_type, exc_val, exc_tb): + if self.temp_dir is not None: + self.temp_dir.cleanup() + self.temp_dir = None - :return: True if the file is supported, False otherwise - """ - supported_mime = ["application/pdf"] - if settings.CONSUMER_BARCODE_TIFF_SUPPORT: - supported_mime += ["image/tiff"] + @property + def supported_mime_type(self) -> bool: + """ + Return True if the given mime type is supported for barcodes, false otherwise + """ + return self.mime in self.SUPPORTED_FILE_MIMES - return mime_type in supported_mime + @property + def asn(self) -> Optional[int]: + """ + Search the parsed barcodes for any ASNs. + The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + is considered the ASN to be used. + Returns the detected ASN (or None) + """ + asn = None + # Ensure the barcodes have been read + self.detect() -def barcode_reader(image: Image) -> List[str]: - """ - Read any barcodes contained in image - Returns a list containing all found barcodes - """ - barcodes = [] + # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + asn_text = next( + (x.value for x in self.barcodes if x.is_asn), + None, + ) - if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": - logger.debug("Scanning for barcodes using PYZBAR") - from pyzbar import pyzbar + if asn_text: + logger.debug(f"Found ASN Barcode: {asn_text}") + # remove the prefix and remove whitespace + asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() - # Decode the barcode image - detected_barcodes = pyzbar.decode(image) + # now, try parsing the ASN number + try: + asn = int(asn_text) + except ValueError as e: + logger.warning(f"Failed to parse ASN number because: {e}") + + return asn + + @staticmethod + def read_barcodes_zxing(image: Image) -> List[str]: + barcodes = [] - if detected_barcodes: - # Traverse through all the detected barcodes in image - for barcode in detected_barcodes: - if barcode.data: - decoded_barcode = barcode.data.decode("utf-8") - barcodes.append(decoded_barcode) - logger.debug( - f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", - ) - elif settings.CONSUMER_BARCODE_SCANNER == "ZXING": - logger.debug("Scanning for barcodes using ZXING") import zxingcpp detected_barcodes = zxingcpp.read_barcodes(image) @@ -111,74 +124,92 @@ def barcode_reader(image: Image) -> List[str]: f"Barcode of type {str(barcode.format)} found: {barcode.text}", ) - return barcodes + return barcodes + @staticmethod + def read_barcodes_pyzbar(image: Image) -> List[str]: + barcodes = [] -def convert_from_tiff_to_pdf(filepath: Path) -> Path: - """ - converts a given TIFF image file to pdf into a temporary directory. + from pyzbar import pyzbar - Returns the new pdf file. - """ - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - # use old file name with pdf extension - newpath = Path(tempdir) / Path(filepath.name).with_suffix(".pdf") + # Decode the barcode image + detected_barcodes = pyzbar.decode(image) - with Image.open(filepath) as im: - has_alpha_layer = im.mode in ("RGBA", "LA") - if has_alpha_layer: - run( - [ - settings.CONVERT_BINARY, - "-alpha", - "off", - filepath, - filepath, - ], - ) - with filepath.open("rb") as img_file, newpath.open("wb") as pdf_file: - pdf_file.write(img2pdf.convert(img_file)) - return newpath + # Traverse through all the detected barcodes in image + for barcode in detected_barcodes: + if barcode.data: + decoded_barcode = barcode.data.decode("utf-8") + barcodes.append(decoded_barcode) + logger.debug( + f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", + ) + return barcodes -def scan_file_for_barcodes( - filepath: Path, - mime_type: str, -) -> DocumentBarcodeInfo: - """ - Scan the provided pdf file for any barcodes - Returns a PDF filepath and a list of - (page_number, barcode_text) tuples - """ + def convert_from_tiff_to_pdf(self): + """ + May convert a TIFF image into a PDF, if the input is a TIFF + """ + # Nothing to do, pdf_file is already assigned correctly + if self.mime != "image/tiff": + return - def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: - detected_barcodes = [] - # use a temporary directory in case the file is too big to handle in memory - with tempfile.TemporaryDirectory() as path: - pages_from_path = convert_from_path( - pdf_filepath, - dpi=300, - output_folder=path, + with Image.open(self.file) as im: + has_alpha_layer = im.mode in ("RGBA", "LA") + if has_alpha_layer: + # Note the save into the temp folder, so as not to trigger a new + # consume + scratch_image = Path(self.temp_dir.name) / Path(self.file.name) + run( + [ + settings.CONVERT_BINARY, + "-alpha", + "off", + self.file, + scratch_image, + ], ) + else: + # Not modifying the original, safe to use in place + scratch_image = self.file + + self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix( + ".pdf", + ) + + with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file: + pdf_file.write(img2pdf.convert(img_file)) + + def detect(self) -> None: + """ + Scan all pages of the PDF as images, updating barcodes and the pages + found on as we go + """ + # Bail if barcodes already exist + if self.barcodes: + return + + # Choose the library for reading + if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": + reader = self.read_barcodes_pyzbar + logger.debug("Scanning for barcodes using PYZBAR") + else: + reader = self.read_barcodes_zxing + logger.debug("Scanning for barcodes using ZXING") + + try: + pages_from_path = convert_from_path( + self.pdf_file, + dpi=300, + output_folder=self.temp_dir.name, + ) + for current_page_number, page in enumerate(pages_from_path): - for barcode_value in barcode_reader(page): - detected_barcodes.append( + for barcode_value in reader(page): + self.barcodes.append( Barcode(current_page_number, barcode_value), ) - return detected_barcodes - pdf_filepath = None - barcodes = [] - - if supported_file_type(mime_type): - pdf_filepath = filepath - if mime_type == "image/tiff": - pdf_filepath = convert_from_tiff_to_pdf(filepath) - - # Always try pikepdf first, it's usually fine, faster and - # uses less memory - try: - barcodes = _pdf2image_barcode_scan(pdf_filepath) # Password protected files can't be checked # This is the exception raised for those except PDFPageCountError as e: @@ -191,141 +222,130 @@ def scan_file_for_barcodes( logger.warning( f"Exception during barcode scanning: {e}", ) - else: - logger.warning( - f"Unsupported file format for barcode reader: {str(mime_type)}", - ) - return DocumentBarcodeInfo(pdf_filepath, barcodes) + def get_separation_pages(self) -> Dict[int, bool]: + """ + Search the parsed barcodes for separators and returns a dict of page + numbers, which separate the file into new files, together with the + information whether to keep the page. + """ + # filter all barcodes for the separator string + # get the page numbers of the separating barcodes + separator_pages = {bc.page: False for bc in self.barcodes if bc.is_separator} + if not settings.CONSUMER_ENABLE_ASN_BARCODE: + return separator_pages + # add the page numbers of the ASN barcodes + # (except for first page, that might lead to infinite loops). + return { + **separator_pages, + **{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0}, + } -def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]: - """ - Search the parsed barcodes for separators - and returns a dict of page numbers, which - separate the file into new files, together - with the information whether to keep the page. - """ - # filter all barcodes for the separator string - # get the page numbers of the separating barcodes - separator_pages = {bc.page: False for bc in barcodes if bc.is_separator} - if not settings.CONSUMER_ENABLE_ASN_BARCODE: - return separator_pages + def separate_pages(self, pages_to_split_on: Dict[int, bool]) -> List[Path]: + """ + Separate the provided pdf file on the pages_to_split_on. + The pages which are defined by the keys in page_numbers + will be removed if the corresponding value is false. + Returns a list of (temporary) filepaths to consume. + These will need to be deleted later. + """ - # add the page numbers of the ASN barcodes - # (except for first page, that might lead to infinite loops). - return { - **separator_pages, - **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0}, - } + document_paths = [] + fname = self.file.with_suffix("").name + with Pdf.open(self.pdf_file) as input_pdf: + # Start with an empty document + current_document: List[Page] = [] + # A list of documents, ie a list of lists of pages + documents: List[List[Page]] = [current_document] + for idx, page in enumerate(input_pdf.pages): + # Keep building the new PDF as long as it is not a + # separator index + if idx not in pages_to_split_on: + current_document.append(page) + continue -def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: - """ - Search the parsed barcodes for any ASNs. - The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - is considered the ASN to be used. - Returns the detected ASN (or None) - """ - asn = None + # This is a split index + # Start a new destination page listing + logger.debug(f"Starting new document at idx {idx}") + current_document = [] + documents.append(current_document) + keep_page = pages_to_split_on[idx] + if keep_page: + # Keep the page + # (new document is started by asn barcode) + current_document.append(page) - # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - asn_text = next( - (x.value for x in barcodes if x.is_asn), - None, - ) + documents = [x for x in documents if len(x)] - if asn_text: - logger.debug(f"Found ASN Barcode: {asn_text}") - # remove the prefix and remove whitespace - asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() + logger.debug(f"Split into {len(documents)} new documents") - # now, try parsing the ASN number - try: - asn = int(asn_text) - except ValueError as e: - logger.warning(f"Failed to parse ASN number because: {e}") + # Write the new documents out + for doc_idx, document in enumerate(documents): + dst = Pdf.new() + dst.pages.extend(document) - return asn + output_filename = f"{fname}_document_{doc_idx}.pdf" + logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages") + savepath = Path(self.temp_dir.name) / output_filename + with open(savepath, "wb") as out: + dst.save(out) + document_paths.append(savepath) -def separate_pages(filepath: Path, pages_to_split_on: Dict[int, bool]) -> List[Path]: - """ - Separate the provided pdf file on the pages_to_split_on. - The pages which are defined by the keys in page_numbers - will be removed if the corresponding value is false. - Returns a list of (temporary) filepaths to consume. - These will need to be deleted later. - """ + return document_paths - document_paths = [] + def separate( + self, + source: DocumentSource, + override_name: Optional[str] = None, + ) -> bool: + """ + Separates the document, based on barcodes and configuration, creating new + documents as required in the appropriate location. - if not pages_to_split_on: - logger.warning("No pages to split on!") - return document_paths + Returns True if a split happened, False otherwise + """ + # Do nothing + if not self.supported_mime_type: + logger.warning(f"Unsupported file format for barcode reader: {self.mime}") + return False - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - tempdir = Path(tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)) - fname = filepath.with_suffix("").name - pdf = Pdf.open(filepath) + # Does nothing unless needed + self.convert_from_tiff_to_pdf() - # Start with an empty document - current_document: List[Page] = [] - # A list of documents, ie a list of lists of pages - documents: List[List[Page]] = [current_document] + # Actually read the codes, if any + self.detect() - for idx, page in enumerate(pdf.pages): - # Keep building the new PDF as long as it is not a - # separator index - if idx not in pages_to_split_on: - current_document.append(page) - continue + separator_pages = self.get_separation_pages() - # This is a split index - # Start a new destination page listing - logger.debug(f"Starting new document at idx {idx}") - current_document = [] - documents.append(current_document) - keep_page = pages_to_split_on[idx] - if keep_page: - # Keep the page - # (new document is started by asn barcode) - current_document.append(page) + # Also do nothing + if not separator_pages: + logger.warning("No pages to split on!") + return False - documents = [x for x in documents if len(x)] + # Create the split documents + doc_paths = self.separate_pages(separator_pages) - logger.debug(f"Split into {len(documents)} new documents") + # Save the new documents to correct folder + if source != DocumentSource.ConsumeFolder: + # The given file is somewhere in SCRATCH_DIR, + # and new documents must be moved to the CONSUMPTION_DIR + # for the consumer to notice them + save_to_dir = settings.CONSUMPTION_DIR + else: + # The given file is somewhere in CONSUMPTION_DIR, + # and may be some levels down for recursive tagging + # so use the file's parent to preserve any metadata + save_to_dir = self.file.parent - # Write the new documents out - for doc_idx, document in enumerate(documents): - dst = Pdf.new() - dst.pages.extend(document) - - output_filename = f"{fname}_document_{doc_idx}.pdf" - - logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages") - savepath = tempdir / output_filename - with open(savepath, "wb") as out: - dst.save(out) - document_paths.append(savepath) - - return document_paths - - -def save_to_dir( - filepath: Path, - newname: str = None, - target_dir: Path = settings.CONSUMPTION_DIR, -): - """ - Copies filepath to target_dir. - Optionally rename the file. - """ - if filepath.is_file() and target_dir.is_dir(): - dest = target_dir - if newname is not None: - dest = dest / newname - shutil.copy(filepath, dest) - logging.debug(f"saved {str(filepath)} to {str(dest)}") - else: - logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") + for idx, document_path in enumerate(doc_paths): + if override_name is not None: + newname = f"{str(idx)}_{override_name}" + dest = save_to_dir / newname + else: + dest = save_to_dir + logger.info(f"Saving {document_path} to {dest}") + shutil.copy2(document_path, dest) + return True diff --git a/src/documents/tasks.py b/src/documents/tasks.py index f51fa9828..1603a1359 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -16,16 +16,15 @@ from filelock import FileLock from redis.exceptions import ConnectionError from whoosh.writing import AsyncWriter -from documents import barcodes from documents import index from documents import sanity_checker +from documents.barcodes import BarcodeReader from documents.classifier import DocumentClassifier from documents.classifier import load_classifier from documents.consumer import Consumer from documents.consumer import ConsumerError from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides -from documents.data_models import DocumentSource from documents.file_handling import create_source_path_directory from documents.file_handling import generate_unique_filename from documents.models import Correspondent @@ -96,95 +95,39 @@ def consume_file( # read all barcodes in the current document if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: - doc_barcode_info = barcodes.scan_file_for_barcodes( - input_doc.original_file, - input_doc.mime_type, - ) + with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: + if settings.CONSUMER_ENABLE_BARCODES and reader.separate( + input_doc.source, + overrides.filename, + ): + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": overrides.filename or input_doc.original_file.name, + "task_id": None, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except ConnectionError as e: + logger.warning(f"ConnectionError on status send: {str(e)}") + # consuming stops here, since the original document with + # the barcodes has been split and will be consumed separately - # split document by separator pages, if enabled - if settings.CONSUMER_ENABLE_BARCODES: - separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) + input_doc.original_file.unlink() + return "File successfully split" - if len(separators) > 0: - logger.debug( - f"Pages with separators found in: {input_doc.original_file}", - ) - document_list = barcodes.separate_pages( - doc_barcode_info.pdf_path, - separators, - ) - - if document_list: - # If the file is an upload, it's in the scratch directory - # Move it to consume directory to be picked up - # Otherwise, use the current parent to keep possible tags - # from subdirectories - if input_doc.source != DocumentSource.ConsumeFolder: - save_to_dir = settings.CONSUMPTION_DIR - else: - # Note this uses the original file, because it's in the - # consume folder already and may include additional path - # components for tagging - # the .path is somewhere in scratch in this case - save_to_dir = input_doc.original_file.parent - - for n, document in enumerate(document_list): - # save to consumption dir - # rename it to the original filename with number prefix - if overrides.filename is not None: - newname = f"{str(n)}_{overrides.filename}" - else: - newname = None - - barcodes.save_to_dir( - document, - newname=newname, - target_dir=save_to_dir, - ) - - # Split file has been copied safely, remove it - document.unlink() - - # And clean up the directory as well, now it's empty - shutil.rmtree(document_list[0].parent) - - # This file has been split into multiple files without issue - # remove the original and working copy - input_doc.original_file.unlink() - - # If the original file was a TIFF, remove the PDF generated from it - if input_doc.mime_type == "image/tiff": - logger.debug( - f"Deleting file {doc_barcode_info.pdf_path}", - ) - doc_barcode_info.pdf_path.unlink() - - # notify the sender, otherwise the progress bar - # in the UI stays stuck - payload = { - "filename": overrides.filename or input_doc.original_file.name, - "task_id": None, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except ConnectionError as e: - logger.warning(f"ConnectionError on status send: {str(e)}") - # consuming stops here, since the original document with - # the barcodes has been split and will be consumed separately - return "File successfully split" - - # try reading the ASN from barcode - if settings.CONSUMER_ENABLE_ASN_BARCODE: - overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - if overrides.asn: - logger.info(f"Found ASN in barcode: {overrides.asn}") + # try reading the ASN from barcode + if settings.CONSUMER_ENABLE_ASN_BARCODE: + overrides.asn = reader.asn + if overrides.asn: + logger.info(f"Found ASN in barcode: {overrides.asn}") # continue with consumption if no barcode was found document = Consumer().try_consume_file( diff --git a/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png b/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png deleted file mode 100644 index 80517d56d..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-128-custom.png b/src/documents/tests/samples/barcodes/barcode-128-custom.png deleted file mode 100644 index c3f1b803a..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-128-custom.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png deleted file mode 100644 index 3f858f6ad..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png deleted file mode 100644 index cc81f8e36..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png deleted file mode 100644 index 1e24b4d84..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png deleted file mode 100644 index 0078026c8..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-asn-123.png b/src/documents/tests/samples/barcodes/barcode-39-asn-123.png deleted file mode 100644 index e0f735960..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-asn-123.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-asn-custom-prefix.png b/src/documents/tests/samples/barcodes/barcode-39-asn-custom-prefix.png deleted file mode 100644 index 5712c6920..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-asn-custom-prefix.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-asn-invalid.png b/src/documents/tests/samples/barcodes/barcode-39-asn-invalid.png deleted file mode 100644 index cc7f0d453..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-asn-invalid.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-custom.png b/src/documents/tests/samples/barcodes/barcode-39-custom.png deleted file mode 100644 index 5c2d7b4f7..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-39-custom.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/barcode-qr-custom.png b/src/documents/tests/samples/barcodes/barcode-qr-custom.png deleted file mode 100644 index 6574638bc..000000000 Binary files a/src/documents/tests/samples/barcodes/barcode-qr-custom.png and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle-alpha.tiff b/src/documents/tests/samples/barcodes/patch-code-t-middle-alpha.tiff new file mode 100644 index 000000000..c8ee6f08b Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle-alpha.tiff differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle-distorted.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle-distorted.pdf new file mode 100755 index 000000000..1a88b5aa9 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle-distorted.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle-fuzzy.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle-fuzzy.pdf new file mode 100755 index 000000000..01f4c08d7 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle-fuzzy.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle_reverse.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle-reverse.pdf similarity index 100% rename from src/documents/tests/samples/barcodes/patch-code-t-middle_reverse.pdf rename to src/documents/tests/samples/barcodes/patch-code-t-middle-reverse.pdf diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle-unreadable.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle-unreadable.pdf new file mode 100755 index 000000000..a67580643 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle-unreadable.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t.pbm b/src/documents/tests/samples/barcodes/patch-code-t.pbm deleted file mode 100644 index 7e7214070..000000000 Binary files a/src/documents/tests/samples/barcodes/patch-code-t.pbm and /dev/null differ diff --git a/src/documents/tests/samples/barcodes/qr-code-PATCHT.png b/src/documents/tests/samples/barcodes/qr-code-PATCHT.png deleted file mode 100644 index 6f1d587ff..000000000 Binary files a/src/documents/tests/samples/barcodes/qr-code-PATCHT.png and /dev/null differ diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 5a6c3edf9..838671256 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -6,10 +6,9 @@ import pytest from django.conf import settings from django.test import TestCase from django.test import override_settings -from PIL import Image -from documents import barcodes from documents import tasks +from documents.barcodes import BarcodeReader from documents.consumer import ConsumerError from documents.data_models import ConsumableDocument from documents.data_models import DocumentSource @@ -30,178 +29,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" - def test_barcode_reader_png(self): - """ - GIVEN: - - PNG file with separator barcode - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-PATCHT.png" - img = Image.open(test_file) - separator_barcode = settings.CONSUMER_BARCODE_STRING - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_pbm(self): - """ - GIVEN: - - Netpbm bitmap file with separator barcode - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pbm" - - img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_distortion_scratchy(self): - """ - GIVEN: - - Image containing high noise - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-PATCHT-distortion.png" - img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_distortion_stretched(self): - """ - GIVEN: - - Image with a stretched barcode - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-PATCHT-distortion2.png" - img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_unreadable(self): - """ - GIVEN: - - Image with a truly unreadable barcode - WHEN: - - Image is scanned for codes - THEN: - - No barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-PATCHT-unreadable.png" - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), []) - - def test_barcode_reader_qr(self): - """ - GIVEN: - - Image file with QR separator barcode - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "qr-code-PATCHT.png" - img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_128(self): - """ - GIVEN: - - Image file with 128 style separator barcode - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-PATCHT.png" - - img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) - self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - - def test_barcode_reader_no_barcode(self): - """ - GIVEN: - - Image file with no barcode - WHEN: - - Image is scanned for codes - THEN: - - No barcode is detected - """ - test_file = self.SAMPLE_DIR / "simple.png" - img = Image.open(test_file) - self.assertListEqual(barcodes.barcode_reader(img), []) - - def test_barcode_reader_custom_separator(self): - """ - GIVEN: - - Image file with custom separator barcode value - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) - - def test_barcode_reader_custom_qr_separator(self): - """ - GIVEN: - - Image file with custom separator barcode value as a QR code - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) - - def test_barcode_reader_custom_128_separator(self): - """ - GIVEN: - - Image file with custom separator 128 barcode value - WHEN: - - Image is scanned for codes - THEN: - - The barcode is detected - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) - - def test_convert_from_tiff_to_pdf(self): - """ - GIVEN: - - Multi-page TIFF image - WHEN: - - Conversion to PDF - THEN: - - The file converts without error - """ - test_file = self.SAMPLE_DIR / "simple.tiff" - - dst = settings.SCRATCH_DIR / "simple.tiff" - shutil.copy(test_file, dst) - target_file = barcodes.convert_from_tiff_to_pdf(dst) - - self.assertIsFile(target_file) - self.assertEqual(target_file.suffix, ".pdf") - def test_scan_file_for_separating_barcodes(self): """ GIVEN: @@ -213,16 +40,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {0: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {0: False}) def test_scan_file_for_separating_barcodes_none_present(self): """ @@ -235,16 +58,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - No pages to split on """ test_file = self.SAMPLE_DIR / "simple.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {}) def test_scan_file_for_separating_barcodes_middle_page(self): """ @@ -257,16 +76,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {1: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_scan_file_for_separating_barcodes_multiple_pages(self): """ @@ -279,39 +94,56 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "several-patcht-codes.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {2: False, 5: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {2: False, 5: False}) - def test_scan_file_for_separating_barcodes_upside_down(self): + def test_scan_file_for_separating_barcodes_hard_to_detect(self): """ GIVEN: - PDF file containing a separator on page 1 (zero indexed) - - The barcode is upside down + - The barcode is upside down, fuzzy or distorted WHEN: - File is scanned for barcodes THEN: - Barcode is detected on page 1 (zero indexed) """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle_reverse.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + for test_file in [ + "patch-code-t-middle-reverse.pdf", + "patch-code-t-middle-distorted.pdf", + "patch-code-t-middle-fuzzy.pdf", + ]: + test_file = self.BARCODE_SAMPLE_DIR / test_file - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {1: False}) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() + + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {1: False}) + + def test_scan_file_for_separating_barcodes_unreadable(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + - The barcode is not readable + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-unreadable.pdf" + + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() + + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {}) def test_scan_file_for_separating_barcodes_fax_decode(self): """ @@ -324,16 +156,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-fax-image.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {1: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_scan_file_for_separating_qr_barcodes(self): """ @@ -347,16 +175,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-qr.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {0: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_barcodes(self): @@ -371,16 +195,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {0: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_qr_barcodes(self): @@ -396,16 +216,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {0: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {0: False}) @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_128_barcodes(self): @@ -421,16 +237,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {0: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {0: False}) def test_scan_file_for_separating_wrong_qr_barcodes(self): """ @@ -445,16 +257,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {}) @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC") def test_scan_file_qr_barcodes_was_problem(self): @@ -468,16 +276,13 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "many-qr-codes.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertGreater(len(doc_barcode_info.barcodes), 0) - self.assertDictEqual(separator_page_numbers, {1: False}) + self.assertEqual(reader.pdf_file, test_file) + self.assertGreater(len(reader.barcodes), 0) + self.assertDictEqual(separator_page_numbers, {1: False}) def test_separate_pages(self): """ @@ -490,9 +295,11 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - documents = barcodes.separate_pages(test_file, {1: False}) + with BarcodeReader(test_file, "application/pdf") as reader: + documents = reader.separate_pages({1: False}) - self.assertEqual(len(documents), 2) + self.assertEqual(reader.pdf_file, test_file) + self.assertEqual(len(documents), 2) def test_separate_pages_double_code(self): """ @@ -505,9 +312,10 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-double.pdf" - pages = barcodes.separate_pages(test_file, {1: False, 2: False}) + with BarcodeReader(test_file, "application/pdf") as reader: + documents = reader.separate_pages({1: False, 2: False}) - self.assertEqual(len(pages), 2) + self.assertEqual(len(documents), 2) def test_separate_pages_no_list(self): """ @@ -519,56 +327,18 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - No new documents are produced - A warning is logged """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + test_file = self.SAMPLE_DIR / "simple.pdf" with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - pages = barcodes.separate_pages(test_file, {}) - self.assertEqual(pages, []) - self.assertEqual( - cm.output, - [ - "WARNING:paperless.barcodes:No pages to split on!", - ], - ) - - def test_save_to_dir(self): - """ - GIVEN: - - File to save to a directory - WHEN: - - The file is saved - THEN: - - The file exists - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - - barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR) - target_file = settings.SCRATCH_DIR / "patch-code-t.pdf" - self.assertIsFile(target_file) - - def test_save_to_dir_not_existing(self): - """ - GIVEN: - - File to save to a directory - - The directory doesn't exist - WHEN: - - The file is saved - THEN: - - The file exists - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - - nonexistingdir = Path("/nowhere") - self.assertIsNotDir(nonexistingdir) - - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - barcodes.save_to_dir(test_file, target_dir=nonexistingdir) - self.assertEqual( - cm.output, - [ - f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", - ], - ) + with BarcodeReader(test_file, "application/pdf") as reader: + success = reader.separate(DocumentSource.ApiUpload) + self.assertFalse(success) + self.assertEqual( + cm.output, + [ + "WARNING:paperless.barcodes:No pages to split on!", + ], + ) def test_save_to_dir_given_name(self): """ @@ -580,17 +350,17 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): THEN: - The file exists """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + with BarcodeReader(test_file, "application/pdf") as reader: + reader.separate(DocumentSource.ApiUpload, "newname.pdf") - barcodes.save_to_dir( - test_file, - newname="newname.pdf", - target_dir=settings.SCRATCH_DIR, - ) - target_file = settings.SCRATCH_DIR / "newname.pdf" - self.assertIsFile(target_file) + self.assertEqual(reader.pdf_file, test_file) + target_file1 = settings.CONSUMPTION_DIR / "0_newname.pdf" + target_file2 = settings.CONSUMPTION_DIR / "1_newname.pdf" + self.assertIsFile(target_file1) + self.assertIsFile(target_file2) - def test_barcode_splitter(self): + def test_barcode_splitter_api_upload(self): """ GIVEN: - Input file containing barcodes @@ -599,31 +369,95 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): THEN: - Correct number of files produced """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + test_file = settings.SCRATCH_DIR / "patch-code-t-middle.pdf" + shutil.copy(sample_file, test_file) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, + with BarcodeReader(test_file, "application/pdf") as reader: + reader.separate(DocumentSource.ApiUpload) + + self.assertEqual(reader.pdf_file, test_file) + + target_file1 = ( + settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf" + ) + + target_file2 = ( + settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf" + ) + + self.assertIsFile(target_file1) + self.assertIsFile(target_file2) + + def test_barcode_splitter_consume_dir(self): + """ + GIVEN: + - Input file containing barcodes + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced + """ + sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + test_file = settings.CONSUMPTION_DIR / "patch-code-t-middle.pdf" + shutil.copy(sample_file, test_file) + + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + reader.separate(DocumentSource.ConsumeFolder) + + self.assertEqual(reader.pdf_file, test_file) + + target_file1 = ( + settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf" + ) + + target_file2 = ( + settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf" + ) + + self.assertIsFile(target_file1) + self.assertIsFile(target_file2) + + def test_barcode_splitter_consume_dir_recursive(self): + """ + GIVEN: + - Input file containing barcodes + - Input file is within a directory structure of the consume folder + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced + - Output files are within the same directory structure + """ + sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" + test_file = ( + settings.CONSUMPTION_DIR / "tag1" / "tag2" / "patch-code-t-middle.pdf" ) + test_file.parent.mkdir(parents=True) + shutil.copy(sample_file, test_file) - self.assertEqual(test_file, doc_barcode_info.pdf_path) - self.assertTrue(len(separator_page_numbers) > 0) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.separate(DocumentSource.ConsumeFolder) - document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertGreater(len(document_list), 0) + self.assertEqual(reader.pdf_file, test_file) - for document in document_list: - barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR) + target_file1 = ( + settings.CONSUMPTION_DIR + / "tag1" + / "tag2" + / "patch-code-t-middle_document_0.pdf" + ) - target_file1 = settings.SCRATCH_DIR / "patch-code-t-middle_document_0.pdf" + target_file2 = ( + settings.CONSUMPTION_DIR + / "tag1" + / "tag2" + / "patch-code-t-middle_document_1.pdf" + ) - target_file2 = settings.SCRATCH_DIR / "patch-code-t-middle_document_1.pdf" - - self.assertIsFile(target_file1) - self.assertIsFile(target_file2) + self.assertIsFile(target_file1) + self.assertIsFile(target_file2) @override_settings(CONSUMER_ENABLE_BARCODES=True) def test_consume_barcode_file(self): @@ -681,7 +515,39 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ), "File successfully split", ) - self.assertFalse(dst.exists()) + self.assertIsNotFile(dst) + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_consume_barcode_tiff_file_with_alpha(self): + """ + GIVEN: + - TIFF image containing barcodes + - TIFF image has an alpha layer + WHEN: + - Consume task handles the alpha layer and returns + THEN: + - The file was split without issue + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff" + + dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff" + shutil.copy(test_file, dst) + + with mock.patch("documents.tasks.async_to_sync"): + self.assertEqual( + tasks.consume_file( + ConsumableDocument( + source=DocumentSource.ConsumeFolder, + original_file=dst, + ), + None, + ), + "File successfully split", + ) + self.assertIsNotFile(dst) @override_settings( CONSUMER_ENABLE_BARCODES=True, @@ -760,7 +626,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ), "File successfully split", ) - self.assertFalse(dst.exists()) + self.assertIsNotFile(dst) def test_scan_file_for_separating_barcodes_password(self): """ @@ -773,20 +639,16 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.SAMPLE_DIR / "password-is-test.pdf" with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - warning = cm.output[0] - expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" - self.assertTrue(warning.startswith(expected_str)) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + warning = cm.output[0] + expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" + self.assertTrue(warning.startswith(expected_str)) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertDictEqual(separator_page_numbers, {}) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual(separator_page_numbers, {}) @override_settings( CONSUMER_ENABLE_BARCODES=True, @@ -803,28 +665,27 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-2.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(test_file, doc_barcode_info.pdf_path) - self.assertDictEqual( - separator_page_numbers, - { - 2: False, - 4: True, - 5: True, - 8: True, - 10: True, - }, - ) + self.assertEqual( + reader.pdf_file, + test_file, + ) + self.assertDictEqual( + separator_page_numbers, + { + 2: False, + 4: True, + 5: True, + 8: True, + 10: True, + }, + ) - document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertEqual(len(document_list), 6) + document_list = reader.separate_pages(separator_page_numbers) + self.assertEqual(len(document_list), 6) @override_settings( CONSUMER_ENABLE_BARCODES=True, @@ -841,27 +702,23 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + with BarcodeReader(test_file, "application/pdf") as reader: + reader.detect() + separator_page_numbers = reader.get_separation_pages() - self.assertEqual(test_file, doc_barcode_info.pdf_path) - self.assertDictEqual( - separator_page_numbers, - { - 2: True, - 3: True, - 6: True, - 8: True, - }, - ) + self.assertEqual(reader.pdf_file, test_file) + self.assertDictEqual( + separator_page_numbers, + { + 2: True, + 3: True, + 6: True, + 8: True, + }, + ) - document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertEqual(len(document_list), 5) + document_list = reader.separate_pages(separator_page_numbers) + self.assertEqual(len(document_list), 5) class TestAsnBarcode(DirectoriesMixin, TestCase): @@ -869,52 +726,6 @@ class TestAsnBarcode(DirectoriesMixin, TestCase): BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" - def test_barcode_reader_asn_normal(self): - """ - GIVEN: - - Image containing standard ASNxxxxx barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASN00123"]) - - def test_barcode_reader_asn_invalid(self): - """ - GIVEN: - - Image containing invalid ASNxxxxx barcode - - The number portion of the ASN is not a number - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASNXYZXYZ"]) - - def test_barcode_reader_asn_custom_prefix(self): - """ - GIVEN: - - Image containing custom prefix barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.png" - - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) - @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") def test_scan_file_for_asn_custom_prefix(self): """ @@ -928,15 +739,47 @@ class TestAsnBarcode(DirectoriesMixin, TestCase): - The ASN integer value is correct """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" + with BarcodeReader(test_file, "application/pdf") as reader: + asn = reader.asn - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + self.assertEqual(reader.pdf_file, test_file) + self.assertEqual(asn, 123) - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, 123) + def test_scan_file_for_asn_barcode(self): + """ + GIVEN: + - PDF containing an ASN barcode + - The ASN value is 123 + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN integer value is correct + """ + test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf" + + with BarcodeReader(test_file, "application/pdf") as reader: + asn = reader.asn + + self.assertEqual(reader.pdf_file, test_file) + self.assertEqual(asn, 123) + + def test_scan_file_for_asn_not_existing(self): + """ + GIVEN: + - PDF without an ASN barcode + WHEN: + - File is scanned for barcodes + THEN: + - No ASN is retrieved from the document + """ + test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" + + with BarcodeReader(test_file, "application/pdf") as reader: + asn = reader.asn + + self.assertEqual(reader.pdf_file, test_file) + self.assertEqual(asn, None) def test_scan_file_for_asn_barcode_invalid(self): """ @@ -951,15 +794,13 @@ class TestAsnBarcode(DirectoriesMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.pdf" - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) + with BarcodeReader(test_file, "application/pdf") as reader: + asn = reader.asn - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + self.assertEqual(reader.pdf_file, test_file) - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, None) + self.assertEqual(reader.pdf_file, test_file) + self.assertEqual(asn, None) @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_consume_barcode_file_asn_assignment(self): @@ -992,48 +833,6 @@ class TestAsnBarcode(DirectoriesMixin, TestCase): self.assertEqual(kwargs["override_asn"], 123) - def test_scan_file_for_asn_barcode(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - """ - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf" - - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, 123) - - def test_scan_file_for_asn_not_existing(self): - """ - GIVEN: - - PDF without an ASN barcode - WHEN: - - File is scanned for barcodes - THEN: - - No ASN is retrieved from the document - """ - test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - "application/pdf", - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, None) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): """