diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 5a2c3381a..606451f84 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -3,7 +3,6 @@ import re import tempfile from dataclasses import dataclass from pathlib import Path -from typing import Final from typing import Optional from django.conf import settings @@ -15,8 +14,9 @@ from PIL import Image from documents.converters import convert_from_tiff_to_pdf from documents.data_models import ConsumableDocument -from documents.data_models import DocumentMetadataOverrides -from documents.data_models import DocumentSource +from documents.plugins.base import ConsumeTaskPlugin +from documents.plugins.base import StopConsumeTaskError +from documents.plugins.helpers import ProgressStatusOptions from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats @@ -26,7 +26,7 @@ logger = logging.getLogger("paperless.barcodes") @dataclass(frozen=True) class Barcode: """ - Holds the information about a single barcode and its location + Holds the information about a single barcode and its location in a document """ page: int @@ -49,77 +49,111 @@ class Barcode: return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX) -class BarcodeReader: - def __init__(self, filepath: Path, mime_type: str) -> None: - self.file: Final[Path] = filepath - self.mime: Final[str] = mime_type - self.pdf_file: Path = self.file - self.barcodes: list[Barcode] = [] - self._tiff_conversion_done = False - self.temp_dir: Optional[tempfile.TemporaryDirectory] = None +class BarcodePlugin(ConsumeTaskPlugin): + NAME: str = "BarcodePlugin" + @property + def able_to_run(self) -> bool: + """ + Able to run if: + - ASN from barcode detection is enabled or + - Barcode support is enabled and the mime type is supported + """ if settings.CONSUMER_BARCODE_TIFF_SUPPORT: - self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"} + supported_mimes = {"application/pdf", "image/tiff"} else: - self.SUPPORTED_FILE_MIMES = {"application/pdf"} + supported_mimes = {"application/pdf"} - def __enter__(self): - if self.supported_mime_type: - self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes") - return self + return ( + settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES + ) and self.input_doc.mime_type in supported_mimes - def __exit__(self, exc_type, exc_val, exc_tb): - if self.temp_dir is not None: - self.temp_dir.cleanup() - self.temp_dir = None + def setup(self): + self.temp_dir = tempfile.TemporaryDirectory( + dir=self.base_tmp_dir, + prefix="barcode", + ) + self.pdf_file = self.input_doc.original_file + self._tiff_conversion_done = False + self.barcodes: list[Barcode] = [] - @property - def supported_mime_type(self) -> bool: - """ - Return True if the given mime type is supported for barcodes, false otherwise - """ - return self.mime in self.SUPPORTED_FILE_MIMES + def run(self) -> Optional[str]: + # Maybe do the conversion of TIFF to PDF + self.convert_from_tiff_to_pdf() - @property - def asn(self) -> Optional[int]: - """ - Search the parsed barcodes for any ASNs. - The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - is considered the ASN to be used. - Returns the detected ASN (or None) - """ - asn = None - - if not self.supported_mime_type: - return None - - # Ensure the barcodes have been read + # Locate any barcodes in the files self.detect() - # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - asn_text = next( - (x.value for x in self.barcodes if x.is_asn), - None, + # Update/overwrite an ASN if possible + located_asn = self.asn + if located_asn is not None: + logger.info(f"Found ASN in barcode: {located_asn}") + self.metadata.asn = located_asn + + separator_pages = self.get_separation_pages() + if not separator_pages: + return "No pages to split on!" + + # We have pages to split against + + # Note this does NOT use the base_temp_dir, as that will be removed + tmp_dir = Path( + tempfile.mkdtemp( + dir=settings.SCRATCH_DIR, + prefix="paperless-barcode-split-", + ), + ).resolve() + + from documents import tasks + + # Create the split document tasks + for new_document in self.separate_pages(separator_pages): + copy_file_with_basic_stats(new_document, tmp_dir / new_document.name) + + task = tasks.consume_file.delay( + ConsumableDocument( + # Same source, for templates + source=self.input_doc.source, + mailrule_id=self.input_doc.mailrule_id, + # Can't use same folder or the consume might grab it again + original_file=(tmp_dir / new_document.name).resolve(), + ), + # All the same metadata + self.metadata, + ) + logger.info(f"Created new task {task.id} for {new_document.name}") + + # This file is now two or more files + self.input_doc.original_file.unlink() + + msg = "Barcode splitting complete!" + + # Update the progress to complete + self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100) + + # Request the consume task stops + raise StopConsumeTaskError(msg) + + def cleanup(self) -> None: + self.temp_dir.cleanup() + + def convert_from_tiff_to_pdf(self): + """ + May convert a TIFF image into a PDF, if the input is a TIFF and + the TIFF has not been made into a PDF + """ + # Nothing to do, pdf_file is already assigned correctly + if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done: + return + + self.pdf_file = convert_from_tiff_to_pdf( + self.input_doc.original_file, + Path(self.temp_dir.name), ) - - if asn_text: - logger.debug(f"Found ASN Barcode: {asn_text}") - # remove the prefix and remove whitespace - asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() - - # remove non-numeric parts of the remaining string - asn_text = re.sub(r"\D", "", asn_text) - - # now, try parsing the ASN number - try: - asn = int(asn_text) - except ValueError as e: - logger.warning(f"Failed to parse ASN number because: {e}") - - return asn + self._tiff_conversion_done = True @staticmethod - def read_barcodes_zxing(image: Image) -> list[str]: + def read_barcodes_zxing(image: Image.Image) -> list[str]: barcodes = [] import zxingcpp @@ -135,7 +169,7 @@ class BarcodeReader: return barcodes @staticmethod - def read_barcodes_pyzbar(image: Image) -> list[str]: + def read_barcodes_pyzbar(image: Image.Image) -> list[str]: barcodes = [] from pyzbar import pyzbar @@ -154,18 +188,6 @@ class BarcodeReader: return barcodes - def convert_from_tiff_to_pdf(self): - """ - May convert a TIFF image into a PDF, if the input is a TIFF and - the TIFF has not been made into a PDF - """ - # Nothing to do, pdf_file is already assigned correctly - if self.mime != "image/tiff" or self._tiff_conversion_done: - return - - self._tiff_conversion_done = True - self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name)) - def detect(self) -> None: """ Scan all pages of the PDF as images, updating barcodes and the pages @@ -218,10 +240,45 @@ class BarcodeReader: # This file is really borked, allow the consumption to continue # but it may fail further on except Exception as e: # pragma: no cover - logger.exception( + logger.warning( f"Exception during barcode scanning: {e}", ) + @property + def asn(self) -> Optional[int]: + """ + Search the parsed barcodes for any ASNs. + The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + is considered the ASN to be used. + Returns the detected ASN (or None) + """ + asn = None + + # Ensure the barcodes have been read + self.detect() + + # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + asn_text = next( + (x.value for x in self.barcodes if x.is_asn), + None, + ) + + if asn_text: + logger.debug(f"Found ASN Barcode: {asn_text}") + # remove the prefix and remove whitespace + asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() + + # remove non-numeric parts of the remaining string + asn_text = re.sub(r"\D", "", asn_text) + + # now, try parsing the ASN number + try: + asn = int(asn_text) + except ValueError as e: + logger.warning(f"Failed to parse ASN number because: {e}") + + return asn + def get_separation_pages(self) -> dict[int, bool]: """ Search the parsed barcodes for separators and returns a dict of page @@ -251,7 +308,7 @@ class BarcodeReader: """ document_paths = [] - fname = self.file.stem + fname = self.input_doc.original_file.stem with Pdf.open(self.pdf_file) as input_pdf: # Start with an empty document current_document: list[Page] = [] @@ -292,58 +349,8 @@ class BarcodeReader: with open(savepath, "wb") as out: dst.save(out) - copy_basic_file_stats(self.file, savepath) + copy_basic_file_stats(self.input_doc.original_file, savepath) document_paths.append(savepath) return document_paths - - def separate( - self, - source: DocumentSource, - overrides: DocumentMetadataOverrides, - ) -> bool: - """ - Separates the document, based on barcodes and configuration, creating new - documents as required in the appropriate location. - - Returns True if a split happened, False otherwise - """ - # Do nothing - if not self.supported_mime_type: - logger.warning(f"Unsupported file format for barcode reader: {self.mime}") - return False - - # Does nothing unless needed - self.convert_from_tiff_to_pdf() - - # Actually read the codes, if any - self.detect() - - separator_pages = self.get_separation_pages() - - # Also do nothing - if not separator_pages: - logger.warning("No pages to split on!") - return False - - tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve() - - from documents import tasks - - # Create the split document tasks - for new_document in self.separate_pages(separator_pages): - copy_file_with_basic_stats(new_document, tmp_dir / new_document.name) - - tasks.consume_file.delay( - ConsumableDocument( - # Same source, for templates - source=source, - # Can't use same folder or the consume might grab it again - original_file=(tmp_dir / new_document.name).resolve(), - ), - # All the same metadata - overrides, - ) - logger.info("Barcode splitting complete!") - return True diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 06e9f68fc..01b25edea 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -21,7 +21,6 @@ from filelock import FileLock from rest_framework.reverse import reverse from documents.classifier import load_classifier -from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.file_handling import create_source_path_directory from documents.file_handling import generate_unique_filename @@ -42,12 +41,83 @@ from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type from documents.parsers import parse_date from documents.permissions import set_permissions_for_object +from documents.plugins.base import AlwaysRunPluginMixin +from documents.plugins.base import ConsumeTaskPlugin +from documents.plugins.base import NoCleanupPluginMixin +from documents.plugins.base import NoSetupPluginMixin from documents.signals import document_consumption_finished from documents.signals import document_consumption_started from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats +class WorkflowTriggerPlugin( + NoCleanupPluginMixin, + NoSetupPluginMixin, + AlwaysRunPluginMixin, + ConsumeTaskPlugin, +): + NAME: str = "WorkflowTriggerPlugin" + + def run(self) -> Optional[str]: + """ + Get overrides from matching workflows + """ + overrides = DocumentMetadataOverrides() + for workflow in Workflow.objects.filter(enabled=True).order_by("order"): + template_overrides = DocumentMetadataOverrides() + + if document_matches_workflow( + self.input_doc, + workflow, + WorkflowTrigger.WorkflowTriggerType.CONSUMPTION, + ): + for action in workflow.actions.all(): + if action.assign_title is not None: + template_overrides.title = action.assign_title + if action.assign_tags is not None: + template_overrides.tag_ids = [ + tag.pk for tag in action.assign_tags.all() + ] + if action.assign_correspondent is not None: + template_overrides.correspondent_id = ( + action.assign_correspondent.pk + ) + if action.assign_document_type is not None: + template_overrides.document_type_id = ( + action.assign_document_type.pk + ) + if action.assign_storage_path is not None: + template_overrides.storage_path_id = ( + action.assign_storage_path.pk + ) + if action.assign_owner is not None: + template_overrides.owner_id = action.assign_owner.pk + if action.assign_view_users is not None: + template_overrides.view_users = [ + user.pk for user in action.assign_view_users.all() + ] + if action.assign_view_groups is not None: + template_overrides.view_groups = [ + group.pk for group in action.assign_view_groups.all() + ] + if action.assign_change_users is not None: + template_overrides.change_users = [ + user.pk for user in action.assign_change_users.all() + ] + if action.assign_change_groups is not None: + template_overrides.change_groups = [ + group.pk for group in action.assign_change_groups.all() + ] + if action.assign_custom_fields is not None: + template_overrides.custom_field_ids = [ + field.pk for field in action.assign_custom_fields.all() + ] + + overrides.update(template_overrides) + self.metadata.update(overrides) + + class ConsumerError(Exception): pass @@ -602,70 +672,6 @@ class Consumer(LoggingMixin): return document - def get_workflow_overrides( - self, - input_doc: ConsumableDocument, - ) -> DocumentMetadataOverrides: - """ - Get overrides from matching workflows - """ - overrides = DocumentMetadataOverrides() - for workflow in Workflow.objects.filter(enabled=True).order_by("order"): - template_overrides = DocumentMetadataOverrides() - - if document_matches_workflow( - input_doc, - workflow, - WorkflowTrigger.WorkflowTriggerType.CONSUMPTION, - ): - for action in workflow.actions.all(): - self.log.info( - f"Applying overrides in {action} from {workflow}", - ) - if action.assign_title is not None: - template_overrides.title = action.assign_title - if action.assign_tags is not None: - template_overrides.tag_ids = [ - tag.pk for tag in action.assign_tags.all() - ] - if action.assign_correspondent is not None: - template_overrides.correspondent_id = ( - action.assign_correspondent.pk - ) - if action.assign_document_type is not None: - template_overrides.document_type_id = ( - action.assign_document_type.pk - ) - if action.assign_storage_path is not None: - template_overrides.storage_path_id = ( - action.assign_storage_path.pk - ) - if action.assign_owner is not None: - template_overrides.owner_id = action.assign_owner.pk - if action.assign_view_users is not None: - template_overrides.view_users = [ - user.pk for user in action.assign_view_users.all() - ] - if action.assign_view_groups is not None: - template_overrides.view_groups = [ - group.pk for group in action.assign_view_groups.all() - ] - if action.assign_change_users is not None: - template_overrides.change_users = [ - user.pk for user in action.assign_change_users.all() - ] - if action.assign_change_groups is not None: - template_overrides.change_groups = [ - group.pk for group in action.assign_change_groups.all() - ] - if action.assign_custom_fields is not None: - template_overrides.custom_field_ids = [ - field.pk for field in action.assign_custom_fields.all() - ] - - overrides.update(template_overrides) - return overrides - def _parse_title_placeholders(self, title: str) -> str: local_added = timezone.localtime(timezone.now()) diff --git a/src/documents/double_sided.py b/src/documents/double_sided.py index 5acde1597..bfe66f4fe 100644 --- a/src/documents/double_sided.py +++ b/src/documents/double_sided.py @@ -3,127 +3,145 @@ import logging import os import shutil from pathlib import Path +from typing import Final +from typing import Optional from django.conf import settings from pikepdf import Pdf from documents.consumer import ConsumerError from documents.converters import convert_from_tiff_to_pdf -from documents.data_models import ConsumableDocument +from documents.plugins.base import ConsumeTaskPlugin +from documents.plugins.base import NoCleanupPluginMixin +from documents.plugins.base import NoSetupPluginMixin +from documents.plugins.base import StopConsumeTaskError logger = logging.getLogger("paperless.double_sided") # Hardcoded for now, could be made a configurable setting if needed -TIMEOUT_MINUTES = 30 +TIMEOUT_MINUTES: Final[int] = 30 +TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60 # Used by test cases STAGING_FILE_NAME = "double-sided-staging.pdf" -def collate(input_doc: ConsumableDocument) -> str: - """ - Tries to collate pages from 2 single sided scans of a double sided - document. +class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin): + NAME: str = "CollatePlugin" - When called with a file, it checks whether or not a staging file - exists, if not, the current file is turned into that staging file - containing the odd numbered pages. - - If a staging file exists, and it is not too old, the current file is - considered to be the second part (the even numbered pages) and it will - collate the pages of both, the pages of the second file will be added - in reverse order, since the ADF will have scanned the pages from bottom - to top. - - Returns a status message on success, or raises a ConsumerError - in case of failure. - """ - - # Make sure scratch dir exists, Consumer might not have run yet - settings.SCRATCH_DIR.mkdir(exist_ok=True) - - if input_doc.mime_type == "application/pdf": - pdf_file = input_doc.original_file - elif ( - input_doc.mime_type == "image/tiff" - and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT - ): - pdf_file = convert_from_tiff_to_pdf( - input_doc.original_file, - settings.SCRATCH_DIR, - ) - input_doc.original_file.unlink() - else: - raise ConsumerError("Unsupported file type for collation of double-sided scans") - - staging = settings.SCRATCH_DIR / STAGING_FILE_NAME - - valid_staging_exists = False - if staging.exists(): - stats = os.stat(str(staging)) - # if the file is older than the timeout, we don't consider - # it valid - if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60: - logger.warning("Outdated double sided staging file exists, deleting it") - os.unlink(str(staging)) - else: - valid_staging_exists = True - - if valid_staging_exists: - try: - # Collate pages from second PDF in reverse order - with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2: - pdf2.pages.reverse() - try: - for i, page in enumerate(pdf2.pages): - pdf1.pages.insert(2 * i + 1, page) - except IndexError: - raise ConsumerError( - "This second file (even numbered pages) contains more " - "pages than the first/odd numbered one. This means the " - "two uploaded files don't belong to the same double-" - "sided scan. Please retry, starting with the odd " - "numbered pages again.", - ) - # Merged file has the same path, but without the - # double-sided subdir. Therefore, it is also in the - # consumption dir and will be picked up for processing - old_file = input_doc.original_file - new_file = Path( - *( - part - for part in old_file.with_name( - f"{old_file.stem}-collated.pdf", - ).parts - if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME - ), - ) - # If the user didn't create the subdirs yet, do it for them - new_file.parent.mkdir(parents=True, exist_ok=True) - pdf1.save(new_file) - logger.info("Collated documents into new file %s", new_file) - return ( - "Success. Even numbered pages of double sided scan collated " - "with odd pages" - ) - finally: - # Delete staging and recently uploaded file no matter what. - # If any error occurs, the user needs to be able to restart - # the process from scratch; after all, the staging file - # with the odd numbered pages might be the culprit - pdf_file.unlink() - staging.unlink() - - else: - shutil.move(pdf_file, staging) - # update access to modification time so we know if the file - # is outdated when another file gets uploaded - os.utime(staging, (dt.datetime.now().timestamp(),) * 2) - logger.info( - "Got scan with odd numbered pages of double-sided scan, moved it to %s", - staging, - ) + @property + def able_to_run(self) -> bool: return ( - "Received odd numbered pages of double sided scan, waiting up to " - f"{TIMEOUT_MINUTES} minutes for even numbered pages" + settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED + and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME + in self.input_doc.original_file.parts ) + + def run(self) -> Optional[str]: + """ + Tries to collate pages from 2 single sided scans of a double sided + document. + + When called with a file, it checks whether or not a staging file + exists, if not, the current file is turned into that staging file + containing the odd numbered pages. + + If a staging file exists, and it is not too old, the current file is + considered to be the second part (the even numbered pages) and it will + collate the pages of both, the pages of the second file will be added + in reverse order, since the ADF will have scanned the pages from bottom + to top. + + Returns a status message on success, or raises a ConsumerError + in case of failure. + """ + + if self.input_doc.mime_type == "application/pdf": + pdf_file = self.input_doc.original_file + elif ( + self.input_doc.mime_type == "image/tiff" + and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT + ): + pdf_file = convert_from_tiff_to_pdf( + self.input_doc.original_file, + self.base_tmp_dir, + ) + self.input_doc.original_file.unlink() + else: + raise ConsumerError( + "Unsupported file type for collation of double-sided scans", + ) + + staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME + + valid_staging_exists = False + if staging.exists(): + stats = staging.stat() + # if the file is older than the timeout, we don't consider + # it valid + if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS: + logger.warning("Outdated double sided staging file exists, deleting it") + staging.unlink() + else: + valid_staging_exists = True + + if valid_staging_exists: + try: + # Collate pages from second PDF in reverse order + with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2: + pdf2.pages.reverse() + try: + for i, page in enumerate(pdf2.pages): + pdf1.pages.insert(2 * i + 1, page) + except IndexError: + raise ConsumerError( + "This second file (even numbered pages) contains more " + "pages than the first/odd numbered one. This means the " + "two uploaded files don't belong to the same double-" + "sided scan. Please retry, starting with the odd " + "numbered pages again.", + ) + # Merged file has the same path, but without the + # double-sided subdir. Therefore, it is also in the + # consumption dir and will be picked up for processing + old_file = self.input_doc.original_file + new_file = Path( + *( + part + for part in old_file.with_name( + f"{old_file.stem}-collated.pdf", + ).parts + if part + != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME + ), + ) + # If the user didn't create the subdirs yet, do it for them + new_file.parent.mkdir(parents=True, exist_ok=True) + pdf1.save(new_file) + logger.info("Collated documents into new file %s", new_file) + raise StopConsumeTaskError( + "Success. Even numbered pages of double sided scan collated " + "with odd pages", + ) + finally: + # Delete staging and recently uploaded file no matter what. + # If any error occurs, the user needs to be able to restart + # the process from scratch; after all, the staging file + # with the odd numbered pages might be the culprit + pdf_file.unlink() + staging.unlink() + + else: + shutil.move(pdf_file, staging) + # update access to modification time so we know if the file + # is outdated when another file gets uploaded + timestamp = dt.datetime.now().timestamp() + os.utime(staging, (timestamp, timestamp)) + logger.info( + "Got scan with odd numbered pages of double-sided scan, moved it to %s", + staging, + ) + raise StopConsumeTaskError( + "Received odd numbered pages of double sided scan, waiting up to " + f"{TIMEOUT_MINUTES} minutes for even numbered pages", + ) diff --git a/src/documents/plugins/__init__.py b/src/documents/plugins/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/plugins/base.py b/src/documents/plugins/base.py new file mode 100644 index 000000000..aec4887be --- /dev/null +++ b/src/documents/plugins/base.py @@ -0,0 +1,131 @@ +import abc +from pathlib import Path +from typing import Final +from typing import Optional + +from documents.data_models import ConsumableDocument +from documents.data_models import DocumentMetadataOverrides +from documents.plugins.helpers import ProgressManager + + +class StopConsumeTaskError(Exception): + """ + A plugin setup or run may raise this to exit the asynchronous consume task. + + Most likely, this means it has created one or more new tasks to execute instead, + such as when a barcode has been used to create new documents + """ + + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + +class ConsumeTaskPlugin(abc.ABC): + """ + Defines the interface for a plugin for the document consume task + Meanings as per RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119) + + Plugin Implementation + + The plugin SHALL implement property able_to_run and methods setup, run and cleanup. + The plugin property able_to_run SHALL return True if the plugin is able to run, given the conditions, settings and document information. + The plugin property able_to_run MAY be hardcoded to return True. + The plugin setup SHOULD perform any resource creation or additional initialization needed to run the document. + The plugin setup MAY be a non-operation. + The plugin cleanup SHOULD perform resource cleanup, including in the event of an error. + The plugin cleanup MAY be a non-operation. + The plugin run SHALL perform any operations against the document or system state required for the plugin. + The plugin run MAY update the document metadata. + The plugin run MAY return an informational message. + The plugin run MAY raise StopConsumeTaskError to cease any further operations against the document. + + Plugin Manager Implementation + + The plugin manager SHALL provide the plugin with the input document, document metadata, progress manager and a created temporary directory. + The plugin manager SHALL execute the plugin setup, run and cleanup, in that order IF the plugin property able_to_run is True. + The plugin manager SHOULD log the return message of executing a plugin's run. + The plugin manager SHALL always execute the plugin cleanup, IF the plugin property able_to_run is True. + The plugin manager SHALL cease calling plugins and exit the task IF a plugin raises StopConsumeTaskError. + The plugin manager SHOULD return the StopConsumeTaskError message IF a plugin raises StopConsumeTaskError. + """ + + NAME: str = "ConsumeTaskPlugin" + + def __init__( + self, + input_doc: ConsumableDocument, + metadata: DocumentMetadataOverrides, + status_mgr: ProgressManager, + base_tmp_dir: Path, + task_id: str, + ) -> None: + super().__init__() + self.input_doc = input_doc + self.metadata = metadata + self.base_tmp_dir: Final = base_tmp_dir + self.status_mgr = status_mgr + self.task_id: Final = task_id + + @abc.abstractproperty + def able_to_run(self) -> bool: + """ + Return True if the conditions are met for the plugin to run, False otherwise + + If False, setup(), run() and cleanup() will not be called + """ + + @abc.abstractmethod + def setup(self) -> None: + """ + Allows the plugin to perform any additional setup it may need, such as creating + a temporary directory, copying a file somewhere, etc. + + Executed before run() + + In general, this should be the "light" work, not the bulk of processing + """ + + @abc.abstractmethod + def run(self) -> Optional[str]: + """ + The bulk of plugin processing, this does whatever action the plugin is for. + + Executed after setup() and before cleanup() + """ + + @abc.abstractmethod + def cleanup(self) -> None: + """ + Allows the plugin to execute any cleanup it may require + + Executed after run(), even in the case of error + """ + + +class AlwaysRunPluginMixin(ConsumeTaskPlugin): + """ + A plugin which is always able to run + """ + + @property + def able_to_run(self) -> bool: + return True + + +class NoSetupPluginMixin(ConsumeTaskPlugin): + """ + A plugin which requires no setup + """ + + def setup(self) -> None: + pass + + +class NoCleanupPluginMixin(ConsumeTaskPlugin): + """ + A plugin which needs to clean up no files + """ + + def cleanup(self) -> None: + pass diff --git a/src/documents/plugins/helpers.py b/src/documents/plugins/helpers.py new file mode 100644 index 000000000..92fe1255b --- /dev/null +++ b/src/documents/plugins/helpers.py @@ -0,0 +1,82 @@ +import enum +from typing import TYPE_CHECKING +from typing import Optional +from typing import Union + +from asgiref.sync import async_to_sync +from channels.layers import get_channel_layer +from channels_redis.pubsub import RedisPubSubChannelLayer + + +class ProgressStatusOptions(str, enum.Enum): + STARTED = "STARTED" + WORKING = "WORKING" + SUCCESS = "SUCCESS" + FAILED = "FAILED" + + +class ProgressManager: + """ + Handles sending of progress information via the channel layer, with proper management + of the open/close of the layer to ensure messages go out and everything is cleaned up + """ + + def __init__(self, filename: str, task_id: Optional[str] = None) -> None: + self.filename = filename + self._channel: Optional[RedisPubSubChannelLayer] = None + self.task_id = task_id + + def __enter__(self): + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def open(self) -> None: + """ + If not already opened, gets the default channel layer + opened and ready to send messages + """ + if self._channel is None: + self._channel = get_channel_layer() + + def close(self) -> None: + """ + If it was opened, flushes the channel layer + """ + if self._channel is not None: + async_to_sync(self._channel.flush) + self._channel = None + + def send_progress( + self, + status: ProgressStatusOptions, + message: str, + current_progress: int, + max_progress: int, + extra_args: Optional[dict[str, Union[str, int]]] = None, + ) -> None: + # Ensure the layer is open + self.open() + + # Just for IDEs + if TYPE_CHECKING: + assert self._channel is not None + + payload = { + "type": "status_update", + "data": { + "filename": self.filename, + "task_id": self.task_id, + "current_progress": current_progress, + "max_progress": max_progress, + "status": status, + "message": message, + }, + } + if extra_args is not None: + payload["data"].update(extra_args) + + # Construct and send the update + async_to_sync(self._channel.group_send)("status_updates", payload) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index abb9cd39d..a83c2e6cd 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -2,30 +2,30 @@ import hashlib import logging import shutil import uuid +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional import tqdm -from asgiref.sync import async_to_sync from celery import Task from celery import shared_task -from channels.layers import get_channel_layer from django.conf import settings from django.db import transaction from django.db.models.signals import post_save from filelock import FileLock -from redis.exceptions import ConnectionError from whoosh.writing import AsyncWriter from documents import index from documents import sanity_checker -from documents.barcodes import BarcodeReader +from documents.barcodes import BarcodePlugin from documents.classifier import DocumentClassifier from documents.classifier import load_classifier from documents.consumer import Consumer from documents.consumer import ConsumerError +from documents.consumer import WorkflowTriggerPlugin from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides -from documents.double_sided import collate +from documents.double_sided import CollatePlugin from documents.file_handling import create_source_path_directory from documents.file_handling import generate_unique_filename from documents.models import Correspondent @@ -35,6 +35,10 @@ from documents.models import StoragePath from documents.models import Tag from documents.parsers import DocumentParser from documents.parsers import get_parser_class_for_mime_type +from documents.plugins.base import ConsumeTaskPlugin +from documents.plugins.base import ProgressManager +from documents.plugins.base import StopConsumeTaskError +from documents.plugins.helpers import ProgressStatusOptions from documents.sanity_checker import SanityCheckFailedException from documents.signals import document_updated @@ -102,70 +106,60 @@ def consume_file( input_doc: ConsumableDocument, overrides: Optional[DocumentMetadataOverrides] = None, ): - def send_progress(status="SUCCESS", message="finished"): - payload = { - "filename": overrides.filename or input_doc.original_file.name, - "task_id": None, - "current_progress": 100, - "max_progress": 100, - "status": status, - "message": message, - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except ConnectionError as e: - logger.warning(f"ConnectionError on status send: {e!s}") - # Default no overrides if overrides is None: overrides = DocumentMetadataOverrides() - # Handle collation of double-sided documents scanned in two parts - if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and ( - settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME - in input_doc.original_file.parts - ): - try: - msg = collate(input_doc) - send_progress(message=msg) - return msg - except ConsumerError as e: - send_progress(status="FAILURE", message=e.args[0]) - raise e + plugins: list[type[ConsumeTaskPlugin]] = [ + CollatePlugin, + BarcodePlugin, + WorkflowTriggerPlugin, + ] - # read all barcodes in the current document - if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: - with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: - if settings.CONSUMER_ENABLE_BARCODES and reader.separate( - input_doc.source, + with ProgressManager( + overrides.filename or input_doc.original_file.name, + self.request.id, + ) as status_mgr, TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir: + tmp_dir = Path(tmp_dir) + for plugin_class in plugins: + plugin_name = plugin_class.NAME + + plugin = plugin_class( + input_doc, overrides, - ): - # notify the sender, otherwise the progress bar - # in the UI stays stuck - send_progress() - # consuming stops here, since the original document with - # the barcodes has been split and will be consumed separately - input_doc.original_file.unlink() - return "File successfully split" + status_mgr, + tmp_dir, + self.request.id, + ) - # try reading the ASN from barcode - if ( - settings.CONSUMER_ENABLE_ASN_BARCODE - and (located_asn := reader.asn) is not None - ): - # Note this will take precedence over an API provided ASN - # But it's from a physical barcode, so that's good - overrides.asn = located_asn - logger.info(f"Found ASN in barcode: {overrides.asn}") + if not plugin.able_to_run: + logger.debug(f"Skipping plugin {plugin_name}") + continue - template_overrides = Consumer().get_workflow_overrides( - input_doc=input_doc, - ) + try: + logger.debug(f"Executing plugin {plugin_name}") + plugin.setup() - overrides.update(template_overrides) + msg = plugin.run() + + if msg is not None: + logger.info(f"{plugin_name} completed with: {msg}") + else: + logger.info(f"{plugin_name} completed with no message") + + overrides = plugin.metadata + + except StopConsumeTaskError as e: + logger.info(f"{plugin_name} requested task exit: {e.message}") + return e.message + + except Exception as e: + logger.exception(f"{plugin_name} failed: {e}") + status_mgr.send_progress(ProgressStatusOptions.FAILED, f"{e}", 100, 100) + raise + + finally: + plugin.cleanup() # continue with consumption if no barcode was found document = Consumer().try_consume_file( diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index e4d8ccc57..4552a2b77 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1,4 +1,7 @@ import shutil +from collections.abc import Generator +from contextlib import contextmanager +from pathlib import Path from unittest import mock import pytest @@ -7,14 +10,13 @@ from django.test import TestCase from django.test import override_settings from documents import tasks -from documents.barcodes import BarcodeReader -from documents.consumer import ConsumerError +from documents.barcodes import BarcodePlugin from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource -from documents.models import Document from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DocumentConsumeDelayMixin +from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import SampleDirMixin @@ -26,8 +28,29 @@ except ImportError: HAS_ZXING_LIB = False +class GetReaderPluginMixin: + @contextmanager + def get_reader(self, filepath: Path) -> Generator[BarcodePlugin, None, None]: + reader = BarcodePlugin( + ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath), + DocumentMetadataOverrides(), + DummyProgressManager(filepath.name, None), + self.dirs.scratch_dir, + "task-id", + ) + reader.setup() + yield reader + reader.cleanup() + + @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR") -class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase): +class TestBarcode( + DirectoriesMixin, + FileSystemAssertsMixin, + SampleDirMixin, + GetReaderPluginMixin, + TestCase, +): def test_scan_file_for_separating_barcodes(self): """ GIVEN: @@ -39,7 +62,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -60,7 +83,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff" - with BarcodeReader(test_file, "image/tiff") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -80,7 +103,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff" - with BarcodeReader(test_file, "image/tiff") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -97,7 +120,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test - No pages to split on """ test_file = self.SAMPLE_DIR / "simple.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -115,7 +138,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -133,7 +156,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "several-patcht-codes.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -158,7 +181,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test ]: test_file = self.BARCODE_SAMPLE_DIR / test_file - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -177,7 +200,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-unreadable.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -195,7 +218,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-fax-image.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -214,7 +237,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-qr.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -234,7 +257,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -255,7 +278,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -276,7 +299,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -296,7 +319,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -315,7 +338,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "many-qr-codes.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -334,7 +357,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.SAMPLE_DIR / "password-is-test.pdf" with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() warning = cm.output[0] expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" @@ -356,7 +379,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: documents = reader.separate_pages({1: False}) self.assertEqual(reader.pdf_file, test_file) @@ -373,7 +396,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-double.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: documents = reader.separate_pages({1: False, 2: False}) self.assertEqual(len(documents), 2) @@ -385,32 +408,18 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test WHEN: - No separation pages are provided THEN: - - No new documents are produced - - A warning is logged + - Nothing happens """ test_file = self.SAMPLE_DIR / "simple.pdf" - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - with BarcodeReader(test_file, "application/pdf") as reader: - self.assertFalse( - reader.separate( - DocumentSource.ApiUpload, - DocumentMetadataOverrides(), - ), - ) - self.assertEqual( - cm.output, - [ - "WARNING:paperless.barcodes:No pages to split on!", - ], - ) + with self.get_reader(test_file) as reader: + self.assertEqual("No pages to split on!", reader.run()) @override_settings( CONSUMER_ENABLE_BARCODES=True, CONSUMER_BARCODE_TIFF_SUPPORT=True, ) - @mock.patch("documents.consumer.Consumer.try_consume_file") - def test_consume_barcode_unsupported_jpg_file(self, m): + def test_consume_barcode_unsupported_jpg_file(self): """ GIVEN: - JPEG image as input @@ -422,35 +431,8 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.SAMPLE_DIR / "simple.jpg" - dst = settings.SCRATCH_DIR / "simple.jpg" - shutil.copy(test_file, dst) - - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - self.assertIn( - "Success", - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ), - ) - - self.assertListEqual( - cm.output, - [ - "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg", - ], - ) - m.assert_called_once() - - args, kwargs = m.call_args - self.assertIsNone(kwargs["override_filename"]) - self.assertIsNone(kwargs["override_title"]) - self.assertIsNone(kwargs["override_correspondent_id"]) - self.assertIsNone(kwargs["override_document_type_id"]) - self.assertIsNone(kwargs["override_tag_ids"]) + with self.get_reader(test_file) as reader: + self.assertFalse(reader.able_to_run) @override_settings( CONSUMER_ENABLE_BARCODES=True, @@ -467,7 +449,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-2.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -504,7 +486,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test """ test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() separator_page_numbers = reader.get_separation_pages() @@ -550,7 +532,7 @@ class TestBarcodeNewConsume( overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9]) - with mock.patch("documents.tasks.async_to_sync") as progress_mocker: + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): self.assertEqual( tasks.consume_file( ConsumableDocument( @@ -559,10 +541,8 @@ class TestBarcodeNewConsume( ), overrides, ), - "File successfully split", + "Barcode splitting complete!", ) - # We let the consumer know progress is done - progress_mocker.assert_called_once() # 2 new document consume tasks created self.assertEqual(self.consume_file_mock.call_count, 2) @@ -580,7 +560,20 @@ class TestBarcodeNewConsume( self.assertEqual(overrides, new_doc_overrides) -class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): +class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase): + @contextmanager + def get_reader(self, filepath: Path) -> BarcodePlugin: + reader = BarcodePlugin( + ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath), + DocumentMetadataOverrides(), + DummyProgressManager(filepath.name, None), + self.dirs.scratch_dir, + "task-id", + ) + reader.setup() + yield reader + reader.cleanup() + @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") def test_scan_file_for_asn_custom_prefix(self): """ @@ -594,7 +587,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): - The ASN integer value is correct """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: asn = reader.asn self.assertEqual(reader.pdf_file, test_file) @@ -613,7 +606,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: asn = reader.asn self.assertEqual(reader.pdf_file, test_file) @@ -630,55 +623,12 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: asn = reader.asn self.assertEqual(reader.pdf_file, test_file) self.assertEqual(asn, None) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) - def test_scan_file_for_asn_already_exists(self): - """ - GIVEN: - - PDF with an ASN barcode - - ASN value already exists - WHEN: - - File is scanned for barcodes - THEN: - - ASN is retrieved from the document - - Consumption fails - """ - - Document.objects.create( - title="WOW", - content="the content", - archive_serial_number=123, - checksum="456", - mime_type="application/pdf", - ) - - test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf" - - dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf" - shutil.copy(test_file, dst) - - with mock.patch("documents.consumer.Consumer._send_progress"): - with self.assertRaises(ConsumerError) as cm, self.assertLogs( - "paperless.consumer", - level="ERROR", - ) as logs_cm: - tasks.consume_file( - ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ), - None, - ) - self.assertIn("Not consuming barcode-39-asn-123.pdf", str(cm.exception)) - error_str = logs_cm.output[0] - expected_str = "ERROR:paperless.consumer:Not consuming barcode-39-asn-123.pdf: Given ASN already exists!" - self.assertEqual(expected_str, error_str) - def test_scan_file_for_asn_barcode_invalid(self): """ GIVEN: @@ -692,7 +642,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): """ test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: asn = reader.asn self.assertEqual(reader.pdf_file, test_file) @@ -718,7 +668,9 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf" shutil.copy(test_file, dst) - with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: + with mock.patch( + "documents.consumer.Consumer.try_consume_file", + ) as mocked_consumer: tasks.consume_file( ConsumableDocument( source=DocumentSource.ConsumeFolder, @@ -726,40 +678,11 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): ), None, ) - - args, kwargs = mocked_call.call_args + mocked_consumer.assert_called_once() + args, kwargs = mocked_consumer.call_args self.assertEqual(kwargs["override_asn"], 123) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) - def test_asn_too_large(self): - """ - GIVEN: - - ASN from barcode enabled - - Barcode contains too large an ASN value - WHEN: - - ASN from barcode checked for correctness - THEN: - - Exception is raised regarding size limits - """ - src = self.BARCODE_SAMPLE_DIR / "barcode-128-asn-too-large.pdf" - - dst = self.dirs.scratch_dir / "barcode-128-asn-too-large.pdf" - shutil.copy(src, dst) - - input_doc = ConsumableDocument( - source=DocumentSource.ConsumeFolder, - original_file=dst, - ) - - with mock.patch("documents.consumer.Consumer._send_progress"): - self.assertRaisesMessage( - ConsumerError, - "Given ASN 4294967296 is out of range [0, 4,294,967,295]", - tasks.consume_file, - input_doc, - ) - @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR") def test_scan_file_for_qrcode_without_upscale(self): """ @@ -774,7 +697,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() self.assertEqual(len(reader.barcodes), 0) @@ -796,7 +719,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase): test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf" - with BarcodeReader(test_file, "application/pdf") as reader: + with self.get_reader(test_file) as reader: reader.detect() self.assertEqual(len(reader.barcodes), 1) self.assertEqual(reader.asn, 123) diff --git a/src/documents/tests/test_double_sided.py b/src/documents/tests/test_double_sided.py index 88cbe7d87..c66594491 100644 --- a/src/documents/tests/test_double_sided.py +++ b/src/documents/tests/test_double_sided.py @@ -17,6 +17,7 @@ from documents.data_models import DocumentSource from documents.double_sided import STAGING_FILE_NAME from documents.double_sided import TIMEOUT_MINUTES from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin @@ -42,9 +43,10 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase): dst = self.dirs.double_sided_dir / dstname dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy(src, dst) - with mock.patch("documents.tasks.async_to_sync"), mock.patch( - "documents.consumer.async_to_sync", - ): + with mock.patch( + "documents.tasks.ProgressManager", + DummyProgressManager, + ), mock.patch("documents.consumer.async_to_sync"): msg = tasks.consume_file( ConsumableDocument( source=DocumentSource.ConsumeFolder, @@ -211,7 +213,7 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf") self.assertIsNotFile(self.staging_file) - self.assertRegex(msg, "Success. New document .* created") + self.assertRegex(msg, r"Success. New document id \d+ created") def test_subdirectory_upload(self): """ @@ -250,4 +252,4 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ msg = self.consume_file("simple.pdf") self.assertIsNotFile(self.staging_file) - self.assertRegex(msg, "Success. New document .* created") + self.assertRegex(msg, r"Success. New document id \d+ created") diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index b688eecc9..e92a00682 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -24,6 +24,7 @@ from documents.models import WorkflowAction from documents.models import WorkflowTrigger from documents.signals import document_consumption_finished from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin from paperless_mail.models import MailAccount from paperless_mail.models import MailRule @@ -126,7 +127,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="INFO") as cm: tasks.consume_file( ConsumableDocument( @@ -203,7 +204,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): w.save() test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="INFO") as cm: tasks.consume_file( ConsumableDocument( @@ -294,7 +295,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="INFO") as cm: tasks.consume_file( ConsumableDocument( @@ -356,7 +357,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="DEBUG") as cm: tasks.consume_file( ConsumableDocument( @@ -407,7 +408,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="DEBUG") as cm: tasks.consume_file( ConsumableDocument( @@ -468,7 +469,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="DEBUG") as cm: tasks.consume_file( ConsumableDocument( @@ -529,7 +530,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="DEBUG") as cm: tasks.consume_file( ConsumableDocument( @@ -591,7 +592,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="DEBUG") as cm: tasks.consume_file( ConsumableDocument( @@ -686,7 +687,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase): test_file = self.SAMPLE_DIR / "simple.pdf" - with mock.patch("documents.tasks.async_to_sync"): + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): with self.assertLogs("paperless.matching", level="INFO") as cm: tasks.consume_file( ConsumableDocument( diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index 0b6d8fcad..4c3305d13 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -9,6 +9,7 @@ from os import PathLike from pathlib import Path from typing import Any from typing import Callable +from typing import Optional from typing import Union from unittest import mock @@ -23,6 +24,7 @@ from django.test import override_settings from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.parsers import ParseError +from documents.plugins.helpers import ProgressStatusOptions def setup_directories(): @@ -146,6 +148,11 @@ def util_call_with_backoff( class DirectoriesMixin: + """ + Creates and overrides settings for all folders and paths, then ensures + they are cleaned up on exit + """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.dirs = None @@ -160,6 +167,10 @@ class DirectoriesMixin: class FileSystemAssertsMixin: + """ + Utilities for checks various state information of the file system + """ + def assertIsFile(self, path: Union[PathLike, str]): self.assertTrue(Path(path).resolve().is_file(), f"File does not exist: {path}") @@ -188,6 +199,11 @@ class FileSystemAssertsMixin: class ConsumerProgressMixin: + """ + Mocks the Consumer _send_progress, preventing attempts to connect to Redis + and allowing access to its calls for verification + """ + def setUp(self) -> None: self.send_progress_patcher = mock.patch( "documents.consumer.Consumer._send_progress", @@ -310,3 +326,59 @@ class SampleDirMixin: SAMPLE_DIR = Path(__file__).parent / "samples" BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes" + + +class DummyProgressManager: + """ + A dummy handler for progress management that doesn't actually try to + connect to Redis. Payloads are stored for test assertions if needed. + + Use it with + mock.patch("documents.tasks.ProgressManager", DummyProgressManager) + """ + + def __init__(self, filename: str, task_id: Optional[str] = None) -> None: + self.filename = filename + self.task_id = task_id + print("hello world") + self.payloads = [] + + def __enter__(self): + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def open(self) -> None: + pass + + def close(self) -> None: + pass + + def send_progress( + self, + status: ProgressStatusOptions, + message: str, + current_progress: int, + max_progress: int, + extra_args: Optional[dict[str, Union[str, int]]] = None, + ) -> None: + # Ensure the layer is open + self.open() + + payload = { + "type": "status_update", + "data": { + "filename": self.filename, + "task_id": self.task_id, + "current_progress": current_progress, + "max_progress": max_progress, + "status": status, + "message": message, + }, + } + if extra_args is not None: + payload["data"].update(extra_args) + + self.payloads.append(payload)