Refactor file consumption task to allow beginnings of a plugin system (#5367)

This commit is contained in:
Trenton H 2024-01-13 08:11:14 -08:00 committed by GitHub
parent 4dbf8d7969
commit 2da5e46386
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 767 additions and 531 deletions

View File

@ -3,7 +3,6 @@ import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Final
from typing import Optional
from django.conf import settings
@ -15,8 +14,9 @@ from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
@ -26,7 +26,7 @@ logger = logging.getLogger("paperless.barcodes")
@dataclass(frozen=True)
class Barcode:
"""
Holds the information about a single barcode and its location
Holds the information about a single barcode and its location in a document
"""
page: int
@ -49,77 +49,111 @@ class Barcode:
return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
class BarcodeReader:
def __init__(self, filepath: Path, mime_type: str) -> None:
self.file: Final[Path] = filepath
self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file
self.barcodes: list[Barcode] = []
self._tiff_conversion_done = False
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
class BarcodePlugin(ConsumeTaskPlugin):
NAME: str = "BarcodePlugin"
@property
def able_to_run(self) -> bool:
"""
Able to run if:
- ASN from barcode detection is enabled or
- Barcode support is enabled and the mime type is supported
"""
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
supported_mimes = {"application/pdf", "image/tiff"}
else:
self.SUPPORTED_FILE_MIMES = {"application/pdf"}
supported_mimes = {"application/pdf"}
def __enter__(self):
if self.supported_mime_type:
self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes")
return self
return (
settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
) and self.input_doc.mime_type in supported_mimes
def __exit__(self, exc_type, exc_val, exc_tb):
if self.temp_dir is not None:
self.temp_dir.cleanup()
self.temp_dir = None
def setup(self):
self.temp_dir = tempfile.TemporaryDirectory(
dir=self.base_tmp_dir,
prefix="barcode",
)
self.pdf_file = self.input_doc.original_file
self._tiff_conversion_done = False
self.barcodes: list[Barcode] = []
@property
def supported_mime_type(self) -> bool:
"""
Return True if the given mime type is supported for barcodes, false otherwise
"""
return self.mime in self.SUPPORTED_FILE_MIMES
def run(self) -> Optional[str]:
# Maybe do the conversion of TIFF to PDF
self.convert_from_tiff_to_pdf()
@property
def asn(self) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
if not self.supported_mime_type:
return None
# Ensure the barcodes have been read
# Locate any barcodes in the files
self.detect()
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x.value for x in self.barcodes if x.is_asn),
None,
# Update/overwrite an ASN if possible
located_asn = self.asn
if located_asn is not None:
logger.info(f"Found ASN in barcode: {located_asn}")
self.metadata.asn = located_asn
separator_pages = self.get_separation_pages()
if not separator_pages:
return "No pages to split on!"
# We have pages to split against
# Note this does NOT use the base_temp_dir, as that will be removed
tmp_dir = Path(
tempfile.mkdtemp(
dir=settings.SCRATCH_DIR,
prefix="paperless-barcode-split-",
),
).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
task = tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=self.input_doc.source,
mailrule_id=self.input_doc.mailrule_id,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
self.metadata,
)
logger.info(f"Created new task {task.id} for {new_document.name}")
# This file is now two or more files
self.input_doc.original_file.unlink()
msg = "Barcode splitting complete!"
# Update the progress to complete
self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
# Request the consume task stops
raise StopConsumeTaskError(msg)
def cleanup(self) -> None:
self.temp_dir.cleanup()
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
return
self.pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
Path(self.temp_dir.name),
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
self._tiff_conversion_done = True
@staticmethod
def read_barcodes_zxing(image: Image) -> list[str]:
def read_barcodes_zxing(image: Image.Image) -> list[str]:
barcodes = []
import zxingcpp
@ -135,7 +169,7 @@ class BarcodeReader:
return barcodes
@staticmethod
def read_barcodes_pyzbar(image: Image) -> list[str]:
def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
barcodes = []
from pyzbar import pyzbar
@ -154,18 +188,6 @@ class BarcodeReader:
return barcodes
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.mime != "image/tiff" or self._tiff_conversion_done:
return
self._tiff_conversion_done = True
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
@ -218,10 +240,45 @@ class BarcodeReader:
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.exception(
logger.warning(
f"Exception during barcode scanning: {e}",
)
@property
def asn(self) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# Ensure the barcodes have been read
self.detect()
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x.value for x in self.barcodes if x.is_asn),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
@ -251,7 +308,7 @@ class BarcodeReader:
"""
document_paths = []
fname = self.file.stem
fname = self.input_doc.original_file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
@ -292,58 +349,8 @@ class BarcodeReader:
with open(savepath, "wb") as out:
dst.save(out)
copy_basic_file_stats(self.file, savepath)
copy_basic_file_stats(self.input_doc.original_file, savepath)
document_paths.append(savepath)
return document_paths
def separate(
self,
source: DocumentSource,
overrides: DocumentMetadataOverrides,
) -> bool:
"""
Separates the document, based on barcodes and configuration, creating new
documents as required in the appropriate location.
Returns True if a split happened, False otherwise
"""
# Do nothing
if not self.supported_mime_type:
logger.warning(f"Unsupported file format for barcode reader: {self.mime}")
return False
# Does nothing unless needed
self.convert_from_tiff_to_pdf()
# Actually read the codes, if any
self.detect()
separator_pages = self.get_separation_pages()
# Also do nothing
if not separator_pages:
logger.warning("No pages to split on!")
return False
tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=source,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
overrides,
)
logger.info("Barcode splitting complete!")
return True

View File

@ -21,7 +21,6 @@ from filelock import FileLock
from rest_framework.reverse import reverse
from documents.classifier import load_classifier
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
@ -42,12 +41,83 @@ from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object
from documents.plugins.base import AlwaysRunPluginMixin
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.signals import document_consumption_finished
from documents.signals import document_consumption_started
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
class WorkflowTriggerPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "WorkflowTriggerPlugin"
def run(self) -> Optional[str]:
"""
Get overrides from matching workflows
"""
overrides = DocumentMetadataOverrides()
for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
template_overrides = DocumentMetadataOverrides()
if document_matches_workflow(
self.input_doc,
workflow,
WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
):
for action in workflow.actions.all():
if action.assign_title is not None:
template_overrides.title = action.assign_title
if action.assign_tags is not None:
template_overrides.tag_ids = [
tag.pk for tag in action.assign_tags.all()
]
if action.assign_correspondent is not None:
template_overrides.correspondent_id = (
action.assign_correspondent.pk
)
if action.assign_document_type is not None:
template_overrides.document_type_id = (
action.assign_document_type.pk
)
if action.assign_storage_path is not None:
template_overrides.storage_path_id = (
action.assign_storage_path.pk
)
if action.assign_owner is not None:
template_overrides.owner_id = action.assign_owner.pk
if action.assign_view_users is not None:
template_overrides.view_users = [
user.pk for user in action.assign_view_users.all()
]
if action.assign_view_groups is not None:
template_overrides.view_groups = [
group.pk for group in action.assign_view_groups.all()
]
if action.assign_change_users is not None:
template_overrides.change_users = [
user.pk for user in action.assign_change_users.all()
]
if action.assign_change_groups is not None:
template_overrides.change_groups = [
group.pk for group in action.assign_change_groups.all()
]
if action.assign_custom_fields is not None:
template_overrides.custom_field_ids = [
field.pk for field in action.assign_custom_fields.all()
]
overrides.update(template_overrides)
self.metadata.update(overrides)
class ConsumerError(Exception):
pass
@ -602,70 +672,6 @@ class Consumer(LoggingMixin):
return document
def get_workflow_overrides(
self,
input_doc: ConsumableDocument,
) -> DocumentMetadataOverrides:
"""
Get overrides from matching workflows
"""
overrides = DocumentMetadataOverrides()
for workflow in Workflow.objects.filter(enabled=True).order_by("order"):
template_overrides = DocumentMetadataOverrides()
if document_matches_workflow(
input_doc,
workflow,
WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
):
for action in workflow.actions.all():
self.log.info(
f"Applying overrides in {action} from {workflow}",
)
if action.assign_title is not None:
template_overrides.title = action.assign_title
if action.assign_tags is not None:
template_overrides.tag_ids = [
tag.pk for tag in action.assign_tags.all()
]
if action.assign_correspondent is not None:
template_overrides.correspondent_id = (
action.assign_correspondent.pk
)
if action.assign_document_type is not None:
template_overrides.document_type_id = (
action.assign_document_type.pk
)
if action.assign_storage_path is not None:
template_overrides.storage_path_id = (
action.assign_storage_path.pk
)
if action.assign_owner is not None:
template_overrides.owner_id = action.assign_owner.pk
if action.assign_view_users is not None:
template_overrides.view_users = [
user.pk for user in action.assign_view_users.all()
]
if action.assign_view_groups is not None:
template_overrides.view_groups = [
group.pk for group in action.assign_view_groups.all()
]
if action.assign_change_users is not None:
template_overrides.change_users = [
user.pk for user in action.assign_change_users.all()
]
if action.assign_change_groups is not None:
template_overrides.change_groups = [
group.pk for group in action.assign_change_groups.all()
]
if action.assign_custom_fields is not None:
template_overrides.custom_field_ids = [
field.pk for field in action.assign_custom_fields.all()
]
overrides.update(template_overrides)
return overrides
def _parse_title_placeholders(self, title: str) -> str:
local_added = timezone.localtime(timezone.now())

View File

@ -3,127 +3,145 @@ import logging
import os
import shutil
from pathlib import Path
from typing import Final
from typing import Optional
from django.conf import settings
from pikepdf import Pdf
from documents.consumer import ConsumerError
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.base import StopConsumeTaskError
logger = logging.getLogger("paperless.double_sided")
# Hardcoded for now, could be made a configurable setting if needed
TIMEOUT_MINUTES = 30
TIMEOUT_MINUTES: Final[int] = 30
TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
# Used by test cases
STAGING_FILE_NAME = "double-sided-staging.pdf"
def collate(input_doc: ConsumableDocument) -> str:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
NAME: str = "CollatePlugin"
When called with a file, it checks whether or not a staging file
exists, if not, the current file is turned into that staging file
containing the odd numbered pages.
If a staging file exists, and it is not too old, the current file is
considered to be the second part (the even numbered pages) and it will
collate the pages of both, the pages of the second file will be added
in reverse order, since the ADF will have scanned the pages from bottom
to top.
Returns a status message on success, or raises a ConsumerError
in case of failure.
"""
# Make sure scratch dir exists, Consumer might not have run yet
settings.SCRATCH_DIR.mkdir(exist_ok=True)
if input_doc.mime_type == "application/pdf":
pdf_file = input_doc.original_file
elif (
input_doc.mime_type == "image/tiff"
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
):
pdf_file = convert_from_tiff_to_pdf(
input_doc.original_file,
settings.SCRATCH_DIR,
)
input_doc.original_file.unlink()
else:
raise ConsumerError("Unsupported file type for collation of double-sided scans")
staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
valid_staging_exists = False
if staging.exists():
stats = os.stat(str(staging))
# if the file is older than the timeout, we don't consider
# it valid
if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
logger.warning("Outdated double sided staging file exists, deleting it")
os.unlink(str(staging))
else:
valid_staging_exists = True
if valid_staging_exists:
try:
# Collate pages from second PDF in reverse order
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
pdf2.pages.reverse()
try:
for i, page in enumerate(pdf2.pages):
pdf1.pages.insert(2 * i + 1, page)
except IndexError:
raise ConsumerError(
"This second file (even numbered pages) contains more "
"pages than the first/odd numbered one. This means the "
"two uploaded files don't belong to the same double-"
"sided scan. Please retry, starting with the odd "
"numbered pages again.",
)
# Merged file has the same path, but without the
# double-sided subdir. Therefore, it is also in the
# consumption dir and will be picked up for processing
old_file = input_doc.original_file
new_file = Path(
*(
part
for part in old_file.with_name(
f"{old_file.stem}-collated.pdf",
).parts
if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
),
)
# If the user didn't create the subdirs yet, do it for them
new_file.parent.mkdir(parents=True, exist_ok=True)
pdf1.save(new_file)
logger.info("Collated documents into new file %s", new_file)
return (
"Success. Even numbered pages of double sided scan collated "
"with odd pages"
)
finally:
# Delete staging and recently uploaded file no matter what.
# If any error occurs, the user needs to be able to restart
# the process from scratch; after all, the staging file
# with the odd numbered pages might be the culprit
pdf_file.unlink()
staging.unlink()
else:
shutil.move(pdf_file, staging)
# update access to modification time so we know if the file
# is outdated when another file gets uploaded
os.utime(staging, (dt.datetime.now().timestamp(),) * 2)
logger.info(
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
staging,
)
@property
def able_to_run(self) -> bool:
return (
"Received odd numbered pages of double sided scan, waiting up to "
f"{TIMEOUT_MINUTES} minutes for even numbered pages"
settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
in self.input_doc.original_file.parts
)
def run(self) -> Optional[str]:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
When called with a file, it checks whether or not a staging file
exists, if not, the current file is turned into that staging file
containing the odd numbered pages.
If a staging file exists, and it is not too old, the current file is
considered to be the second part (the even numbered pages) and it will
collate the pages of both, the pages of the second file will be added
in reverse order, since the ADF will have scanned the pages from bottom
to top.
Returns a status message on success, or raises a ConsumerError
in case of failure.
"""
if self.input_doc.mime_type == "application/pdf":
pdf_file = self.input_doc.original_file
elif (
self.input_doc.mime_type == "image/tiff"
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
):
pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
self.base_tmp_dir,
)
self.input_doc.original_file.unlink()
else:
raise ConsumerError(
"Unsupported file type for collation of double-sided scans",
)
staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
valid_staging_exists = False
if staging.exists():
stats = staging.stat()
# if the file is older than the timeout, we don't consider
# it valid
if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
logger.warning("Outdated double sided staging file exists, deleting it")
staging.unlink()
else:
valid_staging_exists = True
if valid_staging_exists:
try:
# Collate pages from second PDF in reverse order
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
pdf2.pages.reverse()
try:
for i, page in enumerate(pdf2.pages):
pdf1.pages.insert(2 * i + 1, page)
except IndexError:
raise ConsumerError(
"This second file (even numbered pages) contains more "
"pages than the first/odd numbered one. This means the "
"two uploaded files don't belong to the same double-"
"sided scan. Please retry, starting with the odd "
"numbered pages again.",
)
# Merged file has the same path, but without the
# double-sided subdir. Therefore, it is also in the
# consumption dir and will be picked up for processing
old_file = self.input_doc.original_file
new_file = Path(
*(
part
for part in old_file.with_name(
f"{old_file.stem}-collated.pdf",
).parts
if part
!= settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
),
)
# If the user didn't create the subdirs yet, do it for them
new_file.parent.mkdir(parents=True, exist_ok=True)
pdf1.save(new_file)
logger.info("Collated documents into new file %s", new_file)
raise StopConsumeTaskError(
"Success. Even numbered pages of double sided scan collated "
"with odd pages",
)
finally:
# Delete staging and recently uploaded file no matter what.
# If any error occurs, the user needs to be able to restart
# the process from scratch; after all, the staging file
# with the odd numbered pages might be the culprit
pdf_file.unlink()
staging.unlink()
else:
shutil.move(pdf_file, staging)
# update access to modification time so we know if the file
# is outdated when another file gets uploaded
timestamp = dt.datetime.now().timestamp()
os.utime(staging, (timestamp, timestamp))
logger.info(
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
staging,
)
raise StopConsumeTaskError(
"Received odd numbered pages of double sided scan, waiting up to "
f"{TIMEOUT_MINUTES} minutes for even numbered pages",
)

View File

View File

@ -0,0 +1,131 @@
import abc
from pathlib import Path
from typing import Final
from typing import Optional
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.plugins.helpers import ProgressManager
class StopConsumeTaskError(Exception):
"""
A plugin setup or run may raise this to exit the asynchronous consume task.
Most likely, this means it has created one or more new tasks to execute instead,
such as when a barcode has been used to create new documents
"""
def __init__(self, message: str) -> None:
self.message = message
super().__init__(message)
class ConsumeTaskPlugin(abc.ABC):
"""
Defines the interface for a plugin for the document consume task
Meanings as per RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119)
Plugin Implementation
The plugin SHALL implement property able_to_run and methods setup, run and cleanup.
The plugin property able_to_run SHALL return True if the plugin is able to run, given the conditions, settings and document information.
The plugin property able_to_run MAY be hardcoded to return True.
The plugin setup SHOULD perform any resource creation or additional initialization needed to run the document.
The plugin setup MAY be a non-operation.
The plugin cleanup SHOULD perform resource cleanup, including in the event of an error.
The plugin cleanup MAY be a non-operation.
The plugin run SHALL perform any operations against the document or system state required for the plugin.
The plugin run MAY update the document metadata.
The plugin run MAY return an informational message.
The plugin run MAY raise StopConsumeTaskError to cease any further operations against the document.
Plugin Manager Implementation
The plugin manager SHALL provide the plugin with the input document, document metadata, progress manager and a created temporary directory.
The plugin manager SHALL execute the plugin setup, run and cleanup, in that order IF the plugin property able_to_run is True.
The plugin manager SHOULD log the return message of executing a plugin's run.
The plugin manager SHALL always execute the plugin cleanup, IF the plugin property able_to_run is True.
The plugin manager SHALL cease calling plugins and exit the task IF a plugin raises StopConsumeTaskError.
The plugin manager SHOULD return the StopConsumeTaskError message IF a plugin raises StopConsumeTaskError.
"""
NAME: str = "ConsumeTaskPlugin"
def __init__(
self,
input_doc: ConsumableDocument,
metadata: DocumentMetadataOverrides,
status_mgr: ProgressManager,
base_tmp_dir: Path,
task_id: str,
) -> None:
super().__init__()
self.input_doc = input_doc
self.metadata = metadata
self.base_tmp_dir: Final = base_tmp_dir
self.status_mgr = status_mgr
self.task_id: Final = task_id
@abc.abstractproperty
def able_to_run(self) -> bool:
"""
Return True if the conditions are met for the plugin to run, False otherwise
If False, setup(), run() and cleanup() will not be called
"""
@abc.abstractmethod
def setup(self) -> None:
"""
Allows the plugin to perform any additional setup it may need, such as creating
a temporary directory, copying a file somewhere, etc.
Executed before run()
In general, this should be the "light" work, not the bulk of processing
"""
@abc.abstractmethod
def run(self) -> Optional[str]:
"""
The bulk of plugin processing, this does whatever action the plugin is for.
Executed after setup() and before cleanup()
"""
@abc.abstractmethod
def cleanup(self) -> None:
"""
Allows the plugin to execute any cleanup it may require
Executed after run(), even in the case of error
"""
class AlwaysRunPluginMixin(ConsumeTaskPlugin):
"""
A plugin which is always able to run
"""
@property
def able_to_run(self) -> bool:
return True
class NoSetupPluginMixin(ConsumeTaskPlugin):
"""
A plugin which requires no setup
"""
def setup(self) -> None:
pass
class NoCleanupPluginMixin(ConsumeTaskPlugin):
"""
A plugin which needs to clean up no files
"""
def cleanup(self) -> None:
pass

View File

@ -0,0 +1,82 @@
import enum
from typing import TYPE_CHECKING
from typing import Optional
from typing import Union
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from channels_redis.pubsub import RedisPubSubChannelLayer
class ProgressStatusOptions(str, enum.Enum):
STARTED = "STARTED"
WORKING = "WORKING"
SUCCESS = "SUCCESS"
FAILED = "FAILED"
class ProgressManager:
"""
Handles sending of progress information via the channel layer, with proper management
of the open/close of the layer to ensure messages go out and everything is cleaned up
"""
def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
self.filename = filename
self._channel: Optional[RedisPubSubChannelLayer] = None
self.task_id = task_id
def __enter__(self):
self.open()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def open(self) -> None:
"""
If not already opened, gets the default channel layer
opened and ready to send messages
"""
if self._channel is None:
self._channel = get_channel_layer()
def close(self) -> None:
"""
If it was opened, flushes the channel layer
"""
if self._channel is not None:
async_to_sync(self._channel.flush)
self._channel = None
def send_progress(
self,
status: ProgressStatusOptions,
message: str,
current_progress: int,
max_progress: int,
extra_args: Optional[dict[str, Union[str, int]]] = None,
) -> None:
# Ensure the layer is open
self.open()
# Just for IDEs
if TYPE_CHECKING:
assert self._channel is not None
payload = {
"type": "status_update",
"data": {
"filename": self.filename,
"task_id": self.task_id,
"current_progress": current_progress,
"max_progress": max_progress,
"status": status,
"message": message,
},
}
if extra_args is not None:
payload["data"].update(extra_args)
# Construct and send the update
async_to_sync(self._channel.group_send)("status_updates", payload)

View File

@ -2,30 +2,30 @@ import hashlib
import logging
import shutil
import uuid
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional
import tqdm
from asgiref.sync import async_to_sync
from celery import Task
from celery import shared_task
from channels.layers import get_channel_layer
from django.conf import settings
from django.db import transaction
from django.db.models.signals import post_save
from filelock import FileLock
from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter
from documents import index
from documents import sanity_checker
from documents.barcodes import BarcodeReader
from documents.barcodes import BarcodePlugin
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.double_sided import collate
from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
@ -35,6 +35,10 @@ from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import ProgressManager
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions
from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated
@ -102,70 +106,60 @@ def consume_file(
input_doc: ConsumableDocument,
overrides: Optional[DocumentMetadataOverrides] = None,
):
def send_progress(status="SUCCESS", message="finished"):
payload = {
"filename": overrides.filename or input_doc.original_file.name,
"task_id": None,
"current_progress": 100,
"max_progress": 100,
"status": status,
"message": message,
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {e!s}")
# Default no overrides
if overrides is None:
overrides = DocumentMetadataOverrides()
# Handle collation of double-sided documents scanned in two parts
if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
in input_doc.original_file.parts
):
try:
msg = collate(input_doc)
send_progress(message=msg)
return msg
except ConsumerError as e:
send_progress(status="FAILURE", message=e.args[0])
raise e
plugins: list[type[ConsumeTaskPlugin]] = [
CollatePlugin,
BarcodePlugin,
WorkflowTriggerPlugin,
]
# read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
input_doc.source,
with ProgressManager(
overrides.filename or input_doc.original_file.name,
self.request.id,
) as status_mgr, TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
tmp_dir = Path(tmp_dir)
for plugin_class in plugins:
plugin_name = plugin_class.NAME
plugin = plugin_class(
input_doc,
overrides,
):
# notify the sender, otherwise the progress bar
# in the UI stays stuck
send_progress()
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
input_doc.original_file.unlink()
return "File successfully split"
status_mgr,
tmp_dir,
self.request.id,
)
# try reading the ASN from barcode
if (
settings.CONSUMER_ENABLE_ASN_BARCODE
and (located_asn := reader.asn) is not None
):
# Note this will take precedence over an API provided ASN
# But it's from a physical barcode, so that's good
overrides.asn = located_asn
logger.info(f"Found ASN in barcode: {overrides.asn}")
if not plugin.able_to_run:
logger.debug(f"Skipping plugin {plugin_name}")
continue
template_overrides = Consumer().get_workflow_overrides(
input_doc=input_doc,
)
try:
logger.debug(f"Executing plugin {plugin_name}")
plugin.setup()
overrides.update(template_overrides)
msg = plugin.run()
if msg is not None:
logger.info(f"{plugin_name} completed with: {msg}")
else:
logger.info(f"{plugin_name} completed with no message")
overrides = plugin.metadata
except StopConsumeTaskError as e:
logger.info(f"{plugin_name} requested task exit: {e.message}")
return e.message
except Exception as e:
logger.exception(f"{plugin_name} failed: {e}")
status_mgr.send_progress(ProgressStatusOptions.FAILED, f"{e}", 100, 100)
raise
finally:
plugin.cleanup()
# continue with consumption if no barcode was found
document = Consumer().try_consume_file(

View File

@ -1,4 +1,7 @@
import shutil
from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
from unittest import mock
import pytest
@ -7,14 +10,13 @@ from django.test import TestCase
from django.test import override_settings
from documents import tasks
from documents.barcodes import BarcodeReader
from documents.consumer import ConsumerError
from documents.barcodes import BarcodePlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
@ -26,8 +28,29 @@ except ImportError:
HAS_ZXING_LIB = False
class GetReaderPluginMixin:
@contextmanager
def get_reader(self, filepath: Path) -> Generator[BarcodePlugin, None, None]:
reader = BarcodePlugin(
ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
DocumentMetadataOverrides(),
DummyProgressManager(filepath.name, None),
self.dirs.scratch_dir,
"task-id",
)
reader.setup()
yield reader
reader.cleanup()
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
class TestBarcode(
DirectoriesMixin,
FileSystemAssertsMixin,
SampleDirMixin,
GetReaderPluginMixin,
TestCase,
):
def test_scan_file_for_separating_barcodes(self):
"""
GIVEN:
@ -39,7 +62,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -60,7 +83,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -80,7 +103,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
with BarcodeReader(test_file, "image/tiff") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -97,7 +120,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
- No pages to split on
"""
test_file = self.SAMPLE_DIR / "simple.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -115,7 +138,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -133,7 +156,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "several-patcht-codes.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -158,7 +181,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
]:
test_file = self.BARCODE_SAMPLE_DIR / test_file
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -177,7 +200,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-unreadable.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -195,7 +218,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-fax-image.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -214,7 +237,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-qr.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -234,7 +257,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -255,7 +278,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-custom.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -276,7 +299,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -296,7 +319,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-custom.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -315,7 +338,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "many-qr-codes.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -334,7 +357,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.SAMPLE_DIR / "password-is-test.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
warning = cm.output[0]
expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
@ -356,7 +379,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
documents = reader.separate_pages({1: False})
self.assertEqual(reader.pdf_file, test_file)
@ -373,7 +396,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-double.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
documents = reader.separate_pages({1: False, 2: False})
self.assertEqual(len(documents), 2)
@ -385,32 +408,18 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
WHEN:
- No separation pages are provided
THEN:
- No new documents are produced
- A warning is logged
- Nothing happens
"""
test_file = self.SAMPLE_DIR / "simple.pdf"
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
with BarcodeReader(test_file, "application/pdf") as reader:
self.assertFalse(
reader.separate(
DocumentSource.ApiUpload,
DocumentMetadataOverrides(),
),
)
self.assertEqual(
cm.output,
[
"WARNING:paperless.barcodes:No pages to split on!",
],
)
with self.get_reader(test_file) as reader:
self.assertEqual("No pages to split on!", reader.run())
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
CONSUMER_BARCODE_TIFF_SUPPORT=True,
)
@mock.patch("documents.consumer.Consumer.try_consume_file")
def test_consume_barcode_unsupported_jpg_file(self, m):
def test_consume_barcode_unsupported_jpg_file(self):
"""
GIVEN:
- JPEG image as input
@ -422,35 +431,8 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.SAMPLE_DIR / "simple.jpg"
dst = settings.SCRATCH_DIR / "simple.jpg"
shutil.copy(test_file, dst)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
self.assertIn(
"Success",
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
),
)
self.assertListEqual(
cm.output,
[
"WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
],
)
m.assert_called_once()
args, kwargs = m.call_args
self.assertIsNone(kwargs["override_filename"])
self.assertIsNone(kwargs["override_title"])
self.assertIsNone(kwargs["override_correspondent_id"])
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
with self.get_reader(test_file) as reader:
self.assertFalse(reader.able_to_run)
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
@ -467,7 +449,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-2.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -504,7 +486,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, Test
"""
test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
separator_page_numbers = reader.get_separation_pages()
@ -550,7 +532,7 @@ class TestBarcodeNewConsume(
overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
self.assertEqual(
tasks.consume_file(
ConsumableDocument(
@ -559,10 +541,8 @@ class TestBarcodeNewConsume(
),
overrides,
),
"File successfully split",
"Barcode splitting complete!",
)
# We let the consumer know progress is done
progress_mocker.assert_called_once()
# 2 new document consume tasks created
self.assertEqual(self.consume_file_mock.call_count, 2)
@ -580,7 +560,20 @@ class TestBarcodeNewConsume(
self.assertEqual(overrides, new_doc_overrides)
class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase):
@contextmanager
def get_reader(self, filepath: Path) -> BarcodePlugin:
reader = BarcodePlugin(
ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
DocumentMetadataOverrides(),
DummyProgressManager(filepath.name, None),
self.dirs.scratch_dir,
"task-id",
)
reader.setup()
yield reader
reader.cleanup()
@override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
def test_scan_file_for_asn_custom_prefix(self):
"""
@ -594,7 +587,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
- The ASN integer value is correct
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
@ -613,7 +606,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
@ -630,55 +623,12 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
"""
test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
self.assertEqual(asn, None)
@override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
def test_scan_file_for_asn_already_exists(self):
"""
GIVEN:
- PDF with an ASN barcode
- ASN value already exists
WHEN:
- File is scanned for barcodes
THEN:
- ASN is retrieved from the document
- Consumption fails
"""
Document.objects.create(
title="WOW",
content="the content",
archive_serial_number=123,
checksum="456",
mime_type="application/pdf",
)
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
shutil.copy(test_file, dst)
with mock.patch("documents.consumer.Consumer._send_progress"):
with self.assertRaises(ConsumerError) as cm, self.assertLogs(
"paperless.consumer",
level="ERROR",
) as logs_cm:
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
)
self.assertIn("Not consuming barcode-39-asn-123.pdf", str(cm.exception))
error_str = logs_cm.output[0]
expected_str = "ERROR:paperless.consumer:Not consuming barcode-39-asn-123.pdf: Given ASN already exists!"
self.assertEqual(expected_str, error_str)
def test_scan_file_for_asn_barcode_invalid(self):
"""
GIVEN:
@ -692,7 +642,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
"""
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-invalid.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
asn = reader.asn
self.assertEqual(reader.pdf_file, test_file)
@ -718,7 +668,9 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
shutil.copy(test_file, dst)
with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
with mock.patch(
"documents.consumer.Consumer.try_consume_file",
) as mocked_consumer:
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
@ -726,40 +678,11 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
),
None,
)
args, kwargs = mocked_call.call_args
mocked_consumer.assert_called_once()
args, kwargs = mocked_consumer.call_args
self.assertEqual(kwargs["override_asn"], 123)
@override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
def test_asn_too_large(self):
"""
GIVEN:
- ASN from barcode enabled
- Barcode contains too large an ASN value
WHEN:
- ASN from barcode checked for correctness
THEN:
- Exception is raised regarding size limits
"""
src = self.BARCODE_SAMPLE_DIR / "barcode-128-asn-too-large.pdf"
dst = self.dirs.scratch_dir / "barcode-128-asn-too-large.pdf"
shutil.copy(src, dst)
input_doc = ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
)
with mock.patch("documents.consumer.Consumer._send_progress"):
self.assertRaisesMessage(
ConsumerError,
"Given ASN 4294967296 is out of range [0, 4,294,967,295]",
tasks.consume_file,
input_doc,
)
@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
def test_scan_file_for_qrcode_without_upscale(self):
"""
@ -774,7 +697,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
self.assertEqual(len(reader.barcodes), 0)
@ -796,7 +719,7 @@ class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
test_file = self.BARCODE_SAMPLE_DIR / "barcode-qr-asn-000123-upscale-dpi.pdf"
with BarcodeReader(test_file, "application/pdf") as reader:
with self.get_reader(test_file) as reader:
reader.detect()
self.assertEqual(len(reader.barcodes), 1)
self.assertEqual(reader.asn, 123)

View File

@ -17,6 +17,7 @@ from documents.data_models import DocumentSource
from documents.double_sided import STAGING_FILE_NAME
from documents.double_sided import TIMEOUT_MINUTES
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
@ -42,9 +43,10 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
dst = self.dirs.double_sided_dir / dstname
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(src, dst)
with mock.patch("documents.tasks.async_to_sync"), mock.patch(
"documents.consumer.async_to_sync",
):
with mock.patch(
"documents.tasks.ProgressManager",
DummyProgressManager,
), mock.patch("documents.consumer.async_to_sync"):
msg = tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
@ -211,7 +213,7 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")
self.assertRegex(msg, r"Success. New document id \d+ created")
def test_subdirectory_upload(self):
"""
@ -250,4 +252,4 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
msg = self.consume_file("simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")
self.assertRegex(msg, r"Success. New document id \d+ created")

View File

@ -24,6 +24,7 @@ from documents.models import WorkflowAction
from documents.models import WorkflowTrigger
from documents.signals import document_consumption_finished
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@ -126,7 +127,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
@ -203,7 +204,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
w.save()
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
@ -294,7 +295,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(
@ -356,7 +357,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
@ -407,7 +408,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
@ -468,7 +469,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
@ -529,7 +530,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
@ -591,7 +592,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="DEBUG") as cm:
tasks.consume_file(
ConsumableDocument(
@ -686,7 +687,7 @@ class TestWorkflows(DirectoriesMixin, FileSystemAssertsMixin, APITestCase):
test_file = self.SAMPLE_DIR / "simple.pdf"
with mock.patch("documents.tasks.async_to_sync"):
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
with self.assertLogs("paperless.matching", level="INFO") as cm:
tasks.consume_file(
ConsumableDocument(

View File

@ -9,6 +9,7 @@ from os import PathLike
from pathlib import Path
from typing import Any
from typing import Callable
from typing import Optional
from typing import Union
from unittest import mock
@ -23,6 +24,7 @@ from django.test import override_settings
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.parsers import ParseError
from documents.plugins.helpers import ProgressStatusOptions
def setup_directories():
@ -146,6 +148,11 @@ def util_call_with_backoff(
class DirectoriesMixin:
"""
Creates and overrides settings for all folders and paths, then ensures
they are cleaned up on exit
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dirs = None
@ -160,6 +167,10 @@ class DirectoriesMixin:
class FileSystemAssertsMixin:
"""
Utilities for checks various state information of the file system
"""
def assertIsFile(self, path: Union[PathLike, str]):
self.assertTrue(Path(path).resolve().is_file(), f"File does not exist: {path}")
@ -188,6 +199,11 @@ class FileSystemAssertsMixin:
class ConsumerProgressMixin:
"""
Mocks the Consumer _send_progress, preventing attempts to connect to Redis
and allowing access to its calls for verification
"""
def setUp(self) -> None:
self.send_progress_patcher = mock.patch(
"documents.consumer.Consumer._send_progress",
@ -310,3 +326,59 @@ class SampleDirMixin:
SAMPLE_DIR = Path(__file__).parent / "samples"
BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
class DummyProgressManager:
"""
A dummy handler for progress management that doesn't actually try to
connect to Redis. Payloads are stored for test assertions if needed.
Use it with
mock.patch("documents.tasks.ProgressManager", DummyProgressManager)
"""
def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
self.filename = filename
self.task_id = task_id
print("hello world")
self.payloads = []
def __enter__(self):
self.open()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def open(self) -> None:
pass
def close(self) -> None:
pass
def send_progress(
self,
status: ProgressStatusOptions,
message: str,
current_progress: int,
max_progress: int,
extra_args: Optional[dict[str, Union[str, int]]] = None,
) -> None:
# Ensure the layer is open
self.open()
payload = {
"type": "status_update",
"data": {
"filename": self.filename,
"task_id": self.task_id,
"current_progress": current_progress,
"max_progress": max_progress,
"status": status,
"message": message,
},
}
if extra_args is not None:
payload["data"].update(extra_args)
self.payloads.append(payload)