mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-01 01:46:16 +00:00
Chore: refactor consumer plugin checks to a pre-flight plugin (#9994)
This commit is contained in:
@@ -98,15 +98,7 @@ class ConsumerStatusShortMessage(str, Enum):
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ConsumerPlugin(
|
||||
AlwaysRunPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
NoCleanupPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
class ConsumerPluginMixin:
|
||||
def __init__(
|
||||
self,
|
||||
input_doc: ConsumableDocument,
|
||||
@@ -155,88 +147,16 @@ class ConsumerPlugin(
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.input_doc.original_file, Path), (
|
||||
self.input_doc.original_file
|
||||
)
|
||||
if not self.input_doc.original_file.is_file():
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.input_doc.original_file}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
"""
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def pre_check_asn_value(self):
|
||||
"""
|
||||
Check that if override_asn is given, it is unique and within a valid range
|
||||
"""
|
||||
if self.metadata.asn is None:
|
||||
# check not necessary in case no ASN gets set
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
if (
|
||||
self.metadata.asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or self.metadata.asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.metadata.asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
|
||||
)
|
||||
existing_asn_doc = Document.global_objects.filter(
|
||||
archive_serial_number=self.metadata.asn,
|
||||
)
|
||||
if existing_asn_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: Given ASN {self.metadata.asn} already exists!"
|
||||
|
||||
if existing_asn_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
class ConsumerPlugin(
|
||||
AlwaysRunPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
NoCleanupPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
def run_pre_consume_script(self):
|
||||
"""
|
||||
@@ -366,20 +286,7 @@ class ConsumerPlugin(
|
||||
tempdir = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ProgressStatusOptions.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_duplicate()
|
||||
self.pre_check_asn_value()
|
||||
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
@@ -837,3 +744,113 @@ class ConsumerPlugin(
|
||||
copy_basic_file_stats(source, target)
|
||||
except Exception: # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class ConsumerPreflightPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
AlwaysRunPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "ConsumerPreflightPlugin"
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.input_doc.original_file, Path), (
|
||||
self.input_doc.original_file
|
||||
)
|
||||
if not self.input_doc.original_file.is_file():
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.input_doc.original_file}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
"""
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def pre_check_asn_value(self):
|
||||
"""
|
||||
Check that if override_asn is given, it is unique and within a valid range
|
||||
"""
|
||||
if self.metadata.asn is None:
|
||||
# check not necessary in case no ASN gets set
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
if (
|
||||
self.metadata.asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or self.metadata.asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.metadata.asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
|
||||
)
|
||||
existing_asn_doc = Document.global_objects.filter(
|
||||
archive_serial_number=self.metadata.asn,
|
||||
)
|
||||
if existing_asn_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: Given ASN {self.metadata.asn} already exists!"
|
||||
|
||||
if existing_asn_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
|
||||
def run(self) -> None:
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ProgressStatusOptions.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_duplicate()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_asn_value()
|
||||
|
Reference in New Issue
Block a user