Merge branch 'dev' into feature-document-versions-1218

This commit is contained in:
shamoon
2026-02-09 23:41:44 -08:00
724 changed files with 146173 additions and 72126 deletions

View File

@@ -5,6 +5,7 @@ import tempfile
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Final
import magic
from django.conf import settings
@@ -32,12 +33,12 @@ from documents.models import WorkflowTrigger
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object
from documents.plugins.base import AlwaysRunPluginMixin
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.date_parsing import get_date_parser
from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions
from documents.signals import document_consumption_finished
@@ -49,6 +50,8 @@ from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser
LOGGING_NAME: Final[str] = "paperless.consumer"
class WorkflowTriggerPlugin(
NoCleanupPluginMixin,
@@ -126,7 +129,7 @@ class ConsumerPluginMixin:
status: ProgressStatusOptions,
message: ConsumerStatusShortMessage | str | None = None,
document_id=None,
): # pragma: no cover
) -> None: # pragma: no cover
self.status_mgr.send_progress(
status,
message,
@@ -162,9 +165,9 @@ class ConsumerPlugin(
ConsumerPluginMixin,
ConsumeTaskPlugin,
):
logging_name = "paperless.consumer"
logging_name = LOGGING_NAME
def run_pre_consume_script(self):
def run_pre_consume_script(self) -> None:
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
@@ -207,7 +210,7 @@ class ConsumerPlugin(
exception=e,
)
def run_post_consume_script(self, document: Document):
def run_post_consume_script(self, document: Document) -> None:
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
@@ -367,7 +370,10 @@ class ConsumerPlugin(
tempdir.cleanup()
raise
def progress_callback(current_progress, max_progress): # pragma: no cover
def progress_callback(
current_progress,
max_progress,
) -> None: # pragma: no cover
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 50 + 20)
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
@@ -432,7 +438,8 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE,
)
date = parse_date(self.filename, text)
with get_date_parser() as date_parser:
date = next(date_parser.parse(self.filename, text), None)
archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(self.working_copy, mime_type)
@@ -535,7 +542,6 @@ class ConsumerPlugin(
create_source_path_directory(document.source_path)
self._write(
document.storage_type,
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy,
@@ -543,7 +549,6 @@ class ConsumerPlugin(
)
self._write(
document.storage_type,
thumbnail,
document.thumbnail_path,
)
@@ -555,7 +560,6 @@ class ConsumerPlugin(
)
create_source_path_directory(document.archive_path)
self._write(
document.storage_type,
archive_path,
document.archive_path,
)
@@ -675,8 +679,6 @@ class ConsumerPlugin(
)
self.log.debug(f"Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if self.metadata.filename:
title = Path(self.metadata.filename).stem
else:
@@ -703,7 +705,6 @@ class ConsumerPlugin(
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
created=create_date,
modified=create_date,
storage_type=storage_type,
page_count=page_count,
original_filename=self.filename,
)
@@ -714,7 +715,7 @@ class ConsumerPlugin(
return document
def apply_overrides(self, document):
def apply_overrides(self, document) -> None:
if self.metadata.correspondent_id:
document.correspondent = Correspondent.objects.get(
pk=self.metadata.correspondent_id,
@@ -774,7 +775,7 @@ class ConsumerPlugin(
}
CustomFieldInstance.objects.create(**args) # adds to document
def _write(self, storage_type, source, target):
def _write(self, source, target) -> None:
with (
Path(source).open("rb") as read_file,
Path(target).open("wb") as write_file,
@@ -797,9 +798,9 @@ class ConsumerPreflightPlugin(
ConsumeTaskPlugin,
):
NAME: str = "ConsumerPreflightPlugin"
logging_name = "paperless.consumer"
logging_name = LOGGING_NAME
def pre_check_file_exists(self):
def pre_check_file_exists(self) -> None:
"""
Confirm the input file still exists where it should
"""
@@ -813,7 +814,7 @@ class ConsumerPreflightPlugin(
f"Cannot consume {self.input_doc.original_file}: File not found.",
)
def pre_check_duplicate(self):
def pre_check_duplicate(self) -> None:
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
@@ -823,21 +824,47 @@ class ConsumerPreflightPlugin(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
if settings.CONSUMER_DELETE_DUPLICATES:
Path(self.input_doc.original_file).unlink()
self._fail(
msg,
log_msg,
existing_doc = existing_doc.order_by("-created")
duplicates_in_trash = existing_doc.filter(deleted_at__isnull=False)
log_msg = (
f"Consuming duplicate {self.filename}: "
f"{existing_doc.count()} existing document(s) share the same content."
)
def pre_check_directories(self):
if duplicates_in_trash.exists():
log_msg += " Note: at least one existing document is in the trash."
self.log.warning(log_msg)
if settings.CONSUMER_DELETE_DUPLICATES:
duplicate = existing_doc.first()
duplicate_label = (
duplicate.title
or duplicate.original_filename
or (Path(duplicate.filename).name if duplicate.filename else None)
or str(duplicate.pk)
)
Path(self.input_doc.original_file).unlink()
failure_msg = (
f"Not consuming {self.filename}: "
f"It is a duplicate of {duplicate_label} (#{duplicate.pk})"
)
status_msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
if duplicates_in_trash.exists():
status_msg = (
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
)
failure_msg += " Note: existing document is in the trash."
self._fail(
status_msg,
failure_msg,
)
def pre_check_directories(self) -> None:
"""
Ensure all required directories exist before attempting to use them
"""
@@ -846,12 +873,38 @@ class ConsumerPreflightPlugin(
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
def pre_check_asn_value(self):
def run(self) -> None:
self._send_progress(
0,
100,
ProgressStatusOptions.STARTED,
ConsumerStatusShortMessage.NEW_FILE,
)
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_duplicate()
self.pre_check_directories()
class AsnCheckPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
LoggingMixin,
ConsumerPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "AsnCheckPlugin"
logging_name = LOGGING_NAME
def pre_check_asn_value(self) -> None:
"""
Check that if override_asn is given, it is unique and within a valid range
"""
if self.metadata.asn is None:
# check not necessary in case no ASN gets set
# if ASN is None
return
# Validate the range is above zero and less than uint32_t max
# otherwise, Whoosh can't handle it in the index
@@ -883,16 +936,4 @@ class ConsumerPreflightPlugin(
)
def run(self) -> None:
self._send_progress(
0,
100,
ProgressStatusOptions.STARTED,
ConsumerStatusShortMessage.NEW_FILE,
)
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_duplicate()
self.pre_check_directories()
self.pre_check_asn_value()