mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-11 23:59:31 -06:00
Merge branch 'dev' into feature-document-versions-1218
This commit is contained in:
@@ -5,6 +5,7 @@ import tempfile
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
@@ -32,12 +33,12 @@ from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import NoCleanupPluginMixin
|
||||
from documents.plugins.base import NoSetupPluginMixin
|
||||
from documents.plugins.date_parsing import get_date_parser
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.signals import document_consumption_finished
|
||||
@@ -49,6 +50,8 @@ from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
@@ -126,7 +129,7 @@ class ConsumerPluginMixin:
|
||||
status: ProgressStatusOptions,
|
||||
message: ConsumerStatusShortMessage | str | None = None,
|
||||
document_id=None,
|
||||
): # pragma: no cover
|
||||
) -> None: # pragma: no cover
|
||||
self.status_mgr.send_progress(
|
||||
status,
|
||||
message,
|
||||
@@ -162,9 +165,9 @@ class ConsumerPlugin(
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
logging_name = "paperless.consumer"
|
||||
logging_name = LOGGING_NAME
|
||||
|
||||
def run_pre_consume_script(self):
|
||||
def run_pre_consume_script(self) -> None:
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
@@ -207,7 +210,7 @@ class ConsumerPlugin(
|
||||
exception=e,
|
||||
)
|
||||
|
||||
def run_post_consume_script(self, document: Document):
|
||||
def run_post_consume_script(self, document: Document) -> None:
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
@@ -367,7 +370,10 @@ class ConsumerPlugin(
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(current_progress, max_progress): # pragma: no cover
|
||||
def progress_callback(
|
||||
current_progress,
|
||||
max_progress,
|
||||
) -> None: # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
@@ -432,7 +438,8 @@ class ConsumerPlugin(
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
date = parse_date(self.filename, text)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
|
||||
@@ -535,7 +542,6 @@ class ConsumerPlugin(
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
@@ -543,7 +549,6 @@ class ConsumerPlugin(
|
||||
)
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
@@ -555,7 +560,6 @@ class ConsumerPlugin(
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
document.storage_type,
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
@@ -675,8 +679,6 @@ class ConsumerPlugin(
|
||||
)
|
||||
self.log.debug(f"Creation date from st_mtime: {create_date}")
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
if self.metadata.filename:
|
||||
title = Path(self.metadata.filename).stem
|
||||
else:
|
||||
@@ -703,7 +705,6 @@ class ConsumerPlugin(
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
storage_type=storage_type,
|
||||
page_count=page_count,
|
||||
original_filename=self.filename,
|
||||
)
|
||||
@@ -714,7 +715,7 @@ class ConsumerPlugin(
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
def apply_overrides(self, document) -> None:
|
||||
if self.metadata.correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.metadata.correspondent_id,
|
||||
@@ -774,7 +775,7 @@ class ConsumerPlugin(
|
||||
}
|
||||
CustomFieldInstance.objects.create(**args) # adds to document
|
||||
|
||||
def _write(self, storage_type, source, target):
|
||||
def _write(self, source, target) -> None:
|
||||
with (
|
||||
Path(source).open("rb") as read_file,
|
||||
Path(target).open("wb") as write_file,
|
||||
@@ -797,9 +798,9 @@ class ConsumerPreflightPlugin(
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "ConsumerPreflightPlugin"
|
||||
logging_name = "paperless.consumer"
|
||||
logging_name = LOGGING_NAME
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
def pre_check_file_exists(self) -> None:
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
@@ -813,7 +814,7 @@ class ConsumerPreflightPlugin(
|
||||
f"Cannot consume {self.input_doc.original_file}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
def pre_check_duplicate(self) -> None:
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
@@ -823,21 +824,47 @@ class ConsumerPreflightPlugin(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
existing_doc = existing_doc.order_by("-created")
|
||||
duplicates_in_trash = existing_doc.filter(deleted_at__isnull=False)
|
||||
log_msg = (
|
||||
f"Consuming duplicate {self.filename}: "
|
||||
f"{existing_doc.count()} existing document(s) share the same content."
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
if duplicates_in_trash.exists():
|
||||
log_msg += " Note: at least one existing document is in the trash."
|
||||
|
||||
self.log.warning(log_msg)
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
duplicate = existing_doc.first()
|
||||
duplicate_label = (
|
||||
duplicate.title
|
||||
or duplicate.original_filename
|
||||
or (Path(duplicate.filename).name if duplicate.filename else None)
|
||||
or str(duplicate.pk)
|
||||
)
|
||||
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
|
||||
failure_msg = (
|
||||
f"Not consuming {self.filename}: "
|
||||
f"It is a duplicate of {duplicate_label} (#{duplicate.pk})"
|
||||
)
|
||||
status_msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
|
||||
if duplicates_in_trash.exists():
|
||||
status_msg = (
|
||||
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
)
|
||||
failure_msg += " Note: existing document is in the trash."
|
||||
|
||||
self._fail(
|
||||
status_msg,
|
||||
failure_msg,
|
||||
)
|
||||
|
||||
def pre_check_directories(self) -> None:
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
"""
|
||||
@@ -846,12 +873,38 @@ class ConsumerPreflightPlugin(
|
||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def pre_check_asn_value(self):
|
||||
def run(self) -> None:
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ProgressStatusOptions.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_duplicate()
|
||||
self.pre_check_directories()
|
||||
|
||||
|
||||
class AsnCheckPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
AlwaysRunPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "AsnCheckPlugin"
|
||||
logging_name = LOGGING_NAME
|
||||
|
||||
def pre_check_asn_value(self) -> None:
|
||||
"""
|
||||
Check that if override_asn is given, it is unique and within a valid range
|
||||
"""
|
||||
if self.metadata.asn is None:
|
||||
# check not necessary in case no ASN gets set
|
||||
# if ASN is None
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
@@ -883,16 +936,4 @@ class ConsumerPreflightPlugin(
|
||||
)
|
||||
|
||||
def run(self) -> None:
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ProgressStatusOptions.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_duplicate()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_asn_value()
|
||||
|
||||
Reference in New Issue
Block a user