Changes the consumer to work on a temporary copy and provies that copy to the pre-consume script for modifications

This commit is contained in:
Trenton H 2023-01-28 09:32:40 -08:00
parent 9784ea4a60
commit 7dd9a4e089
2 changed files with 50 additions and 11 deletions

View File

@ -1,7 +1,10 @@
import datetime import datetime
import hashlib import hashlib
import os import os
import shutil
import tempfile
import uuid import uuid
from pathlib import Path
from subprocess import CompletedProcess from subprocess import CompletedProcess
from subprocess import run from subprocess import run
from typing import Optional from typing import Optional
@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.path = None self.path: Optional[Path] = None
self.original_path: Optional[Path] = None
self.filename = None self.filename = None
self.override_title = None self.override_title = None
self.override_correspondent_id = None self.override_correspondent_id = None
@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
filepath_arg = os.path.normpath(self.path) working_file_path = str(self.path)
original_file_path = str(self.original_path)
script_env = os.environ.copy() script_env = os.environ.copy()
script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
try: try:
completed_proc = run( completed_proc = run(
args=[ args=[
settings.PRE_CONSUME_SCRIPT, settings.PRE_CONSUME_SCRIPT,
filepath_arg, original_file_path,
], ],
env=script_env, env=script_env,
capture_output=True, capture_output=True,
@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
exception=e, exception=e,
) )
def run_post_consume_script(self, document): def run_post_consume_script(self, document: Document):
if not settings.POST_CONSUME_SCRIPT: if not settings.POST_CONSUME_SCRIPT:
return return
@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
Return the document object if it was successfully created. Return the document object if it was successfully created.
""" """
self.path = path self.path = Path(path).resolve()
self.filename = override_filename or os.path.basename(path) self.filename = override_filename or self.path.name
self.override_title = override_title self.override_title = override_title
self.override_correspondent_id = override_correspondent_id self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id self.override_document_type_id = override_document_type_id
@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
self.log("info", f"Consuming {self.filename}") self.log("info", f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir
self.original_path = self.path
tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
)
self.path = Path(tempdir.name) / Path(self.filename)
shutil.copy(self.original_path, self.path)
# Determine the parser class. # Determine the parser class.
mime_type = magic.from_file(self.path, mime=True) mime_type = magic.from_file(self.path, mime=True)
@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}") self.log("debug", f"Deleting file {self.path}")
os.unlink(self.path) os.unlink(self.path)
self.original_path.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join( shadow_file = os.path.join(
os.path.dirname(self.path), os.path.dirname(self.original_path),
"._" + os.path.basename(self.path), "._" + os.path.basename(self.original_path),
) )
if os.path.isfile(shadow_file): if os.path.isfile(shadow_file):
@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
) )
finally: finally:
document_parser.cleanup() document_parser.cleanup()
tempdir.cleanup()
self.run_post_consume_script(document) self.run_post_consume_script(document)

View File

@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
with tempfile.NamedTemporaryFile() as script: with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer() c = Consumer()
c.path = "path-to-file" c.original_path = "path-to-file"
c.path = "/tmp/somewhere/path-to-file"
c.run_pre_consume_script() c.run_pre_consume_script()
m.assert_called_once() m.assert_called_once()
@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
args, kwargs = m.call_args args, kwargs = m.call_args
command = kwargs["args"] command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file") self.assertEqual(command[1], "path-to-file")
self.assertDictContainsSubset(
{
"DOCUMENT_SOURCE_PATH": c.original_path,
"DOCUMENT_WORKING_PATH": c.path,
},
environment,
)
@mock.patch("documents.consumer.Consumer.log") @mock.patch("documents.consumer.Consumer.log")
def test_script_with_output(self, mocked_log): def test_script_with_output(self, mocked_log):
""" """
@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once() m.assert_called_once()
args, kwargs = m.call_args _, kwargs = m.call_args
command = kwargs["args"] command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk)) self.assertEqual(command[1], str(doc.pk))
@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[7], "my_bank") self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"]) self.assertCountEqual(command[8].split(","), ["a", "b"])
self.assertDictContainsSubset(
{
"DOCUMENT_ID": str(doc.pk),
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
"DOCUMENT_CORRESPONDENT": "my_bank",
"DOCUMENT_TAGS": "a,b",
},
environment,
)
def test_script_exit_non_zero(self): def test_script_exit_non_zero(self):
""" """
GIVEN: GIVEN: