Fix: Allows pre-consume scripts to modify the working path again (#5260)

* Allows pre-consume scripts to modify the working path again and generally cleans up some confusion about working copy vs original
This commit is contained in:
Trenton H 2024-01-05 21:01:57 -08:00 committed by GitHub
parent 3115106dc1
commit a82e3771ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 37 additions and 28 deletions

View File

@ -136,6 +136,11 @@ script can access the following relevant environment variables set:
be triggered, leading to failures as two tasks work on the be triggered, leading to failures as two tasks work on the
same document path same document path
!!! warning
If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic
way, this may allow duplicate documents to be stored
A simple but common example for this would be creating a simple script A simple but common example for this would be creating a simple script
like this: like this:

View File

@ -135,24 +135,24 @@ class Consumer(LoggingMixin):
""" """
Confirm the input file still exists where it should Confirm the input file still exists where it should
""" """
if not os.path.isfile(self.path): if not os.path.isfile(self.original_path):
self._fail( self._fail(
ConsumerStatusShortMessage.FILE_NOT_FOUND, ConsumerStatusShortMessage.FILE_NOT_FOUND,
f"Cannot consume {self.path}: File not found.", f"Cannot consume {self.original_path}: File not found.",
) )
def pre_check_duplicate(self): def pre_check_duplicate(self):
""" """
Using the MD5 of the file, check this exact file doesn't already exist Using the MD5 of the file, check this exact file doesn't already exist
""" """
with open(self.path, "rb") as f: with open(self.original_path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.objects.filter( existing_doc = Document.objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum), Q(checksum=checksum) | Q(archive_checksum=checksum),
) )
if existing_doc.exists(): if existing_doc.exists():
if settings.CONSUMER_DELETE_DUPLICATES: if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path) os.unlink(self.original_path)
self._fail( self._fail(
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS, ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate of" f"Not consuming {self.filename}: It is a duplicate of"
@ -211,7 +211,7 @@ class Consumer(LoggingMixin):
self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
working_file_path = str(self.path) working_file_path = str(self.working_copy)
original_file_path = str(self.original_path) original_file_path = str(self.original_path)
script_env = os.environ.copy() script_env = os.environ.copy()
@ -343,8 +343,8 @@ class Consumer(LoggingMixin):
Return the document object if it was successfully created. Return the document object if it was successfully created.
""" """
self.path = Path(path).resolve() self.original_path = Path(path).resolve()
self.filename = override_filename or self.path.name self.filename = override_filename or self.original_path.name
self.override_title = override_title self.override_title = override_title
self.override_correspondent_id = override_correspondent_id self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id self.override_document_type_id = override_document_type_id
@ -377,17 +377,16 @@ class Consumer(LoggingMixin):
self.log.info(f"Consuming {self.filename}") self.log.info(f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir # For the actual work, copy the file into a tempdir
self.original_path = self.path
tempdir = tempfile.TemporaryDirectory( tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx", prefix="paperless-ngx",
dir=settings.SCRATCH_DIR, dir=settings.SCRATCH_DIR,
) )
self.path = Path(tempdir.name) / Path(self.filename) self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.original_path, self.path) copy_file_with_basic_stats(self.original_path, self.working_copy)
# Determine the parser class. # Determine the parser class.
mime_type = magic.from_file(self.path, mime=True) mime_type = magic.from_file(self.working_copy, mime=True)
self.log.debug(f"Detected mime type: {mime_type}") self.log.debug(f"Detected mime type: {mime_type}")
@ -406,7 +405,7 @@ class Consumer(LoggingMixin):
document_consumption_started.send( document_consumption_started.send(
sender=self.__class__, sender=self.__class__,
filename=self.path, filename=self.working_copy,
logging_group=self.logging_group, logging_group=self.logging_group,
) )
@ -444,7 +443,7 @@ class Consumer(LoggingMixin):
ConsumerStatusShortMessage.PARSING_DOCUMENT, ConsumerStatusShortMessage.PARSING_DOCUMENT,
) )
self.log.debug(f"Parsing {self.filename}...") self.log.debug(f"Parsing {self.filename}...")
document_parser.parse(self.path, mime_type, self.filename) document_parser.parse(self.working_copy, mime_type, self.filename)
self.log.debug(f"Generating thumbnail for {self.filename}...") self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress( self._send_progress(
@ -454,7 +453,7 @@ class Consumer(LoggingMixin):
ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
) )
thumbnail = document_parser.get_thumbnail( thumbnail = document_parser.get_thumbnail(
self.path, self.working_copy,
mime_type, mime_type,
self.filename, self.filename,
) )
@ -527,7 +526,7 @@ class Consumer(LoggingMixin):
self._write( self._write(
document.storage_type, document.storage_type,
self.original_path, self.working_copy,
document.source_path, document.source_path,
) )
@ -560,9 +559,9 @@ class Consumer(LoggingMixin):
document.save() document.save()
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log.debug(f"Deleting file {self.path}") self.log.debug(f"Deleting file {self.working_copy}")
os.unlink(self.path)
self.original_path.unlink() self.original_path.unlink()
self.working_copy.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join( shadow_file = os.path.join(
@ -735,7 +734,7 @@ class Consumer(LoggingMixin):
)[:127], )[:127],
content=text, content=text,
mime_type=mime_type, mime_type=mime_type,
checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(), checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(),
created=create_date, created=create_date,
modified=create_date, modified=create_date,
storage_type=storage_type, storage_type=storage_type,

View File

@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin):
self.logging_group = logging_group self.logging_group = logging_group
self.settings = self.get_settings() self.settings = self.get_settings()
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) self.tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self.archive_path = None self.archive_path = None
self.text = None self.text = None

View File

@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase):
@override_settings(PRE_CONSUME_SCRIPT=None) @override_settings(PRE_CONSUME_SCRIPT=None)
def test_no_pre_consume_script(self, m): def test_no_pre_consume_script(self, m):
c = Consumer() c = Consumer()
c.path = "path-to-file" c.working_copy = "path-to-file"
c.run_pre_consume_script() c.run_pre_consume_script()
m.assert_not_called() m.assert_not_called()
@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase):
def test_pre_consume_script_not_found(self, m, m2): def test_pre_consume_script_not_found(self, m, m2):
c = Consumer() c = Consumer()
c.filename = "somefile.pdf" c.filename = "somefile.pdf"
c.path = "path-to-file" c.working_copy = "path-to-file"
self.assertRaises(ConsumerError, c.run_pre_consume_script) self.assertRaises(ConsumerError, c.run_pre_consume_script)
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run")
@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase):
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer() c = Consumer()
c.original_path = "path-to-file" c.original_path = "path-to-file"
c.path = "/tmp/somewhere/path-to-file" c.working_copy = "/tmp/somewhere/path-to-file"
c.task_id = str(uuid.uuid4()) c.task_id = str(uuid.uuid4())
c.run_pre_consume_script() c.run_pre_consume_script()
@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase):
subset = { subset = {
"DOCUMENT_SOURCE_PATH": c.original_path, "DOCUMENT_SOURCE_PATH": c.original_path,
"DOCUMENT_WORKING_PATH": c.path, "DOCUMENT_WORKING_PATH": c.working_copy,
"TASK_ID": c.task_id, "TASK_ID": c.task_id,
} }
self.assertDictEqual(environment, {**environment, **subset}) self.assertDictEqual(environment, {**environment, **subset})
@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase):
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
with self.assertLogs("paperless.consumer", level="INFO") as cm: with self.assertLogs("paperless.consumer", level="INFO") as cm:
c = Consumer() c = Consumer()
c.path = "path-to-file" c.working_copy = "path-to-file"
c.run_pre_consume_script() c.run_pre_consume_script()
self.assertIn( self.assertIn(
@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase):
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer() c = Consumer()
c.path = "path-to-file" c.working_copy = "path-to-file"
self.assertRaises( self.assertRaises(
ConsumerError, ConsumerError,
c.run_pre_consume_script, c.run_pre_consume_script,

View File

@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser):
with Image.open(image) as im: with Image.open(image) as im:
return im.mode in ("RGBA", "LA") return im.mode in ("RGBA", "LA")
def remove_alpha(self, image_path: str): def remove_alpha(self, image_path: str) -> Path:
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
subprocess.run( subprocess.run(
[ [
settings.CONVERT_BINARY, settings.CONVERT_BINARY,
"-alpha", "-alpha",
"off", "off",
image_path, image_path,
image_path, no_alpha_image,
], ],
) )
return no_alpha_image
def get_dpi(self, image) -> Optional[int]: def get_dpi(self, image) -> Optional[int]:
try: try:
@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
f"Removing alpha layer from {input_file} " f"Removing alpha layer from {input_file} "
"for compatibility with img2pdf", "for compatibility with img2pdf",
) )
self.remove_alpha(input_file) # Replace the input file with the non-alpha
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
if dpi: if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}") self.log.debug(f"Detected DPI for image {input_file}: {dpi}")