mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Fix: Allows pre-consume scripts to modify the working path again (#5260)
* Allows pre-consume scripts to modify the working path again and generally cleans up some confusion about working copy vs original
This commit is contained in:
parent
3115106dc1
commit
a82e3771ae
@ -136,6 +136,11 @@ script can access the following relevant environment variables set:
|
||||
be triggered, leading to failures as two tasks work on the
|
||||
same document path
|
||||
|
||||
!!! warning
|
||||
|
||||
If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic
|
||||
way, this may allow duplicate documents to be stored
|
||||
|
||||
A simple but common example for this would be creating a simple script
|
||||
like this:
|
||||
|
||||
|
@ -135,24 +135,24 @@ class Consumer(LoggingMixin):
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
if not os.path.isfile(self.path):
|
||||
if not os.path.isfile(self.original_path):
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.path}: File not found.",
|
||||
f"Cannot consume {self.original_path}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with open(self.path, "rb") as f:
|
||||
with open(self.original_path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
os.unlink(self.original_path)
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
|
||||
f"Not consuming {self.filename}: It is a duplicate of"
|
||||
@ -211,7 +211,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
|
||||
|
||||
working_file_path = str(self.path)
|
||||
working_file_path = str(self.working_copy)
|
||||
original_file_path = str(self.original_path)
|
||||
|
||||
script_env = os.environ.copy()
|
||||
@ -343,8 +343,8 @@ class Consumer(LoggingMixin):
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
self.path = Path(path).resolve()
|
||||
self.filename = override_filename or self.path.name
|
||||
self.original_path = Path(path).resolve()
|
||||
self.filename = override_filename or self.original_path.name
|
||||
self.override_title = override_title
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
@ -377,17 +377,16 @@ class Consumer(LoggingMixin):
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
self.original_path = self.path
|
||||
tempdir = tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.path = Path(tempdir.name) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.original_path, self.path)
|
||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.original_path, self.working_copy)
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
mime_type = magic.from_file(self.path, mime=True)
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
|
||||
self.log.debug(f"Detected mime type: {mime_type}")
|
||||
|
||||
@ -406,7 +405,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=self.path,
|
||||
filename=self.working_copy,
|
||||
logging_group=self.logging_group,
|
||||
)
|
||||
|
||||
@ -444,7 +443,7 @@ class Consumer(LoggingMixin):
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
document_parser.parse(self.path, mime_type, self.filename)
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
@ -454,7 +453,7 @@ class Consumer(LoggingMixin):
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.path,
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
)
|
||||
@ -527,7 +526,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
self.original_path,
|
||||
self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
@ -560,9 +559,9 @@ class Consumer(LoggingMixin):
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting file {self.path}")
|
||||
os.unlink(self.path)
|
||||
self.log.debug(f"Deleting file {self.working_copy}")
|
||||
self.original_path.unlink()
|
||||
self.working_copy.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = os.path.join(
|
||||
@ -735,7 +734,7 @@ class Consumer(LoggingMixin):
|
||||
)[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(),
|
||||
checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
storage_type=storage_type,
|
||||
|
@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin):
|
||||
self.logging_group = logging_group
|
||||
self.settings = self.get_settings()
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
|
||||
self.archive_path = None
|
||||
self.text = None
|
||||
|
@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase):
|
||||
@override_settings(PRE_CONSUME_SCRIPT=None)
|
||||
def test_no_pre_consume_script(self, m):
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.working_copy = "path-to-file"
|
||||
c.run_pre_consume_script()
|
||||
m.assert_not_called()
|
||||
|
||||
@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase):
|
||||
def test_pre_consume_script_not_found(self, m, m2):
|
||||
c = Consumer()
|
||||
c.filename = "somefile.pdf"
|
||||
c.path = "path-to-file"
|
||||
c.working_copy = "path-to-file"
|
||||
self.assertRaises(ConsumerError, c.run_pre_consume_script)
|
||||
|
||||
@mock.patch("documents.consumer.run")
|
||||
@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase):
|
||||
with override_settings(PRE_CONSUME_SCRIPT=script.name):
|
||||
c = Consumer()
|
||||
c.original_path = "path-to-file"
|
||||
c.path = "/tmp/somewhere/path-to-file"
|
||||
c.working_copy = "/tmp/somewhere/path-to-file"
|
||||
c.task_id = str(uuid.uuid4())
|
||||
c.run_pre_consume_script()
|
||||
|
||||
@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase):
|
||||
|
||||
subset = {
|
||||
"DOCUMENT_SOURCE_PATH": c.original_path,
|
||||
"DOCUMENT_WORKING_PATH": c.path,
|
||||
"DOCUMENT_WORKING_PATH": c.working_copy,
|
||||
"TASK_ID": c.task_id,
|
||||
}
|
||||
self.assertDictEqual(environment, {**environment, **subset})
|
||||
@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase):
|
||||
with override_settings(PRE_CONSUME_SCRIPT=script.name):
|
||||
with self.assertLogs("paperless.consumer", level="INFO") as cm:
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.working_copy = "path-to-file"
|
||||
|
||||
c.run_pre_consume_script()
|
||||
self.assertIn(
|
||||
@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase):
|
||||
|
||||
with override_settings(PRE_CONSUME_SCRIPT=script.name):
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.working_copy = "path-to-file"
|
||||
self.assertRaises(
|
||||
ConsumerError,
|
||||
c.run_pre_consume_script,
|
||||
|
@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
def remove_alpha(self, image_path: str):
|
||||
def remove_alpha(self, image_path: str) -> Path:
|
||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||
subprocess.run(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
image_path,
|
||||
image_path,
|
||||
no_alpha_image,
|
||||
],
|
||||
)
|
||||
return no_alpha_image
|
||||
|
||||
def get_dpi(self, image) -> Optional[int]:
|
||||
try:
|
||||
@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Removing alpha layer from {input_file} "
|
||||
"for compatibility with img2pdf",
|
||||
)
|
||||
self.remove_alpha(input_file)
|
||||
# Replace the input file with the non-alpha
|
||||
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
||||
|
||||
if dpi:
|
||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user