paperless-ngx/src/documents/tests/test_management_consumer.py

421 lines
13 KiB
Python

import filecmp
import os
import shutil
from pathlib import Path
from threading import Thread
from time import sleep
from unittest import mock
from django.conf import settings
from django.core.management import CommandError
from django.core.management import call_command
from django.test import TransactionTestCase
from django.test import override_settings
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.management.commands import document_consumer
from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DocumentConsumeDelayMixin
class ConsumerThread(Thread):
def __init__(self):
super().__init__()
self.cmd = document_consumer.Command()
self.cmd.stop_flag.clear()
def run(self) -> None:
self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False, testing=True)
def stop(self):
# Consumer checks this every second.
self.cmd.stop_flag.set()
def chunked(size, source):
for i in range(0, len(source), size):
yield source[i : i + size]
class ConsumerThreadMixin(DocumentConsumeDelayMixin):
"""
Provides a thread which runs the consumer management command at setUp
and stops it at tearDown
"""
sample_file: Path = (
Path(__file__).parent / Path("samples") / Path("simple.pdf")
).resolve()
def setUp(self) -> None:
super().setUp()
self.t = None
def t_start(self):
self.t = ConsumerThread()
self.t.start()
# give the consumer some time to do initial work
sleep(1)
def tearDown(self) -> None:
if self.t:
# set the stop flag
self.t.stop()
# wait for the consumer to exit.
self.t.join()
self.t = None
super().tearDown()
def wait_for_task_mock_call(self, expected_call_count=1):
n = 0
while n < 50:
if self.consume_file_mock.call_count >= expected_call_count:
# give task_mock some time to finish and raise errors
sleep(1)
return
n += 1
sleep(0.1)
# A bogus async_task that will simply check the file for
# completeness and raise an exception otherwise.
def bogus_task(
self,
input_doc: ConsumableDocument,
overrides=None,
):
eq = filecmp.cmp(input_doc.original_file, self.sample_file, shallow=False)
if not eq:
print("Consumed an INVALID file.")
raise ConsumerError("Incomplete File READ FAILED")
else:
print("Consumed a perfectly valid file.")
def slow_write_file(self, target, incomplete=False):
with open(self.sample_file, "rb") as f:
pdf_bytes = f.read()
if incomplete:
pdf_bytes = pdf_bytes[: len(pdf_bytes) - 100]
with open(target, "wb") as f:
# this will take 2 seconds, since the file is about 20k.
print("Start writing file.")
for b in chunked(1000, pdf_bytes):
f.write(b)
sleep(0.1)
print("file completed.")
@override_settings(
CONSUMER_INOTIFY_DELAY=0.01,
)
class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
def test_consume_file(self):
self.t_start()
f = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_called_once()
input_doc, _ = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, f)
def test_consume_file_invalid_ext(self):
self.t_start()
f = os.path.join(self.dirs.consumption_dir, "my_file.wow")
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_not_called()
def test_consume_existing_file(self):
f = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
shutil.copy(self.sample_file, f)
self.t_start()
self.consume_file_mock.assert_called_once()
input_doc, _ = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, f)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_pdf(self, error_logger):
self.consume_file_mock.side_effect = self.bogus_task
self.t_start()
fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
self.slow_write_file(fname)
self.wait_for_task_mock_call()
error_logger.assert_not_called()
self.consume_file_mock.assert_called_once()
input_doc, _ = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, fname)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_and_move(self, error_logger):
self.consume_file_mock.side_effect = self.bogus_task
self.t_start()
fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.~df"))
fname2 = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
self.slow_write_file(fname)
shutil.move(fname, fname2)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_called_once()
input_doc, _ = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, fname2)
error_logger.assert_not_called()
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_incomplete(self, error_logger):
self.consume_file_mock.side_effect = self.bogus_task
self.t_start()
fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
self.slow_write_file(fname, incomplete=True)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_called_once()
input_doc, _ = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, fname)
# assert that we have an error logged with this invalid file.
error_logger.assert_called_once()
@override_settings(CONSUMPTION_DIR="does_not_exist")
def test_consumption_directory_invalid(self):
self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")
@override_settings(CONSUMPTION_DIR="")
def test_consumption_directory_unset(self):
self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")
def test_mac_write(self):
self.consume_file_mock.side_effect = self.bogus_task
self.t_start()
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, ".DS_STORE"),
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "my_file.pdf"),
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "._my_file.pdf"),
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "my_second_file.pdf"),
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "._my_second_file.pdf"),
)
sleep(5)
self.wait_for_task_mock_call(expected_call_count=2)
self.assertEqual(2, self.consume_file_mock.call_count)
consumed_files = []
for input_doc, _ in self.get_all_consume_delay_call_args():
consumed_files.append(input_doc.original_file.name)
self.assertCountEqual(consumed_files, ["my_file.pdf", "my_second_file.pdf"])
def test_is_ignored(self):
test_paths = [
{
"path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".DS_STORE"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".DS_Store"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
"ignore": False,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
".stversions",
"foo.pdf",
),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
"ignore": True,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
"@eaDir",
"SYNO@.fileindexdb",
"_1jk.fnm",
),
"ignore": True,
},
]
for test_setup in test_paths:
filepath = test_setup["path"]
expected_ignored_result = test_setup["ignore"]
self.assertEqual(
expected_ignored_result,
document_consumer._is_ignored(filepath),
f'_is_ignored("{filepath}") != {expected_ignored_result}',
)
@mock.patch("documents.management.commands.document_consumer.open")
def test_consume_file_busy(self, open_mock):
# Calling this mock always raises this
open_mock.side_effect = OSError
self.t_start()
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_not_called()
@override_settings(
CONSUMER_POLLING=1,
# please leave the delay here and down below
# see https://github.com/paperless-ngx/paperless-ngx/pull/66
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
class TestConsumerPolling(TestConsumer):
# just do all the tests with polling
pass
@override_settings(CONSUMER_INOTIFY_DELAY=0.01, CONSUMER_RECURSIVE=True)
class TestConsumerRecursive(TestConsumer):
# just do all the tests with recursive
pass
@override_settings(
CONSUMER_RECURSIVE=True,
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
class TestConsumerRecursivePolling(TestConsumer):
# just do all the tests with polling and recursive
pass
class TestConsumerTags(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_SUBDIRS_AS_TAGS=True)
def test_consume_file_with_path_tags(self):
tag_names = ("existingTag", "Space Tag")
# Create a Tag prior to consuming a file using it in path
tag_ids = [
Tag.objects.create(name="existingtag").pk,
]
self.t_start()
path = os.path.join(self.dirs.consumption_dir, *tag_names)
os.makedirs(path, exist_ok=True)
f = Path(os.path.join(path, "my_file.pdf"))
# Wait at least inotify read_delay for recursive watchers
# to be created for the new directories
sleep(1)
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.consume_file_mock.assert_called_once()
# Add the pk of the Tag created by _consume()
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
input_doc, overrides = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file, f)
# assertCountEqual has a bad name, but test that the first
# sequence contains the same elements as second, regardless of
# their order.
self.assertCountEqual(overrides.tag_ids, tag_ids)
@override_settings(
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
def test_consume_file_with_path_tags_polling(self):
self.test_consume_file_with_path_tags()