diff --git a/docs/configuration.rst b/docs/configuration.rst index 3a4960f82..675a929bf 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -590,6 +590,14 @@ PAPERLESS_CONSUMER_POLLING= Defaults to 0, which disables polling and uses filesystem notifications. +PAPERLESS_CONSUMER_INOTIFY_DELAY= + Sets the time in seconds the consumer will wait for additional events + from inotify before the consumer will consider a file ready and begin consumption. + Certain scanners or network setups may generate multiple events for a single file, + leading to multiple consumers working on the same file. Configure this to + prevent that. + + Defaults to 0.5 seconds. PAPERLESS_CONSUMER_DELETE_DUPLICATES= When the consumer detects a duplicate document, it will not touch the diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 87e3af2c3..bddb566aa 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -216,7 +216,7 @@ class Command(BaseCommand): try: - inotify_debounce: Final[float] = 0.5 + inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY notified_files = {} while not self.stop_flag: @@ -234,10 +234,23 @@ class Command(BaseCommand): for filepath in notified_files: # Time of the last inotify event for this file last_event_time = notified_files[filepath] - if (monotonic() - last_event_time) > inotify_debounce: + + # Current time - last time over the configured timeout + waited_long_enough = ( + monotonic() - last_event_time + ) > inotify_debounce + + # Also make sure the file exists still, some scanners might write a + # temporary file first + file_still_exists = os.path.exists(filepath) and os.path.isfile( + filepath, + ) + + if waited_long_enough and file_still_exists: _consume(filepath) - else: + elif file_still_exists: still_waiting[filepath] = last_event_time + # These files are still waiting to hit the timeout notified_files = still_waiting diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index e8463ab64..32b04282b 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -98,6 +98,9 @@ class ConsumerMixin: print("file completed.") +@override_settings( + CONSUMER_INOTIFY_DELAY=0.01, +) class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase): def test_consume_file(self): self.t_start() @@ -286,7 +289,7 @@ class TestConsumerPolling(TestConsumer): pass -@override_settings(CONSUMER_RECURSIVE=True) +@override_settings(CONSUMER_INOTIFY_DELAY=0.01, CONSUMER_RECURSIVE=True) class TestConsumerRecursive(TestConsumer): # just do all the tests with recursive pass diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b5be6c420..9b779d575 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -48,6 +48,13 @@ def __get_int(key: str, default: int) -> int: return int(os.getenv(key, default)) +def __get_float(key: str, default: float) -> float: + """ + Return an integer value based on the environment variable or a default + """ + return float(os.getenv(key, default)) + + # NEVER RUN WITH DEBUG IN PRODUCTION. DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") @@ -485,6 +492,11 @@ CONSUMER_POLLING_RETRY_COUNT = int( os.getenv("PAPERLESS_CONSUMER_POLLING_RETRY_COUNT", 5), ) +CONSUMER_INOTIFY_DELAY: Final[float] = __get_float( + "PAPERLESS_CONSUMER_INOTIFY_DELAY", + 0.5, +) + CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")