Merge pull request #953 from paperless-ngx/bugfix-intoify-debounce

Bugfix: Adds configurable intoify debounce time
This commit is contained in:
Trenton Holmes 2022-05-17 09:25:29 -07:00 committed by GitHub
commit 9ae20a6bec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 117 additions and 4 deletions

View File

@ -590,6 +590,28 @@ PAPERLESS_CONSUMER_POLLING=<num>
Defaults to 0, which disables polling and uses filesystem notifications. Defaults to 0, which disables polling and uses filesystem notifications.
PAPERLESS_CONSUMER_POLLING_RETRY_COUNT=<num>
If consumer polling is enabled, sets the number of times paperless will check for a
file to remain unmodified.
Defaults to 5.
PAPERLESS_CONSUMER_POLLING_DELAY=<num>
If consumer polling is enabled, sets the delay in seconds between each check (above) paperless
will do while waiting for a file to remain unmodified.
Defaults to 5.
.. _configuration-inotify:
PAPERLESS_CONSUMER_INOTIFY_DELAY=<num>
Sets the time in seconds the consumer will wait for additional events
from inotify before the consumer will consider a file ready and begin consumption.
Certain scanners or network setups may generate multiple events for a single file,
leading to multiple consumers working on the same file. Configure this to
prevent that.
Defaults to 0.5 seconds.
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool> PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
When the consumer detects a duplicate document, it will not touch the When the consumer detects a duplicate document, it will not touch the

View File

@ -235,3 +235,66 @@ You might find messages like these in your log files:
This indicates that paperless failed to read PDF metadata from one of your documents. This happens when you This indicates that paperless failed to read PDF metadata from one of your documents. This happens when you
open the affected documents in paperless for editing. Paperless will continue to work, and will simply not open the affected documents in paperless for editing. Paperless will continue to work, and will simply not
show the invalid metadata. show the invalid metadata.
Consumer fails with a FileNotFoundError
############################
You might find messages like these in your log files:
.. code::
[ERROR] [paperless.consumer] Error while consuming document SCN_0001.pdf: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
Traceback (most recent call last):
File "/app/paperless/src/paperless_tesseract/parsers.py", line 261, in parse
ocrmypdf.ocr(**args)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/api.py", line 337, in ocr
return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 385, in run_pipeline
exec_concurrent(context, executor)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 302, in exec_concurrent
pdf = post_process(pdf, context, executor)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 235, in post_process
pdf_out = metadata_fixup(pdf_out, context)
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_pipeline.py", line 798, in metadata_fixup
with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf:
File "/usr/local/lib/python3.8/dist-packages/pikepdf/_methods.py", line 923, in open
pdf = Pdf._open(
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
This probably indicates paperless tried to consume the same file twice. This can happen for a number of reasons,
depending on how documents are placed into the consume folder. If paperless is using inotify (the default) to
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
try adjusting the :ref:`polling configuration <configuration-polling>`.
Consumer fails waiting for file to remain unmodified.
############################
You might find messages like these in your log files:
.. code::
[ERROR] [paperless.management.consumer] Timeout while waiting on file /usr/src/paperless/src/../consume/SCN_0001.pdf to remain unmodified.
This indicates paperless timed out while waiting for the file to be completely written to the consume folder.
Adjusting :ref:`polling configuration <configuration-polling>` values should resolve the issue.
.. note::
The user will need to manually move the file out of the consume folder and
back in, for the initial failing file to be consumed.
Consumer fails reporting "OS reports file as busy still".
############################
You might find messages like these in your log files:
.. code::
[WARNING] [paperless.management.consumer] Not consuming file /usr/src/paperless/src/../consume/SCN_0001.pdf: OS reports file as busy still
This indicates paperless was unable to open the file, as the OS reported the file as still being in use. To prevent a
crash, paperless did not try to consume the file. If paperless is using inotify (the default) to
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
try adjusting the :ref:`polling configuration <configuration-polling>`.
.. note::
The user will need to manually move the file out of the consume folder and
back in, for the initial failing file to be consumed.

View File

@ -216,7 +216,7 @@ class Command(BaseCommand):
try: try:
inotify_debounce: Final[float] = 0.5 inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY
notified_files = {} notified_files = {}
while not self.stop_flag: while not self.stop_flag:
@ -234,10 +234,23 @@ class Command(BaseCommand):
for filepath in notified_files: for filepath in notified_files:
# Time of the last inotify event for this file # Time of the last inotify event for this file
last_event_time = notified_files[filepath] last_event_time = notified_files[filepath]
if (monotonic() - last_event_time) > inotify_debounce:
# Current time - last time over the configured timeout
waited_long_enough = (
monotonic() - last_event_time
) > inotify_debounce
# Also make sure the file exists still, some scanners might write a
# temporary file first
file_still_exists = os.path.exists(filepath) and os.path.isfile(
filepath,
)
if waited_long_enough and file_still_exists:
_consume(filepath) _consume(filepath)
else: elif file_still_exists:
still_waiting[filepath] = last_event_time still_waiting[filepath] = last_event_time
# These files are still waiting to hit the timeout # These files are still waiting to hit the timeout
notified_files = still_waiting notified_files = still_waiting

View File

@ -98,6 +98,9 @@ class ConsumerMixin:
print("file completed.") print("file completed.")
@override_settings(
CONSUMER_INOTIFY_DELAY=0.01,
)
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase): class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
def test_consume_file(self): def test_consume_file(self):
self.t_start() self.t_start()
@ -286,7 +289,7 @@ class TestConsumerPolling(TestConsumer):
pass pass
@override_settings(CONSUMER_RECURSIVE=True) @override_settings(CONSUMER_INOTIFY_DELAY=0.01, CONSUMER_RECURSIVE=True)
class TestConsumerRecursive(TestConsumer): class TestConsumerRecursive(TestConsumer):
# just do all the tests with recursive # just do all the tests with recursive
pass pass

View File

@ -48,6 +48,13 @@ def __get_int(key: str, default: int) -> int:
return int(os.getenv(key, default)) return int(os.getenv(key, default))
def __get_float(key: str, default: float) -> float:
"""
Return an integer value based on the environment variable or a default
"""
return float(os.getenv(key, default))
# NEVER RUN WITH DEBUG IN PRODUCTION. # NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
@ -485,6 +492,11 @@ CONSUMER_POLLING_RETRY_COUNT = int(
os.getenv("PAPERLESS_CONSUMER_POLLING_RETRY_COUNT", 5), os.getenv("PAPERLESS_CONSUMER_POLLING_RETRY_COUNT", 5),
) )
CONSUMER_INOTIFY_DELAY: Final[float] = __get_float(
"PAPERLESS_CONSUMER_INOTIFY_DELAY",
0.5,
)
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE") CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")