mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #953 from paperless-ngx/bugfix-intoify-debounce
Bugfix: Adds configurable intoify debounce time
This commit is contained in:
commit
9ae20a6bec
@ -590,6 +590,28 @@ PAPERLESS_CONSUMER_POLLING=<num>
|
||||
|
||||
Defaults to 0, which disables polling and uses filesystem notifications.
|
||||
|
||||
PAPERLESS_CONSUMER_POLLING_RETRY_COUNT=<num>
|
||||
If consumer polling is enabled, sets the number of times paperless will check for a
|
||||
file to remain unmodified.
|
||||
|
||||
Defaults to 5.
|
||||
|
||||
PAPERLESS_CONSUMER_POLLING_DELAY=<num>
|
||||
If consumer polling is enabled, sets the delay in seconds between each check (above) paperless
|
||||
will do while waiting for a file to remain unmodified.
|
||||
|
||||
Defaults to 5.
|
||||
|
||||
.. _configuration-inotify:
|
||||
|
||||
PAPERLESS_CONSUMER_INOTIFY_DELAY=<num>
|
||||
Sets the time in seconds the consumer will wait for additional events
|
||||
from inotify before the consumer will consider a file ready and begin consumption.
|
||||
Certain scanners or network setups may generate multiple events for a single file,
|
||||
leading to multiple consumers working on the same file. Configure this to
|
||||
prevent that.
|
||||
|
||||
Defaults to 0.5 seconds.
|
||||
|
||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
|
||||
When the consumer detects a duplicate document, it will not touch the
|
||||
|
@ -235,3 +235,66 @@ You might find messages like these in your log files:
|
||||
This indicates that paperless failed to read PDF metadata from one of your documents. This happens when you
|
||||
open the affected documents in paperless for editing. Paperless will continue to work, and will simply not
|
||||
show the invalid metadata.
|
||||
|
||||
Consumer fails with a FileNotFoundError
|
||||
############################
|
||||
|
||||
You might find messages like these in your log files:
|
||||
|
||||
.. code::
|
||||
[ERROR] [paperless.consumer] Error while consuming document SCN_0001.pdf: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
|
||||
Traceback (most recent call last):
|
||||
File "/app/paperless/src/paperless_tesseract/parsers.py", line 261, in parse
|
||||
ocrmypdf.ocr(**args)
|
||||
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/api.py", line 337, in ocr
|
||||
return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
|
||||
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 385, in run_pipeline
|
||||
exec_concurrent(context, executor)
|
||||
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 302, in exec_concurrent
|
||||
pdf = post_process(pdf, context, executor)
|
||||
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 235, in post_process
|
||||
pdf_out = metadata_fixup(pdf_out, context)
|
||||
File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_pipeline.py", line 798, in metadata_fixup
|
||||
with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf:
|
||||
File "/usr/local/lib/python3.8/dist-packages/pikepdf/_methods.py", line 923, in open
|
||||
pdf = Pdf._open(
|
||||
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf'
|
||||
|
||||
This probably indicates paperless tried to consume the same file twice. This can happen for a number of reasons,
|
||||
depending on how documents are placed into the consume folder. If paperless is using inotify (the default) to
|
||||
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
|
||||
try adjusting the :ref:`polling configuration <configuration-polling>`.
|
||||
|
||||
Consumer fails waiting for file to remain unmodified.
|
||||
############################
|
||||
|
||||
You might find messages like these in your log files:
|
||||
|
||||
.. code::
|
||||
[ERROR] [paperless.management.consumer] Timeout while waiting on file /usr/src/paperless/src/../consume/SCN_0001.pdf to remain unmodified.
|
||||
|
||||
This indicates paperless timed out while waiting for the file to be completely written to the consume folder.
|
||||
Adjusting :ref:`polling configuration <configuration-polling>` values should resolve the issue.
|
||||
|
||||
.. note::
|
||||
|
||||
The user will need to manually move the file out of the consume folder and
|
||||
back in, for the initial failing file to be consumed.
|
||||
|
||||
Consumer fails reporting "OS reports file as busy still".
|
||||
############################
|
||||
|
||||
You might find messages like these in your log files:
|
||||
|
||||
.. code::
|
||||
[WARNING] [paperless.management.consumer] Not consuming file /usr/src/paperless/src/../consume/SCN_0001.pdf: OS reports file as busy still
|
||||
|
||||
This indicates paperless was unable to open the file, as the OS reported the file as still being in use. To prevent a
|
||||
crash, paperless did not try to consume the file. If paperless is using inotify (the default) to
|
||||
check for documents, try adjusting the :ref:`inotify configuration <configuration-inotify>`. If polling is enabled,
|
||||
try adjusting the :ref:`polling configuration <configuration-polling>`.
|
||||
|
||||
.. note::
|
||||
|
||||
The user will need to manually move the file out of the consume folder and
|
||||
back in, for the initial failing file to be consumed.
|
||||
|
@ -216,7 +216,7 @@ class Command(BaseCommand):
|
||||
|
||||
try:
|
||||
|
||||
inotify_debounce: Final[float] = 0.5
|
||||
inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY
|
||||
notified_files = {}
|
||||
|
||||
while not self.stop_flag:
|
||||
@ -234,10 +234,23 @@ class Command(BaseCommand):
|
||||
for filepath in notified_files:
|
||||
# Time of the last inotify event for this file
|
||||
last_event_time = notified_files[filepath]
|
||||
if (monotonic() - last_event_time) > inotify_debounce:
|
||||
|
||||
# Current time - last time over the configured timeout
|
||||
waited_long_enough = (
|
||||
monotonic() - last_event_time
|
||||
) > inotify_debounce
|
||||
|
||||
# Also make sure the file exists still, some scanners might write a
|
||||
# temporary file first
|
||||
file_still_exists = os.path.exists(filepath) and os.path.isfile(
|
||||
filepath,
|
||||
)
|
||||
|
||||
if waited_long_enough and file_still_exists:
|
||||
_consume(filepath)
|
||||
else:
|
||||
elif file_still_exists:
|
||||
still_waiting[filepath] = last_event_time
|
||||
|
||||
# These files are still waiting to hit the timeout
|
||||
notified_files = still_waiting
|
||||
|
||||
|
@ -98,6 +98,9 @@ class ConsumerMixin:
|
||||
print("file completed.")
|
||||
|
||||
|
||||
@override_settings(
|
||||
CONSUMER_INOTIFY_DELAY=0.01,
|
||||
)
|
||||
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
def test_consume_file(self):
|
||||
self.t_start()
|
||||
@ -286,7 +289,7 @@ class TestConsumerPolling(TestConsumer):
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_INOTIFY_DELAY=0.01, CONSUMER_RECURSIVE=True)
|
||||
class TestConsumerRecursive(TestConsumer):
|
||||
# just do all the tests with recursive
|
||||
pass
|
||||
|
@ -48,6 +48,13 @@ def __get_int(key: str, default: int) -> int:
|
||||
return int(os.getenv(key, default))
|
||||
|
||||
|
||||
def __get_float(key: str, default: float) -> float:
|
||||
"""
|
||||
Return an integer value based on the environment variable or a default
|
||||
"""
|
||||
return float(os.getenv(key, default))
|
||||
|
||||
|
||||
# NEVER RUN WITH DEBUG IN PRODUCTION.
|
||||
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
|
||||
|
||||
@ -485,6 +492,11 @@ CONSUMER_POLLING_RETRY_COUNT = int(
|
||||
os.getenv("PAPERLESS_CONSUMER_POLLING_RETRY_COUNT", 5),
|
||||
)
|
||||
|
||||
CONSUMER_INOTIFY_DELAY: Final[float] = __get_float(
|
||||
"PAPERLESS_CONSUMER_INOTIFY_DELAY",
|
||||
0.5,
|
||||
)
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
|
||||
|
Loading…
x
Reference in New Issue
Block a user