diff --git a/docs/configuration.rst b/docs/configuration.rst index 3a4960f82..3d57236e1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -590,6 +590,28 @@ PAPERLESS_CONSUMER_POLLING= Defaults to 0, which disables polling and uses filesystem notifications. +PAPERLESS_CONSUMER_POLLING_RETRY_COUNT= + If consumer polling is enabled, sets the number of times paperless will check for a + file to remain unmodified. + + Defaults to 5. + +PAPERLESS_CONSUMER_POLLING_DELAY= + If consumer polling is enabled, sets the delay in seconds between each check (above) paperless + will do while waiting for a file to remain unmodified. + + Defaults to 5. + +.. _configuration-inotify: + +PAPERLESS_CONSUMER_INOTIFY_DELAY= + Sets the time in seconds the consumer will wait for additional events + from inotify before the consumer will consider a file ready and begin consumption. + Certain scanners or network setups may generate multiple events for a single file, + leading to multiple consumers working on the same file. Configure this to + prevent that. + + Defaults to 0.5 seconds. PAPERLESS_CONSUMER_DELETE_DUPLICATES= When the consumer detects a duplicate document, it will not touch the diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst index b1f41f096..08cc916b0 100644 --- a/docs/troubleshooting.rst +++ b/docs/troubleshooting.rst @@ -235,3 +235,66 @@ You might find messages like these in your log files: This indicates that paperless failed to read PDF metadata from one of your documents. This happens when you open the affected documents in paperless for editing. Paperless will continue to work, and will simply not show the invalid metadata. + +Consumer fails with a FileNotFoundError +############################ + +You might find messages like these in your log files: + +.. code:: + [ERROR] [paperless.consumer] Error while consuming document SCN_0001.pdf: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf' + Traceback (most recent call last): + File "/app/paperless/src/paperless_tesseract/parsers.py", line 261, in parse + ocrmypdf.ocr(**args) + File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/api.py", line 337, in ocr + return run_pipeline(options=options, plugin_manager=plugin_manager, api=True) + File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 385, in run_pipeline + exec_concurrent(context, executor) + File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 302, in exec_concurrent + pdf = post_process(pdf, context, executor) + File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_sync.py", line 235, in post_process + pdf_out = metadata_fixup(pdf_out, context) + File "/usr/local/lib/python3.8/dist-packages/ocrmypdf/_pipeline.py", line 798, in metadata_fixup + with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf: + File "/usr/local/lib/python3.8/dist-packages/pikepdf/_methods.py", line 923, in open + pdf = Pdf._open( + FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ocrmypdf.io.yhk3zbv0/origin.pdf' + +This probably indicates paperless tried to consume the same file twice. This can happen for a number of reasons, +depending on how documents are placed into the consume folder. If paperless is using inotify (the default) to +check for documents, try adjusting the :ref:`inotify configuration `. If polling is enabled, +try adjusting the :ref:`polling configuration `. + +Consumer fails waiting for file to remain unmodified. +############################ + +You might find messages like these in your log files: + +.. code:: + [ERROR] [paperless.management.consumer] Timeout while waiting on file /usr/src/paperless/src/../consume/SCN_0001.pdf to remain unmodified. + +This indicates paperless timed out while waiting for the file to be completely written to the consume folder. +Adjusting :ref:`polling configuration ` values should resolve the issue. + +.. note:: + + The user will need to manually move the file out of the consume folder and + back in, for the initial failing file to be consumed. + +Consumer fails reporting "OS reports file as busy still". +############################ + +You might find messages like these in your log files: + +.. code:: + [WARNING] [paperless.management.consumer] Not consuming file /usr/src/paperless/src/../consume/SCN_0001.pdf: OS reports file as busy still + +This indicates paperless was unable to open the file, as the OS reported the file as still being in use. To prevent a +crash, paperless did not try to consume the file. If paperless is using inotify (the default) to +check for documents, try adjusting the :ref:`inotify configuration `. If polling is enabled, +try adjusting the :ref:`polling configuration `. + +.. note:: + + The user will need to manually move the file out of the consume folder and + back in, for the initial failing file to be consumed. diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 87e3af2c3..bddb566aa 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -216,7 +216,7 @@ class Command(BaseCommand): try: - inotify_debounce: Final[float] = 0.5 + inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY notified_files = {} while not self.stop_flag: @@ -234,10 +234,23 @@ class Command(BaseCommand): for filepath in notified_files: # Time of the last inotify event for this file last_event_time = notified_files[filepath] - if (monotonic() - last_event_time) > inotify_debounce: + + # Current time - last time over the configured timeout + waited_long_enough = ( + monotonic() - last_event_time + ) > inotify_debounce + + # Also make sure the file exists still, some scanners might write a + # temporary file first + file_still_exists = os.path.exists(filepath) and os.path.isfile( + filepath, + ) + + if waited_long_enough and file_still_exists: _consume(filepath) - else: + elif file_still_exists: still_waiting[filepath] = last_event_time + # These files are still waiting to hit the timeout notified_files = still_waiting diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index e8463ab64..32b04282b 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -98,6 +98,9 @@ class ConsumerMixin: print("file completed.") +@override_settings( + CONSUMER_INOTIFY_DELAY=0.01, +) class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase): def test_consume_file(self): self.t_start() @@ -286,7 +289,7 @@ class TestConsumerPolling(TestConsumer): pass -@override_settings(CONSUMER_RECURSIVE=True) +@override_settings(CONSUMER_INOTIFY_DELAY=0.01, CONSUMER_RECURSIVE=True) class TestConsumerRecursive(TestConsumer): # just do all the tests with recursive pass diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b5be6c420..9b779d575 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -48,6 +48,13 @@ def __get_int(key: str, default: int) -> int: return int(os.getenv(key, default)) +def __get_float(key: str, default: float) -> float: + """ + Return an integer value based on the environment variable or a default + """ + return float(os.getenv(key, default)) + + # NEVER RUN WITH DEBUG IN PRODUCTION. DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") @@ -485,6 +492,11 @@ CONSUMER_POLLING_RETRY_COUNT = int( os.getenv("PAPERLESS_CONSUMER_POLLING_RETRY_COUNT", 5), ) +CONSUMER_INOTIFY_DELAY: Final[float] = __get_float( + "PAPERLESS_CONSUMER_INOTIFY_DELAY", + 0.5, +) + CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")