mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	updated consumer: now using watchdog
This commit is contained in:
		| @@ -4,10 +4,8 @@ import hashlib | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import time | ||||
| import uuid | ||||
|  | ||||
| from operator import itemgetter | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| @@ -36,17 +34,12 @@ class Consumer: | ||||
|       5. Delete the document and image(s) | ||||
|     """ | ||||
|  | ||||
|     # Files are considered ready for consumption if they have been unmodified | ||||
|     # for this duration | ||||
|     FILES_MIN_UNMODIFIED_DURATION = 0.5 | ||||
|  | ||||
|     def __init__(self, consume=settings.CONSUMPTION_DIR, | ||||
|                  scratch=settings.SCRATCH_DIR): | ||||
|  | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
|  | ||||
|         self._ignore = [] | ||||
|         self.consume = consume | ||||
|         self.scratch = scratch | ||||
|  | ||||
| @@ -83,43 +76,6 @@ class Consumer: | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
|  | ||||
|     def consume_new_files(self): | ||||
|         """ | ||||
|         Find non-ignored files in consumption dir and consume them if they have | ||||
|         been unmodified for FILES_MIN_UNMODIFIED_DURATION. | ||||
|         """ | ||||
|         ignored_files = [] | ||||
|         files = [] | ||||
|         for entry in os.scandir(self.consume): | ||||
|             if entry.is_file(): | ||||
|                 file = (entry.path, entry.stat().st_mtime) | ||||
|                 if file in self._ignore: | ||||
|                     ignored_files.append(file) | ||||
|                 else: | ||||
|                     files.append(file) | ||||
|             else: | ||||
|                 self.logger.warning( | ||||
|                     "Skipping %s as it is not a file", | ||||
|                     entry.path | ||||
|                 ) | ||||
|  | ||||
|         if not files: | ||||
|             return | ||||
|  | ||||
|         # Set _ignore to only include files that still exist. | ||||
|         # This keeps it from growing indefinitely. | ||||
|         self._ignore[:] = ignored_files | ||||
|  | ||||
|         files_old_to_new = sorted(files, key=itemgetter(1)) | ||||
|  | ||||
|         time.sleep(self.FILES_MIN_UNMODIFIED_DURATION) | ||||
|  | ||||
|         for file, mtime in files_old_to_new: | ||||
|             if mtime == os.path.getmtime(file): | ||||
|                 # File has not been modified and can be consumed | ||||
|                 if not self.try_consume_file(file): | ||||
|                     self._ignore.append((file, mtime)) | ||||
|  | ||||
|     @transaction.atomic | ||||
|     def try_consume_file(self, file): | ||||
|         """ | ||||
|   | ||||
| @@ -1,12 +1,13 @@ | ||||
| import logging | ||||
| import os | ||||
| import time | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
| from django.core.management.base import BaseCommand | ||||
|  | ||||
| from ...consumer import Consumer, ConsumerError | ||||
| from ...mail import MailFetcher, MailFetcherError | ||||
| from watchdog.observers import Observer | ||||
| from watchdog.events import FileSystemEventHandler | ||||
|  | ||||
| from documents.consumer import Consumer | ||||
|  | ||||
| try: | ||||
|     from inotify_simple import INotify, flags | ||||
| @@ -14,6 +15,15 @@ except ImportError: | ||||
|     INotify = flags = None | ||||
|  | ||||
|  | ||||
| class Handler(FileSystemEventHandler): | ||||
|  | ||||
|     def __init__(self, consumer): | ||||
|         self.consumer = consumer | ||||
|  | ||||
|     def on_created(self, event): | ||||
|         self.consumer.try_consume_file(event.src_path) | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
|     """ | ||||
|     On every iteration of an infinite loop, consume what we can from the | ||||
| @@ -29,6 +39,8 @@ class Command(BaseCommand): | ||||
|         self.mail_fetcher = None | ||||
|         self.first_iteration = True | ||||
|  | ||||
|         self.consumer = Consumer() | ||||
|  | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
| @@ -38,111 +50,34 @@ class Command(BaseCommand): | ||||
|             nargs="?", | ||||
|             help="The consumption directory." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--loop-time", | ||||
|             default=settings.CONSUMER_LOOP_TIME, | ||||
|             type=int, | ||||
|             help="Wait time between each loop (in seconds)." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--mail-delta", | ||||
|             default=10, | ||||
|             type=int, | ||||
|             help="Wait time between each mail fetch (in minutes)." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--oneshot", | ||||
|             action="store_true", | ||||
|             help="Run only once." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--no-inotify", | ||||
|             action="store_true", | ||||
|             help="Don't use inotify, even if it's available.", | ||||
|             default=False | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         self.verbosity = options["verbosity"] | ||||
|         directory = options["directory"] | ||||
|         loop_time = options["loop_time"] | ||||
|         mail_delta = options["mail_delta"] * 60 | ||||
|         use_inotify = INotify is not None and options["no_inotify"] is False | ||||
|  | ||||
|         try: | ||||
|             self.file_consumer = Consumer(consume=directory) | ||||
|             self.mail_fetcher = MailFetcher(consume=directory) | ||||
|         except (ConsumerError, MailFetcherError) as e: | ||||
|             raise CommandError(e) | ||||
|  | ||||
|         for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR): | ||||
|             os.makedirs(d, exist_ok=True) | ||||
|  | ||||
|         logging.getLogger(__name__).info( | ||||
|             "Starting document consumer at {}{}".format( | ||||
|                 directory, | ||||
|                 " with inotify" if use_inotify else "" | ||||
|             "Starting document consumer at {}".format( | ||||
|                 directory | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|         if options["oneshot"]: | ||||
|             self.loop_step(mail_delta) | ||||
|         else: | ||||
|             try: | ||||
|                 if use_inotify: | ||||
|                     self.loop_inotify(mail_delta) | ||||
|                 else: | ||||
|                     self.loop(loop_time, mail_delta) | ||||
|             except KeyboardInterrupt: | ||||
|                 print("Exiting") | ||||
|         # Consume all files as this is not done initially by the watchdog | ||||
|         for entry in os.scandir(directory): | ||||
|             if entry.is_file(): | ||||
|                 self.consumer.try_consume_file(entry.path) | ||||
|  | ||||
|     def loop(self, loop_time, mail_delta): | ||||
|         while True: | ||||
|             start_time = time.time() | ||||
|             if self.verbosity > 1: | ||||
|                 print(".", int(start_time)) | ||||
|             self.loop_step(mail_delta, start_time) | ||||
|             # Sleep until the start of the next loop step | ||||
|             time.sleep(max(0, start_time + loop_time - time.time())) | ||||
|  | ||||
|     def loop_step(self, mail_delta, time_now=None): | ||||
|  | ||||
|         # Occasionally fetch mail and store it to be consumed on the next loop | ||||
|         # We fetch email when we first start up so that it is not necessary to | ||||
|         # wait for 10 minutes after making changes to the config file. | ||||
|         next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
|         if self.first_iteration or time_now > next_mail_time: | ||||
|             self.first_iteration = False | ||||
|             self.mail_fetcher.pull() | ||||
|  | ||||
|         self.file_consumer.consume_new_files() | ||||
|  | ||||
|     def loop_inotify(self, mail_delta): | ||||
|         directory = self.file_consumer.consume | ||||
|         inotify = INotify() | ||||
|         inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO) | ||||
|  | ||||
|         # Run initial mail fetch and consume all currently existing documents | ||||
|         self.loop_step(mail_delta) | ||||
|         next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
|  | ||||
|         while True: | ||||
|             # Consume documents until next_mail_time | ||||
|             while True: | ||||
|                 delta = next_mail_time - time.time() | ||||
|                 if delta > 0: | ||||
|                     for event in inotify.read(timeout=delta): | ||||
|                         file = os.path.join(directory, event.name) | ||||
|                         if os.path.isfile(file): | ||||
|                             self.file_consumer.try_consume_file(file) | ||||
|                         else: | ||||
|                             self.logger.warning( | ||||
|                                 "Skipping %s as it is not a file", | ||||
|                                 file | ||||
|                             ) | ||||
|                 else: | ||||
|                     break | ||||
|  | ||||
|             self.mail_fetcher.pull() | ||||
|             next_mail_time = self.mail_fetcher.last_checked + mail_delta | ||||
|         # Start the watchdog. Woof! | ||||
|         observer = Observer() | ||||
|         event_handler = Handler(self.consumer) | ||||
|         observer.schedule(event_handler, directory, recursive=True) | ||||
|         observer.start() | ||||
|         try: | ||||
|             while observer.is_alive(): | ||||
|                 observer.join(1) | ||||
|         except KeyboardInterrupt: | ||||
|             observer.stop() | ||||
|         observer.join() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler