mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'dev' into mail_rework
This commit is contained in:
		| @@ -12,7 +12,7 @@ from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from .file_handling import generate_filename, create_source_path_directory | ||||
| from .models import Document, FileInfo | ||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||
| from .parsers import ParseError, get_parser_class | ||||
| from .signals import ( | ||||
|     document_consumption_finished, | ||||
| @@ -25,130 +25,198 @@ class ConsumerError(Exception): | ||||
|  | ||||
|  | ||||
| class Consumer: | ||||
|     """ | ||||
|     Loop over every file found in CONSUMPTION_DIR and: | ||||
|       1. Convert it to a greyscale pnm | ||||
|       2. Use tesseract on the pnm | ||||
|       3. Store the document in the MEDIA_ROOT with optional encryption | ||||
|       4. Store the OCR'd text in the database | ||||
|       5. Delete the document and image(s) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, consume=settings.CONSUMPTION_DIR, | ||||
|                  scratch=settings.SCRATCH_DIR): | ||||
|     def __init__(self): | ||||
|  | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
|  | ||||
|         self.consume = consume | ||||
|         self.scratch = scratch | ||||
|  | ||||
|         self.classifier = DocumentClassifier() | ||||
|  | ||||
|         os.makedirs(self.scratch, exist_ok=True) | ||||
|  | ||||
|         self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|         if settings.PASSPHRASE: | ||||
|             self.storage_type = Document.STORAGE_TYPE_GPG | ||||
|  | ||||
|         if not self.consume: | ||||
|     @staticmethod | ||||
|     def pre_check_file_exists(filename): | ||||
|         if not os.path.isfile(filename): | ||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||
|                 filename)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def pre_check_consumption_dir(): | ||||
|         if not settings.CONSUMPTION_DIR: | ||||
|             raise ConsumerError( | ||||
|                 "The CONSUMPTION_DIR settings variable does not appear to be " | ||||
|                 "set." | ||||
|                 "set.") | ||||
|  | ||||
|         if not os.path.isdir(settings.CONSUMPTION_DIR): | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format( | ||||
|                     settings.CONSUMPTION_DIR)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def pre_check_regex(filename): | ||||
|         if not re.match(FileInfo.REGEXES["title"], filename): | ||||
|             raise ConsumerError( | ||||
|                 "Filename {} does not seem to be safe to " | ||||
|                 "consume".format(filename)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def pre_check_duplicate(filename): | ||||
|         with open(filename, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         if Document.objects.filter(checksum=checksum).exists(): | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 os.unlink(filename) | ||||
|             raise ConsumerError( | ||||
|                 "Not consuming {}: It is a duplicate.".format(filename) | ||||
|             ) | ||||
|  | ||||
|         if not os.path.exists(self.consume): | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format(self.consume)) | ||||
|     @staticmethod | ||||
|     def pre_check_directories(): | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
|  | ||||
|     @transaction.atomic | ||||
|     def try_consume_file(self, file): | ||||
|     def try_consume_file(self, | ||||
|                          filename, | ||||
|                          original_filename=None, | ||||
|                          force_title=None, | ||||
|                          force_correspondent_id=None, | ||||
|                          force_document_type_id=None, | ||||
|                          force_tag_ids=None): | ||||
|         """ | ||||
|         Return True if file was consumed | ||||
|         Return the document object if it was successfully created. | ||||
|         """ | ||||
|  | ||||
|         # this is for grouping logging entries for this particular file | ||||
|         # together. | ||||
|  | ||||
|         self.logging_group = uuid.uuid4() | ||||
|  | ||||
|         if not re.match(FileInfo.REGEXES["title"], file): | ||||
|             return False | ||||
|         # Make sure that preconditions for consuming the file are met. | ||||
|  | ||||
|         doc = file | ||||
|         self.pre_check_file_exists(filename) | ||||
|         self.pre_check_consumption_dir() | ||||
|         self.pre_check_directories() | ||||
|         self.pre_check_regex(filename) | ||||
|         self.pre_check_duplicate(filename) | ||||
|  | ||||
|         if self._is_duplicate(doc): | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "Skipping {} as it appears to be a duplicate".format(doc) | ||||
|             ) | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 self._cleanup_doc(doc) | ||||
|             return False | ||||
|         self.log("info", "Consuming {}".format(filename)) | ||||
|  | ||||
|         self.log("info", "Consuming {}".format(doc)) | ||||
|         # Determine the parser class. | ||||
|  | ||||
|         parser_class = get_parser_class(doc) | ||||
|         parser_class = get_parser_class(original_filename or filename) | ||||
|         if not parser_class: | ||||
|             self.log( | ||||
|                 "error", "No parsers could be found for {}".format(doc)) | ||||
|             return False | ||||
|             raise ConsumerError("No parsers abvailable for {}".format(filename)) | ||||
|         else: | ||||
|             self.log("info", "Parser: {}".format(parser_class.__name__)) | ||||
|             self.log("debug", "Parser: {}".format(parser_class.__name__)) | ||||
|  | ||||
|         # Notify all listeners that we're going to do some work. | ||||
|  | ||||
|         document_consumption_started.send( | ||||
|             sender=self.__class__, | ||||
|             filename=doc, | ||||
|             filename=filename, | ||||
|             logging_group=self.logging_group | ||||
|         ) | ||||
|  | ||||
|         document_parser = parser_class(doc, self.logging_group) | ||||
|         # This doesn't parse the document yet, but gives us a parser. | ||||
|  | ||||
|         document_parser = parser_class(filename, self.logging_group) | ||||
|  | ||||
|         # However, this already created working directories which we have to | ||||
|         # clean up. | ||||
|  | ||||
|         # Parse the document. This may take some time. | ||||
|  | ||||
|         try: | ||||
|             self.log("info", "Generating thumbnail for {}...".format(doc)) | ||||
|             self.log("debug", "Generating thumbnail for {}...".format(filename)) | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             self.log("debug", "Parsing {}...".format(filename)) | ||||
|             text = document_parser.get_text() | ||||
|             date = document_parser.get_date() | ||||
|             document = self._store( | ||||
|                 text, | ||||
|                 doc, | ||||
|                 thumbnail, | ||||
|                 date | ||||
|             ) | ||||
|         except ParseError as e: | ||||
|             self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|             document_parser.cleanup() | ||||
|             return False | ||||
|         else: | ||||
|             document_parser.cleanup() | ||||
|             self._cleanup_doc(doc) | ||||
|             raise ConsumerError(e) | ||||
|  | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Document {} consumption finished".format(document) | ||||
|             ) | ||||
|         # Prepare the document classifier. | ||||
|  | ||||
|         # TODO: I don't really like to do this here, but this way we avoid | ||||
|         #   reloading the classifier multiple times, since there are multiple | ||||
|         #   post-consume hooks that all require the classifier. | ||||
|  | ||||
|         try: | ||||
|             classifier = DocumentClassifier() | ||||
|             classifier.reload() | ||||
|         except (FileNotFoundError, IncompatibleClassifierVersionError) as e: | ||||
|             logging.getLogger(__name__).warning( | ||||
|                 "Cannot classify documents: {}.".format(e)) | ||||
|             classifier = None | ||||
|  | ||||
|             try: | ||||
|                 self.classifier.reload() | ||||
|                 classifier = self.classifier | ||||
|             except (FileNotFoundError, IncompatibleClassifierVersionError) as e: | ||||
|                 logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e)) | ||||
|         # now that everything is done, we can start to store the document | ||||
|         # in the system. This will be a transaction and reasonably fast. | ||||
|         try: | ||||
|             with transaction.atomic(): | ||||
|  | ||||
|             document_consumption_finished.send( | ||||
|                 sender=self.__class__, | ||||
|                 document=document, | ||||
|                 logging_group=self.logging_group, | ||||
|                 classifier=classifier | ||||
|             ) | ||||
|             return True | ||||
|                 # store the document. | ||||
|                 document = self._store( | ||||
|                     text=text, | ||||
|                     doc=filename, | ||||
|                     thumbnail=thumbnail, | ||||
|                     date=date, | ||||
|                     original_filename=original_filename, | ||||
|                     force_title=force_title, | ||||
|                     force_correspondent_id=force_correspondent_id, | ||||
|                     force_document_type_id=force_document_type_id, | ||||
|                     force_tag_ids=force_tag_ids | ||||
|                 ) | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail, date): | ||||
|                 # If we get here, it was successful. Proceed with post-consume | ||||
|                 # hooks. If they fail, nothing will get changed. | ||||
|  | ||||
|         file_info = FileInfo.from_path(doc) | ||||
|                 document_consumption_finished.send( | ||||
|                     sender=self.__class__, | ||||
|                     document=document, | ||||
|                     logging_group=self.logging_group, | ||||
|                     classifier=classifier | ||||
|                 ) | ||||
|  | ||||
|                 # After everything is in the database, copy the files into | ||||
|                 # place. If this fails, we'll also rollback the transaction. | ||||
|  | ||||
|                 create_source_path_directory(document.source_path) | ||||
|                 self._write(document, filename, document.source_path) | ||||
|                 self._write(document, thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
|                 self.log("debug", "Deleting document {}".format(filename)) | ||||
|                 os.unlink(filename) | ||||
|         except Exception as e: | ||||
|             raise ConsumerError(e) | ||||
|         finally: | ||||
|             document_parser.cleanup() | ||||
|  | ||||
|         self.log( | ||||
|             "info", | ||||
|             "Document {} consumption finished".format(document) | ||||
|         ) | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail, date, | ||||
|                original_filename=None, | ||||
|                force_title=None, | ||||
|                force_correspondent_id=None, | ||||
|                force_document_type_id=None, | ||||
|                force_tag_ids=None): | ||||
|  | ||||
|         # If someone gave us the original filename, use it instead of doc. | ||||
|  | ||||
|         file_info = FileInfo.from_path(original_filename or doc) | ||||
|  | ||||
|         stats = os.stat(doc) | ||||
|  | ||||
| @@ -175,13 +243,21 @@ class Consumer: | ||||
|             self.log("debug", "Tagging with {}".format(tag_names)) | ||||
|             document.tags.add(*relevant_tags) | ||||
|  | ||||
|         if force_title: | ||||
|             document.title = force_title | ||||
|  | ||||
|         if force_correspondent_id: | ||||
|             document.correspondent = Correspondent.objects.get(pk=force_correspondent_id) | ||||
|  | ||||
|         if force_document_type_id: | ||||
|             document.document_type = DocumentType.objects.get(pk=force_document_type_id) | ||||
|  | ||||
|         if force_tag_ids: | ||||
|             for tag_id in force_tag_ids: | ||||
|                 document.tags.add(Tag.objects.get(pk=tag_id)) | ||||
|  | ||||
|         document.filename = generate_filename(document) | ||||
|  | ||||
|         create_source_path_directory(document.source_path) | ||||
|  | ||||
|         self._write(document, doc, document.source_path) | ||||
|         self._write(document, thumbnail, document.thumbnail_path) | ||||
|  | ||||
|         # We need to save the document twice, since we need the PK of the | ||||
|         # document in order to create its filename above. | ||||
|         document.save() | ||||
| @@ -196,13 +272,3 @@ class Consumer: | ||||
|                     return | ||||
|                 self.log("debug", "Encrypting") | ||||
|                 write_file.write(GnuPG.encrypted(read_file)) | ||||
|  | ||||
|     def _cleanup_doc(self, doc): | ||||
|         self.log("debug", "Deleting document {}".format(doc)) | ||||
|         os.unlink(doc) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _is_duplicate(doc): | ||||
|         with open(doc, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         return Document.objects.filter(checksum=checksum).exists() | ||||
|   | ||||
| @@ -1,9 +1,11 @@ | ||||
| import os | ||||
| import tempfile | ||||
| from datetime import datetime | ||||
| from time import mktime | ||||
|  | ||||
| from django import forms | ||||
| from django.conf import settings | ||||
| from django_q.tasks import async_task | ||||
| from pathvalidate import validate_filename, ValidationError | ||||
|  | ||||
|  | ||||
| @@ -18,15 +20,6 @@ class UploadForm(forms.Form): | ||||
|             raise forms.ValidationError("That filename is suspicious.") | ||||
|         return self.cleaned_data.get("document") | ||||
|  | ||||
|     def get_filename(self, i=None): | ||||
|         return os.path.join( | ||||
|             settings.CONSUMPTION_DIR, | ||||
|             "{}_{}".format( | ||||
|                 str(i), | ||||
|                 self.cleaned_data.get("document").name | ||||
|             ) if i else self.cleaned_data.get("document").name | ||||
|         ) | ||||
|  | ||||
|     def save(self): | ||||
|         """ | ||||
|         Since the consumer already does a lot of work, it's easier just to save | ||||
| @@ -35,15 +28,13 @@ class UploadForm(forms.Form): | ||||
|         """ | ||||
|  | ||||
|         document = self.cleaned_data.get("document").read() | ||||
|         original_filename = self.cleaned_data.get("document").name | ||||
|  | ||||
|         t = int(mktime(datetime.now().timetuple())) | ||||
|  | ||||
|         file_name = self.get_filename() | ||||
|         i = 0 | ||||
|         while os.path.exists(file_name): | ||||
|             i += 1 | ||||
|             file_name = self.get_filename(i) | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f: | ||||
|  | ||||
|         with open(file_name, "wb") as f: | ||||
|             f.write(document) | ||||
|             os.utime(file_name, times=(t, t)) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|  | ||||
|             async_task("documents.tasks.consume_file", f.name, original_filename, task_name=os.path.basename(original_filename)) | ||||
|   | ||||
| @@ -3,10 +3,10 @@ import os | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand | ||||
| from django_q.tasks import async_task | ||||
| from watchdog.events import FileSystemEventHandler | ||||
| from watchdog.observers import Observer | ||||
|  | ||||
| from documents.consumer import Consumer | ||||
| from watchdog.observers.polling import PollingObserver | ||||
|  | ||||
| try: | ||||
|     from inotify_simple import INotify, flags | ||||
| @@ -16,13 +16,10 @@ except ImportError: | ||||
|  | ||||
| class Handler(FileSystemEventHandler): | ||||
|  | ||||
|     def __init__(self, consumer): | ||||
|         self.consumer = consumer | ||||
|  | ||||
|     def _consume(self, file): | ||||
|         if os.path.isfile(file): | ||||
|             try: | ||||
|                 self.consumer.try_consume_file(file) | ||||
|                 async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file)) | ||||
|             except Exception as e: | ||||
|                 # Catch all so that the consumer won't crash. | ||||
|                 logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) | ||||
| @@ -45,7 +42,9 @@ class Command(BaseCommand): | ||||
|         self.verbosity = 0 | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|  | ||||
|         self.consumer = Consumer() | ||||
|         self.file_consumer = None | ||||
|         self.mail_fetcher = None | ||||
|         self.first_iteration = True | ||||
|  | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
| @@ -62,9 +61,6 @@ class Command(BaseCommand): | ||||
|         self.verbosity = options["verbosity"] | ||||
|         directory = options["directory"] | ||||
|  | ||||
|         for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR): | ||||
|             os.makedirs(d, exist_ok=True) | ||||
|  | ||||
|         logging.getLogger(__name__).info( | ||||
|             "Starting document consumer at {}".format( | ||||
|                 directory | ||||
| @@ -74,11 +70,16 @@ class Command(BaseCommand): | ||||
|         # Consume all files as this is not done initially by the watchdog | ||||
|         for entry in os.scandir(directory): | ||||
|             if entry.is_file(): | ||||
|                 self.consumer.try_consume_file(entry.path) | ||||
|                 async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path)) | ||||
|  | ||||
|         # Start the watchdog. Woof! | ||||
|         observer = Observer() | ||||
|         event_handler = Handler(self.consumer) | ||||
|         if settings.CONSUMER_POLLING > 0: | ||||
|             logging.getLogger(__name__).info('Using polling instead of file' | ||||
|                                              'system notifications.') | ||||
|             observer = PollingObserver(timeout=settings.CONSUMER_POLLING) | ||||
|         else: | ||||
|             observer = Observer() | ||||
|         event_handler = Handler() | ||||
|         observer.schedule(event_handler, directory, recursive=True) | ||||
|         observer.start() | ||||
|         try: | ||||
|   | ||||
| @@ -41,15 +41,16 @@ def get_parser_class(doc): | ||||
|     Determine the appropriate parser class based on the file | ||||
|     """ | ||||
|  | ||||
|     parsers = [] | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parsers.append(response[1]) | ||||
|  | ||||
|     options = [] | ||||
|     for parser in parsers: | ||||
|         result = parser(doc) | ||||
|         if result: | ||||
|             options.append(result) | ||||
|  | ||||
|     # Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser. | ||||
|  | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
|         parser_test = parser_declaration["test"] | ||||
|  | ||||
|         if parser_test(doc): | ||||
|             options.append(parser_declaration) | ||||
|  | ||||
|     if not options: | ||||
|         return None | ||||
|   | ||||
| @@ -6,6 +6,7 @@ from whoosh.writing import AsyncWriter | ||||
| from documents import index | ||||
| from documents.classifier import DocumentClassifier, \ | ||||
|     IncompatibleClassifierVersionError | ||||
| from documents.consumer import Consumer, ConsumerError | ||||
| from documents.models import Document | ||||
|  | ||||
|  | ||||
| @@ -49,3 +50,27 @@ def train_classifier(): | ||||
|         logging.getLogger(__name__).error( | ||||
|             "Classifier error: " + str(e) | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def consume_file(file, | ||||
|                  original_filename=None, | ||||
|                  force_title=None, | ||||
|                  force_correspondent_id=None, | ||||
|                  force_document_type_id=None, | ||||
|                  force_tag_ids=None): | ||||
|  | ||||
|     document = Consumer().try_consume_file( | ||||
|         file, | ||||
|         original_filename=original_filename, | ||||
|         force_title=force_title, | ||||
|         force_correspondent_id=force_correspondent_id, | ||||
|         force_document_type_id=force_document_type_id, | ||||
|         force_tag_ids=force_tag_ids) | ||||
|  | ||||
|     if document: | ||||
|         return "Success. New document id {} created".format( | ||||
|             document.pk | ||||
|         ) | ||||
|     else: | ||||
|         raise ConsumerError("Unknown error: Returned document was null, but " | ||||
|                             "no error message was given.") | ||||
|   | ||||
| @@ -1,8 +1,17 @@ | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import tempfile | ||||
| from unittest import mock | ||||
| from unittest.mock import MagicMock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.conf import settings | ||||
| from django.db import DatabaseError | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..models import FileInfo, Tag | ||||
| from ..consumer import Consumer, ConsumerError | ||||
| from ..models import FileInfo, Tag, Correspondent, DocumentType, Document | ||||
| from ..parsers import DocumentParser, ParseError | ||||
|  | ||||
|  | ||||
| class TestAttributes(TestCase): | ||||
| @@ -394,3 +403,251 @@ class TestFieldPermutations(TestCase): | ||||
|             self.assertEqual(info.created.year, 2019) | ||||
|             self.assertEqual(info.created.month, 9) | ||||
|             self.assertEqual(info.created.day, 8) | ||||
|  | ||||
|  | ||||
| class DummyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(DummyParser, self).__init__(path, logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|         return "The Text" | ||||
|  | ||||
|  | ||||
| class FaultyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(FaultyParser, self).__init__(path, logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|         raise ParseError("Does not compute.") | ||||
|  | ||||
|  | ||||
| class TestConsumer(TestCase): | ||||
|  | ||||
|     def make_dummy_parser(self, path, logging_group): | ||||
|         return DummyParser(path, logging_group, self.scratch_dir) | ||||
|  | ||||
|     def make_faulty_parser(self, path, logging_group): | ||||
|         return FaultyParser(path, logging_group, self.scratch_dir) | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch_dir = tempfile.mkdtemp() | ||||
|         self.media_dir = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings( | ||||
|             SCRATCH_DIR=self.scratch_dir, | ||||
|             MEDIA_ROOT=self.media_dir, | ||||
|             ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"), | ||||
|             THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails") | ||||
|         ).enable() | ||||
|  | ||||
|         patcher = mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|         m = patcher.start() | ||||
|         m.return_value = [(None, { | ||||
|             "parser": self.make_dummy_parser, | ||||
|             "test": lambda _: True, | ||||
|             "weight": 0 | ||||
|         })] | ||||
|  | ||||
|         self.addCleanup(patcher.stop) | ||||
|  | ||||
|         self.consumer = Consumer() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch_dir, ignore_errors=True) | ||||
|         shutil.rmtree(self.media_dir, ignore_errors=True) | ||||
|  | ||||
|     def get_test_file(self): | ||||
|         fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir) | ||||
|         return f | ||||
|  | ||||
|     def testNormalOperation(self): | ||||
|  | ||||
|         filename = self.get_test_file() | ||||
|         document = self.consumer.try_consume_file(filename) | ||||
|  | ||||
|         self.assertEqual(document.content, "The Text") | ||||
|         self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0]) | ||||
|         self.assertIsNone(document.correspondent) | ||||
|         self.assertIsNone(document.document_type) | ||||
|         self.assertEqual(document.filename, "0000001.pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile( | ||||
|             document.source_path | ||||
|         )) | ||||
|  | ||||
|         self.assertTrue(os.path.isfile( | ||||
|             document.thumbnail_path | ||||
|         )) | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(filename)) | ||||
|  | ||||
|     def testOverrideFilename(self): | ||||
|         filename = self.get_test_file() | ||||
|         overrideFilename = "My Bank - Statement for November.pdf" | ||||
|  | ||||
|         document = self.consumer.try_consume_file(filename, original_filename=overrideFilename) | ||||
|  | ||||
|         self.assertEqual(document.correspondent.name, "My Bank") | ||||
|         self.assertEqual(document.title, "Statement for November") | ||||
|  | ||||
|     def testOverrideTitle(self): | ||||
|  | ||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_title="Override Title") | ||||
|         self.assertEqual(document.title, "Override Title") | ||||
|  | ||||
|     def testOverrideCorrespondent(self): | ||||
|         c = Correspondent.objects.create(name="test") | ||||
|  | ||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_correspondent_id=c.pk) | ||||
|         self.assertEqual(document.correspondent.id, c.id) | ||||
|  | ||||
|     def testOverrideDocumentType(self): | ||||
|         dt = DocumentType.objects.create(name="test") | ||||
|  | ||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_document_type_id=dt.pk) | ||||
|         self.assertEqual(document.document_type.id, dt.id) | ||||
|  | ||||
|     def testOverrideTags(self): | ||||
|         t1 = Tag.objects.create(name="t1") | ||||
|         t2 = Tag.objects.create(name="t2") | ||||
|         t3 = Tag.objects.create(name="t3") | ||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_tag_ids=[t1.id, t3.id]) | ||||
|  | ||||
|         self.assertIn(t1, document.tags.all()) | ||||
|         self.assertNotIn(t2, document.tags.all()) | ||||
|         self.assertIn(t3, document.tags.all()) | ||||
|  | ||||
|     def testNotAFile(self): | ||||
|         try: | ||||
|             self.consumer.try_consume_file("non-existing-file") | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).endswith('It is not a file')) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR=None) | ||||
|     def testConsumptionDirUnset(self): | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.") | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR="asd") | ||||
|     def testNoConsumptionDir(self): | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "Consumption directory asd does not exist") | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates(self): | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).endswith("It is a duplicate.")) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def testNoParsers(self, m): | ||||
|         m.return_value = [] | ||||
|  | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).startswith("No parsers abvailable")) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def testFaultyParser(self, m): | ||||
|         m.return_value = [(None, { | ||||
|             "parser": self.make_faulty_parser, | ||||
|             "test": lambda _: True, | ||||
|             "weight": 0 | ||||
|         })] | ||||
|  | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "Does not compute.") | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception.") | ||||
|  | ||||
|     @mock.patch("documents.consumer.Consumer._write") | ||||
|     def testPostSaveError(self, m): | ||||
|         filename = self.get_test_file() | ||||
|         m.side_effect = OSError("NO.") | ||||
|         try: | ||||
|             self.consumer.try_consume_file(filename) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "NO.") | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|         # file not deleted | ||||
|         self.assertTrue(os.path.isfile(filename)) | ||||
|  | ||||
|         # Database empty | ||||
|         self.assertEqual(len(Document.objects.all()), 0) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def testFilenameHandling(self): | ||||
|         filename = self.get_test_file() | ||||
|  | ||||
|         document = self.consumer.try_consume_file(filename, original_filename="Bank - Test.pdf", force_title="new docs") | ||||
|  | ||||
|         print(document.source_path) | ||||
|         print("===") | ||||
|  | ||||
|         self.assertEqual(document.title, "new docs") | ||||
|         self.assertEqual(document.correspondent.name, "Bank") | ||||
|         self.assertEqual(document.filename, "bank/new-docs-0000001.pdf") | ||||
|  | ||||
|     @mock.patch("documents.consumer.DocumentClassifier") | ||||
|     def testClassifyDocument(self, m): | ||||
|         correspondent = Correspondent.objects.create(name="test") | ||||
|         dtype = DocumentType.objects.create(name="test") | ||||
|         t1 = Tag.objects.create(name="t1") | ||||
|         t2 = Tag.objects.create(name="t2") | ||||
|  | ||||
|         m.return_value = MagicMock() | ||||
|         m.return_value.predict_correspondent.return_value = correspondent.pk | ||||
|         m.return_value.predict_document_type.return_value = dtype.pk | ||||
|         m.return_value.predict_tags.return_value = [t1.pk] | ||||
|  | ||||
|         document = self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         self.assertEqual(document.correspondent, correspondent) | ||||
|         self.assertEqual(document.document_type, dtype) | ||||
|         self.assertIn(t1, document.tags.all()) | ||||
|         self.assertNotIn(t2, document.tags.all()) | ||||
|   | ||||
| @@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase): | ||||
|             pass | ||||
|  | ||||
|         m.return_value = ( | ||||
|             (None, lambda _: {"weight": 0, "parser": DummyParser}), | ||||
|             (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual( | ||||
| @@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase): | ||||
|             pass | ||||
|  | ||||
|         m.return_value = ( | ||||
|             (None, lambda _: {"weight": 0, "parser": DummyParser1}), | ||||
|             (None, lambda _: {"weight": 1, "parser": DummyParser2}), | ||||
|             (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}), | ||||
|             (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual( | ||||
| @@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase): | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def test__get_parser_class_0_parsers(self, m, *args): | ||||
|         m.return_value = ((None, lambda _: None),) | ||||
|         m.return_value = [] | ||||
|         with TemporaryDirectory() as tmpdir: | ||||
|             self.assertIsNone( | ||||
|                 get_parser_class("doc.pdf") | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| import json | ||||
| import math | ||||
| import multiprocessing | ||||
| import os | ||||
| import re | ||||
| @@ -263,24 +264,58 @@ LOGGING = { | ||||
| # Task queue                                                                  # | ||||
| ############################################################################### | ||||
|  | ||||
|  | ||||
| # Sensible defaults for multitasking: | ||||
| # use a fair balance between worker processes and threads epr worker so that | ||||
| # both consuming many documents in parallel and consuming large documents is | ||||
| # reasonably fast. | ||||
| # Favors threads per worker on smaller systems and never exceeds cpu_count() | ||||
| # in total. | ||||
|  | ||||
| def default_task_workers(): | ||||
|     try: | ||||
|         return max( | ||||
|             math.floor(math.sqrt(multiprocessing.cpu_count())), | ||||
|             1 | ||||
|         ) | ||||
|     except NotImplementedError: | ||||
|         return 1 | ||||
|  | ||||
|  | ||||
| TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers())) | ||||
|  | ||||
| Q_CLUSTER = { | ||||
|     'name': 'paperless', | ||||
|     'catch_up': False, | ||||
|     'workers': TASK_WORKERS, | ||||
|     'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379") | ||||
| } | ||||
|  | ||||
|  | ||||
| def default_threads_per_worker(): | ||||
|     try: | ||||
|         return max( | ||||
|             math.floor(multiprocessing.cpu_count() / TASK_WORKERS), | ||||
|             1 | ||||
|         ) | ||||
|     except NotImplementedError: | ||||
|         return 1 | ||||
|  | ||||
|  | ||||
| THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker()) | ||||
|  | ||||
| ############################################################################### | ||||
| # Paperless Specific Settings                                                 # | ||||
| ############################################################################### | ||||
|  | ||||
| CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0)) | ||||
|  | ||||
| CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") | ||||
|  | ||||
| # The default language that tesseract will attempt to use when parsing | ||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||
| OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
|  | ||||
| # The amount of threads to use for OCR | ||||
| OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count())) | ||||
|  | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") | ||||
| @@ -325,5 +360,6 @@ FILENAME_PARSE_TRANSFORMS = [] | ||||
| for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): | ||||
|     FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) | ||||
|  | ||||
| # TODO: this should not have a prefix. | ||||
| # Specify the filename format for out files | ||||
| PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
| from paperless_tesseract.signals import tesseract_consumer_declaration | ||||
|  | ||||
|  | ||||
| class PaperlessTesseractConfig(AppConfig): | ||||
|  | ||||
| @@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig): | ||||
|  | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         from .signals import ConsumerDeclaration | ||||
|  | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
|         document_consumer_declaration.connect(tesseract_consumer_declaration) | ||||
|  | ||||
|         AppConfig.ready(self) | ||||
|   | ||||
| @@ -2,7 +2,7 @@ import itertools | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| from multiprocessing.pool import ThreadPool | ||||
|  | ||||
| import langdetect | ||||
| import pdftotext | ||||
| @@ -151,7 +151,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         self.log("info", "Running unpaper on {} pages...".format(len(pnms))) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with Pool(processes=settings.OCR_THREADS) as pool: | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             pnms = pool.map(run_unpaper, pnms) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
| @@ -166,7 +166,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) | ||||
|         with Pool(processes=settings.OCR_THREADS) as pool: | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             return r | ||||
|  | ||||
|   | ||||
| @@ -3,21 +3,16 @@ import re | ||||
| from .parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class ConsumerDeclaration: | ||||
| def tesseract_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": RasterisedDocumentParser, | ||||
|         "weight": 0, | ||||
|         "test": tesseract_consumer_test | ||||
|     } | ||||
|  | ||||
|     MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
| MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
|  | ||||
|         if cls.MATCHING_FILES.match(doc.lower()): | ||||
|             return { | ||||
|                 "parser": RasterisedDocumentParser, | ||||
|                 "weight": 0 | ||||
|             } | ||||
|  | ||||
|         return None | ||||
| def tesseract_consumer_test(doc): | ||||
|     return MATCHING_FILES.match(doc.lower()) | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| from django.test import TestCase | ||||
|  | ||||
| from ..signals import ConsumerDeclaration | ||||
| from paperless_tesseract.signals import tesseract_consumer_test | ||||
|  | ||||
|  | ||||
| class SignalsTestCase(TestCase): | ||||
| @@ -20,7 +20,7 @@ class SignalsTestCase(TestCase): | ||||
|         for prefix in prefixes: | ||||
|             for suffix in suffixes: | ||||
|                 name = "{}.{}".format(prefix, suffix) | ||||
|                 self.assertTrue(ConsumerDeclaration.test(name)) | ||||
|                 self.assertTrue(tesseract_consumer_test(name)) | ||||
|  | ||||
|     def test_test_handles_various_file_names_false(self): | ||||
|  | ||||
| @@ -30,7 +30,7 @@ class SignalsTestCase(TestCase): | ||||
|         for prefix in prefixes: | ||||
|             for suffix in suffixes: | ||||
|                 name = "{}.{}".format(prefix, suffix) | ||||
|                 self.assertFalse(ConsumerDeclaration.test(name)) | ||||
|                 self.assertFalse(tesseract_consumer_test(name)) | ||||
|  | ||||
|         self.assertFalse(ConsumerDeclaration.test("")) | ||||
|         self.assertFalse(ConsumerDeclaration.test("doc")) | ||||
|         self.assertFalse(tesseract_consumer_test("")) | ||||
|         self.assertFalse(tesseract_consumer_test("doc")) | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
| from paperless_text.signals import text_consumer_declaration | ||||
|  | ||||
|  | ||||
| class PaperlessTextConfig(AppConfig): | ||||
|  | ||||
| @@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig): | ||||
|  | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         from .signals import ConsumerDeclaration | ||||
|  | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
|         document_consumer_declaration.connect(text_consumer_declaration) | ||||
|  | ||||
|         AppConfig.ready(self) | ||||
|   | ||||
| @@ -3,21 +3,16 @@ import re | ||||
| from .parsers import TextDocumentParser | ||||
|  | ||||
|  | ||||
| class ConsumerDeclaration: | ||||
| def text_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": TextDocumentParser, | ||||
|         "weight": 10, | ||||
|         "test": text_consumer_test | ||||
|     } | ||||
|  | ||||
|     MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
| MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
|  | ||||
|         if cls.MATCHING_FILES.match(doc.lower()): | ||||
|             return { | ||||
|                 "parser": TextDocumentParser, | ||||
|                 "weight": 10 | ||||
|             } | ||||
|  | ||||
|         return None | ||||
| def text_consumer_test(doc): | ||||
|     return MATCHING_FILES.match(doc.lower()) | ||||
|   | ||||
| @@ -6,7 +6,6 @@ ignore = E501 | ||||
| DJANGO_SETTINGS_MODULE=paperless.settings | ||||
| addopts = --pythonwarnings=all | ||||
| env = | ||||
|   PAPERLESS_PASSPHRASE=THISISNOTASECRET | ||||
|   PAPERLESS_SECRET=paperless | ||||
|   PAPERLESS_EMAIL_SECRET=paperless | ||||
|  | ||||
| @@ -15,4 +14,4 @@ env = | ||||
| source = | ||||
|   ./ | ||||
| omit = | ||||
|   */tests | ||||
|   */tests/* | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler