From 4e144183fa98b3564be7e1172f6dd820b2a13d5e Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Thu, 29 Oct 2020 22:36:24 +0100 Subject: [PATCH 001/101] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ef90456a8..b5755d80f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents. This project extends on the project and modernizes many things. +This project is still under development. There also is no automatic way yet to migrate your current paperless setup to this version. I'm working on that. + # How it Works Paperless does not control your scanner, it only helps you deal with what your scanner produces. From e56c715d7f91a0b2630d4434030678e3747f931f Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Wed, 4 Nov 2020 19:41:24 +0100 Subject: [PATCH 002/101] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a4edbb87b..ce4277642 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ +# Paperless-ng + [Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents. Paperless-ng is a fork of the original project, adding a new interface and many other changes under the hood. For a detailed list of changes, see below. This project is still in development and some things may not work as expected. -This project is still under development. There also is no automatic way yet to migrate your current paperless setup to this version. I'm working on that. - # How it Works Paperless does not control your scanner, it only helps you deal with what your scanner produces. From 54f04650d16278f23e9239af196493dbf52f92e1 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Tue, 10 Nov 2020 01:47:35 +0100 Subject: [PATCH 003/101] fixed an issue with the searcher. --- src/documents/index.py | 9 +++++++-- src/documents/views.py | 13 ++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index a099f670c..82a35a63e 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,4 +1,5 @@ import logging +from contextlib import contextmanager from django.db import models from django.dispatch import receiver @@ -99,15 +100,19 @@ def remove_document_from_index(document): remove_document(writer, document) +@contextmanager def query_page(ix, query, page): - with ix.searcher() as searcher: + searcher = ix.searcher() + try: query_parser = MultifieldParser(["content", "title", "correspondent"], ix.schema).parse(query) result_page = searcher.search_page(query_parser, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() - return result_page + yield result_page + finally: + searcher.close() def autocomplete(ix, term, limit=10): diff --git a/src/documents/views.py b/src/documents/views.py index b3d6012f1..8cc330141 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -191,13 +191,12 @@ class SearchView(APIView): except (ValueError, TypeError): page = 1 - result_page = index.query_page(self.ix, query, page) - - return Response( - {'count': len(result_page), - 'page': result_page.pagenum, - 'page_count': result_page.pagecount, - 'results': list(map(self.add_infos_to_hit, result_page))}) + with index.query_page(self.ix, query, page) as result_page: + return Response( + {'count': len(result_page), + 'page': result_page.pagenum, + 'page_count': result_page.pagecount, + 'results': list(map(self.add_infos_to_hit, result_page))}) else: return Response({ From 83f82f3caf6ec7bc68624d36c96bb220e7cf3ab6 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Tue, 10 Nov 2020 01:47:58 +0100 Subject: [PATCH 004/101] added a setting: delete duplicate documents --- paperless.conf.example | 4 ++++ src/documents/consumer.py | 2 ++ src/paperless/settings.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/paperless.conf.example b/paperless.conf.example index 48df40ab2..1c62256ab 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -143,6 +143,10 @@ PAPERLESS_EMAIL_SECRET="" #### Software Tweaks #### ############################################################################### +# When the consumer detects a duplicate document, it will not touch the +# original document. This default behavior can be changed here. +#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false" + # After a document is consumed, Paperless can trigger an arbitrary script if # you like. This script will be passed a number of arguments for you to work # with. The default is blank, which means nothing will be executed. For more diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f61d11136..75e6f6120 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -84,6 +84,8 @@ class Consumer: "warning", "Skipping {} as it appears to be a duplicate".format(doc) ) + if settings.CONSUMER_DELETE_DUPLICATES: + self._cleanup_doc(doc) return False self.log("info", "Consuming {}".format(doc)) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 2c96350dc..06dfdcd84 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -258,6 +258,8 @@ Q_CLUSTER = { # Paperless Specific Settings # ############################################################################### +CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") + # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") From 6795580739f26931f9ea1ee27ceaac3e65c8edce Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:13:54 +0100 Subject: [PATCH 005/101] remove only automatically created schedules (almost) --- src/documents/migrations/1001_auto_20201109_1636.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/migrations/1001_auto_20201109_1636.py b/src/documents/migrations/1001_auto_20201109_1636.py index 8d6a0f584..138de6f91 100644 --- a/src/documents/migrations/1001_auto_20201109_1636.py +++ b/src/documents/migrations/1001_auto_20201109_1636.py @@ -13,7 +13,9 @@ def add_schedules(apps, schema_editor): def remove_schedules(apps, schema_editor): - Schedule.objects.all().delete() + Schedule.objects.filter(func='documents.tasks.train_classifier').delete() + Schedule.objects.filter(func='documents.tasks.index_optimize').delete() + Schedule.objects.filter(func='documents.tasks.consume_mail').delete() class Migration(migrations.Migration): From 02ef7cb0388970d894f85000563fe2249a967f9e Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:14:21 +0100 Subject: [PATCH 006/101] small consumer fixes --- src/documents/consumer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 75e6f6120..96aad7d49 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -60,7 +60,6 @@ class Consumer: raise ConsumerError( "Consumption directory {} does not exist".format(self.consume)) - def log(self, level, message): getattr(self.logger, level)(message, extra={ "group": self.logging_group @@ -98,7 +97,6 @@ class Consumer: else: self.log("info", "Parser: {}".format(parser_class.__name__)) - document_consumption_started.send( sender=self.__class__, filename=doc, @@ -110,9 +108,10 @@ class Consumer: try: self.log("info", "Generating thumbnail for {}...".format(doc)) thumbnail = document_parser.get_optimised_thumbnail() + text = document_parser.get_text() date = document_parser.get_date() document = self._store( - document_parser.get_text(), + text, doc, thumbnail, date From 734da28b69a1e402ab6823efbf8f2c12c91b4d26 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:21:33 +0100 Subject: [PATCH 007/101] fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. --- src/documents/apps.py | 3 - src/documents/consumer.py | 8 +- src/documents/file_handling.py | 92 ++++ .../management/commands/document_importer.py | 27 +- .../migrations/1002_auto_20201111_1105.py | 18 + src/documents/models.py | 245 +---------- src/documents/signals/handlers.py | 60 ++- src/documents/tests/test_file_handling.py | 411 ++++-------------- 8 files changed, 287 insertions(+), 577 deletions(-) create mode 100644 src/documents/file_handling.py create mode 100644 src/documents/migrations/1002_auto_20201111_1105.py diff --git a/src/documents/apps.py b/src/documents/apps.py index 83e671d07..6cf815122 100644 --- a/src/documents/apps.py +++ b/src/documents/apps.py @@ -14,7 +14,6 @@ class DocumentsConfig(AppConfig): add_inbox_tags, run_pre_consume_script, run_post_consume_script, - cleanup_document_deletion, set_log_entry, set_correspondent, set_document_type, @@ -33,6 +32,4 @@ class DocumentsConfig(AppConfig): document_consumption_finished.connect(add_to_index) document_consumption_finished.connect(run_post_consume_script) - post_delete.connect(cleanup_document_deletion) - AppConfig.ready(self) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 96aad7d49..2e8c5493f 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -11,6 +11,7 @@ from django.utils import timezone from paperless.db import GnuPG from .classifier import DocumentClassifier, IncompatibleClassifierVersionError +from .file_handling import generate_filename, create_source_path_directory from .models import Document, FileInfo from .parsers import ParseError, get_parser_class from .signals import ( @@ -174,10 +175,15 @@ class Consumer: self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) + document.filename = generate_filename(document) + + create_source_path_directory(document.source_path) + self._write(document, doc, document.source_path) self._write(document, thumbnail, document.thumbnail_path) - #TODO: why do we need to save the document again? + # We need to save the document twice, since we need the PK of the + # document in order to create its filename above. document.save() return document diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py new file mode 100644 index 000000000..cac317d4c --- /dev/null +++ b/src/documents/file_handling.py @@ -0,0 +1,92 @@ +import os +from collections import defaultdict + +from django.conf import settings +from django.template.defaultfilters import slugify + + +def create_source_path_directory(source_path): + os.makedirs(os.path.dirname(source_path), exist_ok=True) + + +def delete_empty_directories(directory): + # Go up in the directory hierarchy and try to delete all directories + directory = os.path.normpath(directory) + root = os.path.normpath(settings.ORIGINALS_DIR) + + if not directory.startswith(root + os.path.sep): + # don't do anything outside our originals folder. + + # append os.path.set so that we avoid these cases: + # directory = /home/originals2/test + # root = /home/originals ("/" gets appended and startswith fails) + return + + while directory != root: + if not os.listdir(directory): + # it's empty + try: + os.rmdir(directory) + except OSError: + # whatever. empty directories aren't that bad anyway. + return + else: + # it's not empty. + return + + # go one level up + directory = os.path.normpath(os.path.dirname(directory)) + + +def many_to_dictionary(field): + # Converts ManyToManyField to dictionary by assuming, that field + # entries contain an _ or - which will be used as a delimiter + mydictionary = dict() + + for index, t in enumerate(field.all()): + # Populate tag names by index + mydictionary[index] = slugify(t.name) + + # Find delimiter + delimiter = t.name.find('_') + + if delimiter == -1: + delimiter = t.name.find('-') + + if delimiter == -1: + continue + + key = t.name[:delimiter] + value = t.name[delimiter + 1:] + + mydictionary[slugify(key)] = slugify(value) + + return mydictionary + + +def generate_filename(document): + # Create filename based on configured format + if settings.PAPERLESS_FILENAME_FORMAT is not None: + tags = defaultdict(lambda: slugify(None), + many_to_dictionary(document.tags)) + path = settings.PAPERLESS_FILENAME_FORMAT.format( + correspondent=slugify(document.correspondent), + title=slugify(document.title), + created=document.created.date(), + added=slugify(document.added), + tags=tags, + ) + else: + path = "" + + # Always append the primary key to guarantee uniqueness of filename + if len(path) > 0: + filename = "%s-%07i.%s" % (path, document.pk, document.file_type) + else: + filename = "%07i.%s" % (document.pk, document.file_type) + + # Append .gpg for encrypted files + if document.storage_type == document.STORAGE_TYPE_GPG: + filename += ".gpg" + + return filename diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index ae5c1853f..ef3eaafc0 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -8,6 +8,7 @@ from django.core.management import call_command from documents.models import Document from paperless.db import GnuPG +from ...file_handling import generate_filename, create_source_path_directory from ...mixins import Renderable @@ -82,6 +83,10 @@ class Command(Renderable, BaseCommand): def _import_files_from_manifest(self): + storage_type = Document.STORAGE_TYPE_UNENCRYPTED + if settings.PASSPHRASE: + storage_type = Document.STORAGE_TYPE_GPG + for record in self.manifest: if not record["model"] == "documents.document": @@ -94,6 +99,14 @@ class Command(Renderable, BaseCommand): document_path = os.path.join(self.source, doc_file) thumbnail_path = os.path.join(self.source, thumb_file) + document.storage_type = storage_type + document.filename = generate_filename(document) + + if os.path.isfile(document.source_path): + raise FileExistsError(document.source_path) + + create_source_path_directory(document.source_path) + if settings.PASSPHRASE: with open(document_path, "rb") as unencrypted: @@ -109,18 +122,8 @@ class Command(Renderable, BaseCommand): encrypted.write(GnuPG.encrypted(unencrypted)) else: - + print("Moving {} to {}".format(document_path, document.source_path)) shutil.copy(document_path, document.source_path) shutil.copy(thumbnail_path, document.thumbnail_path) - # Reset the storage type to whatever we've used while importing - - storage_type = Document.STORAGE_TYPE_UNENCRYPTED - if settings.PASSPHRASE: - storage_type = Document.STORAGE_TYPE_GPG - - Document.objects.filter( - pk__in=[r["pk"] for r in self.manifest] - ).update( - storage_type=storage_type - ) + document.save() diff --git a/src/documents/migrations/1002_auto_20201111_1105.py b/src/documents/migrations/1002_auto_20201111_1105.py new file mode 100644 index 000000000..7f6bae50b --- /dev/null +++ b/src/documents/migrations/1002_auto_20201111_1105.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2020-11-11 11:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1001_auto_20201109_1636'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='filename', + field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 88598b5f6..ab3262fb5 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -3,18 +3,15 @@ import logging import os import re -from collections import OrderedDict, defaultdict +from collections import OrderedDict import dateutil.parser from django.conf import settings from django.db import models -from django.dispatch import receiver -from django.template.defaultfilters import slugify from django.utils import timezone from django.utils.text import slugify - class MatchingModel(models.Model): MATCH_ANY = 1 @@ -192,7 +189,7 @@ class Document(models.Model): default=timezone.now, editable=False, db_index=True) filename = models.FilePathField( - max_length=256, + max_length=1024, editable=False, default=None, null=True, @@ -220,123 +217,18 @@ class Document(models.Model): return "{}: {}".format(created, self.correspondent or self.title) return str(created) - def find_renamed_document(self, subdirectory=""): - suffix = "%07i.%s" % (self.pk, self.file_type) - - # Append .gpg for encrypted files - if self.storage_type == self.STORAGE_TYPE_GPG: - suffix += ".gpg" - - # Go up in the directory hierarchy and try to delete all directories - root = os.path.normpath(Document.filename_to_path(subdirectory)) - - for filename in os.listdir(root): - if filename.endswith(suffix): - return os.path.join(subdirectory, filename) - - fullname = os.path.join(subdirectory, filename) - if os.path.isdir(Document.filename_to_path(fullname)): - return self.find_renamed_document(fullname) - - return None - - @property - def source_filename(self): - # Initial filename generation (for new documents) - if self.filename is None: - self.filename = self.generate_source_filename() - - # Check if document is still available under filename - elif not os.path.isfile(Document.filename_to_path(self.filename)): - recovered_filename = self.find_renamed_document() - - # If we have found the file so update the filename - if recovered_filename is not None: - logger = logging.getLogger(__name__) - logger.warning("Filename of document " + str(self.id) + - " has changed and was successfully updated") - self.filename = recovered_filename - - # Remove all empty subdirectories from MEDIA_ROOT - Document.delete_all_empty_subdirectories( - Document.filename_to_path("")) - else: - logger = logging.getLogger(__name__) - logger.error("File of document " + str(self.id) + " has " + - "gone and could not be recovered") - - return self.filename - - @staticmethod - def many_to_dictionary(field): - # Converts ManyToManyField to dictionary by assuming, that field - # entries contain an _ or - which will be used as a delimiter - mydictionary = dict() - - for index, t in enumerate(field.all()): - # Populate tag names by index - mydictionary[index] = slugify(t.name) - - # Find delimiter - delimiter = t.name.find('_') - - if delimiter == -1: - delimiter = t.name.find('-') - - if delimiter == -1: - continue - - key = t.name[:delimiter] - value = t.name[delimiter+1:] - - mydictionary[slugify(key)] = slugify(value) - - return mydictionary - - def generate_source_filename(self): - # Create filename based on configured format - if settings.PAPERLESS_FILENAME_FORMAT is not None: - tags = defaultdict(lambda: slugify(None), - self.many_to_dictionary(self.tags)) - path = settings.PAPERLESS_FILENAME_FORMAT.format( - correspondent=slugify(self.correspondent), - title=slugify(self.title), - created=slugify(self.created), - added=slugify(self.added), - tags=tags) - else: - path = "" - - # Always append the primary key to guarantee uniqueness of filename - if len(path) > 0: - filename = "%s-%07i.%s" % (path, self.pk, self.file_type) - else: - filename = "%07i.%s" % (self.pk, self.file_type) - - # Append .gpg for encrypted files - if self.storage_type == self.STORAGE_TYPE_GPG: - filename += ".gpg" - - return filename - - def create_source_directory(self): - new_filename = self.generate_source_filename() - - # Determine the full "target" path - dir_new = Document.filename_to_path(os.path.dirname(new_filename)) - - # Create new path - os.makedirs(dir_new, exist_ok=True) - @property def source_path(self): - return Document.filename_to_path(self.source_filename) + if self.filename: + fname = str(self.filename) + else: + fname = "{:07}.{}".format(self.pk, self.file_type) + if self.storage_type == self.STORAGE_TYPE_GPG: + fname += ".gpg" - @staticmethod - def filename_to_path(filename): return os.path.join( settings.ORIGINALS_DIR, - filename + fname ) @property @@ -362,125 +254,6 @@ class Document(models.Model): def thumbnail_file(self): return open(self.thumbnail_path, "rb") - def set_filename(self, filename): - if os.path.isfile(Document.filename_to_path(filename)): - self.filename = filename - - @staticmethod - def try_delete_empty_directories(directory): - # Go up in the directory hierarchy and try to delete all directories - directory = os.path.normpath(directory) - root = os.path.normpath(Document.filename_to_path("")) - - while directory != root: - # Try to delete the current directory - try: - os.rmdir(directory) - except os.error: - # Directory not empty, no need to go further up - return - - # Cut off actual directory and go one level up - directory, _ = os.path.split(directory) - directory = os.path.normpath(directory) - - @staticmethod - def delete_all_empty_subdirectories(directory): - # Go through all folders and try to delete all directories - root = os.path.normpath(Document.filename_to_path(directory)) - - for filename in os.listdir(root): - fullname = os.path.join(directory, filename) - - if not os.path.isdir(Document.filename_to_path(fullname)): - continue - - # Go into subdirectory to see, if there is more to delete - Document.delete_all_empty_subdirectories( - os.path.join(directory, filename)) - - # Try to delete the directory - try: - os.rmdir(Document.filename_to_path(fullname)) - continue - except os.error: - # Directory not empty, no need to go further up - continue - - -@receiver(models.signals.m2m_changed, sender=Document.tags.through) -@receiver(models.signals.post_save, sender=Document) -def update_filename(sender, instance, **kwargs): - # Skip if document has not been saved yet - if instance.filename is None: - return - - # Check is file exists and update filename otherwise - if not os.path.isfile(Document.filename_to_path(instance.filename)): - instance.filename = instance.source_filename - - # Build the new filename - new_filename = instance.generate_source_filename() - - # If the filename is the same, then nothing needs to be done - if instance.filename == new_filename: - return - - # Determine the full "target" path - path_new = instance.filename_to_path(new_filename) - dir_new = instance.filename_to_path(os.path.dirname(new_filename)) - - # Create new path - instance.create_source_directory() - - # Determine the full "current" path - path_current = instance.filename_to_path(instance.source_filename) - - # Move file - try: - os.rename(path_current, path_new) - except PermissionError: - # Do not update filename in object - return - except FileNotFoundError: - logger = logging.getLogger(__name__) - logger.error("Renaming of document " + str(instance.id) + " failed " + - "as file " + instance.filename + " was no longer present") - return - - # Delete empty directory - old_dir = os.path.dirname(instance.filename) - old_path = instance.filename_to_path(old_dir) - Document.try_delete_empty_directories(old_path) - - instance.filename = new_filename - - # Save instance - # This will not cause a cascade of post_save signals, as next time - # nothing needs to be renamed - instance.save() - - -@receiver(models.signals.post_delete, sender=Document) -def delete_files(sender, instance, **kwargs): - if instance.filename is None: - return - - # Remove the document - old_file = instance.filename_to_path(instance.filename) - - try: - os.remove(old_file) - except FileNotFoundError: - logger = logging.getLogger(__name__) - logger.warning("Deleted document " + str(instance.id) + " but file " + - old_file + " was no longer present") - - # And remove the directory (if applicable) - old_dir = os.path.dirname(instance.filename) - old_path = instance.filename_to_path(old_dir) - Document.try_delete_empty_directories(old_path) - class Log(models.Model): diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index cee1e042b..671cdb104 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -6,9 +6,13 @@ from django.conf import settings from django.contrib.admin.models import ADDITION, LogEntry from django.contrib.auth.models import User from django.contrib.contenttypes.models import ContentType +from django.db import models, DatabaseError +from django.dispatch import receiver from django.utils import timezone from .. import index, matching +from ..file_handling import delete_empty_directories, generate_filename, \ + create_source_path_directory from ..models import Document, Tag @@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs): )).wait() +@receiver(models.signals.post_delete, sender=Document) def cleanup_document_deletion(sender, instance, using, **kwargs): - - if not isinstance(instance, Document): - return - for f in (instance.source_path, instance.thumbnail_path): try: os.unlink(f) except FileNotFoundError: pass # The file's already gone, so we're cool with it. + delete_empty_directories(os.path.dirname(instance.source_path)) + + +@receiver(models.signals.m2m_changed, sender=Document.tags.through) +@receiver(models.signals.post_save, sender=Document) +def update_filename_and_move_files(sender, instance, **kwargs): + + if not instance.filename: + # Can't update the filename if there is not filename to begin with + # This happens after the consumer creates a new document. + # The PK needs to be set first by saving the document once. When this + # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be + # renamed anyway. In all other cases, instance.filename will be set. + return + + old_filename = instance.filename + old_path = instance.source_path + new_filename = generate_filename(instance) + + if new_filename == instance.filename: + # Don't do anything if its the same. + return + + new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) + + if not os.path.isfile(old_path): + # Can't do anything if the old file does not exist anymore. + logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path)) + return + + if os.path.isfile(new_path): + # Can't do anything if the new file already exists. Skip updating file. + logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path)) + return + + create_source_path_directory(new_path) + + try: + os.rename(old_path, new_path) + instance.filename = new_filename + instance.save() + + except OSError as e: + instance.filename = old_filename + except DatabaseError as e: + os.rename(new_path, old_path) + instance.filename = old_filename + + if not os.path.isfile(old_path): + delete_empty_directories(os.path.dirname(old_path)) + def set_log_entry(sender, document=None, logging_group=None, **kwargs): diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index 3b7c757d4..e228acabb 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -10,6 +10,8 @@ from dateutil import tz from django.test import TestCase, override_settings from django.utils.text import slugify + +from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories from ..models import Tag, Document, Correspondent from django.conf import settings @@ -31,18 +33,6 @@ class TestDate(TestCase): for dirname in self.deletion_list: shutil.rmtree(dirname, ignore_errors=True) - @override_settings(PAPERLESS_FILENAME_FORMAT="") - def test_source_filename(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - self.assertEqual(document.source_filename, "0000001.pdf") - - document.filename = "test.pdf" - self.assertEqual(document.source_filename, "test.pdf") - @override_settings(PAPERLESS_FILENAME_FORMAT="") def test_generate_source_filename(self): document = Document() @@ -50,40 +40,40 @@ class TestDate(TestCase): document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - self.assertEqual(document.generate_source_filename(), "0000001.pdf") + self.assertEqual(generate_filename(document), "{:07d}.pdf".format(document.pk)) document.storage_type = Document.STORAGE_TYPE_GPG - self.assertEqual(document.generate_source_filename(), - "0000001.pdf.gpg") + self.assertEqual(generate_filename(document), + "{:07d}.pdf.gpg".format(document.pk)) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming(self): document = Document() document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + # Test default source_path + self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk)) - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") + document.filename = generate_filename(document) + + # Ensure that filename is properly generated + self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) # Enable encryption and check again document.storage_type = Document.STORAGE_TYPE_GPG - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf.gpg") + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf.gpg".format(document.pk)) + document.save() - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), True) + # test that creating dirs for the source_path creates the correct directory + create_source_path_directory(document.source_path) + Path(document.source_path).touch() + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), True) # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -91,14 +81,12 @@ class TestDate(TestCase): document.save() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf.gpg"), True) - self.assertEqual(document.generate_source_filename(), - "test/test-0000001.pdf.gpg") + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/test"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -109,18 +97,18 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") + self.assertEqual(document.source_path, settings.ORIGINALS_DIR + + "/none/none-{:07d}.pdf".format(document.pk)) # Make the folder read- and execute-only (no writing and no renaming) - os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555) + os.chmod(settings.ORIGINALS_DIR + "/none", 0o555) # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -129,11 +117,12 @@ class TestDate(TestCase): # Check proper handling of files self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), True) - self.assertEqual(document.source_filename, - "none/none-0000001.pdf") + "originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) - os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -144,18 +133,20 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Ensure file deletion after delete + pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/none/none-{:07d}.pdf".format(pk)), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -176,12 +167,15 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + create_source_path_directory(document.source_path) + Path(document.source_path).touch() - Path(document.source_path + "test").touch() + important_file = document.source_path + "test" + Path(important_file).touch() # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -193,11 +187,8 @@ class TestDate(TestCase): "/documents/originals/test"), True) self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) + self.assertTrue(os.path.isfile(important_file)) - # Cleanup - os.remove(settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdftest") - os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_underscore(self): @@ -212,13 +203,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_dash(self): @@ -233,13 +219,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_malformed(self): @@ -254,13 +235,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "none-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") def test_tags_all(self): @@ -274,61 +250,24 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") - def test_tags_out_of_bounds_0(self): + @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}") + def test_tags_out_of_bounds(self): document = Document() document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}") - def test_tags_out_of_bounds_10000000(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED + # Add tag to document + document.tags.create(name="demo") document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + self.assertEqual(generate_filename(document), + "none-{:07d}.pdf".format(document.pk)) - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}") - def test_tags_out_of_bounds_99(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}/{correspondent}") @@ -339,153 +278,40 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none/none"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none/none"), True) + pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + - "/documents/originals/none/none/none-0000001.pdf"), + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/none/none/none-{:07d}.pdf".format(pk)), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none/none"), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True) @override_settings(PAPERLESS_FILENAME_FORMAT=None) def test_format_none(self): document = Document() + document.pk = 1 document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - self.assertEqual(document.generate_source_filename(), "0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_renamed(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") - - # Rename the document "illegaly" - os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test") - os.rename(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf", - settings.MEDIA_ROOT + "/documents/originals/" + - "test/test-0000001.pdf") - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf"), True) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), False) - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/foo/foo-0000001.pdf"), True) - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/foo"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), False) - self.assertEqual(document.generate_source_filename(), - "foo/foo-0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_renamed_encrypted(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_GPG - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf.gpg") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf.gpg") - - # Rename the document "illegaly" - os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test") - os.rename(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf.gpg", - settings.MEDIA_ROOT + "/documents/originals/" + - "test/test-0000001.pdf.gpg") - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf.gpg"), True) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), False) - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/foo/foo-0000001.pdf.gpg"), True) - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/foo"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), False) - self.assertEqual(document.generate_source_filename(), - "foo/foo-0000001.pdf.gpg") - - def test_delete_all_empty_subdirectories(self): - # Create our working directory - tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) - os.makedirs(tmp) - self.add_to_deletion_list(tmp) - - os.makedirs(os.path.join(tmp, "empty")) - os.makedirs(os.path.join(tmp, "empty", "subdirectory")) - - os.makedirs(os.path.join(tmp, "notempty")) - Path(os.path.join(tmp, "notempty", "file")).touch() - - Document.delete_all_empty_subdirectories(tmp) - - self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) - self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False) - self.assertEqual(os.path.isfile( - os.path.join(tmp, "notempty", "file")), True) + self.assertEqual(generate_filename(document), "0000001.pdf") def test_try_delete_empty_directories(self): # Create our working directory - tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) + tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") os.makedirs(tmp) self.add_to_deletion_list(tmp) @@ -493,67 +319,10 @@ class TestDate(TestCase): Path(os.path.join(tmp, "notempty", "file")).touch() os.makedirs(os.path.join(tmp, "notempty", "empty")) - Document.try_delete_empty_directories( + delete_empty_directories( os.path.join(tmp, "notempty", "empty")) self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) self.assertEqual(os.path.isfile( os.path.join(tmp, "notempty", "file")), True) self.assertEqual(os.path.isdir( os.path.join(tmp, "notempty", "empty")), False) - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_accidentally_deleted(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") - - # Delete the document "illegaly" - os.remove(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf") - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), True) - self.assertEqual(document.source_filename, - "none/none-0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_set_filename(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Set existing filename - document.set_filename(tmp) - self.assertEqual(document.source_filename, "none/none-0000001.pdf") - - # Set non-existing filename - document.set_filename("doesnotexist") - self.assertEqual(document.source_filename, "none/none-0000001.pdf") From 312b0034bd21cafbfbb61d5c45453751d0478b33 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:38:41 +0100 Subject: [PATCH 008/101] test database errors. --- src/documents/tests/test_file_handling.py | 50 +++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index e228acabb..18fd327b1 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -1,20 +1,16 @@ -import datetime import os import shutil -from unittest import mock from uuid import uuid4 from pathlib import Path -from shutil import rmtree -from dateutil import tz from django.test import TestCase, override_settings -from django.utils.text import slugify - from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories -from ..models import Tag, Document, Correspondent +from ..models import Document, Correspondent from django.conf import settings +from ..signals.handlers import update_filename_and_move_files + class TestDate(TestCase): deletion_list = [] @@ -123,6 +119,46 @@ class TestDate(TestCase): os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + + "{correspondent}") + def test_file_renaming_database_error(self): + + document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") + + document = Document() + document.file_type = "pdf" + document.checksum = "BBBBB" + document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED + document.save() + + # Ensure that filename is properly generated + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) + Path(document.source_path).touch() + + # Test source_path + self.assertTrue(os.path.isfile(document.source_path)) + + # Set a correspondent and save the document + document.correspondent = Correspondent.objects.get_or_create( + name="test")[0] + + # This will cause save() to fail. + document.checksum = document1.checksum + + # Assume saving the document initially works, this gets called. + # After renaming, an error occurs, and filename is not saved: + # document should still be available at document.filename. + update_filename_and_move_files(None, document) + + # Check proper handling of files + self.assertTrue(os.path.isfile(document.source_path)) + self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + + "originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") From ee6942989843ad71311ec968de7be68b3ee2aea4 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 15:58:29 +0100 Subject: [PATCH 009/101] show the filename in the admin. --- src/documents/admin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 6ac949a45..0f63253ce 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -32,7 +32,7 @@ class TagAdmin(admin.ModelAdmin): list_filter = ("colour", "matching_algorithm") list_editable = ("colour", "match", "matching_algorithm") - readonly_fields = ("slug",) + readonly_fields = ("slug", ) class DocumentTypeAdmin(admin.ModelAdmin): @@ -51,9 +51,9 @@ class DocumentTypeAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin): search_fields = ("correspondent__name", "title", "content", "tags__name") - readonly_fields = ("added", "file_type", "storage_type",) + readonly_fields = ("added", "file_type", "storage_type", "filename") list_display = ("title", "created", "added", "correspondent", - "tags_", "archive_serial_number", "document_type") + "tags_", "archive_serial_number", "document_type", "filename") list_filter = ( "document_type", "tags", From 2436ff143f2b90d3af05e116642ca67e60753593 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:19:57 +0100 Subject: [PATCH 010/101] Frontend: CSRF support --- src-ui/package-lock.json | 8 +++++ src-ui/package.json | 1 + src-ui/src/app/app.module.ts | 9 +++++- .../app/interceptors/csrf.interceptor.spec.ts | 16 ++++++++++ .../src/app/interceptors/csrf.interceptor.ts | 30 +++++++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 src-ui/src/app/interceptors/csrf.interceptor.spec.ts create mode 100644 src-ui/src/app/interceptors/csrf.interceptor.ts diff --git a/src-ui/package-lock.json b/src-ui/package-lock.json index 45b1d2d6d..b6b66e1c6 100644 --- a/src-ui/package-lock.json +++ b/src-ui/package-lock.json @@ -8260,6 +8260,14 @@ "moment": "2.18.1" } }, + "ngx-cookie-service": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/ngx-cookie-service/-/ngx-cookie-service-10.1.1.tgz", + "integrity": "sha512-HvBrYHdxMN1NvFJGEIF/8EuAg2fjxj8QwqTv9h6qZGqNLU+lUba8Pb2zRPw1YA+gqKkJawOy5dYNeH0kyPyipw==", + "requires": { + "tslib": "^2.0.0" + } + }, "ngx-file-drop": { "version": "10.0.0", "resolved": "https://registry.npmjs.org/ngx-file-drop/-/ngx-file-drop-10.0.0.tgz", diff --git a/src-ui/package.json b/src-ui/package.json index a9e909155..af3334db9 100644 --- a/src-ui/package.json +++ b/src-ui/package.json @@ -23,6 +23,7 @@ "@ng-bootstrap/ng-bootstrap": "^8.0.0", "bootstrap": "^4.5.0", "ng-bootstrap": "^1.6.3", + "ngx-cookie-service": "^10.1.1", "ngx-file-drop": "^10.0.0", "ngx-infinite-scroll": "^9.1.0", "rxjs": "~6.6.0", diff --git a/src-ui/src/app/app.module.ts b/src-ui/src/app/app.module.ts index dad57280d..014279cc5 100644 --- a/src-ui/src/app/app.module.ts +++ b/src-ui/src/app/app.module.ts @@ -39,6 +39,8 @@ import { InfiniteScrollModule } from 'ngx-infinite-scroll'; import { DateTimeComponent } from './components/common/input/date-time/date-time.component'; import { TagsComponent } from './components/common/input/tags/tags.component'; import { SortableDirective } from './directives/sortable.directive'; +import { CookieService } from 'ngx-cookie-service'; +import { CsrfInterceptor } from './interceptors/csrf.interceptor'; @NgModule({ declarations: [ @@ -85,7 +87,12 @@ import { SortableDirective } from './directives/sortable.directive'; InfiniteScrollModule ], providers: [ - DatePipe + DatePipe, + CookieService, { + provide: HTTP_INTERCEPTORS, + useClass: CsrfInterceptor, + multi: true + } ], bootstrap: [AppComponent] }) diff --git a/src-ui/src/app/interceptors/csrf.interceptor.spec.ts b/src-ui/src/app/interceptors/csrf.interceptor.spec.ts new file mode 100644 index 000000000..64e20c110 --- /dev/null +++ b/src-ui/src/app/interceptors/csrf.interceptor.spec.ts @@ -0,0 +1,16 @@ +import { TestBed } from '@angular/core/testing'; + +import { CsrfInterceptor } from './csrf.interceptor'; + +describe('CsrfInterceptor', () => { + beforeEach(() => TestBed.configureTestingModule({ + providers: [ + CsrfInterceptor + ] + })); + + it('should be created', () => { + const interceptor: CsrfInterceptor = TestBed.inject(CsrfInterceptor); + expect(interceptor).toBeTruthy(); + }); +}); diff --git a/src-ui/src/app/interceptors/csrf.interceptor.ts b/src-ui/src/app/interceptors/csrf.interceptor.ts new file mode 100644 index 000000000..32f3e99dc --- /dev/null +++ b/src-ui/src/app/interceptors/csrf.interceptor.ts @@ -0,0 +1,30 @@ +import { Injectable } from '@angular/core'; +import { + HttpRequest, + HttpHandler, + HttpEvent, + HttpInterceptor +} from '@angular/common/http'; +import { Observable } from 'rxjs'; +import { CookieService } from 'ngx-cookie-service'; + +@Injectable() +export class CsrfInterceptor implements HttpInterceptor { + + constructor(private cookieService: CookieService) { + + } + + intercept(request: HttpRequest, next: HttpHandler): Observable> { + let csrfToken = this.cookieService.get('csrftoken') + if (csrfToken) { + request = request.clone({ + setHeaders: { + 'X-CSRFToken': csrfToken + } + }) + } + + return next.handle(request); + } +} From 024fcde9de241d881bab20358ec2cf0015f1c470 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:23:49 +0100 Subject: [PATCH 011/101] a handy script that brings up postgres and redis --- scripts/start_services.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 scripts/start_services.sh diff --git a/scripts/start_services.sh b/scripts/start_services.sh new file mode 100755 index 000000000..e566f59b3 --- /dev/null +++ b/scripts/start_services.sh @@ -0,0 +1,2 @@ +docker run -p 5432:5432 -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13 +docker run -d -p 6379:6379 redis:latest From bc4f32fd98f50353838f32d35f0d5dc8ac361816 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:24:04 +0100 Subject: [PATCH 012/101] enable Group and User management. --- src/documents/admin.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 0f63253ce..51096d860 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -120,8 +120,3 @@ admin.site.register(Tag, TagAdmin) admin.site.register(DocumentType, DocumentTypeAdmin) admin.site.register(Document, DocumentAdmin) admin.site.register(Log, LogAdmin) - - -# Unless we implement multi-user, these default registrations don't make sense. -admin.site.unregister(Group) -admin.site.unregister(User) From b14fd52b8b4b9c7a18ed21fee25805512ac3e9dd Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 09:23:57 +0100 Subject: [PATCH 013/101] fixes bug with the + button --- src-ui/src/app/components/common/input/tags/tags.component.ts | 2 +- .../components/document-detail/document-detail.component.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src-ui/src/app/components/common/input/tags/tags.component.ts b/src-ui/src/app/components/common/input/tags/tags.component.ts index 7b5a36e90..dd57d8e50 100644 --- a/src-ui/src/app/components/common/input/tags/tags.component.ts +++ b/src-ui/src/app/components/common/input/tags/tags.component.ts @@ -86,7 +86,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor { var modal = this.modalService.open(TagEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newTag => { - this.tagService.list().subscribe(tags => { + this.tagService.listAll().subscribe(tags => { this.tags = tags.results this.addTag(newTag.id) }) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 802a3b212..634b28613 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -89,7 +89,7 @@ export class DocumentDetailComponent implements OnInit { var modal = this.modalService.open(DocumentTypeEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newDocumentType => { - this.documentTypeService.list().subscribe(documentTypes => { + this.documentTypeService.listAll().subscribe(documentTypes => { this.documentTypes = documentTypes.results this.documentForm.get('document_type_id').setValue(newDocumentType.id) }) @@ -100,7 +100,7 @@ export class DocumentDetailComponent implements OnInit { var modal = this.modalService.open(CorrespondentEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newCorrespondent => { - this.correspondentService.list().subscribe(correspondents => { + this.correspondentService.listAll().subscribe(correspondents => { this.correspondents = correspondents.results this.documentForm.get('correspondent_id').setValue(newCorrespondent.id) }) From af254fb9704f2558cd26194cd447eb1c3727f061 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:47:48 +0100 Subject: [PATCH 014/101] adjusted a couple things in the docker compose file. --- docker-compose.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index f9b4d6c33..54f293f90 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,12 +1,12 @@ version: "3.4" services: broker: - image: redis:latest - #restart: always + image: redis:6.0 + restart: always db: image: postgres:13 - #restart: always + restart: always volumes: - pgdata:/var/lib/postgresql/data environment: @@ -16,9 +16,10 @@ services: webserver: image: paperless-ng:latest - #restart: always + restart: always depends_on: - db + - broker ports: - 8000:8000 healthcheck: From 917ee62f81847cac6d46f124dfc61dce966b7da9 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 09:30:04 +0100 Subject: [PATCH 015/101] fixes #30 --- .../management/commands/document_consumer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 93ad6947c..a90fd53ed 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -20,8 +20,21 @@ class Handler(FileSystemEventHandler): def __init__(self, consumer): self.consumer = consumer + def _consume(self, file): + if os.path.isfile(file): + try: + self.consumer.try_consume_file(file) + except Exception as e: + logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) + def on_created(self, event): - self.consumer.try_consume_file(event.src_path) + self._consume(event.src_path) + + def on_modified(self, event): + self._consume(event.src_path) + + def on_moved(self, event): + self._consume(event.src_path) class Command(BaseCommand): From 1eb76a1827f7421062050dbb37816d75e638c30f Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:01:22 +0100 Subject: [PATCH 016/101] fixes #35 --- src/paperless/settings.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 06dfdcd84..38721c00f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -13,6 +13,17 @@ elif os.path.exists("/etc/paperless.conf"): elif os.path.exists("/usr/local/etc/paperless.conf"): load_dotenv("/usr/local/etc/paperless.conf") +# There are multiple levels of concurrency in paperless: +# - Multiple consumers may be run in parallel. +# - Each consumer may process multiple pages in parallel. +# - Each Tesseract OCR run may spawn multiple threads to process a single page +# slightly faster. +# The performance gains from having tesseract use multiple threads are minimal. +# However, when multiple pages are processed in parallel, the total number of +# OCR threads may exceed the number of available cpu cores, which will +# dramatically slow down the consumption process. This settings limits each +# Tesseract process to one thread. +os.environ['OMP_THREAD_LIMIT'] = "1" def __get_boolean(key, default="NO"): """ From fe6568e3b3422fb58230f7e329b48a18e0d65b2d Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:04:01 +0100 Subject: [PATCH 017/101] fixes #31 --- src/documents/classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 6c90536b0..4ba538162 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -120,8 +120,8 @@ class DocumentClassifier(object): num_tags = len(labels_tags_unique) # substract 1 since -1 (null) is also part of the classes. - num_correspondents = len(labels_correspondent) - 1 - num_document_types = len(labels_document_type) - 1 + num_correspondents = len(set(labels_correspondent)) - 1 + num_document_types = len(set(labels_document_type)) - 1 logging.getLogger(__name__).debug( "{} documents, {} tag(s), {} correspondent(s), " From b2019ff262217fab4895b99513c254ac6f64f656 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:05:19 +0100 Subject: [PATCH 018/101] fixes log on windows --- src-ui/src/app/components/manage/logs/logs.component.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src-ui/src/app/components/manage/logs/logs.component.ts b/src-ui/src/app/components/manage/logs/logs.component.ts index da507cbe5..f80aada21 100644 --- a/src-ui/src/app/components/manage/logs/logs.component.ts +++ b/src-ui/src/app/components/manage/logs/logs.component.ts @@ -30,7 +30,7 @@ export class LogsComponent implements OnInit { onScroll() { let lastCreated = null if (this.logs.length > 0) { - lastCreated = this.logs[this.logs.length-1].created + lastCreated = new Date(this.logs[this.logs.length-1].created).toISOString() } this.logService.list(1, 25, 'created', 'des', {'created__lt': lastCreated, 'level__gte': this.level}).subscribe(result => { this.logs.push(...result.results) From 09651e001197e00d67eb97a84be3c48665ebd574 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:41:47 +0100 Subject: [PATCH 019/101] on_modified not needed for the consumer. --- src/documents/management/commands/document_consumer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index a90fd53ed..ea6e033ba 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -25,14 +25,12 @@ class Handler(FileSystemEventHandler): try: self.consumer.try_consume_file(file) except Exception as e: + # Catch all so that the consumer won't crash. logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) def on_created(self, event): self._consume(event.src_path) - def on_modified(self, event): - self._consume(event.src_path) - def on_moved(self, event): self._consume(event.src_path) From adc217e6fd562a949811270709eb9ec2b8f4cbe8 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:42:18 +0100 Subject: [PATCH 020/101] This is how the original filenames were generated. Keep it this way for compatibility. --- src/documents/file_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index cac317d4c..22d5a052d 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -72,7 +72,7 @@ def generate_filename(document): path = settings.PAPERLESS_FILENAME_FORMAT.format( correspondent=slugify(document.correspondent), title=slugify(document.title), - created=document.created.date(), + created=slugify(document.created), added=slugify(document.added), tags=tags, ) From 6377a3758d4ea60a351a915c881495cde019d451 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 11:11:57 +0100 Subject: [PATCH 021/101] fixes #12 --- .../app-frame/app-frame.component.html | 8 ++++++++ .../components/app-frame/app-frame.component.ts | 17 ++++++++++++++++- .../src/app/services/open-documents.service.ts | 13 +++++++++++-- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src-ui/src/app/components/app-frame/app-frame.component.html b/src-ui/src/app/components/app-frame/app-frame.component.html index 0b18777ef..7879e2dcb 100644 --- a/src-ui/src/app/components/app-frame/app-frame.component.html +++ b/src-ui/src/app/components/app-frame/app-frame.component.html @@ -69,6 +69,14 @@ {{d.title}} +