From c3756d2237fb33a70670d78a4c5eb039c260b9b8 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Thu, 29 Oct 2020 22:36:24 +0100 Subject: [PATCH 001/101] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ef90456a8..b5755d80f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents. This project extends on the project and modernizes many things. +This project is still under development. There also is no automatic way yet to migrate your current paperless setup to this version. I'm working on that. + # How it Works Paperless does not control your scanner, it only helps you deal with what your scanner produces. From 9c619d7ce60481412cbb46dd772e237dc3d9ec25 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Wed, 4 Nov 2020 19:41:24 +0100 Subject: [PATCH 002/101] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a4edbb87b..ce4277642 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ +# Paperless-ng + [Paperless](https://github.com/the-paperless-project/paperless) is an application by Daniel Quinn and others that indexes your scanned documents and allows you to easily search for documents and store metadata alongside your documents. Paperless-ng is a fork of the original project, adding a new interface and many other changes under the hood. For a detailed list of changes, see below. This project is still in development and some things may not work as expected. -This project is still under development. There also is no automatic way yet to migrate your current paperless setup to this version. I'm working on that. - # How it Works Paperless does not control your scanner, it only helps you deal with what your scanner produces. From 1ddbf416d4bd89ce9658ae42d5e0451f1675ca60 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Tue, 10 Nov 2020 01:47:35 +0100 Subject: [PATCH 003/101] fixed an issue with the searcher. --- src/documents/index.py | 9 +++++++-- src/documents/views.py | 13 ++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index a099f670c..82a35a63e 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,4 +1,5 @@ import logging +from contextlib import contextmanager from django.db import models from django.dispatch import receiver @@ -99,15 +100,19 @@ def remove_document_from_index(document): remove_document(writer, document) +@contextmanager def query_page(ix, query, page): - with ix.searcher() as searcher: + searcher = ix.searcher() + try: query_parser = MultifieldParser(["content", "title", "correspondent"], ix.schema).parse(query) result_page = searcher.search_page(query_parser, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() - return result_page + yield result_page + finally: + searcher.close() def autocomplete(ix, term, limit=10): diff --git a/src/documents/views.py b/src/documents/views.py index b3d6012f1..8cc330141 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -191,13 +191,12 @@ class SearchView(APIView): except (ValueError, TypeError): page = 1 - result_page = index.query_page(self.ix, query, page) - - return Response( - {'count': len(result_page), - 'page': result_page.pagenum, - 'page_count': result_page.pagecount, - 'results': list(map(self.add_infos_to_hit, result_page))}) + with index.query_page(self.ix, query, page) as result_page: + return Response( + {'count': len(result_page), + 'page': result_page.pagenum, + 'page_count': result_page.pagecount, + 'results': list(map(self.add_infos_to_hit, result_page))}) else: return Response({ From 3048342de79635adab980f8e435f81b77b60d750 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Tue, 10 Nov 2020 01:47:58 +0100 Subject: [PATCH 004/101] added a setting: delete duplicate documents --- paperless.conf.example | 4 ++++ src/documents/consumer.py | 2 ++ src/paperless/settings.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/paperless.conf.example b/paperless.conf.example index 48df40ab2..1c62256ab 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -143,6 +143,10 @@ PAPERLESS_EMAIL_SECRET="" #### Software Tweaks #### ############################################################################### +# When the consumer detects a duplicate document, it will not touch the +# original document. This default behavior can be changed here. +#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false" + # After a document is consumed, Paperless can trigger an arbitrary script if # you like. This script will be passed a number of arguments for you to work # with. The default is blank, which means nothing will be executed. For more diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f61d11136..75e6f6120 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -84,6 +84,8 @@ class Consumer: "warning", "Skipping {} as it appears to be a duplicate".format(doc) ) + if settings.CONSUMER_DELETE_DUPLICATES: + self._cleanup_doc(doc) return False self.log("info", "Consuming {}".format(doc)) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 2c96350dc..06dfdcd84 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -258,6 +258,8 @@ Q_CLUSTER = { # Paperless Specific Settings # ############################################################################### +CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") + # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") From e8d4696c71989d12801e38da3e11a74d0ac36906 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:13:54 +0100 Subject: [PATCH 005/101] remove only automatically created schedules (almost) --- src/documents/migrations/1001_auto_20201109_1636.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/migrations/1001_auto_20201109_1636.py b/src/documents/migrations/1001_auto_20201109_1636.py index 8d6a0f584..138de6f91 100644 --- a/src/documents/migrations/1001_auto_20201109_1636.py +++ b/src/documents/migrations/1001_auto_20201109_1636.py @@ -13,7 +13,9 @@ def add_schedules(apps, schema_editor): def remove_schedules(apps, schema_editor): - Schedule.objects.all().delete() + Schedule.objects.filter(func='documents.tasks.train_classifier').delete() + Schedule.objects.filter(func='documents.tasks.index_optimize').delete() + Schedule.objects.filter(func='documents.tasks.consume_mail').delete() class Migration(migrations.Migration): From a91e46364a63a474d393c74445813aa6e367783a Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:14:21 +0100 Subject: [PATCH 006/101] small consumer fixes --- src/documents/consumer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 75e6f6120..96aad7d49 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -60,7 +60,6 @@ class Consumer: raise ConsumerError( "Consumption directory {} does not exist".format(self.consume)) - def log(self, level, message): getattr(self.logger, level)(message, extra={ "group": self.logging_group @@ -98,7 +97,6 @@ class Consumer: else: self.log("info", "Parser: {}".format(parser_class.__name__)) - document_consumption_started.send( sender=self.__class__, filename=doc, @@ -110,9 +108,10 @@ class Consumer: try: self.log("info", "Generating thumbnail for {}...".format(doc)) thumbnail = document_parser.get_optimised_thumbnail() + text = document_parser.get_text() date = document_parser.get_date() document = self._store( - document_parser.get_text(), + text, doc, thumbnail, date From 8b8a2af053ef013386690b833d90eeec28163bf5 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:21:33 +0100 Subject: [PATCH 007/101] fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. --- src/documents/apps.py | 3 - src/documents/consumer.py | 8 +- src/documents/file_handling.py | 92 ++++ .../management/commands/document_importer.py | 27 +- .../migrations/1002_auto_20201111_1105.py | 18 + src/documents/models.py | 245 +---------- src/documents/signals/handlers.py | 60 ++- src/documents/tests/test_file_handling.py | 411 ++++-------------- 8 files changed, 287 insertions(+), 577 deletions(-) create mode 100644 src/documents/file_handling.py create mode 100644 src/documents/migrations/1002_auto_20201111_1105.py diff --git a/src/documents/apps.py b/src/documents/apps.py index 83e671d07..6cf815122 100644 --- a/src/documents/apps.py +++ b/src/documents/apps.py @@ -14,7 +14,6 @@ class DocumentsConfig(AppConfig): add_inbox_tags, run_pre_consume_script, run_post_consume_script, - cleanup_document_deletion, set_log_entry, set_correspondent, set_document_type, @@ -33,6 +32,4 @@ class DocumentsConfig(AppConfig): document_consumption_finished.connect(add_to_index) document_consumption_finished.connect(run_post_consume_script) - post_delete.connect(cleanup_document_deletion) - AppConfig.ready(self) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 96aad7d49..2e8c5493f 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -11,6 +11,7 @@ from django.utils import timezone from paperless.db import GnuPG from .classifier import DocumentClassifier, IncompatibleClassifierVersionError +from .file_handling import generate_filename, create_source_path_directory from .models import Document, FileInfo from .parsers import ParseError, get_parser_class from .signals import ( @@ -174,10 +175,15 @@ class Consumer: self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) + document.filename = generate_filename(document) + + create_source_path_directory(document.source_path) + self._write(document, doc, document.source_path) self._write(document, thumbnail, document.thumbnail_path) - #TODO: why do we need to save the document again? + # We need to save the document twice, since we need the PK of the + # document in order to create its filename above. document.save() return document diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py new file mode 100644 index 000000000..cac317d4c --- /dev/null +++ b/src/documents/file_handling.py @@ -0,0 +1,92 @@ +import os +from collections import defaultdict + +from django.conf import settings +from django.template.defaultfilters import slugify + + +def create_source_path_directory(source_path): + os.makedirs(os.path.dirname(source_path), exist_ok=True) + + +def delete_empty_directories(directory): + # Go up in the directory hierarchy and try to delete all directories + directory = os.path.normpath(directory) + root = os.path.normpath(settings.ORIGINALS_DIR) + + if not directory.startswith(root + os.path.sep): + # don't do anything outside our originals folder. + + # append os.path.set so that we avoid these cases: + # directory = /home/originals2/test + # root = /home/originals ("/" gets appended and startswith fails) + return + + while directory != root: + if not os.listdir(directory): + # it's empty + try: + os.rmdir(directory) + except OSError: + # whatever. empty directories aren't that bad anyway. + return + else: + # it's not empty. + return + + # go one level up + directory = os.path.normpath(os.path.dirname(directory)) + + +def many_to_dictionary(field): + # Converts ManyToManyField to dictionary by assuming, that field + # entries contain an _ or - which will be used as a delimiter + mydictionary = dict() + + for index, t in enumerate(field.all()): + # Populate tag names by index + mydictionary[index] = slugify(t.name) + + # Find delimiter + delimiter = t.name.find('_') + + if delimiter == -1: + delimiter = t.name.find('-') + + if delimiter == -1: + continue + + key = t.name[:delimiter] + value = t.name[delimiter + 1:] + + mydictionary[slugify(key)] = slugify(value) + + return mydictionary + + +def generate_filename(document): + # Create filename based on configured format + if settings.PAPERLESS_FILENAME_FORMAT is not None: + tags = defaultdict(lambda: slugify(None), + many_to_dictionary(document.tags)) + path = settings.PAPERLESS_FILENAME_FORMAT.format( + correspondent=slugify(document.correspondent), + title=slugify(document.title), + created=document.created.date(), + added=slugify(document.added), + tags=tags, + ) + else: + path = "" + + # Always append the primary key to guarantee uniqueness of filename + if len(path) > 0: + filename = "%s-%07i.%s" % (path, document.pk, document.file_type) + else: + filename = "%07i.%s" % (document.pk, document.file_type) + + # Append .gpg for encrypted files + if document.storage_type == document.STORAGE_TYPE_GPG: + filename += ".gpg" + + return filename diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index ae5c1853f..ef3eaafc0 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -8,6 +8,7 @@ from django.core.management import call_command from documents.models import Document from paperless.db import GnuPG +from ...file_handling import generate_filename, create_source_path_directory from ...mixins import Renderable @@ -82,6 +83,10 @@ class Command(Renderable, BaseCommand): def _import_files_from_manifest(self): + storage_type = Document.STORAGE_TYPE_UNENCRYPTED + if settings.PASSPHRASE: + storage_type = Document.STORAGE_TYPE_GPG + for record in self.manifest: if not record["model"] == "documents.document": @@ -94,6 +99,14 @@ class Command(Renderable, BaseCommand): document_path = os.path.join(self.source, doc_file) thumbnail_path = os.path.join(self.source, thumb_file) + document.storage_type = storage_type + document.filename = generate_filename(document) + + if os.path.isfile(document.source_path): + raise FileExistsError(document.source_path) + + create_source_path_directory(document.source_path) + if settings.PASSPHRASE: with open(document_path, "rb") as unencrypted: @@ -109,18 +122,8 @@ class Command(Renderable, BaseCommand): encrypted.write(GnuPG.encrypted(unencrypted)) else: - + print("Moving {} to {}".format(document_path, document.source_path)) shutil.copy(document_path, document.source_path) shutil.copy(thumbnail_path, document.thumbnail_path) - # Reset the storage type to whatever we've used while importing - - storage_type = Document.STORAGE_TYPE_UNENCRYPTED - if settings.PASSPHRASE: - storage_type = Document.STORAGE_TYPE_GPG - - Document.objects.filter( - pk__in=[r["pk"] for r in self.manifest] - ).update( - storage_type=storage_type - ) + document.save() diff --git a/src/documents/migrations/1002_auto_20201111_1105.py b/src/documents/migrations/1002_auto_20201111_1105.py new file mode 100644 index 000000000..7f6bae50b --- /dev/null +++ b/src/documents/migrations/1002_auto_20201111_1105.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2020-11-11 11:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1001_auto_20201109_1636'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='filename', + field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 88598b5f6..ab3262fb5 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -3,18 +3,15 @@ import logging import os import re -from collections import OrderedDict, defaultdict +from collections import OrderedDict import dateutil.parser from django.conf import settings from django.db import models -from django.dispatch import receiver -from django.template.defaultfilters import slugify from django.utils import timezone from django.utils.text import slugify - class MatchingModel(models.Model): MATCH_ANY = 1 @@ -192,7 +189,7 @@ class Document(models.Model): default=timezone.now, editable=False, db_index=True) filename = models.FilePathField( - max_length=256, + max_length=1024, editable=False, default=None, null=True, @@ -220,123 +217,18 @@ class Document(models.Model): return "{}: {}".format(created, self.correspondent or self.title) return str(created) - def find_renamed_document(self, subdirectory=""): - suffix = "%07i.%s" % (self.pk, self.file_type) - - # Append .gpg for encrypted files - if self.storage_type == self.STORAGE_TYPE_GPG: - suffix += ".gpg" - - # Go up in the directory hierarchy and try to delete all directories - root = os.path.normpath(Document.filename_to_path(subdirectory)) - - for filename in os.listdir(root): - if filename.endswith(suffix): - return os.path.join(subdirectory, filename) - - fullname = os.path.join(subdirectory, filename) - if os.path.isdir(Document.filename_to_path(fullname)): - return self.find_renamed_document(fullname) - - return None - - @property - def source_filename(self): - # Initial filename generation (for new documents) - if self.filename is None: - self.filename = self.generate_source_filename() - - # Check if document is still available under filename - elif not os.path.isfile(Document.filename_to_path(self.filename)): - recovered_filename = self.find_renamed_document() - - # If we have found the file so update the filename - if recovered_filename is not None: - logger = logging.getLogger(__name__) - logger.warning("Filename of document " + str(self.id) + - " has changed and was successfully updated") - self.filename = recovered_filename - - # Remove all empty subdirectories from MEDIA_ROOT - Document.delete_all_empty_subdirectories( - Document.filename_to_path("")) - else: - logger = logging.getLogger(__name__) - logger.error("File of document " + str(self.id) + " has " + - "gone and could not be recovered") - - return self.filename - - @staticmethod - def many_to_dictionary(field): - # Converts ManyToManyField to dictionary by assuming, that field - # entries contain an _ or - which will be used as a delimiter - mydictionary = dict() - - for index, t in enumerate(field.all()): - # Populate tag names by index - mydictionary[index] = slugify(t.name) - - # Find delimiter - delimiter = t.name.find('_') - - if delimiter == -1: - delimiter = t.name.find('-') - - if delimiter == -1: - continue - - key = t.name[:delimiter] - value = t.name[delimiter+1:] - - mydictionary[slugify(key)] = slugify(value) - - return mydictionary - - def generate_source_filename(self): - # Create filename based on configured format - if settings.PAPERLESS_FILENAME_FORMAT is not None: - tags = defaultdict(lambda: slugify(None), - self.many_to_dictionary(self.tags)) - path = settings.PAPERLESS_FILENAME_FORMAT.format( - correspondent=slugify(self.correspondent), - title=slugify(self.title), - created=slugify(self.created), - added=slugify(self.added), - tags=tags) - else: - path = "" - - # Always append the primary key to guarantee uniqueness of filename - if len(path) > 0: - filename = "%s-%07i.%s" % (path, self.pk, self.file_type) - else: - filename = "%07i.%s" % (self.pk, self.file_type) - - # Append .gpg for encrypted files - if self.storage_type == self.STORAGE_TYPE_GPG: - filename += ".gpg" - - return filename - - def create_source_directory(self): - new_filename = self.generate_source_filename() - - # Determine the full "target" path - dir_new = Document.filename_to_path(os.path.dirname(new_filename)) - - # Create new path - os.makedirs(dir_new, exist_ok=True) - @property def source_path(self): - return Document.filename_to_path(self.source_filename) + if self.filename: + fname = str(self.filename) + else: + fname = "{:07}.{}".format(self.pk, self.file_type) + if self.storage_type == self.STORAGE_TYPE_GPG: + fname += ".gpg" - @staticmethod - def filename_to_path(filename): return os.path.join( settings.ORIGINALS_DIR, - filename + fname ) @property @@ -362,125 +254,6 @@ class Document(models.Model): def thumbnail_file(self): return open(self.thumbnail_path, "rb") - def set_filename(self, filename): - if os.path.isfile(Document.filename_to_path(filename)): - self.filename = filename - - @staticmethod - def try_delete_empty_directories(directory): - # Go up in the directory hierarchy and try to delete all directories - directory = os.path.normpath(directory) - root = os.path.normpath(Document.filename_to_path("")) - - while directory != root: - # Try to delete the current directory - try: - os.rmdir(directory) - except os.error: - # Directory not empty, no need to go further up - return - - # Cut off actual directory and go one level up - directory, _ = os.path.split(directory) - directory = os.path.normpath(directory) - - @staticmethod - def delete_all_empty_subdirectories(directory): - # Go through all folders and try to delete all directories - root = os.path.normpath(Document.filename_to_path(directory)) - - for filename in os.listdir(root): - fullname = os.path.join(directory, filename) - - if not os.path.isdir(Document.filename_to_path(fullname)): - continue - - # Go into subdirectory to see, if there is more to delete - Document.delete_all_empty_subdirectories( - os.path.join(directory, filename)) - - # Try to delete the directory - try: - os.rmdir(Document.filename_to_path(fullname)) - continue - except os.error: - # Directory not empty, no need to go further up - continue - - -@receiver(models.signals.m2m_changed, sender=Document.tags.through) -@receiver(models.signals.post_save, sender=Document) -def update_filename(sender, instance, **kwargs): - # Skip if document has not been saved yet - if instance.filename is None: - return - - # Check is file exists and update filename otherwise - if not os.path.isfile(Document.filename_to_path(instance.filename)): - instance.filename = instance.source_filename - - # Build the new filename - new_filename = instance.generate_source_filename() - - # If the filename is the same, then nothing needs to be done - if instance.filename == new_filename: - return - - # Determine the full "target" path - path_new = instance.filename_to_path(new_filename) - dir_new = instance.filename_to_path(os.path.dirname(new_filename)) - - # Create new path - instance.create_source_directory() - - # Determine the full "current" path - path_current = instance.filename_to_path(instance.source_filename) - - # Move file - try: - os.rename(path_current, path_new) - except PermissionError: - # Do not update filename in object - return - except FileNotFoundError: - logger = logging.getLogger(__name__) - logger.error("Renaming of document " + str(instance.id) + " failed " + - "as file " + instance.filename + " was no longer present") - return - - # Delete empty directory - old_dir = os.path.dirname(instance.filename) - old_path = instance.filename_to_path(old_dir) - Document.try_delete_empty_directories(old_path) - - instance.filename = new_filename - - # Save instance - # This will not cause a cascade of post_save signals, as next time - # nothing needs to be renamed - instance.save() - - -@receiver(models.signals.post_delete, sender=Document) -def delete_files(sender, instance, **kwargs): - if instance.filename is None: - return - - # Remove the document - old_file = instance.filename_to_path(instance.filename) - - try: - os.remove(old_file) - except FileNotFoundError: - logger = logging.getLogger(__name__) - logger.warning("Deleted document " + str(instance.id) + " but file " + - old_file + " was no longer present") - - # And remove the directory (if applicable) - old_dir = os.path.dirname(instance.filename) - old_path = instance.filename_to_path(old_dir) - Document.try_delete_empty_directories(old_path) - class Log(models.Model): diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index cee1e042b..671cdb104 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -6,9 +6,13 @@ from django.conf import settings from django.contrib.admin.models import ADDITION, LogEntry from django.contrib.auth.models import User from django.contrib.contenttypes.models import ContentType +from django.db import models, DatabaseError +from django.dispatch import receiver from django.utils import timezone from .. import index, matching +from ..file_handling import delete_empty_directories, generate_filename, \ + create_source_path_directory from ..models import Document, Tag @@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs): )).wait() +@receiver(models.signals.post_delete, sender=Document) def cleanup_document_deletion(sender, instance, using, **kwargs): - - if not isinstance(instance, Document): - return - for f in (instance.source_path, instance.thumbnail_path): try: os.unlink(f) except FileNotFoundError: pass # The file's already gone, so we're cool with it. + delete_empty_directories(os.path.dirname(instance.source_path)) + + +@receiver(models.signals.m2m_changed, sender=Document.tags.through) +@receiver(models.signals.post_save, sender=Document) +def update_filename_and_move_files(sender, instance, **kwargs): + + if not instance.filename: + # Can't update the filename if there is not filename to begin with + # This happens after the consumer creates a new document. + # The PK needs to be set first by saving the document once. When this + # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be + # renamed anyway. In all other cases, instance.filename will be set. + return + + old_filename = instance.filename + old_path = instance.source_path + new_filename = generate_filename(instance) + + if new_filename == instance.filename: + # Don't do anything if its the same. + return + + new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) + + if not os.path.isfile(old_path): + # Can't do anything if the old file does not exist anymore. + logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path)) + return + + if os.path.isfile(new_path): + # Can't do anything if the new file already exists. Skip updating file. + logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path)) + return + + create_source_path_directory(new_path) + + try: + os.rename(old_path, new_path) + instance.filename = new_filename + instance.save() + + except OSError as e: + instance.filename = old_filename + except DatabaseError as e: + os.rename(new_path, old_path) + instance.filename = old_filename + + if not os.path.isfile(old_path): + delete_empty_directories(os.path.dirname(old_path)) + def set_log_entry(sender, document=None, logging_group=None, **kwargs): diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index 3b7c757d4..e228acabb 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -10,6 +10,8 @@ from dateutil import tz from django.test import TestCase, override_settings from django.utils.text import slugify + +from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories from ..models import Tag, Document, Correspondent from django.conf import settings @@ -31,18 +33,6 @@ class TestDate(TestCase): for dirname in self.deletion_list: shutil.rmtree(dirname, ignore_errors=True) - @override_settings(PAPERLESS_FILENAME_FORMAT="") - def test_source_filename(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - self.assertEqual(document.source_filename, "0000001.pdf") - - document.filename = "test.pdf" - self.assertEqual(document.source_filename, "test.pdf") - @override_settings(PAPERLESS_FILENAME_FORMAT="") def test_generate_source_filename(self): document = Document() @@ -50,40 +40,40 @@ class TestDate(TestCase): document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - self.assertEqual(document.generate_source_filename(), "0000001.pdf") + self.assertEqual(generate_filename(document), "{:07d}.pdf".format(document.pk)) document.storage_type = Document.STORAGE_TYPE_GPG - self.assertEqual(document.generate_source_filename(), - "0000001.pdf.gpg") + self.assertEqual(generate_filename(document), + "{:07d}.pdf.gpg".format(document.pk)) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming(self): document = Document() document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + # Test default source_path + self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk)) - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") + document.filename = generate_filename(document) + + # Ensure that filename is properly generated + self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) # Enable encryption and check again document.storage_type = Document.STORAGE_TYPE_GPG - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf.gpg") + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf.gpg".format(document.pk)) + document.save() - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), True) + # test that creating dirs for the source_path creates the correct directory + create_source_path_directory(document.source_path) + Path(document.source_path).touch() + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), True) # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -91,14 +81,12 @@ class TestDate(TestCase): document.save() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf.gpg"), True) - self.assertEqual(document.generate_source_filename(), - "test/test-0000001.pdf.gpg") + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/test"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -109,18 +97,18 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") + self.assertEqual(document.source_path, settings.ORIGINALS_DIR + + "/none/none-{:07d}.pdf".format(document.pk)) # Make the folder read- and execute-only (no writing and no renaming) - os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555) + os.chmod(settings.ORIGINALS_DIR + "/none", 0o555) # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -129,11 +117,12 @@ class TestDate(TestCase): # Check proper handling of files self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), True) - self.assertEqual(document.source_filename, - "none/none-0000001.pdf") + "originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) - os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -144,18 +133,20 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Ensure file deletion after delete + pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/none/none-{:07d}.pdf".format(pk)), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") @@ -176,12 +167,15 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + + create_source_path_directory(document.source_path) + Path(document.source_path).touch() - Path(document.source_path + "test").touch() + important_file = document.source_path + "test" + Path(important_file).touch() # Set a correspondent and save the document document.correspondent = Correspondent.objects.get_or_create( @@ -193,11 +187,8 @@ class TestDate(TestCase): "/documents/originals/test"), True) self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) + self.assertTrue(os.path.isfile(important_file)) - # Cleanup - os.remove(settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdftest") - os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_underscore(self): @@ -212,13 +203,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_dash(self): @@ -233,13 +219,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_malformed(self): @@ -254,13 +235,8 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() + self.assertEqual(generate_filename(document), + "none-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") def test_tags_all(self): @@ -274,61 +250,24 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "demo-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + self.assertEqual(generate_filename(document), + "demo-{:07d}.pdf".format(document.pk)) - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") - def test_tags_out_of_bounds_0(self): + @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}") + def test_tags_out_of_bounds(self): document = Document() document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}") - def test_tags_out_of_bounds_10000000(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED + # Add tag to document + document.tags.create(name="demo") document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() + self.assertEqual(generate_filename(document), + "none-{:07d}.pdf".format(document.pk)) - document.delete() - - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}") - def test_tags_out_of_bounds_99(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - document.delete() @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}/{correspondent}") @@ -339,153 +278,40 @@ class TestDate(TestCase): document.save() # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none/none-0000001.pdf") - document.create_source_directory() + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) Path(document.source_path).touch() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none/none"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none/none"), True) + pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + - "/documents/originals/none/none/none-0000001.pdf"), + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + + "/none/none/none-{:07d}.pdf".format(pk)), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none/none"), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + + "/none"), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True) @override_settings(PAPERLESS_FILENAME_FORMAT=None) def test_format_none(self): document = Document() + document.pk = 1 document.file_type = "pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - self.assertEqual(document.generate_source_filename(), "0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_renamed(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") - - # Rename the document "illegaly" - os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test") - os.rename(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf", - settings.MEDIA_ROOT + "/documents/originals/" + - "test/test-0000001.pdf") - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf"), True) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), False) - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/foo/foo-0000001.pdf"), True) - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/foo"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), False) - self.assertEqual(document.generate_source_filename(), - "foo/foo-0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_renamed_encrypted(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_GPG - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf.gpg") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf.gpg") - - # Rename the document "illegaly" - os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test") - os.rename(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf.gpg", - settings.MEDIA_ROOT + "/documents/originals/" + - "test/test-0000001.pdf.gpg") - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/test/test-0000001.pdf.gpg"), True) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-0000001.pdf"), False) - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/foo/foo-0000001.pdf.gpg"), True) - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/foo"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), False) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), False) - self.assertEqual(document.generate_source_filename(), - "foo/foo-0000001.pdf.gpg") - - def test_delete_all_empty_subdirectories(self): - # Create our working directory - tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) - os.makedirs(tmp) - self.add_to_deletion_list(tmp) - - os.makedirs(os.path.join(tmp, "empty")) - os.makedirs(os.path.join(tmp, "empty", "subdirectory")) - - os.makedirs(os.path.join(tmp, "notempty")) - Path(os.path.join(tmp, "notempty", "file")).touch() - - Document.delete_all_empty_subdirectories(tmp) - - self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) - self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False) - self.assertEqual(os.path.isfile( - os.path.join(tmp, "notempty", "file")), True) + self.assertEqual(generate_filename(document), "0000001.pdf") def test_try_delete_empty_directories(self): # Create our working directory - tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) + tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") os.makedirs(tmp) self.add_to_deletion_list(tmp) @@ -493,67 +319,10 @@ class TestDate(TestCase): Path(os.path.join(tmp, "notempty", "file")).touch() os.makedirs(os.path.join(tmp, "notempty", "empty")) - Document.try_delete_empty_directories( + delete_empty_directories( os.path.join(tmp, "notempty", "empty")) self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) self.assertEqual(os.path.isfile( os.path.join(tmp, "notempty", "file")), True) self.assertEqual(os.path.isdir( os.path.join(tmp, "notempty", "empty")), False) - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_document_accidentally_deleted(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Test source_path - self.assertEqual(document.source_path, settings.MEDIA_ROOT + - "/documents/originals/none/none-0000001.pdf") - - # Delete the document "illegaly" - os.remove(settings.MEDIA_ROOT + "/documents/originals/" + - "none/none-0000001.pdf") - - # Set new correspondent and expect document to be saved properly - document.correspondent = Correspondent.objects.get_or_create( - name="foo")[0] - document.save() - - # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), True) - self.assertEqual(document.source_filename, - "none/none-0000001.pdf") - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") - def test_set_filename(self): - document = Document() - document.file_type = "pdf" - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.save() - - # Ensure that filename is properly generated - tmp = document.source_filename - self.assertEqual(document.generate_source_filename(), - "none/none-0000001.pdf") - document.create_source_directory() - Path(document.source_path).touch() - - # Set existing filename - document.set_filename(tmp) - self.assertEqual(document.source_filename, "none/none-0000001.pdf") - - # Set non-existing filename - document.set_filename("doesnotexist") - self.assertEqual(document.source_filename, "none/none-0000001.pdf") From ce7bb51df285b7bdd757d5eb57584eeed493a7d6 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 14:38:41 +0100 Subject: [PATCH 008/101] test database errors. --- src/documents/tests/test_file_handling.py | 50 +++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index e228acabb..18fd327b1 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -1,20 +1,16 @@ -import datetime import os import shutil -from unittest import mock from uuid import uuid4 from pathlib import Path -from shutil import rmtree -from dateutil import tz from django.test import TestCase, override_settings -from django.utils.text import slugify - from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories -from ..models import Tag, Document, Correspondent +from ..models import Document, Correspondent from django.conf import settings +from ..signals.handlers import update_filename_and_move_files + class TestDate(TestCase): deletion_list = [] @@ -123,6 +119,46 @@ class TestDate(TestCase): os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + + "{correspondent}") + def test_file_renaming_database_error(self): + + document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") + + document = Document() + document.file_type = "pdf" + document.checksum = "BBBBB" + document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED + document.save() + + # Ensure that filename is properly generated + document.filename = generate_filename(document) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) + create_source_path_directory(document.source_path) + Path(document.source_path).touch() + + # Test source_path + self.assertTrue(os.path.isfile(document.source_path)) + + # Set a correspondent and save the document + document.correspondent = Correspondent.objects.get_or_create( + name="test")[0] + + # This will cause save() to fail. + document.checksum = document1.checksum + + # Assume saving the document initially works, this gets called. + # After renaming, an error occurs, and filename is not saved: + # document should still be available at document.filename. + update_filename_and_move_files(None, document) + + # Check proper handling of files + self.assertTrue(os.path.isfile(document.source_path)) + self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + + "originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, + "none/none-{:07d}.pdf".format(document.pk)) @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + "{correspondent}") From 5a658b7ad66408dd78c4bc186ae794b23720cbe6 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 15:58:29 +0100 Subject: [PATCH 009/101] show the filename in the admin. --- src/documents/admin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 6ac949a45..0f63253ce 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -32,7 +32,7 @@ class TagAdmin(admin.ModelAdmin): list_filter = ("colour", "matching_algorithm") list_editable = ("colour", "match", "matching_algorithm") - readonly_fields = ("slug",) + readonly_fields = ("slug", ) class DocumentTypeAdmin(admin.ModelAdmin): @@ -51,9 +51,9 @@ class DocumentTypeAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin): search_fields = ("correspondent__name", "title", "content", "tags__name") - readonly_fields = ("added", "file_type", "storage_type",) + readonly_fields = ("added", "file_type", "storage_type", "filename") list_display = ("title", "created", "added", "correspondent", - "tags_", "archive_serial_number", "document_type") + "tags_", "archive_serial_number", "document_type", "filename") list_filter = ( "document_type", "tags", From 5d0434fd03aaa1d048210c30859f4769ceaa0dce Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:19:57 +0100 Subject: [PATCH 010/101] Frontend: CSRF support --- src-ui/package-lock.json | 8 +++++ src-ui/package.json | 1 + src-ui/src/app/app.module.ts | 9 +++++- .../app/interceptors/csrf.interceptor.spec.ts | 16 ++++++++++ .../src/app/interceptors/csrf.interceptor.ts | 30 +++++++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 src-ui/src/app/interceptors/csrf.interceptor.spec.ts create mode 100644 src-ui/src/app/interceptors/csrf.interceptor.ts diff --git a/src-ui/package-lock.json b/src-ui/package-lock.json index 45b1d2d6d..b6b66e1c6 100644 --- a/src-ui/package-lock.json +++ b/src-ui/package-lock.json @@ -8260,6 +8260,14 @@ "moment": "2.18.1" } }, + "ngx-cookie-service": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/ngx-cookie-service/-/ngx-cookie-service-10.1.1.tgz", + "integrity": "sha512-HvBrYHdxMN1NvFJGEIF/8EuAg2fjxj8QwqTv9h6qZGqNLU+lUba8Pb2zRPw1YA+gqKkJawOy5dYNeH0kyPyipw==", + "requires": { + "tslib": "^2.0.0" + } + }, "ngx-file-drop": { "version": "10.0.0", "resolved": "https://registry.npmjs.org/ngx-file-drop/-/ngx-file-drop-10.0.0.tgz", diff --git a/src-ui/package.json b/src-ui/package.json index a9e909155..af3334db9 100644 --- a/src-ui/package.json +++ b/src-ui/package.json @@ -23,6 +23,7 @@ "@ng-bootstrap/ng-bootstrap": "^8.0.0", "bootstrap": "^4.5.0", "ng-bootstrap": "^1.6.3", + "ngx-cookie-service": "^10.1.1", "ngx-file-drop": "^10.0.0", "ngx-infinite-scroll": "^9.1.0", "rxjs": "~6.6.0", diff --git a/src-ui/src/app/app.module.ts b/src-ui/src/app/app.module.ts index dad57280d..014279cc5 100644 --- a/src-ui/src/app/app.module.ts +++ b/src-ui/src/app/app.module.ts @@ -39,6 +39,8 @@ import { InfiniteScrollModule } from 'ngx-infinite-scroll'; import { DateTimeComponent } from './components/common/input/date-time/date-time.component'; import { TagsComponent } from './components/common/input/tags/tags.component'; import { SortableDirective } from './directives/sortable.directive'; +import { CookieService } from 'ngx-cookie-service'; +import { CsrfInterceptor } from './interceptors/csrf.interceptor'; @NgModule({ declarations: [ @@ -85,7 +87,12 @@ import { SortableDirective } from './directives/sortable.directive'; InfiniteScrollModule ], providers: [ - DatePipe + DatePipe, + CookieService, { + provide: HTTP_INTERCEPTORS, + useClass: CsrfInterceptor, + multi: true + } ], bootstrap: [AppComponent] }) diff --git a/src-ui/src/app/interceptors/csrf.interceptor.spec.ts b/src-ui/src/app/interceptors/csrf.interceptor.spec.ts new file mode 100644 index 000000000..64e20c110 --- /dev/null +++ b/src-ui/src/app/interceptors/csrf.interceptor.spec.ts @@ -0,0 +1,16 @@ +import { TestBed } from '@angular/core/testing'; + +import { CsrfInterceptor } from './csrf.interceptor'; + +describe('CsrfInterceptor', () => { + beforeEach(() => TestBed.configureTestingModule({ + providers: [ + CsrfInterceptor + ] + })); + + it('should be created', () => { + const interceptor: CsrfInterceptor = TestBed.inject(CsrfInterceptor); + expect(interceptor).toBeTruthy(); + }); +}); diff --git a/src-ui/src/app/interceptors/csrf.interceptor.ts b/src-ui/src/app/interceptors/csrf.interceptor.ts new file mode 100644 index 000000000..32f3e99dc --- /dev/null +++ b/src-ui/src/app/interceptors/csrf.interceptor.ts @@ -0,0 +1,30 @@ +import { Injectable } from '@angular/core'; +import { + HttpRequest, + HttpHandler, + HttpEvent, + HttpInterceptor +} from '@angular/common/http'; +import { Observable } from 'rxjs'; +import { CookieService } from 'ngx-cookie-service'; + +@Injectable() +export class CsrfInterceptor implements HttpInterceptor { + + constructor(private cookieService: CookieService) { + + } + + intercept(request: HttpRequest, next: HttpHandler): Observable> { + let csrfToken = this.cookieService.get('csrftoken') + if (csrfToken) { + request = request.clone({ + setHeaders: { + 'X-CSRFToken': csrfToken + } + }) + } + + return next.handle(request); + } +} From 4c5400e28accbcfe985224dbcc4e885525d39dab Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:23:49 +0100 Subject: [PATCH 011/101] a handy script that brings up postgres and redis --- scripts/start_services.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 scripts/start_services.sh diff --git a/scripts/start_services.sh b/scripts/start_services.sh new file mode 100755 index 000000000..e566f59b3 --- /dev/null +++ b/scripts/start_services.sh @@ -0,0 +1,2 @@ +docker run -p 5432:5432 -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13 +docker run -d -p 6379:6379 redis:latest From 1e3e80930cb72775e7f75db02217b18f4547b3c4 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:24:04 +0100 Subject: [PATCH 012/101] enable Group and User management. --- src/documents/admin.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 0f63253ce..51096d860 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -120,8 +120,3 @@ admin.site.register(Tag, TagAdmin) admin.site.register(DocumentType, DocumentTypeAdmin) admin.site.register(Document, DocumentAdmin) admin.site.register(Log, LogAdmin) - - -# Unless we implement multi-user, these default registrations don't make sense. -admin.site.unregister(Group) -admin.site.unregister(User) From 1581d707ac39f6b8e140b68f3cff1047dc8fd1dd Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 09:23:57 +0100 Subject: [PATCH 013/101] fixes bug with the + button --- src-ui/src/app/components/common/input/tags/tags.component.ts | 2 +- .../components/document-detail/document-detail.component.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src-ui/src/app/components/common/input/tags/tags.component.ts b/src-ui/src/app/components/common/input/tags/tags.component.ts index 7b5a36e90..dd57d8e50 100644 --- a/src-ui/src/app/components/common/input/tags/tags.component.ts +++ b/src-ui/src/app/components/common/input/tags/tags.component.ts @@ -86,7 +86,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor { var modal = this.modalService.open(TagEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newTag => { - this.tagService.list().subscribe(tags => { + this.tagService.listAll().subscribe(tags => { this.tags = tags.results this.addTag(newTag.id) }) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 802a3b212..634b28613 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -89,7 +89,7 @@ export class DocumentDetailComponent implements OnInit { var modal = this.modalService.open(DocumentTypeEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newDocumentType => { - this.documentTypeService.list().subscribe(documentTypes => { + this.documentTypeService.listAll().subscribe(documentTypes => { this.documentTypes = documentTypes.results this.documentForm.get('document_type_id').setValue(newDocumentType.id) }) @@ -100,7 +100,7 @@ export class DocumentDetailComponent implements OnInit { var modal = this.modalService.open(CorrespondentEditDialogComponent, {backdrop: 'static'}) modal.componentInstance.dialogMode = 'create' modal.componentInstance.success.subscribe(newCorrespondent => { - this.correspondentService.list().subscribe(correspondents => { + this.correspondentService.listAll().subscribe(correspondents => { this.correspondents = correspondents.results this.documentForm.get('correspondent_id').setValue(newCorrespondent.id) }) From 4904e2dc073075e320f2651964b04698dd4e6420 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 11 Nov 2020 20:47:48 +0100 Subject: [PATCH 014/101] adjusted a couple things in the docker compose file. --- docker-compose.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index f9b4d6c33..54f293f90 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,12 +1,12 @@ version: "3.4" services: broker: - image: redis:latest - #restart: always + image: redis:6.0 + restart: always db: image: postgres:13 - #restart: always + restart: always volumes: - pgdata:/var/lib/postgresql/data environment: @@ -16,9 +16,10 @@ services: webserver: image: paperless-ng:latest - #restart: always + restart: always depends_on: - db + - broker ports: - 8000:8000 healthcheck: From f53a958bc5068d8c1edfdb85584478863cd527a2 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 09:30:04 +0100 Subject: [PATCH 015/101] fixes #30 --- .../management/commands/document_consumer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 93ad6947c..a90fd53ed 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -20,8 +20,21 @@ class Handler(FileSystemEventHandler): def __init__(self, consumer): self.consumer = consumer + def _consume(self, file): + if os.path.isfile(file): + try: + self.consumer.try_consume_file(file) + except Exception as e: + logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) + def on_created(self, event): - self.consumer.try_consume_file(event.src_path) + self._consume(event.src_path) + + def on_modified(self, event): + self._consume(event.src_path) + + def on_moved(self, event): + self._consume(event.src_path) class Command(BaseCommand): From c5f0da388b2e047ea74a416390c241b0a740e9ff Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:01:22 +0100 Subject: [PATCH 016/101] fixes #35 --- src/paperless/settings.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 06dfdcd84..38721c00f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -13,6 +13,17 @@ elif os.path.exists("/etc/paperless.conf"): elif os.path.exists("/usr/local/etc/paperless.conf"): load_dotenv("/usr/local/etc/paperless.conf") +# There are multiple levels of concurrency in paperless: +# - Multiple consumers may be run in parallel. +# - Each consumer may process multiple pages in parallel. +# - Each Tesseract OCR run may spawn multiple threads to process a single page +# slightly faster. +# The performance gains from having tesseract use multiple threads are minimal. +# However, when multiple pages are processed in parallel, the total number of +# OCR threads may exceed the number of available cpu cores, which will +# dramatically slow down the consumption process. This settings limits each +# Tesseract process to one thread. +os.environ['OMP_THREAD_LIMIT'] = "1" def __get_boolean(key, default="NO"): """ From 1c50b7693d6d14d47043e769b935ed56a2c5293e Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:04:01 +0100 Subject: [PATCH 017/101] fixes #31 --- src/documents/classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 6c90536b0..4ba538162 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -120,8 +120,8 @@ class DocumentClassifier(object): num_tags = len(labels_tags_unique) # substract 1 since -1 (null) is also part of the classes. - num_correspondents = len(labels_correspondent) - 1 - num_document_types = len(labels_document_type) - 1 + num_correspondents = len(set(labels_correspondent)) - 1 + num_document_types = len(set(labels_document_type)) - 1 logging.getLogger(__name__).debug( "{} documents, {} tag(s), {} correspondent(s), " From fbac5e20f410af93327cfb6ef32c0c685b8e67b5 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:05:19 +0100 Subject: [PATCH 018/101] fixes log on windows --- src-ui/src/app/components/manage/logs/logs.component.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src-ui/src/app/components/manage/logs/logs.component.ts b/src-ui/src/app/components/manage/logs/logs.component.ts index da507cbe5..f80aada21 100644 --- a/src-ui/src/app/components/manage/logs/logs.component.ts +++ b/src-ui/src/app/components/manage/logs/logs.component.ts @@ -30,7 +30,7 @@ export class LogsComponent implements OnInit { onScroll() { let lastCreated = null if (this.logs.length > 0) { - lastCreated = this.logs[this.logs.length-1].created + lastCreated = new Date(this.logs[this.logs.length-1].created).toISOString() } this.logService.list(1, 25, 'created', 'des', {'created__lt': lastCreated, 'level__gte': this.level}).subscribe(result => { this.logs.push(...result.results) From 1fa2c54932beac1ab68a45f1d0af9f39d1c8e2fa Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:41:47 +0100 Subject: [PATCH 019/101] on_modified not needed for the consumer. --- src/documents/management/commands/document_consumer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index a90fd53ed..ea6e033ba 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -25,14 +25,12 @@ class Handler(FileSystemEventHandler): try: self.consumer.try_consume_file(file) except Exception as e: + # Catch all so that the consumer won't crash. logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) def on_created(self, event): self._consume(event.src_path) - def on_modified(self, event): - self._consume(event.src_path) - def on_moved(self, event): self._consume(event.src_path) From 23eae9a3f1717f227f59e13351996d97db1aeb85 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 10:42:18 +0100 Subject: [PATCH 020/101] This is how the original filenames were generated. Keep it this way for compatibility. --- src/documents/file_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index cac317d4c..22d5a052d 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -72,7 +72,7 @@ def generate_filename(document): path = settings.PAPERLESS_FILENAME_FORMAT.format( correspondent=slugify(document.correspondent), title=slugify(document.title), - created=document.created.date(), + created=slugify(document.created), added=slugify(document.added), tags=tags, ) From 6f6f4fb1631b4c1d5cef6fee682909bb86463f28 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 11:11:57 +0100 Subject: [PATCH 021/101] fixes #12 --- .../app-frame/app-frame.component.html | 8 ++++++++ .../components/app-frame/app-frame.component.ts | 17 ++++++++++++++++- .../src/app/services/open-documents.service.ts | 13 +++++++++++-- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src-ui/src/app/components/app-frame/app-frame.component.html b/src-ui/src/app/components/app-frame/app-frame.component.html index 0b18777ef..7879e2dcb 100644 --- a/src-ui/src/app/components/app-frame/app-frame.component.html +++ b/src-ui/src/app/components/app-frame/app-frame.component.html @@ -69,6 +69,14 @@ {{d.title}} +