mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'dev' into feature-bulk-edit
This commit is contained in:
		| @@ -17,8 +17,6 @@ class CorrespondentAdmin(admin.ModelAdmin): | ||||
|     list_filter = ("matching_algorithm",) | ||||
|     list_editable = ("match", "matching_algorithm") | ||||
|  | ||||
|     readonly_fields = ("slug",) | ||||
|  | ||||
|  | ||||
| class TagAdmin(admin.ModelAdmin): | ||||
|  | ||||
| @@ -31,8 +29,6 @@ class TagAdmin(admin.ModelAdmin): | ||||
|     list_filter = ("colour", "matching_algorithm") | ||||
|     list_editable = ("colour", "match", "matching_algorithm") | ||||
|  | ||||
|     readonly_fields = ("slug", ) | ||||
|  | ||||
|  | ||||
| class DocumentTypeAdmin(admin.ModelAdmin): | ||||
|  | ||||
| @@ -44,13 +40,16 @@ class DocumentTypeAdmin(admin.ModelAdmin): | ||||
|     list_filter = ("matching_algorithm",) | ||||
|     list_editable = ("match", "matching_algorithm") | ||||
|  | ||||
|     readonly_fields = ("slug",) | ||||
|  | ||||
|  | ||||
| class DocumentAdmin(admin.ModelAdmin): | ||||
|  | ||||
|     search_fields = ("correspondent__name", "title", "content", "tags__name") | ||||
|     readonly_fields = ("added", "mime_type", "storage_type", "filename") | ||||
|     readonly_fields = ( | ||||
|         "added", | ||||
|         "modified", | ||||
|         "mime_type", | ||||
|         "storage_type", | ||||
|         "filename") | ||||
|  | ||||
|     list_display_links = ("title",) | ||||
|  | ||||
| @@ -101,7 +100,7 @@ class DocumentAdmin(admin.ModelAdmin): | ||||
|         for tag in obj.tags.all(): | ||||
|             r += self._html_tag( | ||||
|                 "span", | ||||
|                 tag.slug + ", " | ||||
|                 tag.name + ", " | ||||
|             ) | ||||
|         return r | ||||
|  | ||||
|   | ||||
| @@ -8,13 +8,14 @@ from django.conf import settings | ||||
| from django.db import transaction | ||||
| from django.db.models import Q | ||||
| from django.utils import timezone | ||||
| from filelock import FileLock | ||||
|  | ||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from .file_handling import create_source_path_directory | ||||
| from .file_handling import create_source_path_directory, \ | ||||
|     generate_unique_filename | ||||
| from .loggers import LoggingMixin | ||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||
| from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||
|     get_supported_file_extensions, parse_date | ||||
| from .parsers import ParseError, get_parser_class_for_mime_type, parse_date | ||||
| from .signals import ( | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| @@ -38,6 +39,10 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|     def pre_check_file_exists(self): | ||||
|         if not os.path.isfile(self.path): | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 "Cannot consume {}: It is not a file.".format(self.path) | ||||
|             ) | ||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||
|                 self.path)) | ||||
|  | ||||
| @@ -47,6 +52,10 @@ class Consumer(LoggingMixin): | ||||
|         if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501 | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 os.unlink(self.path) | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 "Not consuming {}: It is a duplicate.".format(self.filename) | ||||
|             ) | ||||
|             raise ConsumerError( | ||||
|                 "Not consuming {}: It is a duplicate.".format(self.filename) | ||||
|             ) | ||||
| @@ -148,8 +157,9 @@ class Consumer(LoggingMixin): | ||||
|             classifier = DocumentClassifier() | ||||
|             classifier.reload() | ||||
|         except (FileNotFoundError, IncompatibleClassifierVersionError) as e: | ||||
|             logging.getLogger(__name__).warning( | ||||
|                 "Cannot classify documents: {}.".format(e)) | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 f"Cannot classify documents: {e}.") | ||||
|             classifier = None | ||||
|  | ||||
|         # now that everything is done, we can start to store the document | ||||
| @@ -176,31 +186,28 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|                 # After everything is in the database, copy the files into | ||||
|                 # place. If this fails, we'll also rollback the transaction. | ||||
|                 with FileLock(settings.MEDIA_LOCK): | ||||
|                     document.filename = generate_unique_filename( | ||||
|                         document, settings.ORIGINALS_DIR) | ||||
|                     create_source_path_directory(document.source_path) | ||||
|  | ||||
|                 # TODO: not required, since this is done by the file handling | ||||
|                 #  logic | ||||
|                 create_source_path_directory(document.source_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             self.path, document.source_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 if archive_path and os.path.isfile(archive_path): | ||||
|                     self._write(document.storage_type, | ||||
|                                 archive_path, document.archive_path) | ||||
|                                 self.path, document.source_path) | ||||
|  | ||||
|                     with open(archive_path, 'rb') as f: | ||||
|                         document.archive_checksum = hashlib.md5( | ||||
|                             f.read()).hexdigest() | ||||
|                         document.save() | ||||
|                     self._write(document.storage_type, | ||||
|                                 thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 # Afte performing all database operations and moving files | ||||
|                 # into place, tell paperless where the file is. | ||||
|                 document.filename = os.path.basename(document.source_path) | ||||
|                 # Saving the document now will trigger the filename handling | ||||
|                 # logic. | ||||
|                     if archive_path and os.path.isfile(archive_path): | ||||
|                         create_source_path_directory(document.archive_path) | ||||
|                         self._write(document.storage_type, | ||||
|                                     archive_path, document.archive_path) | ||||
|  | ||||
|                         with open(archive_path, 'rb') as f: | ||||
|                             document.archive_checksum = hashlib.md5( | ||||
|                                 f.read()).hexdigest() | ||||
|  | ||||
|                 # Don't save with the lock active. Saving will cause the file | ||||
|                 # renaming logic to aquire the lock as well. | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
| @@ -241,7 +248,7 @@ class Consumer(LoggingMixin): | ||||
|         with open(self.path, "rb") as f: | ||||
|             document = Document.objects.create( | ||||
|                 correspondent=file_info.correspondent, | ||||
|                 title=file_info.title, | ||||
|                 title=(self.override_title or file_info.title)[:127], | ||||
|                 content=text, | ||||
|                 mime_type=mime_type, | ||||
|                 checksum=hashlib.md5(f.read()).hexdigest(), | ||||
| @@ -252,18 +259,17 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         relevant_tags = set(file_info.tags) | ||||
|         if relevant_tags: | ||||
|             tag_names = ", ".join([t.slug for t in relevant_tags]) | ||||
|             tag_names = ", ".join([t.name for t in relevant_tags]) | ||||
|             self.log("debug", "Tagging with {}".format(tag_names)) | ||||
|             document.tags.add(*relevant_tags) | ||||
|  | ||||
|         self.apply_overrides(document) | ||||
|  | ||||
|         document.save() | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def apply_overrides(self, document): | ||||
|         if self.override_title: | ||||
|             document.title = self.override_title | ||||
|  | ||||
|         if self.override_correspondent_id: | ||||
|             document.correspondent = Correspondent.objects.get( | ||||
|                 pk=self.override_correspondent_id) | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| from collections import defaultdict | ||||
|  | ||||
| import pathvalidate | ||||
| from django.conf import settings | ||||
| from django.template.defaultfilters import slugify | ||||
|  | ||||
| @@ -68,21 +70,53 @@ def many_to_dictionary(field): | ||||
|     return mydictionary | ||||
|  | ||||
|  | ||||
| def generate_filename(doc): | ||||
| def generate_unique_filename(doc, root): | ||||
|     counter = 0 | ||||
|  | ||||
|     while True: | ||||
|         new_filename = generate_filename(doc, counter) | ||||
|         if new_filename == doc.filename: | ||||
|             # still the same as before. | ||||
|             return new_filename | ||||
|  | ||||
|         if os.path.exists(os.path.join(root, new_filename)): | ||||
|             counter += 1 | ||||
|         else: | ||||
|             return new_filename | ||||
|  | ||||
|  | ||||
| def generate_filename(doc, counter=0): | ||||
|     path = "" | ||||
|  | ||||
|     try: | ||||
|         if settings.PAPERLESS_FILENAME_FORMAT is not None: | ||||
|             tags = defaultdict(lambda: slugify(None), | ||||
|                                many_to_dictionary(doc.tags)) | ||||
|  | ||||
|             if doc.correspondent: | ||||
|                 correspondent = pathvalidate.sanitize_filename( | ||||
|                     doc.correspondent.name, replacement_text="-" | ||||
|                 ) | ||||
|             else: | ||||
|                 correspondent = "none" | ||||
|  | ||||
|             if doc.document_type: | ||||
|                 document_type = pathvalidate.sanitize_filename( | ||||
|                     doc.document_type.name, replacement_text="-" | ||||
|                 ) | ||||
|             else: | ||||
|                 document_type = "none" | ||||
|  | ||||
|             path = settings.PAPERLESS_FILENAME_FORMAT.format( | ||||
|                 correspondent=slugify(doc.correspondent), | ||||
|                 title=slugify(doc.title), | ||||
|                 created=slugify(doc.created), | ||||
|                 title=pathvalidate.sanitize_filename( | ||||
|                     doc.title, replacement_text="-"), | ||||
|                 correspondent=correspondent, | ||||
|                 document_type=document_type, | ||||
|                 created=datetime.date.isoformat(doc.created), | ||||
|                 created_year=doc.created.year if doc.created else "none", | ||||
|                 created_month=doc.created.month if doc.created else "none", | ||||
|                 created_day=doc.created.day if doc.created else "none", | ||||
|                 added=slugify(doc.added), | ||||
|                 added=datetime.date.isoformat(doc.added), | ||||
|                 added_year=doc.added.year if doc.added else "none", | ||||
|                 added_month=doc.added.month if doc.added else "none", | ||||
|                 added_day=doc.added.day if doc.added else "none", | ||||
| @@ -93,11 +127,11 @@ def generate_filename(doc): | ||||
|             f"Invalid PAPERLESS_FILENAME_FORMAT: " | ||||
|             f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default") | ||||
|  | ||||
|     # Always append the primary key to guarantee uniqueness of filename | ||||
|     counter_str = f"_{counter:02}" if counter else "" | ||||
|     if len(path) > 0: | ||||
|         filename = "%s-%07i%s" % (path, doc.pk, doc.file_type) | ||||
|         filename = f"{path}{counter_str}{doc.file_type}" | ||||
|     else: | ||||
|         filename = "%07i%s" % (doc.pk, doc.file_type) | ||||
|         filename = f"{doc.pk:07}{counter_str}{doc.file_type}" | ||||
|  | ||||
|     # Append .gpg for encrypted files | ||||
|     if doc.storage_type == doc.STORAGE_TYPE_GPG: | ||||
|   | ||||
| @@ -37,6 +37,10 @@ class DocumentTypeFilterSet(FilterSet): | ||||
|  | ||||
| class TagsFilter(Filter): | ||||
|  | ||||
|     def __init__(self, exclude=False): | ||||
|         super(TagsFilter, self).__init__() | ||||
|         self.exclude = exclude | ||||
|  | ||||
|     def filter(self, qs, value): | ||||
|         if not value: | ||||
|             return qs | ||||
| @@ -47,7 +51,10 @@ class TagsFilter(Filter): | ||||
|             return qs | ||||
|  | ||||
|         for tag_id in tag_ids: | ||||
|             qs = qs.filter(tags__id=tag_id) | ||||
|             if self.exclude: | ||||
|                 qs = qs.exclude(tags__id=tag_id) | ||||
|             else: | ||||
|                 qs = qs.filter(tags__id=tag_id) | ||||
|  | ||||
|         return qs | ||||
|  | ||||
| @@ -74,6 +81,8 @@ class DocumentFilterSet(FilterSet): | ||||
|  | ||||
|     tags__id__all = TagsFilter() | ||||
|  | ||||
|     tags__id__none = TagsFilter(exclude=True) | ||||
|  | ||||
|     is_in_inbox = InboxFilter() | ||||
|  | ||||
|     class Meta: | ||||
|   | ||||
| @@ -82,7 +82,8 @@ class Command(BaseCommand): | ||||
|             with open(document.thumbnail_path, "wb") as f: | ||||
|                 f.write(raw_thumb) | ||||
|  | ||||
|             document.save(update_fields=("storage_type", "filename")) | ||||
|             Document.objects.filter(id=document.id).update( | ||||
|                 storage_type=document.storage_type, filename=document.filename) | ||||
|  | ||||
|             for path in old_paths: | ||||
|                 os.unlink(path) | ||||
|   | ||||
| @@ -29,10 +29,9 @@ def _tags_from_path(filepath): | ||||
|     path_parts = Path(filepath).relative_to( | ||||
|                 settings.CONSUMPTION_DIR).parent.parts | ||||
|     for part in path_parts: | ||||
|         tag_ids.add(Tag.objects.get_or_create( | ||||
|             slug=slugify(part), | ||||
|             defaults={"name": part}, | ||||
|         )[0].pk) | ||||
|         tag_ids.add(Tag.objects.get_or_create(name__iexact=part, defaults={ | ||||
|             "name": part | ||||
|         })[0].pk) | ||||
|  | ||||
|     return tag_ids | ||||
|  | ||||
|   | ||||
| @@ -38,6 +38,9 @@ class Command(Renderable, BaseCommand): | ||||
|         if not os.access(self.target, os.W_OK): | ||||
|             raise CommandError("That path doesn't appear to be writable") | ||||
|  | ||||
|         if os.listdir(self.target): | ||||
|             raise CommandError("That directory is not empty.") | ||||
|  | ||||
|         self.dump() | ||||
|  | ||||
|     def dump(self): | ||||
| @@ -54,31 +57,39 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|             document = document_map[document_dict["pk"]] | ||||
|  | ||||
|             unique_filename = f"{document.pk:07}_{document.file_name}" | ||||
|             file_target = os.path.join(self.target, unique_filename) | ||||
|             print(f"Exporting: {document}") | ||||
|  | ||||
|             thumbnail_name = unique_filename + "-thumbnail.png" | ||||
|             filename_counter = 0 | ||||
|             while True: | ||||
|                 original_name = document.get_public_filename( | ||||
|                     counter=filename_counter) | ||||
|                 original_target = os.path.join(self.target, original_name) | ||||
|  | ||||
|                 if not os.path.exists(original_target): | ||||
|                     break | ||||
|                 else: | ||||
|                     filename_counter += 1 | ||||
|  | ||||
|             thumbnail_name = original_name + "-thumbnail.png" | ||||
|             thumbnail_target = os.path.join(self.target, thumbnail_name) | ||||
|  | ||||
|             document_dict[EXPORTER_FILE_NAME] = unique_filename | ||||
|             document_dict[EXPORTER_FILE_NAME] = original_name | ||||
|             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name | ||||
|  | ||||
|             if os.path.exists(document.archive_path): | ||||
|                 archive_name = \ | ||||
|                     f"{document.pk:07}_archive_{document.archive_file_name}" | ||||
|                 archive_name = document.get_public_filename( | ||||
|                     archive=True, counter=filename_counter, suffix="_archive") | ||||
|                 archive_target = os.path.join(self.target, archive_name) | ||||
|                 document_dict[EXPORTER_ARCHIVE_NAME] = archive_name | ||||
|             else: | ||||
|                 archive_target = None | ||||
|  | ||||
|             print(f"Exporting: {file_target}") | ||||
|  | ||||
|             t = int(time.mktime(document.created.timetuple())) | ||||
|             if document.storage_type == Document.STORAGE_TYPE_GPG: | ||||
|  | ||||
|                 with open(file_target, "wb") as f: | ||||
|                 with open(original_target, "wb") as f: | ||||
|                     f.write(GnuPG.decrypted(document.source_file)) | ||||
|                     os.utime(file_target, times=(t, t)) | ||||
|                     os.utime(original_target, times=(t, t)) | ||||
|  | ||||
|                 with open(thumbnail_target, "wb") as f: | ||||
|                     f.write(GnuPG.decrypted(document.thumbnail_file)) | ||||
| @@ -90,7 +101,7 @@ class Command(Renderable, BaseCommand): | ||||
|                         os.utime(archive_target, times=(t, t)) | ||||
|             else: | ||||
|  | ||||
|                 shutil.copy(document.source_path, file_target) | ||||
|                 shutil.copy(document.source_path, original_target) | ||||
|                 shutil.copy(document.thumbnail_path, thumbnail_target) | ||||
|  | ||||
|                 if archive_target: | ||||
|   | ||||
| @@ -5,11 +5,13 @@ import shutil | ||||
| from django.conf import settings | ||||
| from django.core.management import call_command | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
| from filelock import FileLock | ||||
|  | ||||
| from documents.models import Document | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||
|     EXPORTER_ARCHIVE_NAME | ||||
| from ...file_handling import generate_filename, create_source_path_directory | ||||
| from ...file_handling import create_source_path_directory, \ | ||||
|     generate_unique_filename | ||||
| from ...mixins import Renderable | ||||
|  | ||||
|  | ||||
| @@ -114,17 +116,20 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|             document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|             document.filename = generate_filename(document) | ||||
|             with FileLock(settings.MEDIA_LOCK): | ||||
|                 document.filename = generate_unique_filename( | ||||
|                     document, settings.ORIGINALS_DIR) | ||||
|  | ||||
|             if os.path.isfile(document.source_path): | ||||
|                 raise FileExistsError(document.source_path) | ||||
|                 if os.path.isfile(document.source_path): | ||||
|                     raise FileExistsError(document.source_path) | ||||
|  | ||||
|             create_source_path_directory(document.source_path) | ||||
|                 create_source_path_directory(document.source_path) | ||||
|  | ||||
|             print(f"Moving {document_path} to {document.source_path}") | ||||
|             shutil.copy(document_path, document.source_path) | ||||
|             shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|             if archive_path: | ||||
|                 shutil.copy(archive_path, document.archive_path) | ||||
|                 print(f"Moving {document_path} to {document.source_path}") | ||||
|                 shutil.copy(document_path, document.source_path) | ||||
|                 shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|                 if archive_path: | ||||
|                     create_source_path_directory(document.archive_path) | ||||
|                     shutil.copy(archive_path, document.archive_path) | ||||
|  | ||||
|             document.save() | ||||
|   | ||||
| @@ -1,3 +1,6 @@ | ||||
| import logging | ||||
|  | ||||
| import tqdm | ||||
| from django.core.management.base import BaseCommand | ||||
|  | ||||
| from documents.models import Document | ||||
| @@ -18,6 +21,8 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|         self.verbosity = options["verbosity"] | ||||
|  | ||||
|         for document in Document.objects.all(): | ||||
|         logging.getLogger().handlers[0].level = logging.ERROR | ||||
|  | ||||
|         for document in tqdm.tqdm(Document.objects.all()): | ||||
|             # Saving the document again will generate a new filename and rename | ||||
|             document.save() | ||||
|   | ||||
							
								
								
									
										25
									
								
								src/documents/migrations/1006_auto_20201208_2209.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								src/documents/migrations/1006_auto_20201208_2209.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| # Generated by Django 3.1.4 on 2020-12-08 22:09 | ||||
|  | ||||
| from django.db import migrations | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     dependencies = [ | ||||
|         ('documents', '1005_checksums'), | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         migrations.RemoveField( | ||||
|             model_name='correspondent', | ||||
|             name='slug', | ||||
|         ), | ||||
|         migrations.RemoveField( | ||||
|             model_name='documenttype', | ||||
|             name='slug', | ||||
|         ), | ||||
|         migrations.RemoveField( | ||||
|             model_name='tag', | ||||
|             name='slug', | ||||
|         ), | ||||
|     ] | ||||
| @@ -1,10 +1,12 @@ | ||||
| # coding=utf-8 | ||||
|  | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| from collections import OrderedDict | ||||
|  | ||||
| import pathvalidate | ||||
|  | ||||
| import dateutil.parser | ||||
| from django.conf import settings | ||||
| from django.db import models | ||||
| @@ -34,7 +36,6 @@ class MatchingModel(models.Model): | ||||
|     ) | ||||
|  | ||||
|     name = models.CharField(max_length=128, unique=True) | ||||
|     slug = models.SlugField(blank=True, editable=False) | ||||
|  | ||||
|     match = models.CharField(max_length=256, blank=True) | ||||
|     matching_algorithm = models.PositiveIntegerField( | ||||
| @@ -67,7 +68,6 @@ class MatchingModel(models.Model): | ||||
|     def save(self, *args, **kwargs): | ||||
|  | ||||
|         self.match = self.match.lower() | ||||
|         self.slug = slugify(self.name) | ||||
|  | ||||
|         models.Model.save(self, *args, **kwargs) | ||||
|  | ||||
| @@ -172,6 +172,7 @@ class Document(models.Model): | ||||
|  | ||||
|     created = models.DateTimeField( | ||||
|         default=timezone.now, db_index=True) | ||||
|  | ||||
|     modified = models.DateTimeField( | ||||
|         auto_now=True, editable=False, db_index=True) | ||||
|  | ||||
| @@ -206,13 +207,11 @@ class Document(models.Model): | ||||
|         ordering = ("correspondent", "title") | ||||
|  | ||||
|     def __str__(self): | ||||
|         created = self.created.strftime("%Y%m%d") | ||||
|         created = datetime.date.isoformat(self.created) | ||||
|         if self.correspondent and self.title: | ||||
|             return "{}: {} - {}".format( | ||||
|                 created, self.correspondent, self.title) | ||||
|         if self.correspondent or self.title: | ||||
|             return "{}: {}".format(created, self.correspondent or self.title) | ||||
|         return str(created) | ||||
|             return f"{created} {self.correspondent} {self.title}" | ||||
|         else: | ||||
|             return f"{created} {self.title}" | ||||
|  | ||||
|     @property | ||||
|     def source_path(self): | ||||
| @@ -248,13 +247,21 @@ class Document(models.Model): | ||||
|     def archive_file(self): | ||||
|         return open(self.archive_path, "rb") | ||||
|  | ||||
|     @property | ||||
|     def file_name(self): | ||||
|         return slugify(str(self)) + self.file_type | ||||
|     def get_public_filename(self, archive=False, counter=0, suffix=None): | ||||
|         result = str(self) | ||||
|  | ||||
|     @property | ||||
|     def archive_file_name(self): | ||||
|         return slugify(str(self)) + ".pdf" | ||||
|         if counter: | ||||
|             result += f"_{counter:02}" | ||||
|  | ||||
|         if suffix: | ||||
|             result += suffix | ||||
|  | ||||
|         if archive: | ||||
|             result += ".pdf" | ||||
|         else: | ||||
|             result += self.file_type | ||||
|  | ||||
|         return pathvalidate.sanitize_filename(result, replacement_text="-") | ||||
|  | ||||
|     @property | ||||
|     def file_type(self): | ||||
| @@ -375,9 +382,7 @@ class FileInfo: | ||||
|     def _get_correspondent(cls, name): | ||||
|         if not name: | ||||
|             return None | ||||
|         return Correspondent.objects.get_or_create(name=name, defaults={ | ||||
|             "slug": slugify(name) | ||||
|         })[0] | ||||
|         return Correspondent.objects.get_or_create(name=name)[0] | ||||
|  | ||||
|     @classmethod | ||||
|     def _get_title(cls, title): | ||||
| @@ -387,10 +392,7 @@ class FileInfo: | ||||
|     def _get_tags(cls, tags): | ||||
|         r = [] | ||||
|         for t in tags.split(","): | ||||
|             r.append(Tag.objects.get_or_create( | ||||
|                 slug=slugify(t), | ||||
|                 defaults={"name": t} | ||||
|             )[0]) | ||||
|             r.append(Tag.objects.get_or_create(name=t)[0]) | ||||
|         return tuple(r) | ||||
|  | ||||
|     @classmethod | ||||
|   | ||||
| @@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin): | ||||
|     def __init__(self, logging_group): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
| @@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin): | ||||
|         self.text = None | ||||
|         self.date = None | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|         return [] | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|   | ||||
| @@ -46,6 +46,10 @@ def check_sanity(): | ||||
|         for f in files: | ||||
|             present_files.append(os.path.normpath(os.path.join(root, f))) | ||||
|  | ||||
|     lockfile = os.path.normpath(settings.MEDIA_LOCK) | ||||
|     if lockfile in present_files: | ||||
|         present_files.remove(lockfile) | ||||
|  | ||||
|     for doc in Document.objects.all(): | ||||
|         # Check sanity of the thumbnail | ||||
|         if not os.path.isfile(doc.thumbnail_path): | ||||
|   | ||||
| @@ -1,17 +1,23 @@ | ||||
| import magic | ||||
| from django.utils.text import slugify | ||||
| from pathvalidate import validate_filename, ValidationError | ||||
| from rest_framework import serializers | ||||
| from rest_framework.fields import SerializerMethodField | ||||
|  | ||||
| from .models import Correspondent, Tag, Document, Log, DocumentType | ||||
| from .parsers import is_mime_type_supported | ||||
|  | ||||
|  | ||||
| class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): | ||||
| class CorrespondentSerializer(serializers.ModelSerializer): | ||||
|  | ||||
|     document_count = serializers.IntegerField(read_only=True) | ||||
|  | ||||
|     last_correspondence = serializers.DateTimeField(read_only=True) | ||||
|  | ||||
|     def get_slug(self, obj): | ||||
|         return slugify(obj.name) | ||||
|     slug = SerializerMethodField() | ||||
|  | ||||
|     class Meta: | ||||
|         model = Correspondent | ||||
|         fields = ( | ||||
| @@ -26,10 +32,14 @@ class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer): | ||||
| class DocumentTypeSerializer(serializers.ModelSerializer): | ||||
|  | ||||
|     document_count = serializers.IntegerField(read_only=True) | ||||
|  | ||||
|     def get_slug(self, obj): | ||||
|         return slugify(obj.name) | ||||
|     slug = SerializerMethodField() | ||||
|  | ||||
|     class Meta: | ||||
|         model = DocumentType | ||||
|         fields = ( | ||||
| @@ -43,10 +53,14 @@ class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer): | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class TagSerializer(serializers.HyperlinkedModelSerializer): | ||||
| class TagSerializer(serializers.ModelSerializer): | ||||
|  | ||||
|     document_count = serializers.IntegerField(read_only=True) | ||||
|  | ||||
|     def get_slug(self, obj): | ||||
|         return slugify(obj.name) | ||||
|     slug = SerializerMethodField() | ||||
|  | ||||
|     class Meta: | ||||
|         model = Tag | ||||
|         fields = ( | ||||
| @@ -83,6 +97,18 @@ class DocumentSerializer(serializers.ModelSerializer): | ||||
|     tags = TagsField(many=True) | ||||
|     document_type = DocumentTypeField(allow_null=True) | ||||
|  | ||||
|     original_file_name = SerializerMethodField() | ||||
|     archived_file_name = SerializerMethodField() | ||||
|  | ||||
|     def get_original_file_name(self, obj): | ||||
|         return obj.get_public_filename() | ||||
|  | ||||
|     def get_archived_file_name(self, obj): | ||||
|         if obj.archive_checksum: | ||||
|             return obj.get_public_filename(archive=True) | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|     class Meta: | ||||
|         model = Document | ||||
|         depth = 1 | ||||
| @@ -96,7 +122,9 @@ class DocumentSerializer(serializers.ModelSerializer): | ||||
|             "created", | ||||
|             "modified", | ||||
|             "added", | ||||
|             "archive_serial_number" | ||||
|             "archive_serial_number", | ||||
|             "original_file_name", | ||||
|             "archived_file_name", | ||||
|         ) | ||||
|  | ||||
|  | ||||
| @@ -178,8 +206,7 @@ class PostDocumentSerializer(serializers.Serializer): | ||||
|         required=False, | ||||
|     ) | ||||
|  | ||||
|     def validate(self, attrs): | ||||
|         document = attrs.get('document') | ||||
|     def validate_document(self, document): | ||||
|  | ||||
|         try: | ||||
|             validate_filename(document.name) | ||||
| @@ -191,32 +218,31 @@ class PostDocumentSerializer(serializers.Serializer): | ||||
|  | ||||
|         if not is_mime_type_supported(mime_type): | ||||
|             raise serializers.ValidationError( | ||||
|                 "This mime type is not supported.") | ||||
|                 "This file type is not supported.") | ||||
|  | ||||
|         attrs['document_data'] = document_data | ||||
|         return document.name, document_data | ||||
|  | ||||
|         title = attrs.get('title') | ||||
|     def validate_title(self, title): | ||||
|         if title: | ||||
|             return title | ||||
|         else: | ||||
|             # do not return empty strings. | ||||
|             return None | ||||
|  | ||||
|         if not title: | ||||
|             attrs['title'] = None | ||||
|  | ||||
|         correspondent = attrs.get('correspondent') | ||||
|     def validate_correspondent(self, correspondent): | ||||
|         if correspondent: | ||||
|             attrs['correspondent_id'] = correspondent.id | ||||
|             return correspondent.id | ||||
|         else: | ||||
|             attrs['correspondent_id'] = None | ||||
|             return None | ||||
|  | ||||
|         document_type = attrs.get('document_type') | ||||
|     def validate_document_type(self, document_type): | ||||
|         if document_type: | ||||
|             attrs['document_type_id'] = document_type.id | ||||
|             return document_type.id | ||||
|         else: | ||||
|             attrs['document_type_id'] = None | ||||
|             return None | ||||
|  | ||||
|         tags = attrs.get('tags') | ||||
|     def validate_tags(self, tags): | ||||
|         if tags: | ||||
|             tag_ids = [tag.id for tag in tags] | ||||
|             attrs['tag_ids'] = tag_ids | ||||
|             return [tag.id for tag in tags] | ||||
|         else: | ||||
|             attrs['tag_ids'] = None | ||||
|  | ||||
|         return attrs | ||||
|             return None | ||||
|   | ||||
| @@ -9,11 +9,13 @@ from django.contrib.contenttypes.models import ContentType | ||||
| from django.db import models, DatabaseError | ||||
| from django.dispatch import receiver | ||||
| from django.utils import timezone | ||||
| from filelock import FileLock | ||||
| from rest_framework.reverse import reverse | ||||
|  | ||||
| from .. import index, matching | ||||
| from ..file_handling import delete_empty_directories, generate_filename, \ | ||||
|     create_source_path_directory, archive_name_from_filename | ||||
| from ..file_handling import delete_empty_directories, \ | ||||
|     create_source_path_directory, archive_name_from_filename, \ | ||||
|     generate_unique_filename | ||||
| from ..models import Document, Tag | ||||
|  | ||||
|  | ||||
| @@ -134,7 +136,7 @@ def set_tags(sender, | ||||
|  | ||||
|     message = 'Tagging "{}" with "{}"' | ||||
|     logger( | ||||
|         message.format(document, ", ".join([t.slug for t in relevant_tags])), | ||||
|         message.format(document, ", ".join([t.name for t in relevant_tags])), | ||||
|         logging_group | ||||
|     ) | ||||
|  | ||||
| @@ -157,41 +159,42 @@ def run_post_consume_script(sender, document, **kwargs): | ||||
|     Popen(( | ||||
|         settings.POST_CONSUME_SCRIPT, | ||||
|         str(document.pk), | ||||
|         document.file_name, | ||||
|         document.get_public_filename(), | ||||
|         os.path.normpath(document.source_path), | ||||
|         os.path.normpath(document.thumbnail_path), | ||||
|         reverse("document-download", kwargs={"pk": document.pk}), | ||||
|         reverse("document-thumb", kwargs={"pk": document.pk}), | ||||
|         str(document.correspondent), | ||||
|         str(",".join(document.tags.all().values_list("slug", flat=True))) | ||||
|         str(",".join(document.tags.all().values_list("name", flat=True))) | ||||
|     )).wait() | ||||
|  | ||||
|  | ||||
| @receiver(models.signals.post_delete, sender=Document) | ||||
| def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||
|     for f in (instance.source_path, | ||||
|               instance.archive_path, | ||||
|               instance.thumbnail_path): | ||||
|         if os.path.isfile(f): | ||||
|             try: | ||||
|                 os.unlink(f) | ||||
|                 logging.getLogger(__name__).debug( | ||||
|                     f"Deleted file {f}.") | ||||
|             except OSError as e: | ||||
|                 logging.getLogger(__name__).warning( | ||||
|                     f"While deleting document {instance.file_name}, the file " | ||||
|                     f"{f} could not be deleted: {e}" | ||||
|                 ) | ||||
|     with FileLock(settings.MEDIA_LOCK): | ||||
|         for f in (instance.source_path, | ||||
|                   instance.archive_path, | ||||
|                   instance.thumbnail_path): | ||||
|             if os.path.isfile(f): | ||||
|                 try: | ||||
|                     os.unlink(f) | ||||
|                     logging.getLogger(__name__).debug( | ||||
|                         f"Deleted file {f}.") | ||||
|                 except OSError as e: | ||||
|                     logging.getLogger(__name__).warning( | ||||
|                         f"While deleting document {str(instance)}, the file " | ||||
|                         f"{f} could not be deleted: {e}" | ||||
|                     ) | ||||
|  | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.source_path), | ||||
|         root=settings.ORIGINALS_DIR | ||||
|     ) | ||||
|         delete_empty_directories( | ||||
|             os.path.dirname(instance.source_path), | ||||
|             root=settings.ORIGINALS_DIR | ||||
|         ) | ||||
|  | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.archive_path), | ||||
|         root=settings.ARCHIVE_DIR | ||||
|     ) | ||||
|         delete_empty_directories( | ||||
|             os.path.dirname(instance.archive_path), | ||||
|             root=settings.ARCHIVE_DIR | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def validate_move(instance, old_path, new_path): | ||||
| @@ -226,81 +229,94 @@ def update_filename_and_move_files(sender, instance, **kwargs): | ||||
|         # This will in turn cause this logic to move the file where it belongs. | ||||
|         return | ||||
|  | ||||
|     old_filename = instance.filename | ||||
|     new_filename = generate_filename(instance) | ||||
|     with FileLock(settings.MEDIA_LOCK): | ||||
|         old_filename = instance.filename | ||||
|         new_filename = generate_unique_filename( | ||||
|             instance, settings.ORIGINALS_DIR) | ||||
|  | ||||
|     if new_filename == instance.filename: | ||||
|         # Don't do anything if its the same. | ||||
|         return | ||||
|  | ||||
|     old_source_path = instance.source_path | ||||
|     new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|  | ||||
|     if not validate_move(instance, old_source_path, new_source_path): | ||||
|         return | ||||
|  | ||||
|     # archive files are optional, archive checksum tells us if we have one, | ||||
|     # since this is None for documents without archived files. | ||||
|     if instance.archive_checksum: | ||||
|         new_archive_filename = archive_name_from_filename(new_filename) | ||||
|         old_archive_path = instance.archive_path | ||||
|         new_archive_path = os.path.join(settings.ARCHIVE_DIR, | ||||
|                                         new_archive_filename) | ||||
|  | ||||
|         if not validate_move(instance, old_archive_path, new_archive_path): | ||||
|         if new_filename == instance.filename: | ||||
|             # Don't do anything if its the same. | ||||
|             return | ||||
|  | ||||
|         create_source_path_directory(new_archive_path) | ||||
|     else: | ||||
|         old_archive_path = None | ||||
|         new_archive_path = None | ||||
|         old_source_path = instance.source_path | ||||
|         new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|  | ||||
|     create_source_path_directory(new_source_path) | ||||
|         if not validate_move(instance, old_source_path, new_source_path): | ||||
|             return | ||||
|  | ||||
|     try: | ||||
|         os.rename(old_source_path, new_source_path) | ||||
|         # archive files are optional, archive checksum tells us if we have one, | ||||
|         # since this is None for documents without archived files. | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(old_archive_path, new_archive_path) | ||||
|         instance.filename = new_filename | ||||
|         # Don't save here to prevent infinite recursion. | ||||
|         Document.objects.filter(pk=instance.pk).update(filename=new_filename) | ||||
|             new_archive_filename = archive_name_from_filename(new_filename) | ||||
|             old_archive_path = instance.archive_path | ||||
|             new_archive_path = os.path.join(settings.ARCHIVE_DIR, | ||||
|                                             new_archive_filename) | ||||
|  | ||||
|         logging.getLogger(__name__).debug( | ||||
|             f"Moved file {old_source_path} to {new_source_path}.") | ||||
|             if not validate_move(instance, old_archive_path, new_archive_path): | ||||
|                 return | ||||
|  | ||||
|         if instance.archive_checksum: | ||||
|             logging.getLogger(__name__).debug( | ||||
|                 f"Moved file {old_archive_path} to {new_archive_path}.") | ||||
|             create_source_path_directory(new_archive_path) | ||||
|         else: | ||||
|             old_archive_path = None | ||||
|             new_archive_path = None | ||||
|  | ||||
|         create_source_path_directory(new_source_path) | ||||
|  | ||||
|     except OSError as e: | ||||
|         instance.filename = old_filename | ||||
|         # this happens when we can't move a file. If that's the case for the | ||||
|         # archive file, we try our best to revert the changes. | ||||
|         try: | ||||
|             os.rename(old_source_path, new_source_path) | ||||
|             if instance.archive_checksum: | ||||
|                 os.rename(old_archive_path, new_archive_path) | ||||
|             instance.filename = new_filename | ||||
|  | ||||
|             # Don't save() here to prevent infinite recursion. | ||||
|             Document.objects.filter(pk=instance.pk).update( | ||||
|                 filename=new_filename) | ||||
|  | ||||
|             logging.getLogger(__name__).debug( | ||||
|                 f"Moved file {old_source_path} to {new_source_path}.") | ||||
|  | ||||
|             if instance.archive_checksum: | ||||
|                 logging.getLogger(__name__).debug( | ||||
|                     f"Moved file {old_archive_path} to {new_archive_path}.") | ||||
|  | ||||
|         except OSError as e: | ||||
|             instance.filename = old_filename | ||||
|             # this happens when we can't move a file. If that's the case for | ||||
|             # the archive file, we try our best to revert the changes. | ||||
|             # no need to save the instance, the update() has not happened yet. | ||||
|             try: | ||||
|                 os.rename(new_source_path, old_source_path) | ||||
|                 os.rename(new_archive_path, old_archive_path) | ||||
|             except Exception as e: | ||||
|                 # This is fine, since: | ||||
|                 # A: if we managed to move source from A to B, we will also | ||||
|                 #  manage to move it from B to A. If not, we have a serious | ||||
|                 #  issue that's going to get caught by the santiy checker. | ||||
|                 #  All files remain in place and will never be overwritten, | ||||
|                 #  so this is not the end of the world. | ||||
|                 # B: if moving the orignal file failed, nothing has changed | ||||
|                 #  anyway. | ||||
|                 pass | ||||
|         except DatabaseError as e: | ||||
|             # this happens after moving files, so move them back into place. | ||||
|             # since moving them once succeeded, it's very likely going to | ||||
|             # succeed again. | ||||
|             os.rename(new_source_path, old_source_path) | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         except Exception as e: | ||||
|             # This is fine, since: | ||||
|             # A: if we managed to move source from A to B, we will also manage | ||||
|             #  to move it from B to A. If not, we have a serious issue | ||||
|             #  that's going to get caught by the santiy checker. | ||||
|             #  all files remain in place and will never be overwritten, | ||||
|             #  so this is not the end of the world. | ||||
|             # B: if moving the orignal file failed, nothing has changed anyway. | ||||
|             pass | ||||
|     except DatabaseError as e: | ||||
|         os.rename(new_source_path, old_source_path) | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         instance.filename = old_filename | ||||
|             if instance.archive_checksum: | ||||
|                 os.rename(new_archive_path, old_archive_path) | ||||
|             instance.filename = old_filename | ||||
|             # again, no need to save the instance, since the actual update() | ||||
|             # operation failed. | ||||
|  | ||||
|     if not os.path.isfile(old_source_path): | ||||
|         delete_empty_directories(os.path.dirname(old_source_path), | ||||
|                                  root=settings.ORIGINALS_DIR) | ||||
|         # finally, remove any empty sub folders. This will do nothing if | ||||
|         # something has failed above. | ||||
|         if not os.path.isfile(old_source_path): | ||||
|             delete_empty_directories(os.path.dirname(old_source_path), | ||||
|                                      root=settings.ORIGINALS_DIR) | ||||
|  | ||||
|     if old_archive_path and not os.path.isfile(old_archive_path): | ||||
|         delete_empty_directories(os.path.dirname(old_archive_path), | ||||
|                                  root=settings.ARCHIVE_DIR) | ||||
|         if old_archive_path and not os.path.isfile(old_archive_path): | ||||
|             delete_empty_directories(os.path.dirname(old_archive_path), | ||||
|                                      root=settings.ARCHIVE_DIR) | ||||
|  | ||||
|  | ||||
| def set_log_entry(sender, document=None, logging_group=None, **kwargs): | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| import logging | ||||
|  | ||||
| import tqdm | ||||
| from django.conf import settings | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| @@ -23,7 +24,7 @@ def index_reindex(): | ||||
|     ix = index.open_index(recreate=True) | ||||
|  | ||||
|     with AsyncWriter(ix) as writer: | ||||
|         for document in documents: | ||||
|         for document in tqdm.tqdm(documents): | ||||
|             index.update_document(writer, document) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from unittest import mock | ||||
|  | ||||
| @@ -195,6 +196,24 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(len(results), 3) | ||||
|  | ||||
|         response = self.client.get("/api/documents/?tags__id__none={}".format(tag_3.id)) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(len(results), 2) | ||||
|         self.assertEqual(results[0]['id'], doc1.id) | ||||
|         self.assertEqual(results[1]['id'], doc2.id) | ||||
|  | ||||
|         response = self.client.get("/api/documents/?tags__id__none={},{}".format(tag_3.id, tag_2.id)) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(len(results), 1) | ||||
|         self.assertEqual(results[0]['id'], doc1.id) | ||||
|  | ||||
|         response = self.client.get("/api/documents/?tags__id__none={},{}".format(tag_2.id, tag_inbox.id)) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(len(results), 0) | ||||
|  | ||||
|     def test_search_no_query(self): | ||||
|         response = self.client.get("/api/search/") | ||||
|         results = response.data['results'] | ||||
| @@ -475,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|  | ||||
|         async_task.assert_not_called() | ||||
|  | ||||
|     def test_get_metadata(self): | ||||
|         doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A") | ||||
|  | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path) | ||||
|  | ||||
|         response = self.client.get(f"/api/documents/{doc.pk}/metadata/") | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         meta = response.data | ||||
|  | ||||
|         self.assertEqual(meta['original_mime_type'], "image/png") | ||||
|         self.assertTrue(meta['has_archive_version']) | ||||
|         self.assertEqual(len(meta['original_metadata']), 0) | ||||
|         self.assertGreater(len(meta['archive_metadata']), 0) | ||||
|  | ||||
|     def test_get_metadata_no_archive(self): | ||||
|         doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf") | ||||
|  | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path) | ||||
|  | ||||
|         response = self.client.get(f"/api/documents/{doc.pk}/metadata/") | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         meta = response.data | ||||
|  | ||||
|         self.assertEqual(meta['original_mime_type'], "application/pdf") | ||||
|         self.assertFalse(meta['has_archive_version']) | ||||
|         self.assertGreater(len(meta['original_metadata']), 0) | ||||
|         self.assertIsNone(meta['archive_metadata']) | ||||
|   | ||||
| @@ -27,7 +27,7 @@ class TestAttributes(TestCase): | ||||
|  | ||||
|         self.assertEqual(file_info.title, title, filename) | ||||
|  | ||||
|         self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename) | ||||
|         self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename) | ||||
|  | ||||
|     def test_guess_attributes_from_name0(self): | ||||
|         self._test_guess_attributes_from_name( | ||||
| @@ -188,7 +188,7 @@ class TestFieldPermutations(TestCase): | ||||
|             self.assertEqual(info.tags, (), filename) | ||||
|         else: | ||||
|             self.assertEqual( | ||||
|                 [t.slug for t in info.tags], tags.split(','), | ||||
|                 [t.name for t in info.tags], tags.split(','), | ||||
|                 filename | ||||
|             ) | ||||
|  | ||||
| @@ -342,8 +342,8 @@ class TestFieldPermutations(TestCase): | ||||
|             info = FileInfo.from_filename(filename) | ||||
|             self.assertEqual(info.title, "0001") | ||||
|             self.assertEqual(len(info.tags), 2) | ||||
|             self.assertEqual(info.tags[0].slug, "tag1") | ||||
|             self.assertEqual(info.tags[1].slug, "tag2") | ||||
|             self.assertEqual(info.tags[0].name, "tag1") | ||||
|             self.assertEqual(info.tags[1].name, "tag2") | ||||
|             self.assertIsNone(info.created) | ||||
|  | ||||
|         # Complex transformation with date in replacement string | ||||
| @@ -356,8 +356,8 @@ class TestFieldPermutations(TestCase): | ||||
|             info = FileInfo.from_filename(filename) | ||||
|             self.assertEqual(info.title, "0001") | ||||
|             self.assertEqual(len(info.tags), 2) | ||||
|             self.assertEqual(info.tags[0].slug, "tag1") | ||||
|             self.assertEqual(info.tags[1].slug, "tag2") | ||||
|             self.assertEqual(info.tags[0].name, "tag1") | ||||
|             self.assertEqual(info.tags[1].name, "tag2") | ||||
|             self.assertEqual(info.created.year, 2019) | ||||
|             self.assertEqual(info.created.month, 9) | ||||
|             self.assertEqual(info.created.day, 8) | ||||
| @@ -598,10 +598,10 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertEqual(document.title, "new docs") | ||||
|         self.assertEqual(document.correspondent.name, "Bank") | ||||
|         self.assertEqual(document.filename, "bank/new-docs-0000001.pdf") | ||||
|         self.assertEqual(document.filename, "Bank/new docs.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.generate_filename") | ||||
|     @mock.patch("documents.signals.handlers.generate_unique_filename") | ||||
|     def testFilenameHandlingUnstableFormat(self, m): | ||||
|  | ||||
|         filenames = ["this", "that", "now this", "i cant decide"] | ||||
| @@ -611,7 +611,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|             filenames.insert(0, f) | ||||
|             return f | ||||
|  | ||||
|         m.side_effect = lambda f: get_filename() | ||||
|         m.side_effect = lambda f, root: get_filename() | ||||
|  | ||||
|         filename = self.get_test_file() | ||||
|  | ||||
|   | ||||
| @@ -48,19 +48,19 @@ class TestDocument(TestCase): | ||||
|     def test_file_name(self): | ||||
|  | ||||
|         doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.file_name, "20201225-test.pdf") | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf") | ||||
|  | ||||
|     def test_file_name_jpg(self): | ||||
|  | ||||
|         doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.file_name, "20201225-test.jpg") | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg") | ||||
|  | ||||
|     def test_file_name_unknown(self): | ||||
|  | ||||
|         doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.file_name, "20201225-test.zip") | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip") | ||||
|  | ||||
|     def test_file_name_invalid(self): | ||||
|     def test_file_name_invalid_type(self): | ||||
|  | ||||
|         doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.file_name, "20201225-test") | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test") | ||||
|   | ||||
| @@ -1,5 +1,8 @@ | ||||
| import datetime | ||||
| import hashlib | ||||
| import os | ||||
| import shutil | ||||
| import random | ||||
| import uuid | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
|  | ||||
| @@ -8,7 +11,8 @@ from django.db import DatabaseError | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from .utils import DirectoriesMixin | ||||
| from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories | ||||
| from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \ | ||||
|     generate_unique_filename | ||||
| from ..models import Document, Correspondent | ||||
|  | ||||
|  | ||||
| @@ -40,13 +44,13 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         document.filename = generate_filename(document) | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|         self.assertEqual(document.filename, "none/none.pdf") | ||||
|  | ||||
|         # Enable encryption and check again | ||||
|         document.storage_type = Document.STORAGE_TYPE_GPG | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, | ||||
|                          "none/none-{:07d}.pdf.gpg".format(document.pk)) | ||||
|                          "none/none.pdf.gpg") | ||||
|  | ||||
|         document.save() | ||||
|  | ||||
| @@ -62,7 +66,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test.pdf.gpg"), True) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||
|     def test_file_renaming_missing_permissions(self): | ||||
| @@ -74,12 +78,12 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Ensure that filename is properly generated | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, | ||||
|                          "none/none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none/none.pdf") | ||||
|         create_source_path_directory(document.source_path) | ||||
|         Path(document.source_path).touch() | ||||
|  | ||||
|         # Test source_path | ||||
|         self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)) | ||||
|         self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none.pdf") | ||||
|  | ||||
|         # Make the folder read- and execute-only (no writing and no renaming) | ||||
|         os.chmod(settings.ORIGINALS_DIR + "/none", 0o555) | ||||
| @@ -89,8 +93,8 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         document.save() | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True) | ||||
|         self.assertEqual(document.filename, "none/none.pdf") | ||||
|  | ||||
|         os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) | ||||
|  | ||||
| @@ -108,7 +112,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Ensure that filename is properly generated | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, | ||||
|                          "none/none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none/none.pdf") | ||||
|         create_source_path_directory(document.source_path) | ||||
|         Path(document.source_path).touch() | ||||
|  | ||||
| @@ -125,8 +129,8 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|             # Check proper handling of files | ||||
|             self.assertTrue(os.path.isfile(document.source_path)) | ||||
|             self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|             self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|             self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True) | ||||
|             self.assertEqual(document.filename, "none/none.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||
|     def test_document_delete(self): | ||||
| @@ -138,7 +142,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Ensure that filename is properly generated | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, | ||||
|                          "none/none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none/none.pdf") | ||||
|  | ||||
|         create_source_path_directory(document.source_path) | ||||
|         Path(document.source_path).touch() | ||||
| @@ -146,7 +150,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Ensure file deletion after delete | ||||
|         pk = document.pk | ||||
|         document.delete() | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||
| @@ -168,7 +172,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         # Ensure that filename is properly generated | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, | ||||
|                          "none/none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none/none.pdf") | ||||
|  | ||||
|         create_source_path_directory(document.source_path) | ||||
|  | ||||
| @@ -199,7 +203,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(generate_filename(document), | ||||
|                          "demo-{:07d}.pdf".format(document.pk)) | ||||
|                          "demo.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") | ||||
|     def test_tags_with_dash(self): | ||||
| @@ -215,7 +219,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(generate_filename(document), | ||||
|                          "demo-{:07d}.pdf".format(document.pk)) | ||||
|                          "demo.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") | ||||
|     def test_tags_malformed(self): | ||||
| @@ -231,7 +235,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(generate_filename(document), | ||||
|                          "none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") | ||||
|     def test_tags_all(self): | ||||
| @@ -246,7 +250,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(generate_filename(document), | ||||
|                          "demo-{:07d}.pdf".format(document.pk)) | ||||
|                          "demo.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}") | ||||
|     def test_tags_out_of_bounds(self): | ||||
| @@ -261,7 +265,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         self.assertEqual(generate_filename(document), | ||||
|                          "none-{:07d}.pdf".format(document.pk)) | ||||
|                          "none.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}") | ||||
|     def test_nested_directory_cleanup(self): | ||||
| @@ -272,7 +276,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         # Ensure that filename is properly generated | ||||
|         document.filename = generate_filename(document) | ||||
|         self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk)) | ||||
|         self.assertEqual(document.filename, "none/none/none.pdf") | ||||
|         create_source_path_directory(document.source_path) | ||||
|         Path(document.source_path).touch() | ||||
|  | ||||
| @@ -282,7 +286,7 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|         pk = document.pk | ||||
|         document.delete() | ||||
|  | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none.pdf"), False) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True) | ||||
| @@ -330,6 +334,48 @@ class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertEqual(generate_filename(document), "0000001.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{title}") | ||||
|     def test_duplicates(self): | ||||
|         document = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="A", pk=1) | ||||
|         document2 = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="B", pk=2) | ||||
|         Path(document.source_path).touch() | ||||
|         Path(document2.source_path).touch() | ||||
|         document.filename = "0000001.pdf" | ||||
|         document.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(document.filename, "qwe.pdf") | ||||
|  | ||||
|         document2.filename = "0000002.pdf" | ||||
|         document2.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(document2.filename, "qwe_01.pdf") | ||||
|  | ||||
|         # saving should not change the file names. | ||||
|  | ||||
|         document.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(document.filename, "qwe.pdf") | ||||
|  | ||||
|         document2.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(document2.filename, "qwe_01.pdf") | ||||
|  | ||||
|         document.delete() | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(document.source_path)) | ||||
|  | ||||
|         # filename free, should remove _01 suffix | ||||
|  | ||||
|         document2.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(document2.filename, "qwe.pdf") | ||||
|  | ||||
|  | ||||
|  | ||||
| class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|  | ||||
| @@ -358,15 +404,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|         self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf")) | ||||
|         self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")) | ||||
|         self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc.pdf")) | ||||
|         self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_archive_gone(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         #Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
| @@ -381,7 +426,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none")) | ||||
|         Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch() | ||||
|         Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
| @@ -485,3 +530,44 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
| class TestFilenameGeneration(TestCase): | ||||
|  | ||||
|     @override_settings( | ||||
|         PAPERLESS_FILENAME_FORMAT="{title}" | ||||
|     ) | ||||
|     def test_invalid_characters(self): | ||||
|  | ||||
|         doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1") | ||||
|         self.assertEqual(generate_filename(doc), "This. is the title.pdf") | ||||
|  | ||||
|         doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2") | ||||
|         self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay.pdf") | ||||
|  | ||||
|     @override_settings( | ||||
|         PAPERLESS_FILENAME_FORMAT="{created}" | ||||
|     ) | ||||
|     def test_date(self): | ||||
|         doc = Document.objects.create(title="does not matter", created=datetime.datetime(2020,5,21, 7,36,51, 153), mime_type="application/pdf", pk=2, checksum="2") | ||||
|         self.assertEqual(generate_filename(doc), "2020-05-21.pdf") | ||||
|  | ||||
|  | ||||
| def run(): | ||||
|     doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow") | ||||
|     doc.filename = generate_unique_filename(doc, settings.ORIGINALS_DIR) | ||||
|     Path(doc.thumbnail_path).touch() | ||||
|     with open(doc.source_path, "w") as f: | ||||
|         f.write(str(uuid.uuid4())) | ||||
|     with open(doc.source_path, "rb") as f: | ||||
|         doc.checksum = hashlib.md5(f.read()).hexdigest() | ||||
|  | ||||
|     with open(doc.archive_path, "w") as f: | ||||
|         f.write(str(uuid.uuid4())) | ||||
|     with open(doc.archive_path, "rb") as f: | ||||
|         doc.archive_checksum = hashlib.md5(f.read()).hexdigest() | ||||
|  | ||||
|     doc.save() | ||||
|  | ||||
|     for i in range(30): | ||||
|         doc.title = str(random.randrange(1, 5)) | ||||
|         doc.save() | ||||
|   | ||||
| @@ -16,25 +16,23 @@ sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||
| class TestArchiver(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_models(self): | ||||
|         self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf") | ||||
|         #self.d2 = Document.objects.create(checksum="B", title="B", content="second document") | ||||
|         #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document") | ||||
|         return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf") | ||||
|  | ||||
|     def test_archiver(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|         doc = self.make_models() | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf")) | ||||
|  | ||||
|         call_command('document_archiver') | ||||
|  | ||||
|     def test_handle_document(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|         doc = self.make_models() | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf")) | ||||
|  | ||||
|         handle_document(self.d1.pk) | ||||
|         handle_document(doc.pk) | ||||
|  | ||||
|         doc = Document.objects.get(id=self.d1.id) | ||||
|         doc = Document.objects.get(id=doc.id) | ||||
|  | ||||
|         self.assertIsNotNone(doc.checksum) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|   | ||||
| @@ -230,7 +230,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase): | ||||
|  | ||||
|         tag_names = ("existingTag", "Space Tag") | ||||
|         # Create a Tag prior to consuming a file using it in path | ||||
|         tag_ids = [Tag.objects.create(name=tag_names[0]).pk,] | ||||
|         tag_ids = [Tag.objects.create(name="existingtag").pk,] | ||||
|  | ||||
|         self.t_start() | ||||
|  | ||||
|   | ||||
| @@ -35,20 +35,20 @@ class TestDecryptDocuments(TestCase): | ||||
|             PASSPHRASE="test" | ||||
|         ).enable() | ||||
|  | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg")) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg")) | ||||
|         doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg",  mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|  | ||||
|         Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg")) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg")) | ||||
|  | ||||
|         call_command('decrypt_documents') | ||||
|  | ||||
|         doc = Document.objects.get(id=2) | ||||
|         doc.refresh_from_db() | ||||
|  | ||||
|         self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED) | ||||
|         self.assertEqual(doc.filename, "0000002.pdf") | ||||
|         self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf"))) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png"))) | ||||
|         self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png"))) | ||||
|         self.assertTrue(os.path.isfile(doc.thumbnail_path)) | ||||
|  | ||||
|         with doc.source_file as f: | ||||
|   | ||||
| @@ -24,13 +24,14 @@ class TestExportImport(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         file = os.path.join(self.dirs.originals_dir, "0000001.pdf") | ||||
|  | ||||
|         Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") | ||||
|         Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|         Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", mime_type="application/pdf") | ||||
|         Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|         Tag.objects.create(name="t") | ||||
|         DocumentType.objects.create(name="dt") | ||||
|         Correspondent.objects.create(name="c") | ||||
|  | ||||
|         target = tempfile.mkdtemp() | ||||
|         self.addCleanup(shutil.rmtree, target) | ||||
|  | ||||
|         call_command('document_exporter', target) | ||||
|  | ||||
| @@ -66,6 +67,6 @@ class TestExportImport(DirectoriesMixin, TestCase): | ||||
|     def test_export_missing_files(self): | ||||
|  | ||||
|         target = tempfile.mkdtemp() | ||||
|         call_command('document_exporter', target) | ||||
|         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") | ||||
|         self.addCleanup(shutil.rmtree, target) | ||||
|         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", mime_type="application/pdf") | ||||
|         self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) | ||||
|   | ||||
| @@ -40,6 +40,7 @@ from .filters import ( | ||||
|     LogFilterSet | ||||
| ) | ||||
| from .models import Correspondent, Document, Log, Tag, DocumentType | ||||
| from .parsers import get_parser_class_for_mime_type | ||||
| from .serialisers import ( | ||||
|     CorrespondentSerializer, | ||||
|     DocumentSerializer, | ||||
| @@ -151,11 +152,11 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|         doc = Document.objects.get(id=pk) | ||||
|         if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501 | ||||
|             file_handle = doc.archive_file | ||||
|             filename = doc.archive_file_name | ||||
|             filename = doc.get_public_filename(archive=True) | ||||
|             mime_type = 'application/pdf' | ||||
|         else: | ||||
|             file_handle = doc.source_file | ||||
|             filename = doc.file_name | ||||
|             filename = doc.get_public_filename() | ||||
|             mime_type = doc.mime_type | ||||
|  | ||||
|         if doc.storage_type == Document.STORAGE_TYPE_GPG: | ||||
| @@ -166,17 +167,43 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|             disposition, filename) | ||||
|         return response | ||||
|  | ||||
|     def get_metadata(self, file, mime_type): | ||||
|         if not os.path.isfile(file): | ||||
|             return None | ||||
|  | ||||
|         parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|         if parser_class: | ||||
|             parser = parser_class(logging_group=None) | ||||
|             return parser.extract_metadata(file, mime_type) | ||||
|         else: | ||||
|             return [] | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def metadata(self, request, pk=None): | ||||
|         try: | ||||
|             doc = Document.objects.get(pk=pk) | ||||
|             return Response({ | ||||
|                 "paperless__checksum": doc.checksum, | ||||
|                 "paperless__mime_type": doc.mime_type, | ||||
|                 "paperless__filename": doc.filename, | ||||
|                 "paperless__has_archive_version": | ||||
|                     os.path.isfile(doc.archive_path) | ||||
|             }) | ||||
|  | ||||
|             meta = { | ||||
|                 "original_checksum": doc.checksum, | ||||
|                 "original_size": os.stat(doc.source_path).st_size, | ||||
|                 "original_mime_type": doc.mime_type, | ||||
|                 "media_filename": doc.filename, | ||||
|                 "has_archive_version": os.path.isfile(doc.archive_path), | ||||
|                 "original_metadata": self.get_metadata( | ||||
|                     doc.source_path, doc.mime_type) | ||||
|             } | ||||
|  | ||||
|             if doc.archive_checksum and os.path.isfile(doc.archive_path): | ||||
|                 meta['archive_checksum'] = doc.archive_checksum | ||||
|                 meta['archive_size'] = os.stat(doc.archive_path).st_size, | ||||
|                 meta['archive_metadata'] = self.get_metadata( | ||||
|                     doc.archive_path, "application/pdf") | ||||
|             else: | ||||
|                 meta['archive_checksum'] = None | ||||
|                 meta['archive_size'] = None | ||||
|                 meta['archive_metadata'] = None | ||||
|  | ||||
|             return Response(meta) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
|  | ||||
| @@ -263,12 +290,11 @@ class PostDocumentView(APIView): | ||||
|         serializer = self.get_serializer(data=request.data) | ||||
|         serializer.is_valid(raise_exception=True) | ||||
|  | ||||
|         document = serializer.validated_data['document'] | ||||
|         document_data = serializer.validated_data['document_data'] | ||||
|         correspondent_id = serializer.validated_data['correspondent_id'] | ||||
|         document_type_id = serializer.validated_data['document_type_id'] | ||||
|         tag_ids = serializer.validated_data['tag_ids'] | ||||
|         title = serializer.validated_data['title'] | ||||
|         doc_name, doc_data = serializer.validated_data.get('document') | ||||
|         correspondent_id = serializer.validated_data.get('correspondent') | ||||
|         document_type_id = serializer.validated_data.get('document_type') | ||||
|         tag_ids = serializer.validated_data.get('tags') | ||||
|         title = serializer.validated_data.get('title') | ||||
|  | ||||
|         t = int(mktime(datetime.now().timetuple())) | ||||
|  | ||||
| @@ -277,17 +303,17 @@ class PostDocumentView(APIView): | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", | ||||
|                                          dir=settings.SCRATCH_DIR, | ||||
|                                          delete=False) as f: | ||||
|             f.write(document_data) | ||||
|             f.write(doc_data) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|  | ||||
|             async_task("documents.tasks.consume_file", | ||||
|                        f.name, | ||||
|                        override_filename=document.name, | ||||
|                        override_filename=doc_name, | ||||
|                        override_title=title, | ||||
|                        override_correspondent_id=correspondent_id, | ||||
|                        override_document_type_id=document_type_id, | ||||
|                        override_tag_ids=tag_ids, | ||||
|                        task_name=os.path.basename(document.name)[:100]) | ||||
|                        task_name=os.path.basename(doc_name)[:100]) | ||||
|         return Response("OK") | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -53,6 +53,10 @@ ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") | ||||
| THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | ||||
|  | ||||
| DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) | ||||
|  | ||||
| # Lock file for synchronizing changes to the MEDIA directory across multiple | ||||
| # threads. | ||||
| MEDIA_LOCK = os.path.join(MEDIA_ROOT, "media.lock") | ||||
| INDEX_DIR = os.path.join(DATA_DIR, "index") | ||||
| MODEL_FILE = os.path.join(DATA_DIR, "classification_model.pickle") | ||||
|  | ||||
|   | ||||
| @@ -1 +1 @@ | ||||
| __version__ = (0, 9, 5) | ||||
| __version__ = (0, 9, 6) | ||||
|   | ||||
| @@ -103,10 +103,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|     def _correspondent_from_name(self, name): | ||||
|         try: | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=name, defaults={ | ||||
|                     "slug": slugify(name) | ||||
|                 })[0] | ||||
|             return Correspondent.objects.get_or_create(name=name)[0] | ||||
|         except DatabaseError as e: | ||||
|             self.log( | ||||
|                 "error", | ||||
|   | ||||
| @@ -5,6 +5,7 @@ import subprocess | ||||
|  | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| import pikepdf | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from ocrmypdf import InputFileError, EncryptedPdfError | ||||
| @@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|         namespace_pattern = re.compile(r"\{(.*)\}(.*)") | ||||
|  | ||||
|         result = [] | ||||
|         if mime_type == 'application/pdf': | ||||
|             pdf = pikepdf.open(document_path) | ||||
|             meta = pdf.open_metadata() | ||||
|             for key, value in meta.items(): | ||||
|                 if isinstance(value, list): | ||||
|                     value = " ".join([str(e) for e in value]) | ||||
|                 value = str(value) | ||||
|                 try: | ||||
|                     m = namespace_pattern.match(key) | ||||
|                     result.append({ | ||||
|                         "namespace": m.group(1), | ||||
|                         "prefix": meta.REVERSE_NS[m.group(1)], | ||||
|                         "key": m.group(2), | ||||
|                         "value": value | ||||
|                     }) | ||||
|                 except Exception as e: | ||||
|                     self.log( | ||||
|                         "warning", | ||||
|                         f"Error while reading metadata {key}: {value}. Error: " | ||||
|                         f"{e}" | ||||
|                     ) | ||||
|         return result | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler