mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-bulk-edit
This commit is contained in:
@@ -17,8 +17,6 @@ class CorrespondentAdmin(admin.ModelAdmin):
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
|
||||
@@ -31,8 +29,6 @@ class TagAdmin(admin.ModelAdmin):
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug", )
|
||||
|
||||
|
||||
class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
|
||||
@@ -44,13 +40,16 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "mime_type", "storage_type", "filename")
|
||||
readonly_fields = (
|
||||
"added",
|
||||
"modified",
|
||||
"mime_type",
|
||||
"storage_type",
|
||||
"filename")
|
||||
|
||||
list_display_links = ("title",)
|
||||
|
||||
@@ -101,7 +100,7 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
for tag in obj.tags.all():
|
||||
r += self._html_tag(
|
||||
"span",
|
||||
tag.slug + ", "
|
||||
tag.name + ", "
|
||||
)
|
||||
return r
|
||||
|
||||
|
@@ -8,13 +8,14 @@ from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import create_source_path_directory
|
||||
from .file_handling import create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, \
|
||||
get_supported_file_extensions, parse_date
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@@ -38,6 +39,10 @@ class Consumer(LoggingMixin):
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
if not os.path.isfile(self.path):
|
||||
self.log(
|
||||
"error",
|
||||
"Cannot consume {}: It is not a file.".format(self.path)
|
||||
)
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
@@ -47,6 +52,10 @@ class Consumer(LoggingMixin):
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
self.log(
|
||||
"error",
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
)
|
||||
raise ConsumerError(
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
)
|
||||
@@ -148,8 +157,9 @@ class Consumer(LoggingMixin):
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
self.log(
|
||||
"warning",
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
|
||||
# now that everything is done, we can start to store the document
|
||||
@@ -176,31 +186,28 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
document.filename = generate_unique_filename(
|
||||
document, settings.ORIGINALS_DIR)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
# TODO: not required, since this is done by the file handling
|
||||
# logic
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
self.path, document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
self.path, document.source_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
# Afte performing all database operations and moving files
|
||||
# into place, tell paperless where the file is.
|
||||
document.filename = os.path.basename(document.source_path)
|
||||
# Saving the document now will trigger the filename handling
|
||||
# logic.
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to aquire the lock as well.
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
@@ -241,7 +248,7 @@ class Consumer(LoggingMixin):
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
title=(self.override_title or file_info.title)[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
@@ -252,18 +259,17 @@ class Consumer(LoggingMixin):
|
||||
|
||||
relevant_tags = set(file_info.tags)
|
||||
if relevant_tags:
|
||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||
tag_names = ", ".join([t.name for t in relevant_tags])
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
self.apply_overrides(document)
|
||||
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.override_title:
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.override_correspondent_id)
|
||||
|
@@ -1,7 +1,9 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
@@ -68,21 +70,53 @@ def many_to_dictionary(field):
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(doc):
|
||||
def generate_unique_filename(doc, root):
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(doc, counter)
|
||||
if new_filename == doc.filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if os.path.exists(os.path.join(root, new_filename)):
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(doc, counter=0):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(doc.correspondent),
|
||||
title=slugify(doc.title),
|
||||
created=slugify(doc.created),
|
||||
title=pathvalidate.sanitize_filename(
|
||||
doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
added=slugify(doc.added),
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
@@ -93,11 +127,11 @@ def generate_filename(doc):
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
|
||||
filename = f"{path}{counter_str}{doc.file_type}"
|
||||
else:
|
||||
filename = "%07i%s" % (doc.pk, doc.file_type)
|
||||
filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
|
@@ -37,6 +37,10 @@ class DocumentTypeFilterSet(FilterSet):
|
||||
|
||||
class TagsFilter(Filter):
|
||||
|
||||
def __init__(self, exclude=False):
|
||||
super(TagsFilter, self).__init__()
|
||||
self.exclude = exclude
|
||||
|
||||
def filter(self, qs, value):
|
||||
if not value:
|
||||
return qs
|
||||
@@ -47,7 +51,10 @@ class TagsFilter(Filter):
|
||||
return qs
|
||||
|
||||
for tag_id in tag_ids:
|
||||
qs = qs.filter(tags__id=tag_id)
|
||||
if self.exclude:
|
||||
qs = qs.exclude(tags__id=tag_id)
|
||||
else:
|
||||
qs = qs.filter(tags__id=tag_id)
|
||||
|
||||
return qs
|
||||
|
||||
@@ -74,6 +81,8 @@ class DocumentFilterSet(FilterSet):
|
||||
|
||||
tags__id__all = TagsFilter()
|
||||
|
||||
tags__id__none = TagsFilter(exclude=True)
|
||||
|
||||
is_in_inbox = InboxFilter()
|
||||
|
||||
class Meta:
|
||||
|
@@ -82,7 +82,8 @@ class Command(BaseCommand):
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(raw_thumb)
|
||||
|
||||
document.save(update_fields=("storage_type", "filename"))
|
||||
Document.objects.filter(id=document.id).update(
|
||||
storage_type=document.storage_type, filename=document.filename)
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
||||
|
@@ -29,10 +29,9 @@ def _tags_from_path(filepath):
|
||||
path_parts = Path(filepath).relative_to(
|
||||
settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(Tag.objects.get_or_create(
|
||||
slug=slugify(part),
|
||||
defaults={"name": part},
|
||||
)[0].pk)
|
||||
tag_ids.add(Tag.objects.get_or_create(name__iexact=part, defaults={
|
||||
"name": part
|
||||
})[0].pk)
|
||||
|
||||
return tag_ids
|
||||
|
||||
|
@@ -38,6 +38,9 @@ class Command(Renderable, BaseCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if os.listdir(self.target):
|
||||
raise CommandError("That directory is not empty.")
|
||||
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
@@ -54,31 +57,39 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
print(f"Exporting: {document}")
|
||||
|
||||
thumbnail_name = unique_filename + "-thumbnail.png"
|
||||
filename_counter = 0
|
||||
while True:
|
||||
original_name = document.get_public_filename(
|
||||
counter=filename_counter)
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
|
||||
if not os.path.exists(original_target):
|
||||
break
|
||||
else:
|
||||
filename_counter += 1
|
||||
|
||||
thumbnail_name = original_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_FILE_NAME] = original_name
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = \
|
||||
f"{document.pk:07}_archive_{document.archive_file_name}"
|
||||
archive_name = document.get_public_filename(
|
||||
archive=True, counter=filename_counter, suffix="_archive")
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
||||
with open(file_target, "wb") as f:
|
||||
with open(original_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
os.utime(file_target, times=(t, t))
|
||||
os.utime(original_target, times=(t, t))
|
||||
|
||||
with open(thumbnail_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
@@ -90,7 +101,7 @@ class Command(Renderable, BaseCommand):
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
|
||||
shutil.copy(document.source_path, file_target)
|
||||
shutil.copy(document.source_path, original_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
|
@@ -5,11 +5,13 @@ import shutil
|
||||
from django.conf import settings
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.models import Document
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...file_handling import create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
@@ -114,17 +116,20 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
document.filename = generate_unique_filename(
|
||||
document, settings.ORIGINALS_DIR)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
@@ -1,3 +1,6 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document
|
||||
@@ -18,6 +21,8 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.all():
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(Document.objects.all()):
|
||||
# Saving the document again will generate a new filename and rename
|
||||
document.save()
|
||||
|
25
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
25
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-08 22:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1005_checksums'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='correspondent',
|
||||
name='slug',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documenttype',
|
||||
name='slug',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
),
|
||||
]
|
@@ -1,10 +1,12 @@
|
||||
# coding=utf-8
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
import pathvalidate
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
@@ -34,7 +36,6 @@ class MatchingModel(models.Model):
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
@@ -67,7 +68,6 @@ class MatchingModel(models.Model):
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
@@ -172,6 +172,7 @@ class Document(models.Model):
|
||||
|
||||
created = models.DateTimeField(
|
||||
default=timezone.now, db_index=True)
|
||||
|
||||
modified = models.DateTimeField(
|
||||
auto_now=True, editable=False, db_index=True)
|
||||
|
||||
@@ -206,13 +207,11 @@ class Document(models.Model):
|
||||
ordering = ("correspondent", "title")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d")
|
||||
created = datetime.date.isoformat(self.created)
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
if self.correspondent or self.title:
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
return f"{created} {self.correspondent} {self.title}"
|
||||
else:
|
||||
return f"{created} {self.title}"
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
@@ -248,13 +247,21 @@ class Document(models.Model):
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
def get_public_filename(self, archive=False, counter=0, suffix=None):
|
||||
result = str(self)
|
||||
|
||||
@property
|
||||
def archive_file_name(self):
|
||||
return slugify(str(self)) + ".pdf"
|
||||
if counter:
|
||||
result += f"_{counter:02}"
|
||||
|
||||
if suffix:
|
||||
result += suffix
|
||||
|
||||
if archive:
|
||||
result += ".pdf"
|
||||
else:
|
||||
result += self.file_type
|
||||
|
||||
return pathvalidate.sanitize_filename(result, replacement_text="-")
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
@@ -375,9 +382,7 @@ class FileInfo:
|
||||
def _get_correspondent(cls, name):
|
||||
if not name:
|
||||
return None
|
||||
return Correspondent.objects.get_or_create(name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
return Correspondent.objects.get_or_create(name=name)[0]
|
||||
|
||||
@classmethod
|
||||
def _get_title(cls, title):
|
||||
@@ -387,10 +392,7 @@ class FileInfo:
|
||||
def _get_tags(cls, tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(Tag.objects.get_or_create(
|
||||
slug=slugify(t),
|
||||
defaults={"name": t}
|
||||
)[0])
|
||||
r.append(Tag.objects.get_or_create(name=t)[0])
|
||||
return tuple(r)
|
||||
|
||||
@classmethod
|
||||
|
@@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
|
||||
def __init__(self, logging_group):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
@@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
|
||||
self.text = None
|
||||
self.date = None
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
return []
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@@ -46,6 +46,10 @@ def check_sanity():
|
||||
for f in files:
|
||||
present_files.append(os.path.normpath(os.path.join(root, f)))
|
||||
|
||||
lockfile = os.path.normpath(settings.MEDIA_LOCK)
|
||||
if lockfile in present_files:
|
||||
present_files.remove(lockfile)
|
||||
|
||||
for doc in Document.objects.all():
|
||||
# Check sanity of the thumbnail
|
||||
if not os.path.isfile(doc.thumbnail_path):
|
||||
|
@@ -1,17 +1,23 @@
|
||||
import magic
|
||||
from django.utils.text import slugify
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
from rest_framework import serializers
|
||||
from rest_framework.fields import SerializerMethodField
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log, DocumentType
|
||||
from .parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
class CorrespondentSerializer(serializers.ModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
|
||||
last_correspondence = serializers.DateTimeField(read_only=True)
|
||||
|
||||
def get_slug(self, obj):
|
||||
return slugify(obj.name)
|
||||
slug = SerializerMethodField()
|
||||
|
||||
class Meta:
|
||||
model = Correspondent
|
||||
fields = (
|
||||
@@ -26,10 +32,14 @@ class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
)
|
||||
|
||||
|
||||
class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
|
||||
class DocumentTypeSerializer(serializers.ModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
|
||||
def get_slug(self, obj):
|
||||
return slugify(obj.name)
|
||||
slug = SerializerMethodField()
|
||||
|
||||
class Meta:
|
||||
model = DocumentType
|
||||
fields = (
|
||||
@@ -43,10 +53,14 @@ class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
|
||||
)
|
||||
|
||||
|
||||
class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
class TagSerializer(serializers.ModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
|
||||
def get_slug(self, obj):
|
||||
return slugify(obj.name)
|
||||
slug = SerializerMethodField()
|
||||
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = (
|
||||
@@ -83,6 +97,18 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
tags = TagsField(many=True)
|
||||
document_type = DocumentTypeField(allow_null=True)
|
||||
|
||||
original_file_name = SerializerMethodField()
|
||||
archived_file_name = SerializerMethodField()
|
||||
|
||||
def get_original_file_name(self, obj):
|
||||
return obj.get_public_filename()
|
||||
|
||||
def get_archived_file_name(self, obj):
|
||||
if obj.archive_checksum:
|
||||
return obj.get_public_filename(archive=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
depth = 1
|
||||
@@ -96,7 +122,9 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"archive_serial_number"
|
||||
"archive_serial_number",
|
||||
"original_file_name",
|
||||
"archived_file_name",
|
||||
)
|
||||
|
||||
|
||||
@@ -178,8 +206,7 @@ class PostDocumentSerializer(serializers.Serializer):
|
||||
required=False,
|
||||
)
|
||||
|
||||
def validate(self, attrs):
|
||||
document = attrs.get('document')
|
||||
def validate_document(self, document):
|
||||
|
||||
try:
|
||||
validate_filename(document.name)
|
||||
@@ -191,32 +218,31 @@ class PostDocumentSerializer(serializers.Serializer):
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise serializers.ValidationError(
|
||||
"This mime type is not supported.")
|
||||
"This file type is not supported.")
|
||||
|
||||
attrs['document_data'] = document_data
|
||||
return document.name, document_data
|
||||
|
||||
title = attrs.get('title')
|
||||
def validate_title(self, title):
|
||||
if title:
|
||||
return title
|
||||
else:
|
||||
# do not return empty strings.
|
||||
return None
|
||||
|
||||
if not title:
|
||||
attrs['title'] = None
|
||||
|
||||
correspondent = attrs.get('correspondent')
|
||||
def validate_correspondent(self, correspondent):
|
||||
if correspondent:
|
||||
attrs['correspondent_id'] = correspondent.id
|
||||
return correspondent.id
|
||||
else:
|
||||
attrs['correspondent_id'] = None
|
||||
return None
|
||||
|
||||
document_type = attrs.get('document_type')
|
||||
def validate_document_type(self, document_type):
|
||||
if document_type:
|
||||
attrs['document_type_id'] = document_type.id
|
||||
return document_type.id
|
||||
else:
|
||||
attrs['document_type_id'] = None
|
||||
return None
|
||||
|
||||
tags = attrs.get('tags')
|
||||
def validate_tags(self, tags):
|
||||
if tags:
|
||||
tag_ids = [tag.id for tag in tags]
|
||||
attrs['tag_ids'] = tag_ids
|
||||
return [tag.id for tag in tags]
|
||||
else:
|
||||
attrs['tag_ids'] = None
|
||||
|
||||
return attrs
|
||||
return None
|
||||
|
@@ -9,11 +9,13 @@ from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory, archive_name_from_filename
|
||||
from ..file_handling import delete_empty_directories, \
|
||||
create_source_path_directory, archive_name_from_filename, \
|
||||
generate_unique_filename
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@@ -134,7 +136,7 @@ def set_tags(sender,
|
||||
|
||||
message = 'Tagging "{}" with "{}"'
|
||||
logger(
|
||||
message.format(document, ", ".join([t.slug for t in relevant_tags])),
|
||||
message.format(document, ", ".join([t.name for t in relevant_tags])),
|
||||
logging_group
|
||||
)
|
||||
|
||||
@@ -157,41 +159,42 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
Popen((
|
||||
settings.POST_CONSUME_SCRIPT,
|
||||
str(document.pk),
|
||||
document.file_name,
|
||||
document.get_public_filename(),
|
||||
os.path.normpath(document.source_path),
|
||||
os.path.normpath(document.thumbnail_path),
|
||||
reverse("document-download", kwargs={"pk": document.pk}),
|
||||
reverse("document-thumb", kwargs={"pk": document.pk}),
|
||||
str(document.correspondent),
|
||||
str(",".join(document.tags.all().values_list("slug", flat=True)))
|
||||
str(",".join(document.tags.all().values_list("name", flat=True)))
|
||||
)).wait()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {instance.file_name}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {str(instance)}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
|
||||
|
||||
def validate_move(instance, old_path, new_path):
|
||||
@@ -226,81 +229,94 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
# This will in turn cause this logic to move the file where it belongs.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
new_filename = generate_filename(instance)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
old_filename = instance.filename
|
||||
new_filename = generate_unique_filename(
|
||||
instance, settings.ORIGINALS_DIR)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
try:
|
||||
os.rename(old_source_path, new_source_path)
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
# Don't save here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
return
|
||||
|
||||
if instance.archive_checksum:
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for the
|
||||
# archive file, we try our best to revert the changes.
|
||||
try:
|
||||
os.rename(old_source_path, new_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
|
||||
# Don't save() here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(
|
||||
filename=new_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
|
||||
if instance.archive_checksum:
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for
|
||||
# the archive file, we try our best to revert the changes.
|
||||
# no need to save the instance, the update() has not happened yet.
|
||||
try:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also
|
||||
# manage to move it from B to A. If not, we have a serious
|
||||
# issue that's going to get caught by the santiy checker.
|
||||
# All files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed
|
||||
# anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
# this happens after moving files, so move them back into place.
|
||||
# since moving them once succeeded, it's very likely going to
|
||||
# succeed again.
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also manage
|
||||
# to move it from B to A. If not, we have a serious issue
|
||||
# that's going to get caught by the santiy checker.
|
||||
# all files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
# again, no need to save the instance, since the actual update()
|
||||
# operation failed.
|
||||
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
# finally, remove any empty sub folders. This will do nothing if
|
||||
# something has failed above.
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
@@ -23,7 +24,7 @@ def index_reindex():
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
for document in tqdm.tqdm(documents):
|
||||
index.update_document(writer, document)
|
||||
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
@@ -195,6 +196,24 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__none={}".format(tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
self.assertEqual(results[1]['id'], doc2.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__none={},{}".format(tag_3.id, tag_2.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__none={},{}".format(tag_2.id, tag_inbox.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_search_no_query(self):
|
||||
response = self.client.get("/api/search/")
|
||||
results = response.data['results']
|
||||
@@ -475,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
def test_get_metadata(self):
|
||||
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A")
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
meta = response.data
|
||||
|
||||
self.assertEqual(meta['original_mime_type'], "image/png")
|
||||
self.assertTrue(meta['has_archive_version'])
|
||||
self.assertEqual(len(meta['original_metadata']), 0)
|
||||
self.assertGreater(len(meta['archive_metadata']), 0)
|
||||
|
||||
def test_get_metadata_no_archive(self):
|
||||
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path)
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
meta = response.data
|
||||
|
||||
self.assertEqual(meta['original_mime_type'], "application/pdf")
|
||||
self.assertFalse(meta['has_archive_version'])
|
||||
self.assertGreater(len(meta['original_metadata']), 0)
|
||||
self.assertIsNone(meta['archive_metadata'])
|
||||
|
@@ -27,7 +27,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
self.assertEqual(file_info.title, title, filename)
|
||||
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)
|
||||
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
|
||||
|
||||
def test_guess_attributes_from_name0(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
@@ -188,7 +188,7 @@ class TestFieldPermutations(TestCase):
|
||||
self.assertEqual(info.tags, (), filename)
|
||||
else:
|
||||
self.assertEqual(
|
||||
[t.slug for t in info.tags], tags.split(','),
|
||||
[t.name for t in info.tags], tags.split(','),
|
||||
filename
|
||||
)
|
||||
|
||||
@@ -342,8 +342,8 @@ class TestFieldPermutations(TestCase):
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertEqual(info.tags[0].name, "tag1")
|
||||
self.assertEqual(info.tags[1].name, "tag2")
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
# Complex transformation with date in replacement string
|
||||
@@ -356,8 +356,8 @@ class TestFieldPermutations(TestCase):
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertEqual(info.tags[0].name, "tag1")
|
||||
self.assertEqual(info.tags[1].name, "tag2")
|
||||
self.assertEqual(info.created.year, 2019)
|
||||
self.assertEqual(info.created.month, 9)
|
||||
self.assertEqual(info.created.day, 8)
|
||||
@@ -598,10 +598,10 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||
self.assertEqual(document.filename, "Bank/new docs.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.generate_filename")
|
||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m):
|
||||
|
||||
filenames = ["this", "that", "now this", "i cant decide"]
|
||||
@@ -611,7 +611,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
filenames.insert(0, f)
|
||||
return f
|
||||
|
||||
m.side_effect = lambda f: get_filename()
|
||||
m.side_effect = lambda f, root: get_filename()
|
||||
|
||||
filename = self.get_test_file()
|
||||
|
||||
|
@@ -48,19 +48,19 @@ class TestDocument(TestCase):
|
||||
def test_file_name(self):
|
||||
|
||||
doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.pdf")
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
|
||||
|
||||
def test_file_name_jpg(self):
|
||||
|
||||
doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.jpg")
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
|
||||
|
||||
def test_file_name_unknown(self):
|
||||
|
||||
doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.zip")
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
|
||||
|
||||
def test_file_name_invalid(self):
|
||||
def test_file_name_invalid_type(self):
|
||||
|
||||
doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test")
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
||||
|
@@ -1,5 +1,8 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
@@ -8,7 +11,8 @@ from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
|
||||
generate_unique_filename
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
@@ -40,13 +44,13 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
# Enable encryption and check again
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf.gpg".format(document.pk))
|
||||
"none/none.pdf.gpg")
|
||||
|
||||
document.save()
|
||||
|
||||
@@ -62,7 +66,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test.pdf.gpg"), True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
@@ -74,12 +78,12 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none.pdf")
|
||||
|
||||
# Make the folder read- and execute-only (no writing and no renaming)
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
|
||||
@@ -89,8 +93,8 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
|
||||
@@ -108,7 +112,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
@@ -125,8 +129,8 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
@@ -138,7 +142,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
@@ -146,7 +150,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure file deletion after delete
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
@@ -168,7 +172,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
@@ -199,7 +203,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
@@ -215,7 +219,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
@@ -231,7 +235,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
"none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
@@ -246,7 +250,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
|
||||
def test_tags_out_of_bounds(self):
|
||||
@@ -261,7 +265,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
"none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
@@ -272,7 +276,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.filename, "none/none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
@@ -282,7 +286,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
|
||||
@@ -330,6 +334,48 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_duplicates(self):
|
||||
document = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="A", pk=1)
|
||||
document2 = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="B", pk=2)
|
||||
Path(document.source_path).touch()
|
||||
Path(document2.source_path).touch()
|
||||
document.filename = "0000001.pdf"
|
||||
document.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document.filename, "qwe.pdf")
|
||||
|
||||
document2.filename = "0000002.pdf"
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe_01.pdf")
|
||||
|
||||
# saving should not change the file names.
|
||||
|
||||
document.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document.filename, "qwe.pdf")
|
||||
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe_01.pdf")
|
||||
|
||||
document.delete()
|
||||
|
||||
self.assertFalse(os.path.isfile(document.source_path))
|
||||
|
||||
# filename free, should remove _01 suffix
|
||||
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe.pdf")
|
||||
|
||||
|
||||
|
||||
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
|
||||
@@ -358,15 +404,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
#Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
@@ -381,7 +426,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
@@ -485,3 +530,44 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
class TestFilenameGeneration(TestCase):
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{title}"
|
||||
)
|
||||
def test_invalid_characters(self):
|
||||
|
||||
doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1")
|
||||
self.assertEqual(generate_filename(doc), "This. is the title.pdf")
|
||||
|
||||
doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay.pdf")
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{created}"
|
||||
)
|
||||
def test_date(self):
|
||||
doc = Document.objects.create(title="does not matter", created=datetime.datetime(2020,5,21, 7,36,51, 153), mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "2020-05-21.pdf")
|
||||
|
||||
|
||||
def run():
|
||||
doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow")
|
||||
doc.filename = generate_unique_filename(doc, settings.ORIGINALS_DIR)
|
||||
Path(doc.thumbnail_path).touch()
|
||||
with open(doc.source_path, "w") as f:
|
||||
f.write(str(uuid.uuid4()))
|
||||
with open(doc.source_path, "rb") as f:
|
||||
doc.checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
with open(doc.archive_path, "w") as f:
|
||||
f.write(str(uuid.uuid4()))
|
||||
with open(doc.archive_path, "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
doc.save()
|
||||
|
||||
for i in range(30):
|
||||
doc.title = str(random.randrange(1, 5))
|
||||
doc.save()
|
||||
|
@@ -16,25 +16,23 @@ sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
|
||||
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
doc = self.make_models()
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
doc = self.make_models()
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
|
||||
|
||||
handle_document(self.d1.pk)
|
||||
handle_document(doc.pk)
|
||||
|
||||
doc = Document.objects.get(id=self.d1.id)
|
||||
doc = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
@@ -230,7 +230,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
tag_names = ("existingTag", "Space Tag")
|
||||
# Create a Tag prior to consuming a file using it in path
|
||||
tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
|
||||
tag_ids = [Tag.objects.create(name="existingtag").pk,]
|
||||
|
||||
self.t_start()
|
||||
|
||||
|
@@ -35,20 +35,20 @@ class TestDecryptDocuments(TestCase):
|
||||
PASSPHRASE="test"
|
||||
).enable()
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
|
||||
doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
|
||||
|
||||
call_command('decrypt_documents')
|
||||
|
||||
doc = Document.objects.get(id=2)
|
||||
doc.refresh_from_db()
|
||||
|
||||
self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(doc.filename, "0000002.pdf")
|
||||
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
|
||||
self.assertTrue(os.path.isfile(doc.thumbnail_path))
|
||||
|
||||
with doc.source_file as f:
|
||||
|
@@ -24,13 +24,14 @@ class TestExportImport(DirectoriesMixin, TestCase):
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
|
||||
Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", mime_type="application/pdf")
|
||||
Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
Correspondent.objects.create(name="c")
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
self.addCleanup(shutil.rmtree, target)
|
||||
|
||||
call_command('document_exporter', target)
|
||||
|
||||
@@ -66,6 +67,6 @@ class TestExportImport(DirectoriesMixin, TestCase):
|
||||
def test_export_missing_files(self):
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
call_command('document_exporter', target)
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
self.addCleanup(shutil.rmtree, target)
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", mime_type="application/pdf")
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
||||
|
@@ -40,6 +40,7 @@ from .filters import (
|
||||
LogFilterSet
|
||||
)
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .parsers import get_parser_class_for_mime_type
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer,
|
||||
DocumentSerializer,
|
||||
@@ -151,11 +152,11 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
doc = Document.objects.get(id=pk)
|
||||
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
|
||||
file_handle = doc.archive_file
|
||||
filename = doc.archive_file_name
|
||||
filename = doc.get_public_filename(archive=True)
|
||||
mime_type = 'application/pdf'
|
||||
else:
|
||||
file_handle = doc.source_file
|
||||
filename = doc.file_name
|
||||
filename = doc.get_public_filename()
|
||||
mime_type = doc.mime_type
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
@@ -166,17 +167,43 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
def get_metadata(self, file, mime_type):
|
||||
if not os.path.isfile(file):
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(logging_group=None)
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
else:
|
||||
return []
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
return Response({
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
})
|
||||
|
||||
meta = {
|
||||
"original_checksum": doc.checksum,
|
||||
"original_size": os.stat(doc.source_path).st_size,
|
||||
"original_mime_type": doc.mime_type,
|
||||
"media_filename": doc.filename,
|
||||
"has_archive_version": os.path.isfile(doc.archive_path),
|
||||
"original_metadata": self.get_metadata(
|
||||
doc.source_path, doc.mime_type)
|
||||
}
|
||||
|
||||
if doc.archive_checksum and os.path.isfile(doc.archive_path):
|
||||
meta['archive_checksum'] = doc.archive_checksum
|
||||
meta['archive_size'] = os.stat(doc.archive_path).st_size,
|
||||
meta['archive_metadata'] = self.get_metadata(
|
||||
doc.archive_path, "application/pdf")
|
||||
else:
|
||||
meta['archive_checksum'] = None
|
||||
meta['archive_size'] = None
|
||||
meta['archive_metadata'] = None
|
||||
|
||||
return Response(meta)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
@@ -263,12 +290,11 @@ class PostDocumentView(APIView):
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
document = serializer.validated_data['document']
|
||||
document_data = serializer.validated_data['document_data']
|
||||
correspondent_id = serializer.validated_data['correspondent_id']
|
||||
document_type_id = serializer.validated_data['document_type_id']
|
||||
tag_ids = serializer.validated_data['tag_ids']
|
||||
title = serializer.validated_data['title']
|
||||
doc_name, doc_data = serializer.validated_data.get('document')
|
||||
correspondent_id = serializer.validated_data.get('correspondent')
|
||||
document_type_id = serializer.validated_data.get('document_type')
|
||||
tag_ids = serializer.validated_data.get('tags')
|
||||
title = serializer.validated_data.get('title')
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
@@ -277,17 +303,17 @@ class PostDocumentView(APIView):
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
f.write(document_data)
|
||||
f.write(doc_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=document.name,
|
||||
override_filename=doc_name,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent_id,
|
||||
override_document_type_id=document_type_id,
|
||||
override_tag_ids=tag_ids,
|
||||
task_name=os.path.basename(document.name)[:100])
|
||||
task_name=os.path.basename(doc_name)[:100])
|
||||
return Response("OK")
|
||||
|
||||
|
||||
|
@@ -53,6 +53,10 @@ ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
|
||||
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
|
||||
|
||||
# Lock file for synchronizing changes to the MEDIA directory across multiple
|
||||
# threads.
|
||||
MEDIA_LOCK = os.path.join(MEDIA_ROOT, "media.lock")
|
||||
INDEX_DIR = os.path.join(DATA_DIR, "index")
|
||||
MODEL_FILE = os.path.join(DATA_DIR, "classification_model.pickle")
|
||||
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (0, 9, 5)
|
||||
__version__ = (0, 9, 6)
|
||||
|
@@ -103,10 +103,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
def _correspondent_from_name(self, name):
|
||||
try:
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
return Correspondent.objects.get_or_create(name=name)[0]
|
||||
except DatabaseError as e:
|
||||
self.log(
|
||||
"error",
|
||||
|
@@ -5,6 +5,7 @@ import subprocess
|
||||
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pikepdf
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
@@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
result = []
|
||||
if mime_type == 'application/pdf':
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
result.append({
|
||||
"namespace": m.group(1),
|
||||
"prefix": meta.REVERSE_NS[m.group(1)],
|
||||
"key": m.group(2),
|
||||
"value": value
|
||||
})
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Error while reading metadata {key}: {value}. Error: "
|
||||
f"{e}"
|
||||
)
|
||||
return result
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
|
Reference in New Issue
Block a user