mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-30 18:27:45 -05:00
Merge branch 'dev' into feature-autocolor
This commit is contained in:
@@ -1,10 +1,7 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from . import index
|
||||
from .models import Correspondent, Document, DocumentType, Log, Tag
|
||||
from .models import Correspondent, Document, DocumentType, Tag, \
|
||||
SavedView, SavedViewFilterRule
|
||||
|
||||
|
||||
class CorrespondentAdmin(admin.ModelAdmin):
|
||||
@@ -17,8 +14,6 @@ class CorrespondentAdmin(admin.ModelAdmin):
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
|
||||
@@ -31,8 +26,6 @@ class TagAdmin(admin.ModelAdmin):
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug", )
|
||||
|
||||
|
||||
class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
|
||||
@@ -44,32 +37,40 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "mime_type", "storage_type", "filename")
|
||||
readonly_fields = (
|
||||
"added",
|
||||
"modified",
|
||||
"mime_type",
|
||||
"storage_type",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum"
|
||||
)
|
||||
|
||||
list_display_links = ("title",)
|
||||
|
||||
list_display = (
|
||||
"correspondent",
|
||||
"id",
|
||||
"title",
|
||||
"tags_",
|
||||
"created",
|
||||
"mime_type",
|
||||
"filename",
|
||||
"archive_filename"
|
||||
)
|
||||
|
||||
list_filter = (
|
||||
"document_type",
|
||||
"tags",
|
||||
"correspondent"
|
||||
("mime_type"),
|
||||
("archive_serial_number", admin.EmptyFieldListFilter),
|
||||
("archive_filename", admin.EmptyFieldListFilter),
|
||||
)
|
||||
|
||||
filter_horizontal = ("tags",)
|
||||
|
||||
ordering = ["-created", "correspondent"]
|
||||
ordering = ["-id"]
|
||||
|
||||
date_hierarchy = "created"
|
||||
|
||||
@@ -81,59 +82,40 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
created_.short_description = "Created"
|
||||
|
||||
def delete_queryset(self, request, queryset):
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for o in queryset:
|
||||
index.remove_document(writer, o)
|
||||
|
||||
super(DocumentAdmin, self).delete_queryset(request, queryset)
|
||||
|
||||
def delete_model(self, request, obj):
|
||||
from documents import index
|
||||
index.remove_document_from_index(obj)
|
||||
super(DocumentAdmin, self).delete_model(request, obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
from documents import index
|
||||
index.add_or_update_document(obj)
|
||||
super(DocumentAdmin, self).save_model(request, obj, form, change)
|
||||
|
||||
@mark_safe
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
for tag in obj.tags.all():
|
||||
r += self._html_tag(
|
||||
"span",
|
||||
tag.slug + ", "
|
||||
)
|
||||
return r
|
||||
|
||||
@staticmethod
|
||||
def _html_tag(kind, inside=None, **kwargs):
|
||||
attributes = format_html_join(' ', '{}="{}"', kwargs.items())
|
||||
|
||||
if inside is not None:
|
||||
return format_html("<{kind} {attributes}>{inside}</{kind}>",
|
||||
kind=kind, attributes=attributes, inside=inside)
|
||||
|
||||
return format_html("<{} {}/>", kind, attributes)
|
||||
class RuleInline(admin.TabularInline):
|
||||
model = SavedViewFilterRule
|
||||
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
class SavedViewAdmin(admin.ModelAdmin):
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
list_display = ("name", "user")
|
||||
|
||||
def has_change_permission(self, request, obj=None):
|
||||
return False
|
||||
|
||||
list_display = ("created", "message", "level",)
|
||||
list_filter = ("level", "created",)
|
||||
|
||||
ordering = ('-created',)
|
||||
|
||||
list_display_links = ("created", "message")
|
||||
inlines = [
|
||||
RuleInline
|
||||
]
|
||||
|
||||
|
||||
admin.site.register(Correspondent, CorrespondentAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(DocumentType, DocumentTypeAdmin)
|
||||
admin.site.register(Document, DocumentAdmin)
|
||||
admin.site.register(Log, LogAdmin)
|
||||
admin.site.register(SavedView, SavedViewAdmin)
|
||||
|
@@ -1,34 +1,30 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
|
||||
class DocumentsConfig(AppConfig):
|
||||
|
||||
name = "documents"
|
||||
|
||||
def ready(self):
|
||||
verbose_name = _("Documents")
|
||||
|
||||
from .signals import document_consumption_started
|
||||
def ready(self):
|
||||
from .signals import document_consumption_finished
|
||||
from .signals.handlers import (
|
||||
add_inbox_tags,
|
||||
run_pre_consume_script,
|
||||
run_post_consume_script,
|
||||
set_log_entry,
|
||||
set_correspondent,
|
||||
set_document_type,
|
||||
set_tags,
|
||||
add_to_index
|
||||
|
||||
)
|
||||
|
||||
document_consumption_started.connect(run_pre_consume_script)
|
||||
|
||||
document_consumption_finished.connect(add_inbox_tags)
|
||||
document_consumption_finished.connect(set_correspondent)
|
||||
document_consumption_finished.connect(set_document_type)
|
||||
document_consumption_finished.connect(set_tags)
|
||||
document_consumption_finished.connect(set_log_entry)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
60
src/documents/bulk_download.py
Normal file
60
src/documents/bulk_download.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from zipfile import ZipFile
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class BulkArchiveStrategy:
|
||||
|
||||
def __init__(self, zipf: ZipFile):
|
||||
self.zipf = zipf
|
||||
|
||||
def make_unique_filename(self,
|
||||
doc: Document,
|
||||
archive: bool = False,
|
||||
folder: str = ""):
|
||||
counter = 0
|
||||
while True:
|
||||
filename = folder + doc.get_public_filename(archive, counter)
|
||||
if filename in self.zipf.namelist():
|
||||
counter += 1
|
||||
else:
|
||||
return filename
|
||||
|
||||
def add_document(self, doc: Document):
|
||||
raise NotImplementedError() # pragma: no cover
|
||||
|
||||
|
||||
class OriginalsOnlyStrategy(BulkArchiveStrategy):
|
||||
|
||||
def add_document(self, doc: Document):
|
||||
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
|
||||
|
||||
|
||||
class ArchiveOnlyStrategy(BulkArchiveStrategy):
|
||||
|
||||
def __init__(self, zipf):
|
||||
super(ArchiveOnlyStrategy, self).__init__(zipf)
|
||||
|
||||
def add_document(self, doc: Document):
|
||||
if doc.has_archive_version:
|
||||
self.zipf.write(doc.archive_path,
|
||||
self.make_unique_filename(doc, archive=True))
|
||||
else:
|
||||
self.zipf.write(doc.source_path,
|
||||
self.make_unique_filename(doc))
|
||||
|
||||
|
||||
class OriginalAndArchiveStrategy(BulkArchiveStrategy):
|
||||
|
||||
def add_document(self, doc: Document):
|
||||
if doc.has_archive_version:
|
||||
self.zipf.write(
|
||||
doc.archive_path, self.make_unique_filename(
|
||||
doc, archive=True, folder="archive/"
|
||||
)
|
||||
)
|
||||
|
||||
self.zipf.write(
|
||||
doc.source_path,
|
||||
self.make_unique_filename(doc, folder="originals/")
|
||||
)
|
106
src/documents/bulk_edit.py
Normal file
106
src/documents/bulk_edit.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import itertools
|
||||
|
||||
from django.db.models import Q
|
||||
from django_q.tasks import async_task
|
||||
|
||||
from documents.models import Document, Correspondent, DocumentType
|
||||
|
||||
|
||||
def set_correspondent(doc_ids, correspondent):
|
||||
if correspondent:
|
||||
correspondent = Correspondent.objects.get(id=correspondent)
|
||||
|
||||
qs = Document.objects.filter(
|
||||
Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
|
||||
affected_docs = [doc.id for doc in qs]
|
||||
qs.update(correspondent=correspondent)
|
||||
|
||||
async_task(
|
||||
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def set_document_type(doc_ids, document_type):
|
||||
if document_type:
|
||||
document_type = DocumentType.objects.get(id=document_type)
|
||||
|
||||
qs = Document.objects.filter(
|
||||
Q(id__in=doc_ids) & ~Q(document_type=document_type))
|
||||
affected_docs = [doc.id for doc in qs]
|
||||
qs.update(document_type=document_type)
|
||||
|
||||
async_task(
|
||||
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def add_tag(doc_ids, tag):
|
||||
|
||||
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(tags__id=tag))
|
||||
affected_docs = [doc.id for doc in qs]
|
||||
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
|
||||
DocumentTagRelationship.objects.bulk_create([
|
||||
DocumentTagRelationship(
|
||||
document_id=doc, tag_id=tag) for doc in affected_docs
|
||||
])
|
||||
|
||||
async_task(
|
||||
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def remove_tag(doc_ids, tag):
|
||||
|
||||
qs = Document.objects.filter(Q(id__in=doc_ids) & Q(tags__id=tag))
|
||||
affected_docs = [doc.id for doc in qs]
|
||||
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
|
||||
DocumentTagRelationship.objects.filter(
|
||||
Q(document_id__in=affected_docs) &
|
||||
Q(tag_id=tag)
|
||||
).delete()
|
||||
|
||||
async_task(
|
||||
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def modify_tags(doc_ids, add_tags, remove_tags):
|
||||
qs = Document.objects.filter(id__in=doc_ids)
|
||||
affected_docs = [doc.id for doc in qs]
|
||||
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
|
||||
DocumentTagRelationship.objects.filter(
|
||||
document_id__in=affected_docs,
|
||||
tag_id__in=remove_tags,
|
||||
).delete()
|
||||
|
||||
DocumentTagRelationship.objects.bulk_create([DocumentTagRelationship(
|
||||
document_id=doc, tag_id=tag) for (doc, tag) in itertools.product(
|
||||
affected_docs, add_tags)
|
||||
], ignore_conflicts=True)
|
||||
|
||||
async_task(
|
||||
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def delete(doc_ids):
|
||||
Document.objects.filter(id__in=doc_ids).delete()
|
||||
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for id in doc_ids:
|
||||
index.remove_document_by_id(writer, id)
|
||||
|
||||
return "OK"
|
@@ -2,6 +2,7 @@ import textwrap
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error, register
|
||||
from django.core.exceptions import FieldError
|
||||
from django.db.utils import OperationalError, ProgrammingError
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
@@ -16,7 +17,7 @@ def changed_password_check(app_configs, **kwargs):
|
||||
try:
|
||||
encrypted_doc = Document.objects.filter(
|
||||
storage_type=Document.STORAGE_TYPE_GPG).first()
|
||||
except (OperationalError, ProgrammingError):
|
||||
except (OperationalError, ProgrammingError, FieldError):
|
||||
return [] # No documents table yet
|
||||
|
||||
if encrypted_doc:
|
||||
@@ -50,6 +51,6 @@ def parser_check(app_configs, **kwargs):
|
||||
|
||||
if len(parsers) == 0:
|
||||
return [Error("No parsers found. This is a bug. The consumer won't be "
|
||||
"able to onsume any documents without parsers.")]
|
||||
"able to consume any documents without parsers.")]
|
||||
else:
|
||||
return []
|
||||
|
@@ -5,10 +5,6 @@ import pickle
|
||||
import re
|
||||
|
||||
from django.conf import settings
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from documents.models import Document, MatchingModel
|
||||
|
||||
@@ -17,7 +13,7 @@ class IncompatibleClassifierVersionError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger("paperless.classifier")
|
||||
|
||||
|
||||
def preprocess_content(content):
|
||||
@@ -26,15 +22,40 @@ def preprocess_content(content):
|
||||
return content
|
||||
|
||||
|
||||
def load_classifier():
|
||||
if not os.path.isfile(settings.MODEL_FILE):
|
||||
logger.debug(
|
||||
f"Document classification model does not exist (yet), not "
|
||||
f"performing automatic matching."
|
||||
)
|
||||
return None
|
||||
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.load()
|
||||
|
||||
except (EOFError, IncompatibleClassifierVersionError) as e:
|
||||
# there's something wrong with the model file.
|
||||
logger.exception(
|
||||
f"Unrecoverable error while loading document "
|
||||
f"classification model, deleting model file."
|
||||
)
|
||||
os.unlink(settings.MODEL_FILE)
|
||||
classifier = None
|
||||
except OSError as e:
|
||||
logger.error(
|
||||
f"Error while loading document classification model: {str(e)}"
|
||||
)
|
||||
classifier = None
|
||||
|
||||
return classifier
|
||||
|
||||
|
||||
class DocumentClassifier(object):
|
||||
|
||||
FORMAT_VERSION = 6
|
||||
|
||||
def __init__(self):
|
||||
# mtime of the model file on disk. used to prevent reloading when
|
||||
# nothing has changed.
|
||||
self.classifier_version = 0
|
||||
|
||||
# hash of the training data. used to prevent re-training when the
|
||||
# training data has not changed.
|
||||
self.data_hash = None
|
||||
@@ -45,30 +66,23 @@ class DocumentClassifier(object):
|
||||
self.correspondent_classifier = None
|
||||
self.document_type_classifier = None
|
||||
|
||||
def reload(self):
|
||||
if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
schema_version = pickle.load(f)
|
||||
def load(self):
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
schema_version = pickle.load(f)
|
||||
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
if self.classifier_version > 0:
|
||||
# Don't be confused by this check. It's simply here
|
||||
# so that we wont log anything on initial reload.
|
||||
logger.info("Classifier updated on disk, "
|
||||
"reloading classifier models")
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
|
||||
def save_classifier(self):
|
||||
def save(self):
|
||||
with open(settings.MODEL_FILE, "wb") as f:
|
||||
pickle.dump(self.FORMAT_VERSION, f)
|
||||
pickle.dump(self.data_hash, f)
|
||||
@@ -81,13 +95,14 @@ class DocumentClassifier(object):
|
||||
pickle.dump(self.document_type_classifier, f)
|
||||
|
||||
def train(self):
|
||||
|
||||
data = list()
|
||||
labels_tags = list()
|
||||
labels_correspondent = list()
|
||||
labels_document_type = list()
|
||||
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logging.getLogger(__name__).debug("Gathering data from database...")
|
||||
logger.debug("Gathering data from database...")
|
||||
m = hashlib.sha1()
|
||||
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
|
||||
preprocessed_content = preprocess_content(doc.content)
|
||||
@@ -134,7 +149,7 @@ class DocumentClassifier(object):
|
||||
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
|
||||
num_document_types = len(set(labels_document_type) | {-1}) - 1
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
"{} document type(s).".format(
|
||||
len(data),
|
||||
@@ -144,8 +159,12 @@ class DocumentClassifier(object):
|
||||
)
|
||||
)
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
|
||||
# Step 2: vectorize data
|
||||
logging.getLogger(__name__).debug("Vectorizing data...")
|
||||
logger.debug("Vectorizing data...")
|
||||
self.data_vectorizer = CountVectorizer(
|
||||
analyzer="word",
|
||||
ngram_range=(1, 2),
|
||||
@@ -155,7 +174,7 @@ class DocumentClassifier(object):
|
||||
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logging.getLogger(__name__).debug("Training tags classifier...")
|
||||
logger.debug("Training tags classifier...")
|
||||
|
||||
if num_tags == 1:
|
||||
# Special case where only one tag has auto:
|
||||
@@ -174,12 +193,12 @@ class DocumentClassifier(object):
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
self.tags_classifier = None
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"There are no tags. Not training tags classifier."
|
||||
)
|
||||
|
||||
if num_correspondents > 0:
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"Training correspondent classifier..."
|
||||
)
|
||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||
@@ -189,13 +208,13 @@ class DocumentClassifier(object):
|
||||
)
|
||||
else:
|
||||
self.correspondent_classifier = None
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"There are no correspondents. Not training correspondent "
|
||||
"classifier."
|
||||
)
|
||||
|
||||
if num_document_types > 0:
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"Training document type classifier..."
|
||||
)
|
||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||
@@ -205,7 +224,7 @@ class DocumentClassifier(object):
|
||||
)
|
||||
else:
|
||||
self.document_type_classifier = None
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"There are no document types. Not training document type "
|
||||
"classifier."
|
||||
)
|
||||
@@ -237,6 +256,8 @@ class DocumentClassifier(object):
|
||||
return None
|
||||
|
||||
def predict_tags(self, content):
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
if self.tags_classifier:
|
||||
X = self.data_vectorizer.transform([preprocess_content(content)])
|
||||
y = self.tags_classifier.predict(X)
|
||||
|
@@ -1,20 +1,25 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from subprocess import Popen
|
||||
|
||||
import magic
|
||||
from asgiref.sync import async_to_sync
|
||||
from channels.layers import get_channel_layer
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import create_source_path_directory
|
||||
from .classifier import load_classifier
|
||||
from .file_handling import create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, \
|
||||
get_supported_file_extensions, parse_date
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@@ -25,8 +30,45 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
|
||||
MESSAGE_FILE_NOT_FOUND = "file_not_found"
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
|
||||
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
|
||||
MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
|
||||
MESSAGE_NEW_FILE = "new_file"
|
||||
MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
|
||||
MESSAGE_PARSING_DOCUMENT = "parsing_document"
|
||||
MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
|
||||
MESSAGE_PARSE_DATE = "parse_date"
|
||||
MESSAGE_SAVE_DOCUMENT = "save_document"
|
||||
MESSAGE_FINISHED = "finished"
|
||||
|
||||
|
||||
class Consumer(LoggingMixin):
|
||||
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
def _send_progress(self, current_progress, max_progress, status,
|
||||
message=None, document_id=None):
|
||||
payload = {
|
||||
'filename': os.path.basename(self.filename) if self.filename else None, # NOQA: E501
|
||||
'task_id': self.task_id,
|
||||
'current_progress': current_progress,
|
||||
'max_progress': max_progress,
|
||||
'status': status,
|
||||
'message': message,
|
||||
'document_id': document_id
|
||||
}
|
||||
async_to_sync(self.channel_layer.group_send)("status_updates",
|
||||
{'type': 'status_update',
|
||||
'data': payload})
|
||||
|
||||
def _fail(self, message, log_message=None):
|
||||
self._send_progress(100, 100, 'FAILED', message)
|
||||
self.log("error", log_message or message)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}")
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.path = None
|
||||
@@ -35,11 +77,16 @@ class Consumer(LoggingMixin):
|
||||
self.override_correspondent_id = None
|
||||
self.override_tag_ids = None
|
||||
self.override_document_type_id = None
|
||||
self.task_id = None
|
||||
|
||||
self.channel_layer = get_channel_layer()
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
if not os.path.isfile(self.path):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
self._fail(
|
||||
MESSAGE_FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.path}: File not found."
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
@@ -47,8 +94,9 @@ class Consumer(LoggingMixin):
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
self._fail(
|
||||
MESSAGE_DOCUMENT_ALREADY_EXISTS,
|
||||
f"Not consuming {self.filename}: It is a duplicate."
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
@@ -57,13 +105,62 @@ class Consumer(LoggingMixin):
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
def run_pre_consume_script(self):
|
||||
if not settings.PRE_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
|
||||
self._fail(
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured pre-consume script "
|
||||
f"{settings.PRE_CONSUME_SCRIPT} does not exist.")
|
||||
|
||||
try:
|
||||
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing pre-consume script: {e}"
|
||||
)
|
||||
|
||||
def run_post_consume_script(self, document):
|
||||
if not settings.POST_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
|
||||
self._fail(
|
||||
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured post-consume script "
|
||||
f"{settings.POST_CONSUME_SCRIPT} does not exist."
|
||||
)
|
||||
|
||||
try:
|
||||
Popen((
|
||||
settings.POST_CONSUME_SCRIPT,
|
||||
str(document.pk),
|
||||
document.get_public_filename(),
|
||||
os.path.normpath(document.source_path),
|
||||
os.path.normpath(document.thumbnail_path),
|
||||
reverse("document-download", kwargs={"pk": document.pk}),
|
||||
reverse("document-thumb", kwargs={"pk": document.pk}),
|
||||
str(document.correspondent),
|
||||
str(",".join(document.tags.all().values_list(
|
||||
"name", flat=True)))
|
||||
)).wait()
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing post-consume script: {e}"
|
||||
)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
override_tag_ids=None,
|
||||
task_id=None):
|
||||
"""
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
@@ -74,6 +171,9 @@ class Consumer(LoggingMixin):
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
self.override_tag_ids = override_tag_ids
|
||||
self.task_id = task_id or str(uuid.uuid4())
|
||||
|
||||
self._send_progress(0, 100, 'STARTING', MESSAGE_NEW_FILE)
|
||||
|
||||
# this is for grouping logging entries for this particular file
|
||||
# together.
|
||||
@@ -86,19 +186,20 @@ class Consumer(LoggingMixin):
|
||||
self.pre_check_directories()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
self.log("info", f"Consuming {self.filename}")
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
mime_type = magic.from_file(self.path, mime=True)
|
||||
|
||||
self.log("debug", f"Detected mime type: {mime_type}")
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if not parser_class:
|
||||
raise ConsumerError(f"No parsers abvailable for {self.filename}")
|
||||
else:
|
||||
self.log("debug",
|
||||
f"Parser: {parser_class.__name__} "
|
||||
f"based on mime type {mime_type}")
|
||||
self._fail(
|
||||
MESSAGE_UNSUPPORTED_TYPE,
|
||||
f"Unsupported mime type {mime_type}"
|
||||
)
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
@@ -108,35 +209,54 @@ class Consumer(LoggingMixin):
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
self.run_pre_consume_script()
|
||||
|
||||
def progress_callback(current_progress, max_progress):
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, "WORKING")
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.logging_group)
|
||||
document_parser = parser_class(self.logging_group, progress_callback)
|
||||
|
||||
self.log("debug", f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
|
||||
try:
|
||||
self._send_progress(20, 100, 'WORKING', MESSAGE_PARSING_DOCUMENT)
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
document_parser.parse(self.path, mime_type)
|
||||
document_parser.parse(self.path, mime_type, self.filename)
|
||||
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(70, 100, 'WORKING',
|
||||
MESSAGE_GENERATING_THUMBNAIL)
|
||||
thumbnail = document_parser.get_optimised_thumbnail(
|
||||
self.path, mime_type)
|
||||
self.path, mime_type, self.filename)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if not date:
|
||||
self._send_progress(90, 100, 'WORKING',
|
||||
MESSAGE_PARSE_DATE)
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while consuming document {self.filename}: {e}")
|
||||
raise ConsumerError(e)
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error while consuming document {self.filename}: {e}"
|
||||
)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
@@ -144,14 +264,9 @@ class Consumer(LoggingMixin):
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
classifier = None
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
@@ -176,51 +291,55 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
document.filename = generate_unique_filename(document)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
# TODO: not required, since this is done by the file handling
|
||||
# logic
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
self.path, document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
self.path, document.source_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
# Afte performing all database operations and moving files
|
||||
# into place, tell paperless where the file is.
|
||||
document.filename = os.path.basename(document.source_path)
|
||||
# Saving the document now will trigger the filename handling
|
||||
# logic.
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to aquire the lock as well.
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occured while consuming "
|
||||
f"{self.filename}: {e}"
|
||||
)
|
||||
raise ConsumerError(e)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
self._send_progress(100, 100, 'SUCCESS', MESSAGE_FINISHED, document.id)
|
||||
|
||||
return document
|
||||
|
||||
def _store(self, text, date, mime_type):
|
||||
@@ -240,8 +359,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
title=(self.override_title or file_info.title)[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
@@ -250,20 +368,13 @@ class Consumer(LoggingMixin):
|
||||
storage_type=storage_type
|
||||
)
|
||||
|
||||
relevant_tags = set(file_info.tags)
|
||||
if relevant_tags:
|
||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
self.apply_overrides(document)
|
||||
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.override_title:
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.override_correspondent_id)
|
||||
|
@@ -1,11 +1,22 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.filehandling")
|
||||
|
||||
|
||||
class defaultdictNoStr(defaultdict):
|
||||
|
||||
def __str__(self):
|
||||
raise ValueError("Don't use {tags} directly.")
|
||||
|
||||
|
||||
def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
@@ -68,44 +79,119 @@ def many_to_dictionary(field):
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(doc):
|
||||
def generate_unique_filename(doc,
|
||||
archive_filename=False):
|
||||
"""
|
||||
Generates a unique filename for doc in settings.ORIGINALS_DIR.
|
||||
|
||||
The returned filename is guaranteed to be either the current filename
|
||||
of the document if unchanged, or a new filename that does not correspondent
|
||||
to any existing files. The function will append _01, _02, etc to the
|
||||
filename before the extension to avoid conflicts.
|
||||
|
||||
If archive_filename is True, return a unique archive filename instead.
|
||||
|
||||
"""
|
||||
if archive_filename:
|
||||
old_filename = doc.archive_filename
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = doc.filename
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
# If generating archive filenames, try to make a name that is similar to
|
||||
# the original filename first.
|
||||
|
||||
if archive_filename and doc.filename:
|
||||
new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
|
||||
if new_filename == old_filename or not os.path.exists(os.path.join(root, new_filename)): # NOQA: E501
|
||||
return new_filename
|
||||
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(
|
||||
doc, counter, archive_filename=archive_filename)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if os.path.exists(os.path.join(root, new_filename)):
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(doc.correspondent),
|
||||
title=slugify(doc.title),
|
||||
created=slugify(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
added=slugify(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
tags=tags,
|
||||
tags = defaultdictNoStr(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
tag_list = pathvalidate.sanitize_filename(
|
||||
",".join(sorted(
|
||||
[tag.name for tag in doc.tags.all()]
|
||||
)),
|
||||
replacement_text="-"
|
||||
)
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
if doc.archive_serial_number:
|
||||
asn = str(doc.archive_serial_number)
|
||||
else:
|
||||
asn = "none"
|
||||
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
title=pathvalidate.sanitize_filename(
|
||||
doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
|
||||
created_day=f"{doc.created.day:02}" if doc.created else "none",
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=f"{doc.added.month:02}" if doc.added else "none",
|
||||
added_day=f"{doc.added.day:02}" if doc.added else "none",
|
||||
asn=asn,
|
||||
tags=tags,
|
||||
tag_list=tag_list
|
||||
).strip()
|
||||
|
||||
path = path.strip(os.sep)
|
||||
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning(
|
||||
logger.warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
|
||||
filetype_str = ".pdf" if archive_filename else doc.file_type
|
||||
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
|
||||
filename = f"{path}{counter_str}{filetype_str}"
|
||||
else:
|
||||
filename = "%07i%s" % (doc.pk, doc.file_type)
|
||||
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
@@ -4,7 +4,7 @@ from .models import Correspondent, Document, Tag, DocumentType, Log
|
||||
|
||||
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
|
||||
ID_KWARGS = ["in", "exact"]
|
||||
INT_KWARGS = ["exact", "gt", "gte", "lt", "lte"]
|
||||
INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"]
|
||||
DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"]
|
||||
|
||||
|
||||
@@ -37,6 +37,10 @@ class DocumentTypeFilterSet(FilterSet):
|
||||
|
||||
class TagsFilter(Filter):
|
||||
|
||||
def __init__(self, exclude=False):
|
||||
super(TagsFilter, self).__init__()
|
||||
self.exclude = exclude
|
||||
|
||||
def filter(self, qs, value):
|
||||
if not value:
|
||||
return qs
|
||||
@@ -47,7 +51,10 @@ class TagsFilter(Filter):
|
||||
return qs
|
||||
|
||||
for tag_id in tag_ids:
|
||||
qs = qs.filter(tags__id=tag_id)
|
||||
if self.exclude:
|
||||
qs = qs.exclude(tags__id=tag_id)
|
||||
else:
|
||||
qs = qs.filter(tags__id=tag_id)
|
||||
|
||||
return qs
|
||||
|
||||
@@ -74,6 +81,8 @@ class DocumentFilterSet(FilterSet):
|
||||
|
||||
tags__id__all = TagsFilter()
|
||||
|
||||
tags__id__none = TagsFilter(exclude=True)
|
||||
|
||||
is_in_inbox = InboxFilter()
|
||||
|
||||
class Meta:
|
||||
@@ -89,12 +98,14 @@ class DocumentFilterSet(FilterSet):
|
||||
"added": DATE_KWARGS,
|
||||
"modified": DATE_KWARGS,
|
||||
|
||||
"correspondent": ["isnull"],
|
||||
"correspondent__id": ID_KWARGS,
|
||||
"correspondent__name": CHAR_KWARGS,
|
||||
|
||||
"tags__id": ID_KWARGS,
|
||||
"tags__name": CHAR_KWARGS,
|
||||
|
||||
"document_type": ["isnull"],
|
||||
"document_type__id": ID_KWARGS,
|
||||
"document_type__name": CHAR_KWARGS,
|
||||
|
||||
|
@@ -1,59 +0,0 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import magic
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
from documents.parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
document = forms.FileField()
|
||||
|
||||
def clean_document(self):
|
||||
document_name = self.cleaned_data.get("document").name
|
||||
|
||||
try:
|
||||
validate_filename(document_name)
|
||||
except ValidationError:
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
|
||||
document_data = self.cleaned_data.get("document").read()
|
||||
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise forms.ValidationError("This mime type is not supported.")
|
||||
|
||||
return document_name, document_data
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
to-be-consumed files to the consumption directory rather than have the
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
original_filename, data = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
|
||||
f.write(data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=original_filename,
|
||||
task_name=os.path.basename(original_filename)[:100])
|
@@ -3,7 +3,7 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.conf import settings
|
||||
from whoosh import highlight
|
||||
from whoosh import highlight, classify, query
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
from whoosh.index import create_in, exists_in, open_dir
|
||||
@@ -12,7 +12,7 @@ from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger("paperless.index")
|
||||
|
||||
|
||||
class JsonFormatter(Formatter):
|
||||
@@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
|
||||
self.seen = {}
|
||||
|
||||
def format_token(self, text, token, replace=False):
|
||||
seen = self.seen
|
||||
ttext = self._text(get_text(text, token, replace))
|
||||
if ttext in seen:
|
||||
termnum = seen[ttext]
|
||||
else:
|
||||
termnum = len(seen)
|
||||
seen[ttext] = termnum
|
||||
|
||||
return {'text': ttext, 'term': termnum}
|
||||
return {'text': ttext, 'highlight': 'true'}
|
||||
|
||||
def format_fragment(self, fragment, replace=False):
|
||||
output = []
|
||||
index = fragment.startchar
|
||||
text = fragment.text
|
||||
|
||||
amend_token = None
|
||||
for t in fragment.matches:
|
||||
if t.startchar is None:
|
||||
continue
|
||||
if t.startchar < index:
|
||||
continue
|
||||
if t.startchar > index:
|
||||
output.append({'text': text[index:t.startchar]})
|
||||
output.append(self.format_token(text, t, replace))
|
||||
text_inbetween = text[index:t.startchar]
|
||||
if amend_token and t.startchar - index < 10:
|
||||
amend_token['text'] += text_inbetween
|
||||
else:
|
||||
output.append({'text': text_inbetween,
|
||||
'highlight': False})
|
||||
amend_token = None
|
||||
token = self.format_token(text, t, replace)
|
||||
if amend_token:
|
||||
amend_token['text'] += token['text']
|
||||
else:
|
||||
output.append(token)
|
||||
amend_token = token
|
||||
index = t.endchar
|
||||
if index < fragment.endchar:
|
||||
output.append({'text': text[index:fragment.endchar]})
|
||||
output.append({'text': text[index:fragment.endchar],
|
||||
'highlight': False})
|
||||
return output
|
||||
|
||||
def format(self, fragments, replace=False):
|
||||
@@ -73,16 +78,31 @@ def open_index(recreate=False):
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
except Exception as e:
|
||||
logger.error(f"Error while opening the index: {e}, recreating.")
|
||||
except Exception:
|
||||
logger.exception(f"Error while opening the index, recreating.")
|
||||
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_writer(ix=None, optimize=False):
|
||||
if ix:
|
||||
writer = AsyncWriter(ix)
|
||||
else:
|
||||
writer = AsyncWriter(open_index())
|
||||
|
||||
try:
|
||||
yield writer
|
||||
except Exception as e:
|
||||
logger.exception(str(e))
|
||||
writer.cancel()
|
||||
finally:
|
||||
writer.commit(optimize=optimize)
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
logger.debug("Indexing {}...".format(doc))
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
@@ -98,39 +118,60 @@ def update_document(writer, doc):
|
||||
|
||||
|
||||
def remove_document(writer, doc):
|
||||
logger.debug("Removing {} from index...".format(doc))
|
||||
writer.delete_by_term('id', doc.pk)
|
||||
remove_document_by_id(writer, doc.pk)
|
||||
|
||||
|
||||
def remove_document_by_id(writer, doc_id):
|
||||
writer.delete_by_term('id', doc_id)
|
||||
|
||||
|
||||
def add_or_update_document(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
with open_index_writer() as writer:
|
||||
update_document(writer, document)
|
||||
|
||||
|
||||
def remove_document_from_index(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
with open_index_writer() as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, querystring, page):
|
||||
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
if querystring:
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
str_q = qp.parse(querystring)
|
||||
corrected = searcher.correct_query(str_q, querystring)
|
||||
else:
|
||||
str_q = None
|
||||
corrected = None
|
||||
|
||||
if more_like_doc_id:
|
||||
docnum = searcher.document_number(id=more_like_doc_id)
|
||||
kts = searcher.key_terms_from_text(
|
||||
'content', more_like_doc_content, numterms=20,
|
||||
model=classify.Bo1Model, normalize=False)
|
||||
more_like_q = query.Or(
|
||||
[query.Term('content', word, boost=weight)
|
||||
for word, weight in kts])
|
||||
result_page = searcher.search_page(
|
||||
more_like_q, page, filter=str_q, mask={docnum})
|
||||
elif str_q:
|
||||
result_page = searcher.search_page(str_q, page)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Either querystring or more_like_doc_id is required."
|
||||
)
|
||||
|
||||
q = qp.parse(querystring)
|
||||
result_page = searcher.search_page(q, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
|
||||
corrected = searcher.correct_query(q, querystring)
|
||||
if corrected.query != q:
|
||||
if corrected and corrected.query != str_q:
|
||||
corrected_query = corrected.string
|
||||
else:
|
||||
corrected_query = None
|
||||
|
@@ -4,34 +4,25 @@ import uuid
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class PaperlessHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
if settings.DISABLE_DBHANDLER:
|
||||
return
|
||||
|
||||
# We have to do the import here or Django will barf when it tries to
|
||||
# load this because the apps aren't loaded at that point
|
||||
from .models import Log
|
||||
|
||||
kwargs = {"message": record.msg, "level": record.levelno}
|
||||
|
||||
if hasattr(record, "group"):
|
||||
kwargs["group"] = record.group
|
||||
|
||||
Log.objects.create(**kwargs)
|
||||
|
||||
|
||||
class LoggingMixin:
|
||||
|
||||
logging_group = None
|
||||
|
||||
logging_name = None
|
||||
|
||||
def renew_logging_group(self):
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||
logger = logging.getLogger(target)
|
||||
def log(self, level, message, **kwargs):
|
||||
if self.logging_name:
|
||||
logger = logging.getLogger(self.logging_name)
|
||||
else:
|
||||
name = ".".join([
|
||||
self.__class__.__module__,
|
||||
self.__class__.__name__
|
||||
])
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
getattr(logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
}, **kwargs)
|
||||
|
@@ -2,7 +2,6 @@ import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from termcolor import colored as coloured
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
@@ -26,16 +25,14 @@ class Command(BaseCommand):
|
||||
def handle(self, *args, **options):
|
||||
|
||||
try:
|
||||
print(coloured(
|
||||
print(
|
||||
"\n\nWARNING: This script is going to work directly on your "
|
||||
"document originals, so\nWARNING: you probably shouldn't run "
|
||||
"this unless you've got a recent backup\nWARNING: handy. It "
|
||||
"*should* work without a hitch, but be safe and backup your\n"
|
||||
"WARNING: stuff first.\n\nHit Ctrl+C to exit now, or Enter to "
|
||||
"continue.\n\n",
|
||||
"yellow",
|
||||
attrs=("bold",)
|
||||
))
|
||||
"continue.\n\n"
|
||||
)
|
||||
__ = input()
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
@@ -57,8 +54,8 @@ class Command(BaseCommand):
|
||||
|
||||
for document in encrypted_files:
|
||||
|
||||
print(coloured("Decrypting {}".format(
|
||||
document).encode('utf-8'), "green"))
|
||||
print("Decrypting {}".format(
|
||||
document).encode('utf-8'))
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
|
||||
@@ -82,7 +79,8 @@ class Command(BaseCommand):
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(raw_thumb)
|
||||
|
||||
document.save(update_fields=("storage_type", "filename"))
|
||||
Document.objects.filter(id=document.id).update(
|
||||
storage_type=document.storage_type, filename=document.filename)
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
||||
|
@@ -5,59 +5,83 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from time import sleep
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from filelock import FileLock
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from ... import index
|
||||
from ...file_handling import create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
from ...file_handling import create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger("paperless.management.archiver")
|
||||
|
||||
|
||||
def handle_document(document):
|
||||
def handle_document(document_id):
|
||||
document = Document.objects.get(id=document_id)
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
if not parser_class:
|
||||
logger.error(f"No parser found for mime type {mime_type}, cannot "
|
||||
f"archive document {document} (ID: {document_id})")
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename())
|
||||
|
||||
thumbnail = parser.get_optimised_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename()
|
||||
)
|
||||
|
||||
if parser.get_archive_path():
|
||||
with transaction.atomic():
|
||||
with open(parser.get_archive_path(), 'rb') as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# i'm going to save first so that in case the file move
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# we also don't use save() since that triggers the filehandling
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document, archive_filename=True)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text()
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(),
|
||||
document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, document)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while parsing document {document}: {str(e)}")
|
||||
logger.exception(f"Error while parsing document {document} "
|
||||
f"(ID: {document_id})")
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current classification model, assigns correspondents, tags
|
||||
@@ -66,10 +90,6 @@ class Command(Renderable, BaseCommand):
|
||||
modified) after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-f", "--overwrite",
|
||||
@@ -98,17 +118,29 @@ class Command(Renderable, BaseCommand):
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
documents_to_process = list(filter(
|
||||
lambda d: overwrite or not d.archive_checksum,
|
||||
documents
|
||||
document_ids = list(map(
|
||||
lambda doc: doc.id,
|
||||
filter(
|
||||
lambda d: overwrite or not d.has_archive_version,
|
||||
documents
|
||||
)
|
||||
))
|
||||
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
handle_document,
|
||||
documents_to_process
|
||||
),
|
||||
total=len(documents_to_process)
|
||||
))
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# conncetions between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
try:
|
||||
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
handle_document,
|
||||
document_ids
|
||||
),
|
||||
total=len(document_ids)
|
||||
))
|
||||
except KeyboardInterrupt:
|
||||
print("Aborting...")
|
||||
|
@@ -1,11 +1,11 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
@@ -18,21 +18,20 @@ try:
|
||||
except ImportError:
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger("paperless.management.consumer")
|
||||
|
||||
|
||||
def _tags_from_path(filepath):
|
||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIr
|
||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
|
||||
and get or create Tag IDs for every directory.
|
||||
"""
|
||||
tag_ids = set()
|
||||
path_parts = Path(filepath).relative_to(
|
||||
settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(Tag.objects.get_or_create(
|
||||
slug=slugify(part),
|
||||
defaults={"name": part},
|
||||
)[0].pk)
|
||||
tag_ids.add(Tag.objects.get_or_create(name__iexact=part, defaults={
|
||||
"name": part
|
||||
})[0].pk)
|
||||
|
||||
return tag_ids
|
||||
|
||||
@@ -47,7 +46,7 @@ def _consume(filepath):
|
||||
return
|
||||
|
||||
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
|
||||
logger.debug(
|
||||
logger.warning(
|
||||
f"Not consuming file {filepath}: Unknown file extension.")
|
||||
return
|
||||
|
||||
@@ -56,10 +55,10 @@ def _consume(filepath):
|
||||
if settings.CONSUMER_SUBDIRS_AS_TAGS:
|
||||
tag_ids = _tags_from_path(filepath)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error creating tags from path: {}".format(e))
|
||||
logger.exception("Error creating tags from path")
|
||||
|
||||
try:
|
||||
logger.info(f"Adding {filepath} to the task queue.")
|
||||
async_task("documents.tasks.consume_file",
|
||||
filepath,
|
||||
override_tag_ids=tag_ids if tag_ids else None,
|
||||
@@ -68,14 +67,14 @@ def _consume(filepath):
|
||||
# Catch all so that the consumer won't crash.
|
||||
# This is also what the test case is listening for to check for
|
||||
# errors.
|
||||
logger.error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
logger.exception("Error while consuming document")
|
||||
|
||||
|
||||
def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
|
||||
def _consume_wait_unmodified(file):
|
||||
logger.debug(f"Waiting for file {file} to remain unmodified")
|
||||
mtime = -1
|
||||
current_try = 0
|
||||
while current_try < num_tries:
|
||||
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
|
||||
try:
|
||||
new_mtime = os.stat(file).st_mtime
|
||||
except FileNotFoundError:
|
||||
@@ -86,7 +85,7 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
|
||||
_consume(file)
|
||||
return
|
||||
mtime = new_mtime
|
||||
sleep(wait_time)
|
||||
sleep(settings.CONSUMER_POLLING_DELAY)
|
||||
current_try += 1
|
||||
|
||||
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
|
||||
@@ -95,10 +94,14 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def on_created(self, event):
|
||||
_consume_wait_unmodified(event.src_path)
|
||||
Thread(
|
||||
target=_consume_wait_unmodified, args=(event.src_path,)
|
||||
).start()
|
||||
|
||||
def on_moved(self, event):
|
||||
_consume_wait_unmodified(event.dest_path)
|
||||
Thread(
|
||||
target=_consume_wait_unmodified, args=(event.dest_path,)
|
||||
).start()
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
@@ -110,12 +113,7 @@ class Command(BaseCommand):
|
||||
# This is here primarily for the tests and is irrelevant in production.
|
||||
stop_flag = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.observer = None
|
||||
observer = None
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
@@ -163,7 +161,7 @@ class Command(BaseCommand):
|
||||
logger.debug("Consumer exiting.")
|
||||
|
||||
def handle_polling(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
logger.info(
|
||||
f"Polling directory for changes: {directory}")
|
||||
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
self.observer.schedule(Handler(), directory, recursive=recursive)
|
||||
@@ -178,7 +176,7 @@ class Command(BaseCommand):
|
||||
self.observer.join()
|
||||
|
||||
def handle_inotify(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
logger.info(
|
||||
f"Using inotify to watch directory for changes: {directory}")
|
||||
|
||||
inotify = INotify()
|
||||
|
@@ -1,10 +1,9 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from ...mixins import Renderable
|
||||
from ...tasks import train_classifier
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Trains the classifier on your data and saves the resulting models to a
|
||||
|
@@ -1,19 +1,24 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.db import transaction
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
from ...file_handling import generate_filename, delete_empty_directories
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Decrypt and rename all files in our collection into a given target
|
||||
@@ -24,13 +29,47 @@ class Command(Renderable, BaseCommand):
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
|
||||
parser.add_argument(
|
||||
"-c", "--compare-checksums",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Compare file checksums when determining whether to export "
|
||||
"a file or not. If not specified, file size and time "
|
||||
"modified is used instead."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f", "--use-filename-format",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
|
||||
"export directory, if configured."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-d", "--delete",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="After exporting, delete files in the export directory that "
|
||||
"do not belong to the current export, such as files from "
|
||||
"deleted documents."
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.target = None
|
||||
self.files_in_export_dir = []
|
||||
self.exported_files = []
|
||||
self.compare_checksums = False
|
||||
self.use_filename_format = False
|
||||
self.delete = False
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.target = options["target"]
|
||||
self.compare_checksums = options['compare_checksums']
|
||||
self.use_filename_format = options['use_filename_format']
|
||||
self.delete = options['delete']
|
||||
|
||||
if not os.path.exists(self.target):
|
||||
raise CommandError("That path doesn't exist")
|
||||
@@ -38,72 +77,148 @@ class Command(Renderable, BaseCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
self.dump()
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
# 1. Take a snapshot of what files exist in the current export folder
|
||||
for root, dirs, files in os.walk(self.target):
|
||||
self.files_in_export_dir.extend(
|
||||
map(lambda f: os.path.abspath(os.path.join(root, f)), files)
|
||||
)
|
||||
|
||||
documents = Document.objects.all()
|
||||
document_map = {d.pk: d for d in documents}
|
||||
manifest = json.loads(serializers.serialize("json", documents))
|
||||
# 2. Create manifest, containing all correspondents, types, tags and
|
||||
# documents
|
||||
with transaction.atomic():
|
||||
manifest = json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
for index, document_dict in enumerate(manifest):
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", Tag.objects.all()))
|
||||
|
||||
# Force output to unencrypted as that will be the current state.
|
||||
# The importer will make the decision to encrypt or not.
|
||||
manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", DocumentType.objects.all()))
|
||||
|
||||
documents = Document.objects.order_by("id")
|
||||
document_map = {d.pk: d for d in documents}
|
||||
document_manifest = json.loads(
|
||||
serializers.serialize("json", documents))
|
||||
manifest += document_manifest
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(enumerate(document_manifest),
|
||||
total=len(document_manifest)):
|
||||
# 3.1. store files unencrypted
|
||||
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
# 3.2. generate a unique filename
|
||||
filename_counter = 0
|
||||
while True:
|
||||
if self.use_filename_format:
|
||||
base_name = generate_filename(
|
||||
document, counter=filename_counter,
|
||||
append_gpg=False)
|
||||
else:
|
||||
base_name = document.get_public_filename(
|
||||
counter=filename_counter)
|
||||
|
||||
thumbnail_name = unique_filename + "-thumbnail.png"
|
||||
if base_name not in self.exported_files:
|
||||
self.exported_files.append(base_name)
|
||||
break
|
||||
else:
|
||||
filename_counter += 1
|
||||
|
||||
# 3.3. write filenames into manifest
|
||||
original_name = base_name
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
document_dict[EXPORTER_FILE_NAME] = original_name
|
||||
|
||||
thumbnail_name = base_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = \
|
||||
f"{document.pk:07}_archive_{document.archive_file_name}"
|
||||
if document.has_archive_version:
|
||||
archive_name = base_name + "-archive.pdf"
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
# 3.4. write files to target folder
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
||||
with open(file_target, "wb") as f:
|
||||
os.makedirs(os.path.dirname(original_target), exist_ok=True)
|
||||
with open(original_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
os.utime(file_target, times=(t, t))
|
||||
os.utime(original_target, times=(t, t))
|
||||
|
||||
os.makedirs(os.path.dirname(thumbnail_target), exist_ok=True)
|
||||
with open(thumbnail_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
os.makedirs(os.path.dirname(archive_target), exist_ok=True)
|
||||
with open(archive_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.archive_path))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
self.check_and_copy(document.source_path,
|
||||
document.checksum,
|
||||
original_target)
|
||||
|
||||
shutil.copy(document.source_path, file_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
self.check_and_copy(document.thumbnail_path,
|
||||
None,
|
||||
thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
shutil.copy(document.archive_path, archive_target)
|
||||
self.check_and_copy(document.archive_path,
|
||||
document.archive_checksum,
|
||||
archive_target)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
# 4. write manifest to target forlder
|
||||
manifest_path = os.path.abspath(
|
||||
os.path.join(self.target, "manifest.json"))
|
||||
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", Tag.objects.all()))
|
||||
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", DocumentType.objects.all()))
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||
with open(manifest_path, "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
if self.delete:
|
||||
# 5. Remove files which we did not explicitly export in this run
|
||||
|
||||
if manifest_path in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(manifest_path)
|
||||
|
||||
for f in self.files_in_export_dir:
|
||||
os.remove(f)
|
||||
|
||||
delete_empty_directories(os.path.abspath(os.path.dirname(f)),
|
||||
os.path.abspath(self.target))
|
||||
|
||||
def check_and_copy(self, source, source_checksum, target):
|
||||
if os.path.abspath(target) in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(os.path.abspath(target))
|
||||
|
||||
perform_copy = False
|
||||
|
||||
if os.path.exists(target):
|
||||
source_stat = os.stat(source)
|
||||
target_stat = os.stat(target)
|
||||
if self.compare_checksums and source_checksum:
|
||||
with open(target, "rb") as f:
|
||||
target_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif source_stat.st_mtime != target_stat.st_mtime:
|
||||
perform_copy = True
|
||||
elif source_stat.st_size != target_stat.st_size:
|
||||
perform_copy = True
|
||||
else:
|
||||
# Copy if it does not exist
|
||||
perform_copy = True
|
||||
|
||||
if perform_copy:
|
||||
os.makedirs(os.path.dirname(target), exist_ok=True)
|
||||
shutil.copy2(source, target)
|
||||
|
@@ -1,19 +1,33 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from contextlib import contextmanager
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.db.models.signals import post_save, m2m_changed
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.models import Document
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
from ...file_handling import create_source_path_directory
|
||||
from ...signals.handlers import update_filename_and_move_files
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@contextmanager
|
||||
def disable_signal(sig, receiver, sender):
|
||||
try:
|
||||
sig.disconnect(receiver=receiver, sender=sender)
|
||||
yield
|
||||
finally:
|
||||
sig.connect(receiver=receiver, sender=sender)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Using a manifest.json file, load the data from there, and import the
|
||||
@@ -30,6 +44,8 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
self.source = options["source"]
|
||||
|
||||
if not os.path.exists(self.source):
|
||||
@@ -45,11 +61,19 @@ class Command(Renderable, BaseCommand):
|
||||
self.manifest = json.load(f)
|
||||
|
||||
self._check_manifest()
|
||||
with disable_signal(post_save,
|
||||
receiver=update_filename_and_move_files,
|
||||
sender=Document):
|
||||
with disable_signal(m2m_changed,
|
||||
receiver=update_filename_and_move_files,
|
||||
sender=Document.tags.through):
|
||||
# Fill up the database with whatever is in the manifest
|
||||
call_command("loaddata", manifest_path)
|
||||
|
||||
# Fill up the database with whatever is in the manifest
|
||||
call_command("loaddata", manifest_path)
|
||||
self._import_files_from_manifest()
|
||||
|
||||
self._import_files_from_manifest()
|
||||
print("Updating search index...")
|
||||
call_command('document_index', 'reindex')
|
||||
|
||||
@staticmethod
|
||||
def _check_manifest_exists(path):
|
||||
@@ -93,10 +117,13 @@ class Command(Renderable, BaseCommand):
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
for record in self.manifest:
|
||||
print("Copy files into paperless...")
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
manifest_documents = list(filter(
|
||||
lambda r: r["model"] == "documents.document",
|
||||
self.manifest))
|
||||
|
||||
for record in tqdm.tqdm(manifest_documents):
|
||||
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
@@ -114,17 +141,19 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
shutil.copy2(document_path, document.source_path)
|
||||
shutil.copy2(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
# TODO: this assumes that the export is valid and
|
||||
# archive_filename is present on all documents with
|
||||
# archived files
|
||||
shutil.copy2(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
@@ -1,25 +1,19 @@
|
||||
from django.core.management import BaseCommand
|
||||
from django.db import transaction
|
||||
|
||||
from documents.mixins import Renderable
|
||||
from documents.tasks import index_reindex, index_optimize
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = "Manages the document index."
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("command", choices=['reindex', 'optimize'])
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
if options['command'] == 'reindex':
|
||||
index_reindex()
|
||||
elif options['command'] == 'optimize':
|
||||
index_optimize()
|
||||
with transaction.atomic():
|
||||
if options['command'] == 'reindex':
|
||||
index_reindex()
|
||||
elif options['command'] == 'optimize':
|
||||
index_optimize()
|
||||
|
@@ -1,12 +0,0 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Log
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = "A quick & dirty way to see what's in the logs"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
for log in Log.objects.order_by("pk"):
|
||||
print(log)
|
@@ -1,23 +1,21 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
This will rename all documents to match the latest filename format.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in Document.objects.all():
|
||||
# Saving the document again will generate a new filename and rename
|
||||
document.save()
|
||||
for document in tqdm.tqdm(Document.objects.all()):
|
||||
post_save.send(Document, instance=document)
|
||||
|
@@ -2,14 +2,15 @@ import logging
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.classifier import load_classifier
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
from ...signals.handlers import set_correspondent, set_document_type, set_tags
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
logger = logging.getLogger("paperless.management.retagger")
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current classification model, assigns correspondents, tags
|
||||
@@ -18,10 +19,6 @@ class Command(Renderable, BaseCommand):
|
||||
modified) after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-c", "--correspondent",
|
||||
@@ -62,24 +59,16 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
if options["inbox_only"]:
|
||||
queryset = Document.objects.filter(tags__is_inbox_tag=True)
|
||||
else:
|
||||
queryset = Document.objects.all()
|
||||
documents = queryset.distinct()
|
||||
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in documents:
|
||||
logging.getLogger(__name__).info(
|
||||
logger.info(
|
||||
f"Processing document {document.title}")
|
||||
|
||||
if options['correspondent']:
|
||||
|
15
src/documents/management/commands/document_sanity_checker.py
Normal file
15
src/documents/management/commands/document_sanity_checker.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from documents.sanity_checker import check_sanity
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
This command checks your document archive for issues.
|
||||
""".replace(" ", "")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
messages = check_sanity(progress=True)
|
||||
|
||||
messages.log_messages()
|
69
src/documents/management/commands/document_thumbnails.py
Normal file
69
src/documents/management/commands/document_thumbnails.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import logging
|
||||
import multiprocessing
|
||||
import shutil
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
def _process_document(doc_in):
|
||||
document = Document.objects.get(id=doc_in)
|
||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
||||
|
||||
if parser_class:
|
||||
parser = parser_class(logging_group=None)
|
||||
else:
|
||||
print(f"{document} No parser for mime type {document.mime_type}")
|
||||
return
|
||||
|
||||
try:
|
||||
thumb = parser.get_optimised_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename()
|
||||
)
|
||||
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
This will regenerate the thumbnails for all documents.
|
||||
""".replace(" ", "")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-d", "--document",
|
||||
default=None,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Specify the ID of a document, and this command will only "
|
||||
"run on this specific document."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
if options['document']:
|
||||
documents = Document.objects.filter(pk=options['document'])
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# conncetions between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
with multiprocessing.Pool() as pool:
|
||||
list(tqdm.tqdm(
|
||||
pool.imap_unordered(_process_document, ids), total=len(ids)
|
||||
))
|
@@ -1,53 +1,63 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
def match_correspondents(document_content, classifier):
|
||||
logger = logging.getLogger("paperless.matching")
|
||||
|
||||
|
||||
def log_reason(matching_model, document, reason):
|
||||
class_name = type(matching_model).__name__
|
||||
logger.debug(
|
||||
f"{class_name} {matching_model.name} matched on document "
|
||||
f"{document} because {reason}")
|
||||
|
||||
|
||||
def match_correspondents(document, classifier):
|
||||
if classifier:
|
||||
pred_id = classifier.predict_correspondent(document_content)
|
||||
pred_id = classifier.predict_correspondent(document.content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
lambda o: matches(o, document) or o.pk == pred_id,
|
||||
correspondents))
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
def match_document_types(document, classifier):
|
||||
if classifier:
|
||||
pred_id = classifier.predict_document_type(document_content)
|
||||
pred_id = classifier.predict_document_type(document.content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
document_types = DocumentType.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
lambda o: matches(o, document) or o.pk == pred_id,
|
||||
document_types))
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
def match_tags(document, classifier):
|
||||
if classifier:
|
||||
predicted_tag_ids = classifier.predict_tags(document_content)
|
||||
predicted_tag_ids = classifier.predict_tags(document.content)
|
||||
else:
|
||||
predicted_tag_ids = []
|
||||
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
|
||||
lambda o: matches(o, document) or o.pk in predicted_tag_ids,
|
||||
tags))
|
||||
|
||||
|
||||
def matches(matching_model, document_content):
|
||||
def matches(matching_model, document):
|
||||
search_kwargs = {}
|
||||
|
||||
document_content = document_content.lower()
|
||||
document_content = document.content.lower()
|
||||
|
||||
# Check that match is not empty
|
||||
if matching_model.match.strip() == "":
|
||||
@@ -62,35 +72,73 @@ def matches(matching_model, document_content):
|
||||
rf"\b{word}\b", document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
log_reason(
|
||||
matching_model, document,
|
||||
f"it contains all of these words: {matching_model.match}"
|
||||
)
|
||||
return True
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||
log_reason(
|
||||
matching_model, document,
|
||||
f"it contains this word: {word}"
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
return bool(re.search(
|
||||
result = bool(re.search(
|
||||
rf"\b{matching_model.match}\b",
|
||||
document_content,
|
||||
**search_kwargs
|
||||
))
|
||||
if result:
|
||||
log_reason(
|
||||
matching_model, document,
|
||||
f"it contains this string: \"{matching_model.match}\""
|
||||
)
|
||||
return result
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
return bool(re.search(
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content
|
||||
))
|
||||
try:
|
||||
match = re.search(
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content
|
||||
)
|
||||
except re.error:
|
||||
logger.error(
|
||||
f"Error while processing regular expression "
|
||||
f"{matching_model.match}"
|
||||
)
|
||||
return False
|
||||
if match:
|
||||
log_reason(
|
||||
matching_model, document,
|
||||
f"the string {match.group()} matches the regular expression "
|
||||
f"{matching_model.match}"
|
||||
)
|
||||
return bool(match)
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
||||
text = re.sub(r'[^\w\s]', '', document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
|
||||
return fuzz.partial_ratio(match, text) >= 90
|
||||
if fuzz.partial_ratio(match, text) >= 90:
|
||||
# TODO: make this better
|
||||
log_reason(
|
||||
matching_model, document,
|
||||
f"parts of the document content somehow match the string "
|
||||
f"{matching_model.match}"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
|
@@ -6,13 +6,18 @@ import magic
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def source_path(self):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
if self.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
return os.path.join(
|
||||
@@ -26,9 +31,18 @@ def add_mime_types(apps, schema_editor):
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
d.mime_type = magic.from_file(source_path(d), mime=True)
|
||||
f = open(source_path(d), "rb")
|
||||
if d.storage_type == STORAGE_TYPE_GPG:
|
||||
|
||||
data = GnuPG.decrypted(f)
|
||||
else:
|
||||
data = f.read(1024)
|
||||
|
||||
d.mime_type = magic.from_buffer(data, mime=True)
|
||||
d.save()
|
||||
|
||||
f.close()
|
||||
|
||||
|
||||
def add_file_extensions(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
25
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
25
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-08 22:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1005_checksums'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='correspondent',
|
||||
name='slug',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documenttype',
|
||||
name='slug',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
),
|
||||
]
|
@@ -0,0 +1,37 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-12 14:41
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
('documents', '1006_auto_20201208_2209'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='SavedView',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=128)),
|
||||
('show_on_dashboard', models.BooleanField()),
|
||||
('show_in_sidebar', models.BooleanField()),
|
||||
('sort_field', models.CharField(max_length=128)),
|
||||
('sort_reverse', models.BooleanField(default=False)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='SavedViewFilterRule',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('rule_type', models.PositiveIntegerField(choices=[(0, 'Title contains'), (1, 'Content contains'), (2, 'ASN is'), (3, 'Correspondent is'), (4, 'Document type is'), (5, 'Is in inbox'), (6, 'Has tag'), (7, 'Has any tag'), (8, 'Created before'), (9, 'Created after'), (10, 'Created year is'), (11, 'Created month is'), (12, 'Created day is'), (13, 'Added before'), (14, 'Added after'), (15, 'Modified before'), (16, 'Modified after'), (17, 'Does not have tag')])),
|
||||
('value', models.CharField(max_length=128)),
|
||||
('saved_view', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview')),
|
||||
],
|
||||
),
|
||||
]
|
34
src/documents/migrations/1008_auto_20201216_1736.py
Normal file
34
src/documents/migrations/1008_auto_20201216_1736.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-16 17:36
|
||||
|
||||
from django.db import migrations
|
||||
import django.db.models.functions.text
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1007_savedview_savedviewfilterrule'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='correspondent',
|
||||
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='document',
|
||||
options={'ordering': ('-created',)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='documenttype',
|
||||
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='savedview',
|
||||
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
|
||||
),
|
||||
]
|
29
src/documents/migrations/1009_auto_20201216_2005.py
Normal file
29
src/documents/migrations/1009_auto_20201216_2005.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-16 20:05
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1008_auto_20201216_1736'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='correspondent',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='documenttype',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='savedview',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
]
|
18
src/documents/migrations/1010_auto_20210101_2159.py
Normal file
18
src/documents/migrations/1010_auto_20210101_2159.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.1.4 on 2021-01-01 21:59
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1009_auto_20201216_2005'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='savedviewfilterrule',
|
||||
name='value',
|
||||
field=models.CharField(blank=True, max_length=128, null=True),
|
||||
),
|
||||
]
|
250
src/documents/migrations/1011_auto_20210101_2340.py
Normal file
250
src/documents/migrations/1011_auto_20210101_2340.py
Normal file
@@ -0,0 +1,250 @@
|
||||
# Generated by Django 3.1.4 on 2021-01-01 23:40
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
('documents', '1010_auto_20210101_2159'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='correspondent',
|
||||
options={'ordering': ('name',), 'verbose_name': 'correspondent', 'verbose_name_plural': 'correspondents'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='document',
|
||||
options={'ordering': ('-created',), 'verbose_name': 'document', 'verbose_name_plural': 'documents'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='documenttype',
|
||||
options={'verbose_name': 'document type', 'verbose_name_plural': 'document types'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='log',
|
||||
options={'ordering': ('-created',), 'verbose_name': 'log', 'verbose_name_plural': 'logs'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='savedview',
|
||||
options={'ordering': ('name',), 'verbose_name': 'saved view', 'verbose_name_plural': 'saved views'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='savedviewfilterrule',
|
||||
options={'verbose_name': 'filter rule', 'verbose_name_plural': 'filter rules'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'verbose_name': 'tag', 'verbose_name_plural': 'tags'},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='is_insensitive',
|
||||
field=models.BooleanField(default=True, verbose_name='is insensitive'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='match',
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='name',
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='added',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False, verbose_name='added'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='archive_checksum',
|
||||
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True, verbose_name='archive checksum'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='archive_serial_number',
|
||||
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True, verbose_name='archive serial number'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True, verbose_name='checksum'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='content',
|
||||
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.', verbose_name='content'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='correspondent',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.correspondent', verbose_name='correspondent'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='created',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, verbose_name='created'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='document_type',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype', verbose_name='document type'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, verbose_name='filename'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='mime_type',
|
||||
field=models.CharField(editable=False, max_length=256, verbose_name='mime type'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='modified',
|
||||
field=models.DateTimeField(auto_now=True, db_index=True, verbose_name='modified'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='storage_type',
|
||||
field=models.CharField(choices=[('unencrypted', 'Unencrypted'), ('gpg', 'Encrypted with GNU Privacy Guard')], default='unencrypted', editable=False, max_length=11, verbose_name='storage type'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='documents', to='documents.Tag', verbose_name='tags'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='title',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=128, verbose_name='title'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='is_insensitive',
|
||||
field=models.BooleanField(default=True, verbose_name='is insensitive'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='match',
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='name',
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='created',
|
||||
field=models.DateTimeField(auto_now_add=True, verbose_name='created'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='group',
|
||||
field=models.UUIDField(blank=True, null=True, verbose_name='group'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='level',
|
||||
field=models.PositiveIntegerField(choices=[(10, 'debug'), (20, 'information'), (30, 'warning'), (40, 'error'), (50, 'critical')], default=20, verbose_name='level'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='message',
|
||||
field=models.TextField(verbose_name='message'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='name',
|
||||
field=models.CharField(max_length=128, verbose_name='name'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='show_in_sidebar',
|
||||
field=models.BooleanField(verbose_name='show in sidebar'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='show_on_dashboard',
|
||||
field=models.BooleanField(verbose_name='show on dashboard'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='sort_field',
|
||||
field=models.CharField(max_length=128, verbose_name='sort field'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='sort_reverse',
|
||||
field=models.BooleanField(default=False, verbose_name='sort reverse'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedview',
|
||||
name='user',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL, verbose_name='user'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedviewfilterrule',
|
||||
name='rule_type',
|
||||
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag')], verbose_name='rule type'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedviewfilterrule',
|
||||
name='saved_view',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview', verbose_name='saved view'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='savedviewfilterrule',
|
||||
name='value',
|
||||
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='value'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='colour',
|
||||
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1, verbose_name='color'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='is_inbox_tag',
|
||||
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.', verbose_name='is inbox tag'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='is_insensitive',
|
||||
field=models.BooleanField(default=True, verbose_name='is insensitive'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='match',
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='name',
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
|
||||
),
|
||||
]
|
330
src/documents/migrations/1012_fix_archive_files.py
Normal file
330
src/documents/migrations/1012_fix_archive_files.py
Normal file
@@ -0,0 +1,330 @@
|
||||
# Generated by Django 3.1.6 on 2021-02-07 22:26
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from time import sleep
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from documents.file_handling import defaultdictNoStr, many_to_dictionary
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
###############################################################################
|
||||
# This is code copied straight paperless before the change.
|
||||
###############################################################################
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
||||
|
||||
def archive_path_old(doc):
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename(doc.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(doc.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def archive_path_new(doc):
|
||||
if doc.archive_filename is not None:
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
str(doc.archive_filename)
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def source_path(doc):
|
||||
if doc.filename:
|
||||
fname = str(doc.filename)
|
||||
else:
|
||||
fname = "{:07}{}".format(doc.pk, doc.file_type)
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def generate_unique_filename(doc, archive_filename=False):
|
||||
if archive_filename:
|
||||
old_filename = doc.archive_filename
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = doc.filename
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(
|
||||
doc, counter, archive_filename=archive_filename)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if os.path.exists(os.path.join(root, new_filename)):
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdictNoStr(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
tag_list = pathvalidate.sanitize_filename(
|
||||
",".join(sorted(
|
||||
[tag.name for tag in doc.tags.all()]
|
||||
)),
|
||||
replacement_text="-"
|
||||
)
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
title=pathvalidate.sanitize_filename(
|
||||
doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
|
||||
created_day=f"{doc.created.day:02}" if doc.created else "none",
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=f"{doc.added.month:02}" if doc.added else "none",
|
||||
added_day=f"{doc.added.day:02}" if doc.added else "none",
|
||||
tags=tags,
|
||||
tag_list=tag_list
|
||||
).strip()
|
||||
|
||||
path = path.strip(os.sep)
|
||||
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logger.warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
|
||||
filetype_str = ".pdf" if archive_filename else doc.file_type
|
||||
|
||||
if len(path) > 0:
|
||||
filename = f"{path}{counter_str}{filetype_str}"
|
||||
else:
|
||||
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
###############################################################################
|
||||
# This code performs bidirection archive file transformation.
|
||||
###############################################################################
|
||||
|
||||
|
||||
def parse_wrapper(parser, path, mime_type, file_name):
|
||||
# this is here so that I can mock this out for testing.
|
||||
parser.parse(path, mime_type, file_name)
|
||||
|
||||
|
||||
def create_archive_version(doc, retry_count=3):
|
||||
from documents.parsers import get_parser_class_for_mime_type, \
|
||||
DocumentParser, \
|
||||
ParseError
|
||||
|
||||
logger.info(
|
||||
f"Regenerating archive document for document ID:{doc.id}"
|
||||
)
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
for try_num in range(retry_count):
|
||||
parser: DocumentParser = parser_class(None, None)
|
||||
try:
|
||||
parse_wrapper(parser, source_path(doc), doc.mime_type,
|
||||
os.path.basename(doc.filename))
|
||||
doc.content = parser.get_text()
|
||||
|
||||
if parser.get_archive_path() and os.path.isfile(
|
||||
parser.get_archive_path()):
|
||||
doc.archive_filename = generate_unique_filename(
|
||||
doc, archive_filename=True)
|
||||
with open(parser.get_archive_path(), "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
os.makedirs(os.path.dirname(archive_path_new(doc)),
|
||||
exist_ok=True)
|
||||
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
|
||||
else:
|
||||
doc.archive_checksum = None
|
||||
logger.error(
|
||||
f"Parser did not return an archive document for document "
|
||||
f"ID:{doc.id}. Removing archive document."
|
||||
)
|
||||
doc.save()
|
||||
return
|
||||
except ParseError:
|
||||
if try_num + 1 == retry_count:
|
||||
logger.exception(
|
||||
f"Unable to regenerate archive document for ID:{doc.id}. You "
|
||||
f"need to invoke the document_archiver management command "
|
||||
f"manually for that document."
|
||||
)
|
||||
doc.archive_checksum = None
|
||||
doc.save()
|
||||
return
|
||||
else:
|
||||
# This is mostly here for the tika parser in docker
|
||||
# environemnts. The servers for parsing need to come up first,
|
||||
# and the docker setup doesn't ensure that tika is running
|
||||
# before attempting migrations.
|
||||
logger.error("Parse error, will try again in 5 seconds...")
|
||||
sleep(5)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
def move_old_to_new_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
affected_document_ids = set()
|
||||
|
||||
old_archive_path_to_id = {}
|
||||
|
||||
# check for documents that have incorrect archive versions
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
|
||||
if old_path in old_archive_path_to_id:
|
||||
affected_document_ids.add(doc.id)
|
||||
affected_document_ids.add(old_archive_path_to_id[old_path])
|
||||
else:
|
||||
old_archive_path_to_id[old_path] = doc.id
|
||||
|
||||
# check that archive files of all unaffected documents are in place
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
|
||||
raise ValueError(
|
||||
f"Archived document ID:{doc.id} does not exist at: "
|
||||
f"{old_path}")
|
||||
|
||||
# check that we can regenerate affected archive versions
|
||||
for doc_id in affected_document_ids:
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
if not parser_class:
|
||||
raise ValueError(
|
||||
f"Document ID:{doc.id} has an invalid archived document, "
|
||||
f"but no parsers are available. Cannot migrate.")
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
|
||||
if doc.id in affected_document_ids:
|
||||
old_path = archive_path_old(doc)
|
||||
# remove affected archive versions
|
||||
if os.path.isfile(old_path):
|
||||
logger.debug(
|
||||
f"Removing {old_path}"
|
||||
)
|
||||
os.unlink(old_path)
|
||||
else:
|
||||
# Set archive path for unaffected files
|
||||
doc.archive_filename = archive_name_from_filename(doc.filename)
|
||||
Document.objects.filter(id=doc.id).update(
|
||||
archive_filename=doc.archive_filename
|
||||
)
|
||||
|
||||
# regenerate archive documents
|
||||
for doc_id in affected_document_ids:
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
create_archive_version(doc)
|
||||
|
||||
|
||||
def move_new_to_old_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
old_archive_paths = set()
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
if old_archive_path in old_archive_paths:
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Archive file name {old_archive_path} of "
|
||||
f"document {doc.filename} would clash with another archive "
|
||||
f"filename.")
|
||||
old_archive_paths.add(old_archive_path)
|
||||
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Cannot move {new_archive_path} to "
|
||||
f"{old_archive_path}: file already exists."
|
||||
)
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
if new_archive_path != old_archive_path:
|
||||
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
|
||||
shutil.move(new_archive_path, old_archive_path)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1011_auto_20210101_2340'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
|
||||
),
|
||||
migrations.RunPython(
|
||||
move_old_to_new_locations,
|
||||
move_new_to_old_locations
|
||||
),
|
||||
]
|
@@ -1,9 +0,0 @@
|
||||
class Renderable:
|
||||
"""
|
||||
A handy mixin to make it easier/cleaner to print output based on a
|
||||
verbosity value.
|
||||
"""
|
||||
|
||||
def _render(self, text, verbosity):
|
||||
if self.verbosity >= verbosity:
|
||||
print(text)
|
@@ -1,18 +1,22 @@
|
||||
# coding=utf-8
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
import pathvalidate
|
||||
|
||||
import dateutil.parser
|
||||
from colorhash import ColorHash
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
from django.utils.timezone import is_aware
|
||||
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from documents.file_handling import archive_name_from_filename
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
@@ -26,37 +30,31 @@ class MatchingModel(models.Model):
|
||||
MATCH_AUTO = 6
|
||||
|
||||
MATCHING_ALGORITHMS = (
|
||||
(MATCH_ANY, "Any"),
|
||||
(MATCH_ALL, "All"),
|
||||
(MATCH_LITERAL, "Literal"),
|
||||
(MATCH_REGEX, "Regular Expression"),
|
||||
(MATCH_FUZZY, "Fuzzy Match"),
|
||||
(MATCH_AUTO, "Automatic Classification"),
|
||||
(MATCH_ANY, _("Any word")),
|
||||
(MATCH_ALL, _("All words")),
|
||||
(MATCH_LITERAL, _("Exact match")),
|
||||
(MATCH_REGEX, _("Regular expression")),
|
||||
(MATCH_FUZZY, _("Fuzzy word")),
|
||||
(MATCH_AUTO, _("Automatic")),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
name = models.CharField(
|
||||
_("name"),
|
||||
max_length=128, unique=True)
|
||||
|
||||
match = models.CharField(
|
||||
_("match"),
|
||||
max_length=256, blank=True)
|
||||
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
_("matching algorithm"),
|
||||
choices=MATCHING_ALGORITHMS,
|
||||
default=MATCH_ANY,
|
||||
help_text=(
|
||||
"Which algorithm you want to use when matching text to the OCR'd "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word "
|
||||
"provided in the PDF, while \"all\" requires that every word "
|
||||
"provided appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. (If you don't know what a regex "
|
||||
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
||||
"match\" looks for words or phrases that are mostly—but not "
|
||||
"exactly—the same, which can be useful for matching against "
|
||||
"documents containg imperfections that foil accurate OCR."
|
||||
)
|
||||
default=MATCH_ANY
|
||||
)
|
||||
|
||||
is_insensitive = models.BooleanField(default=True)
|
||||
is_insensitive = models.BooleanField(
|
||||
_("is insensitive"),
|
||||
default=True)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
@@ -65,13 +63,6 @@ class MatchingModel(models.Model):
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
|
||||
class Correspondent(MatchingModel):
|
||||
|
||||
@@ -81,18 +72,27 @@ class Correspondent(MatchingModel):
|
||||
|
||||
class Meta:
|
||||
ordering = ("name",)
|
||||
verbose_name = _("correspondent")
|
||||
verbose_name_plural = _("correspondents")
|
||||
|
||||
|
||||
class Tag(MatchingModel):
|
||||
|
||||
colour = models.CharField(blank=True, max_length=7)
|
||||
colour = models.CharField(
|
||||
_("color"),
|
||||
blank=True, max_length=7)
|
||||
|
||||
is_inbox_tag = models.BooleanField(
|
||||
_("is inbox tag"),
|
||||
default=False,
|
||||
help_text="Marks this tag as an inbox tag: All newly consumed "
|
||||
"documents will be tagged with inbox tags."
|
||||
help_text=_("Marks this tag as an inbox tag: All newly consumed "
|
||||
"documents will be tagged with inbox tags.")
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("tag")
|
||||
verbose_name_plural = _("tags")
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self.colour == "":
|
||||
self.colour = ColorHash(
|
||||
@@ -105,7 +105,9 @@ class Tag(MatchingModel):
|
||||
|
||||
class DocumentType(MatchingModel):
|
||||
|
||||
pass
|
||||
class Meta:
|
||||
verbose_name = _("document type")
|
||||
verbose_name_plural = _("document types")
|
||||
|
||||
|
||||
class Document(models.Model):
|
||||
@@ -113,8 +115,8 @@ class Document(models.Model):
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
STORAGE_TYPES = (
|
||||
(STORAGE_TYPE_UNENCRYPTED, "Unencrypted"),
|
||||
(STORAGE_TYPE_GPG, "Encrypted with GNU Privacy Guard")
|
||||
(STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")),
|
||||
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard"))
|
||||
)
|
||||
|
||||
correspondent = models.ForeignKey(
|
||||
@@ -122,54 +124,68 @@ class Document(models.Model):
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="documents",
|
||||
on_delete=models.SET_NULL
|
||||
on_delete=models.SET_NULL,
|
||||
verbose_name=_("correspondent")
|
||||
)
|
||||
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
title = models.CharField(
|
||||
_("title"),
|
||||
max_length=128, blank=True, db_index=True)
|
||||
|
||||
document_type = models.ForeignKey(
|
||||
DocumentType,
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="documents",
|
||||
on_delete=models.SET_NULL
|
||||
on_delete=models.SET_NULL,
|
||||
verbose_name=_("document type")
|
||||
)
|
||||
|
||||
content = models.TextField(
|
||||
_("content"),
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching."
|
||||
help_text=_("The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching.")
|
||||
)
|
||||
|
||||
mime_type = models.CharField(
|
||||
_("mime type"),
|
||||
max_length=256,
|
||||
editable=False
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
Tag, related_name="documents", blank=True)
|
||||
Tag, related_name="documents", blank=True,
|
||||
verbose_name=_("tags")
|
||||
)
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document."
|
||||
help_text=_("The checksum of the original document.")
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
help_text=_("The checksum of the archived document.")
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
_("created"),
|
||||
default=timezone.now, db_index=True)
|
||||
|
||||
modified = models.DateTimeField(
|
||||
_("modified"),
|
||||
auto_now=True, editable=False, db_index=True)
|
||||
|
||||
storage_type = models.CharField(
|
||||
_("storage type"),
|
||||
max_length=11,
|
||||
choices=STORAGE_TYPES,
|
||||
default=STORAGE_TYPE_UNENCRYPTED,
|
||||
@@ -177,36 +193,53 @@ class Document(models.Model):
|
||||
)
|
||||
|
||||
added = models.DateTimeField(
|
||||
_("added"),
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
filename = models.FilePathField(
|
||||
_("filename"),
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
unique=True,
|
||||
null=True,
|
||||
help_text="Current filename in storage"
|
||||
help_text=_("Current filename in storage")
|
||||
)
|
||||
|
||||
archive_filename = models.FilePathField(
|
||||
_("archive filename"),
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
unique=True,
|
||||
null=True,
|
||||
help_text=_("Current archive filename in storage")
|
||||
)
|
||||
|
||||
archive_serial_number = models.IntegerField(
|
||||
_("archive serial number"),
|
||||
blank=True,
|
||||
null=True,
|
||||
unique=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document "
|
||||
"archive."
|
||||
help_text=_("The position of this document in your physical document "
|
||||
"archive.")
|
||||
)
|
||||
|
||||
class Meta:
|
||||
ordering = ("correspondent", "title")
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("document")
|
||||
verbose_name_plural = _("documents")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d")
|
||||
if is_aware(self.created):
|
||||
created = timezone.localdate(self.created).isoformat()
|
||||
else:
|
||||
created = datetime.date.isoformat(self.created)
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
if self.correspondent or self.title:
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
return f"{created} {self.correspondent} {self.title}"
|
||||
else:
|
||||
return f"{created} {self.title}"
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
@@ -215,7 +248,7 @@ class Document(models.Model):
|
||||
else:
|
||||
fname = "{:07}{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
@@ -227,28 +260,38 @@ class Document(models.Model):
|
||||
return open(self.source_path, "rb")
|
||||
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
def has_archive_version(self):
|
||||
return self.archive_filename is not None
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.has_archive_version:
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
str(self.archive_filename)
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
def get_public_filename(self, archive=False, counter=0, suffix=None):
|
||||
result = str(self)
|
||||
|
||||
@property
|
||||
def archive_file_name(self):
|
||||
return slugify(str(self)) + ".pdf"
|
||||
if counter:
|
||||
result += f"_{counter:02}"
|
||||
|
||||
if suffix:
|
||||
result += suffix
|
||||
|
||||
if archive:
|
||||
result += ".pdf"
|
||||
else:
|
||||
result += self.file_type
|
||||
|
||||
return pathvalidate.sanitize_filename(result, replacement_text="-")
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
@@ -273,76 +316,116 @@ class Document(models.Model):
|
||||
class Log(models.Model):
|
||||
|
||||
LEVELS = (
|
||||
(logging.DEBUG, "Debugging"),
|
||||
(logging.INFO, "Informational"),
|
||||
(logging.WARNING, "Warning"),
|
||||
(logging.ERROR, "Error"),
|
||||
(logging.CRITICAL, "Critical"),
|
||||
(logging.DEBUG, _("debug")),
|
||||
(logging.INFO, _("information")),
|
||||
(logging.WARNING, _("warning")),
|
||||
(logging.ERROR, _("error")),
|
||||
(logging.CRITICAL, _("critical")),
|
||||
)
|
||||
|
||||
group = models.UUIDField(blank=True, null=True)
|
||||
message = models.TextField()
|
||||
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
group = models.UUIDField(
|
||||
_("group"),
|
||||
blank=True, null=True)
|
||||
|
||||
message = models.TextField(_("message"))
|
||||
|
||||
level = models.PositiveIntegerField(
|
||||
_("level"),
|
||||
choices=LEVELS, default=logging.INFO)
|
||||
|
||||
created = models.DateTimeField(_("created"), auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("log")
|
||||
verbose_name_plural = _("logs")
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
|
||||
class SavedView(models.Model):
|
||||
|
||||
class Meta:
|
||||
|
||||
ordering = ("name",)
|
||||
verbose_name = _("saved view")
|
||||
verbose_name_plural = _("saved views")
|
||||
|
||||
user = models.ForeignKey(User, on_delete=models.CASCADE,
|
||||
verbose_name=_("user"))
|
||||
name = models.CharField(
|
||||
_("name"),
|
||||
max_length=128)
|
||||
|
||||
show_on_dashboard = models.BooleanField(
|
||||
_("show on dashboard"),
|
||||
)
|
||||
show_in_sidebar = models.BooleanField(
|
||||
_("show in sidebar"),
|
||||
)
|
||||
|
||||
sort_field = models.CharField(
|
||||
_("sort field"),
|
||||
max_length=128)
|
||||
sort_reverse = models.BooleanField(
|
||||
_("sort reverse"),
|
||||
default=False)
|
||||
|
||||
|
||||
class SavedViewFilterRule(models.Model):
|
||||
RULE_TYPES = [
|
||||
(0, _("title contains")),
|
||||
(1, _("content contains")),
|
||||
(2, _("ASN is")),
|
||||
(3, _("correspondent is")),
|
||||
(4, _("document type is")),
|
||||
(5, _("is in inbox")),
|
||||
(6, _("has tag")),
|
||||
(7, _("has any tag")),
|
||||
(8, _("created before")),
|
||||
(9, _("created after")),
|
||||
(10, _("created year is")),
|
||||
(11, _("created month is")),
|
||||
(12, _("created day is")),
|
||||
(13, _("added before")),
|
||||
(14, _("added after")),
|
||||
(15, _("modified before")),
|
||||
(16, _("modified after")),
|
||||
(17, _("does not have tag")),
|
||||
]
|
||||
|
||||
saved_view = models.ForeignKey(
|
||||
SavedView,
|
||||
on_delete=models.CASCADE,
|
||||
related_name="filter_rules",
|
||||
verbose_name=_("saved view")
|
||||
)
|
||||
|
||||
rule_type = models.PositiveIntegerField(
|
||||
_("rule type"),
|
||||
choices=RULE_TYPES)
|
||||
|
||||
value = models.CharField(
|
||||
_("value"),
|
||||
max_length=128,
|
||||
blank=True,
|
||||
null=True)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("filter rule")
|
||||
verbose_name_plural = _("filter rules")
|
||||
|
||||
|
||||
# TODO: why is this in the models file?
|
||||
class FileInfo:
|
||||
|
||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
||||
# posterity, in the hopes that we might find a way to make it work one day.
|
||||
ALMOST_REGEX = re.compile(
|
||||
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
|
||||
r"((?P<correspondent>{non_separated_word}+){separator})??"
|
||||
r"(?P<title>{non_separated_word}+)"
|
||||
r"({separator}(?P<tags>[a-z,0-9-]+))?"
|
||||
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
|
||||
separator=r"\s+-\s+",
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
@@ -365,28 +448,10 @@ class FileInfo:
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_correspondent(cls, name):
|
||||
if not name:
|
||||
return None
|
||||
return Correspondent.objects.get_or_create(name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
|
||||
@classmethod
|
||||
def _get_title(cls, title):
|
||||
return title
|
||||
|
||||
@classmethod
|
||||
def _get_tags(cls, tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(Tag.objects.get_or_create(
|
||||
slug=slugify(t),
|
||||
defaults={"name": t}
|
||||
)[0])
|
||||
return tuple(r)
|
||||
|
||||
@classmethod
|
||||
def _mangle_property(cls, properties, name):
|
||||
if name in properties:
|
||||
@@ -396,15 +461,6 @@ class FileInfo:
|
||||
|
||||
@classmethod
|
||||
def from_filename(cls, filename):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<date> - <correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title>"
|
||||
"<title>"
|
||||
"""
|
||||
|
||||
# Mutate filename in-place before parsing its components
|
||||
# by applying at most one of the configured transformations.
|
||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||
@@ -435,7 +491,5 @@ class FileInfo:
|
||||
if m:
|
||||
properties = m.groupdict()
|
||||
cls._mangle_property(properties, "created")
|
||||
cls._mangle_property(properties, "correspondent")
|
||||
cls._mangle_property(properties, "title")
|
||||
cls._mangle_property(properties, "tags")
|
||||
return cls(**properties)
|
||||
|
@@ -6,7 +6,6 @@ import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
@@ -36,7 +35,7 @@ DATE_REGEX = re.compile(
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
def is_mime_type_supported(mime_type):
|
||||
@@ -117,6 +116,7 @@ def run_convert(input_file,
|
||||
trim=False,
|
||||
type=None,
|
||||
depth=None,
|
||||
auto_orient=False,
|
||||
extra=None,
|
||||
logging_group=None):
|
||||
|
||||
@@ -134,6 +134,7 @@ def run_convert(input_file,
|
||||
args += ['-trim'] if trim else []
|
||||
args += ['-type', str(type)] if type else []
|
||||
args += ['-depth', str(depth)] if depth else []
|
||||
args += ['-auto-orient'] if auto_orient else []
|
||||
args += [input_file, output_file]
|
||||
|
||||
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
|
||||
@@ -142,6 +143,53 @@ def run_convert(input_file,
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
out_path = os.path.join(temp_dir, "convert.png")
|
||||
|
||||
# Run convert to get a decent thumbnail
|
||||
try:
|
||||
run_convert(density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file="{}[0]".format(in_path),
|
||||
output_file=out_path,
|
||||
logging_group=logging_group)
|
||||
except ParseError:
|
||||
# if convert fails, fall back to extracting
|
||||
# the first PDF page as a PNG using Ghostscript
|
||||
logger.warning(
|
||||
"Thumbnail generation with ImageMagick failed, falling back "
|
||||
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
gs_out_path = os.path.join(temp_dir, "gs_out.png")
|
||||
cmd = [settings.GS_BINARY,
|
||||
"-q",
|
||||
"-sDEVICE=pngalpha",
|
||||
"-o", gs_out_path,
|
||||
in_path]
|
||||
if not subprocess.Popen(cmd).wait() == 0:
|
||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||
# then run convert on the output from gs
|
||||
run_convert(density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=gs_out_path,
|
||||
output_file=out_path,
|
||||
logging_group=logging_group)
|
||||
|
||||
return out_path
|
||||
|
||||
|
||||
def parse_date(filename, text):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
@@ -151,6 +199,8 @@ def parse_date(filename, text):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
import dateparser
|
||||
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
@@ -161,9 +211,14 @@ def parse_date(filename, text):
|
||||
}
|
||||
)
|
||||
|
||||
date = None
|
||||
def __filter(date):
|
||||
if date and date.year > 1900 and \
|
||||
date <= timezone.now() and \
|
||||
date.date() not in settings.IGNORE_DATES:
|
||||
return date
|
||||
return None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
date = None
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
@@ -176,7 +231,8 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
@@ -189,10 +245,9 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
@@ -207,30 +262,44 @@ class DocumentParser(LoggingMixin):
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, logging_group):
|
||||
logging_name = "paperless.parsing"
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
self.archive_path = None
|
||||
self.text = None
|
||||
self.date = None
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
def progress(self, current_progress, max_progress):
|
||||
if self.progress_callback:
|
||||
self.progress_callback(current_progress, max_progress)
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
return []
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_archive_path(self):
|
||||
return self.archive_path
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
thumbnail = self.get_thumbnail(document_path, mime_type)
|
||||
def get_optimised_thumbnail(self,
|
||||
document_path,
|
||||
mime_type,
|
||||
file_name=None):
|
||||
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
|
||||
|
||||
@@ -253,5 +322,5 @@ class DocumentParser(LoggingMixin):
|
||||
return self.date
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
self.log("debug", f"Deleting directory {self.tempdir}")
|
||||
shutil.rmtree(self.tempdir)
|
||||
|
@@ -1,117 +1,145 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from tqdm import tqdm
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class SanityMessage:
|
||||
message = None
|
||||
class SanityCheckMessages:
|
||||
|
||||
def __init__(self):
|
||||
self._messages = []
|
||||
|
||||
def error(self, message):
|
||||
self._messages.append({"level": logging.ERROR, "message": message})
|
||||
|
||||
def warning(self, message):
|
||||
self._messages.append({"level": logging.WARNING, "message": message})
|
||||
|
||||
def info(self, message):
|
||||
self._messages.append({"level": logging.INFO, "message": message})
|
||||
|
||||
def log_messages(self):
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
|
||||
if len(self._messages) == 0:
|
||||
logger.info("Sanity checker detected no issues.")
|
||||
else:
|
||||
for msg in self._messages:
|
||||
logger.log(msg['level'], msg['message'])
|
||||
|
||||
def __len__(self):
|
||||
return len(self._messages)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._messages[item]
|
||||
|
||||
def has_error(self):
|
||||
return any([msg['level'] == logging.ERROR for msg in self._messages])
|
||||
|
||||
def has_warning(self):
|
||||
return any([msg['level'] == logging.WARNING for msg in self._messages])
|
||||
|
||||
|
||||
class SanityWarning(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"Warning: {self.message}"
|
||||
class SanityCheckFailedException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SanityError(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"ERROR: {self.message}"
|
||||
|
||||
|
||||
class SanityFailedError(Exception):
|
||||
|
||||
def __init__(self, messages):
|
||||
self.messages = messages
|
||||
|
||||
def __str__(self):
|
||||
message_string = "\n".join([str(m) for m in self.messages])
|
||||
return (
|
||||
f"The following issuse were found by the sanity checker:\n"
|
||||
f"{message_string}\n\n===============\n\n")
|
||||
|
||||
|
||||
def check_sanity():
|
||||
messages = []
|
||||
def check_sanity(progress=False):
|
||||
messages = SanityCheckMessages()
|
||||
|
||||
present_files = []
|
||||
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
|
||||
for f in files:
|
||||
present_files.append(os.path.normpath(os.path.join(root, f)))
|
||||
|
||||
for doc in Document.objects.all():
|
||||
lockfile = os.path.normpath(settings.MEDIA_LOCK)
|
||||
if lockfile in present_files:
|
||||
present_files.remove(lockfile)
|
||||
|
||||
if progress:
|
||||
docs = tqdm(Document.objects.all())
|
||||
else:
|
||||
docs = Document.objects.all()
|
||||
|
||||
for doc in docs:
|
||||
# Check sanity of the thumbnail
|
||||
if not os.path.isfile(doc.thumbnail_path):
|
||||
messages.append(SanityError(
|
||||
f"Thumbnail of document {doc.pk} does not exist."))
|
||||
messages.error(f"Thumbnail of document {doc.pk} does not exist.")
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.thumbnail_path))
|
||||
if os.path.normpath(doc.thumbnail_path) in present_files:
|
||||
present_files.remove(os.path.normpath(doc.thumbnail_path))
|
||||
try:
|
||||
with doc.thumbnail_file as f:
|
||||
f.read()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
messages.error(
|
||||
f"Cannot read thumbnail file of document {doc.pk}: {e}"
|
||||
))
|
||||
)
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
if not os.path.isfile(doc.source_path):
|
||||
messages.append(SanityError(
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
messages.error(f"Original of document {doc.pk} does not exist.")
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
if os.path.normpath(doc.source_path) in present_files:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
messages.error(
|
||||
f"Cannot read original file of document {doc.pk}: {e}")
|
||||
else:
|
||||
if not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
messages.error(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
)
|
||||
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum:
|
||||
if doc.archive_checksum and not doc.archive_filename:
|
||||
messages.error(
|
||||
f"Document {doc.pk} has an archive file checksum, but no "
|
||||
f"archive filename."
|
||||
)
|
||||
elif not doc.archive_checksum and doc.archive_filename:
|
||||
messages.error(
|
||||
f"Document {doc.pk} has an archive file, but its checksum is "
|
||||
f"missing."
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
if not os.path.isfile(doc.archive_path):
|
||||
messages.append(SanityError(
|
||||
messages.error(
|
||||
f"Archived version of document {doc.pk} does not exist."
|
||||
))
|
||||
)
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
if os.path.normpath(doc.archive_path) in present_files:
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
try:
|
||||
with doc.archive_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
messages.error(
|
||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||
))
|
||||
)
|
||||
else:
|
||||
if not checksum == doc.archive_checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of archive {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
messages.error(
|
||||
f"Checksum mismatch of archived document "
|
||||
f"{doc.pk}. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}."
|
||||
)
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
f"Document {doc.pk} has no content."
|
||||
))
|
||||
messages.info(f"Document {doc.pk} has no content.")
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.append(SanityWarning(
|
||||
f"Orphaned file in media dir: {extra_file}"
|
||||
))
|
||||
messages.warning(f"Orphaned file in media dir: {extra_file}")
|
||||
|
||||
return messages
|
||||
|
@@ -1,12 +1,62 @@
|
||||
import re
|
||||
|
||||
import magic
|
||||
from django.utils.text import slugify
|
||||
from rest_framework import serializers
|
||||
from rest_framework.fields import SerializerMethodField
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log, DocumentType
|
||||
from . import bulk_edit
|
||||
from .models import Correspondent, Tag, Document, DocumentType, \
|
||||
SavedView, SavedViewFilterRule, MatchingModel
|
||||
from .parsers import is_mime_type_supported
|
||||
|
||||
from django.utils.translation import gettext as _
|
||||
|
||||
|
||||
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
# https://www.django-rest-framework.org/api-guide/serializers/#example
|
||||
class DynamicFieldsModelSerializer(serializers.ModelSerializer):
|
||||
"""
|
||||
A ModelSerializer that takes an additional `fields` argument that
|
||||
controls which fields should be displayed.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Don't pass the 'fields' arg up to the superclass
|
||||
fields = kwargs.pop('fields', None)
|
||||
|
||||
# Instantiate the superclass normally
|
||||
super(DynamicFieldsModelSerializer, self).__init__(*args, **kwargs)
|
||||
|
||||
if fields is not None:
|
||||
# Drop any fields that are not specified in the `fields` argument.
|
||||
allowed = set(fields)
|
||||
existing = set(self.fields)
|
||||
for field_name in existing - allowed:
|
||||
self.fields.pop(field_name)
|
||||
|
||||
|
||||
class MatchingModelSerializer(serializers.ModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
|
||||
def get_slug(self, obj):
|
||||
return slugify(obj.name)
|
||||
slug = SerializerMethodField()
|
||||
|
||||
def validate_match(self, match):
|
||||
if 'matching_algorithm' in self.initial_data and self.initial_data['matching_algorithm'] == MatchingModel.MATCH_REGEX: # NOQA: E501
|
||||
try:
|
||||
re.compile(match)
|
||||
except Exception as e:
|
||||
raise serializers.ValidationError(
|
||||
_("Invalid regular expresssion: %(error)s") %
|
||||
{'error': str(e)}
|
||||
)
|
||||
return match
|
||||
|
||||
|
||||
class CorrespondentSerializer(MatchingModelSerializer):
|
||||
|
||||
last_correspondence = serializers.DateTimeField(read_only=True)
|
||||
|
||||
class Meta:
|
||||
@@ -23,9 +73,7 @@ class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
)
|
||||
|
||||
|
||||
class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
class DocumentTypeSerializer(MatchingModelSerializer):
|
||||
|
||||
class Meta:
|
||||
model = DocumentType
|
||||
@@ -40,9 +88,7 @@ class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
|
||||
)
|
||||
|
||||
|
||||
class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
|
||||
document_count = serializers.IntegerField(read_only=True)
|
||||
class TagSerializer(MatchingModelSerializer):
|
||||
|
||||
class Meta:
|
||||
model = Tag
|
||||
@@ -74,13 +120,23 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
|
||||
return DocumentType.objects.all()
|
||||
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
class DocumentSerializer(DynamicFieldsModelSerializer):
|
||||
|
||||
correspondent_id = CorrespondentField(
|
||||
allow_null=True, source='correspondent')
|
||||
tags_id = TagsField(many=True, source='tags')
|
||||
document_type_id = DocumentTypeField(
|
||||
allow_null=True, source='document_type')
|
||||
correspondent = CorrespondentField(allow_null=True)
|
||||
tags = TagsField(many=True)
|
||||
document_type = DocumentTypeField(allow_null=True)
|
||||
|
||||
original_file_name = SerializerMethodField()
|
||||
archived_file_name = SerializerMethodField()
|
||||
|
||||
def get_original_file_name(self, obj):
|
||||
return obj.get_public_filename()
|
||||
|
||||
def get_archived_file_name(self, obj):
|
||||
if obj.has_archive_version:
|
||||
return obj.get_public_filename(archive=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
@@ -88,28 +144,280 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
fields = (
|
||||
"id",
|
||||
"correspondent",
|
||||
"correspondent_id",
|
||||
"document_type",
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"archive_serial_number"
|
||||
"archive_serial_number",
|
||||
"original_file_name",
|
||||
"archived_file_name",
|
||||
)
|
||||
|
||||
|
||||
class LogSerializer(serializers.ModelSerializer):
|
||||
class SavedViewFilterRuleSerializer(serializers.ModelSerializer):
|
||||
|
||||
class Meta:
|
||||
model = Log
|
||||
fields = (
|
||||
"id",
|
||||
"created",
|
||||
"message",
|
||||
"group",
|
||||
"level"
|
||||
)
|
||||
model = SavedViewFilterRule
|
||||
fields = ["rule_type", "value"]
|
||||
|
||||
|
||||
class SavedViewSerializer(serializers.ModelSerializer):
|
||||
|
||||
filter_rules = SavedViewFilterRuleSerializer(many=True)
|
||||
|
||||
class Meta:
|
||||
model = SavedView
|
||||
depth = 1
|
||||
fields = ["id", "name", "show_on_dashboard", "show_in_sidebar",
|
||||
"sort_field", "sort_reverse", "filter_rules"]
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
if 'filter_rules' in validated_data:
|
||||
rules_data = validated_data.pop('filter_rules')
|
||||
else:
|
||||
rules_data = None
|
||||
super(SavedViewSerializer, self).update(instance, validated_data)
|
||||
if rules_data is not None:
|
||||
SavedViewFilterRule.objects.filter(saved_view=instance).delete()
|
||||
for rule_data in rules_data:
|
||||
SavedViewFilterRule.objects.create(
|
||||
saved_view=instance, **rule_data)
|
||||
return instance
|
||||
|
||||
def create(self, validated_data):
|
||||
rules_data = validated_data.pop('filter_rules')
|
||||
saved_view = SavedView.objects.create(**validated_data)
|
||||
for rule_data in rules_data:
|
||||
SavedViewFilterRule.objects.create(
|
||||
saved_view=saved_view, **rule_data)
|
||||
return saved_view
|
||||
|
||||
|
||||
class DocumentListSerializer(serializers.Serializer):
|
||||
|
||||
documents = serializers.ListField(
|
||||
required=True,
|
||||
label="Documents",
|
||||
write_only=True,
|
||||
child=serializers.IntegerField()
|
||||
)
|
||||
|
||||
def _validate_document_id_list(self, documents, name="documents"):
|
||||
if not type(documents) == list:
|
||||
raise serializers.ValidationError(f"{name} must be a list")
|
||||
if not all([type(i) == int for i in documents]):
|
||||
raise serializers.ValidationError(
|
||||
f"{name} must be a list of integers")
|
||||
count = Document.objects.filter(id__in=documents).count()
|
||||
if not count == len(documents):
|
||||
raise serializers.ValidationError(
|
||||
f"Some documents in {name} don't exist or were "
|
||||
f"specified twice.")
|
||||
|
||||
def validate_documents(self, documents):
|
||||
self._validate_document_id_list(documents)
|
||||
return documents
|
||||
|
||||
|
||||
class BulkEditSerializer(DocumentListSerializer):
|
||||
|
||||
method = serializers.ChoiceField(
|
||||
choices=[
|
||||
"set_correspondent",
|
||||
"set_document_type",
|
||||
"add_tag",
|
||||
"remove_tag",
|
||||
"modify_tags",
|
||||
"delete"
|
||||
],
|
||||
label="Method",
|
||||
write_only=True,
|
||||
)
|
||||
|
||||
parameters = serializers.DictField(allow_empty=True)
|
||||
|
||||
def _validate_tag_id_list(self, tags, name="tags"):
|
||||
if not type(tags) == list:
|
||||
raise serializers.ValidationError(f"{name} must be a list")
|
||||
if not all([type(i) == int for i in tags]):
|
||||
raise serializers.ValidationError(
|
||||
f"{name} must be a list of integers")
|
||||
count = Tag.objects.filter(id__in=tags).count()
|
||||
if not count == len(tags):
|
||||
raise serializers.ValidationError(
|
||||
f"Some tags in {name} don't exist or were specified twice.")
|
||||
|
||||
def validate_method(self, method):
|
||||
if method == "set_correspondent":
|
||||
return bulk_edit.set_correspondent
|
||||
elif method == "set_document_type":
|
||||
return bulk_edit.set_document_type
|
||||
elif method == "add_tag":
|
||||
return bulk_edit.add_tag
|
||||
elif method == "remove_tag":
|
||||
return bulk_edit.remove_tag
|
||||
elif method == "modify_tags":
|
||||
return bulk_edit.modify_tags
|
||||
elif method == "delete":
|
||||
return bulk_edit.delete
|
||||
else:
|
||||
raise serializers.ValidationError("Unsupported method.")
|
||||
|
||||
def _validate_parameters_tags(self, parameters):
|
||||
if 'tag' in parameters:
|
||||
tag_id = parameters['tag']
|
||||
try:
|
||||
Tag.objects.get(id=tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise serializers.ValidationError("Tag does not exist")
|
||||
else:
|
||||
raise serializers.ValidationError("tag not specified")
|
||||
|
||||
def _validate_parameters_document_type(self, parameters):
|
||||
if 'document_type' in parameters:
|
||||
document_type_id = parameters['document_type']
|
||||
if document_type_id is None:
|
||||
# None is ok
|
||||
return
|
||||
try:
|
||||
DocumentType.objects.get(id=document_type_id)
|
||||
except DocumentType.DoesNotExist:
|
||||
raise serializers.ValidationError(
|
||||
"Document type does not exist")
|
||||
else:
|
||||
raise serializers.ValidationError("document_type not specified")
|
||||
|
||||
def _validate_parameters_correspondent(self, parameters):
|
||||
if 'correspondent' in parameters:
|
||||
correspondent_id = parameters['correspondent']
|
||||
if correspondent_id is None:
|
||||
return
|
||||
try:
|
||||
Correspondent.objects.get(id=correspondent_id)
|
||||
except Correspondent.DoesNotExist:
|
||||
raise serializers.ValidationError(
|
||||
"Correspondent does not exist")
|
||||
else:
|
||||
raise serializers.ValidationError("correspondent not specified")
|
||||
|
||||
def _validate_parameters_modify_tags(self, parameters):
|
||||
if "add_tags" in parameters:
|
||||
self._validate_tag_id_list(parameters['add_tags'], "add_tags")
|
||||
else:
|
||||
raise serializers.ValidationError("add_tags not specified")
|
||||
|
||||
if "remove_tags" in parameters:
|
||||
self._validate_tag_id_list(parameters['remove_tags'],
|
||||
"remove_tags")
|
||||
else:
|
||||
raise serializers.ValidationError("remove_tags not specified")
|
||||
|
||||
def validate(self, attrs):
|
||||
|
||||
method = attrs['method']
|
||||
parameters = attrs['parameters']
|
||||
|
||||
if method == bulk_edit.set_correspondent:
|
||||
self._validate_parameters_correspondent(parameters)
|
||||
elif method == bulk_edit.set_document_type:
|
||||
self._validate_parameters_document_type(parameters)
|
||||
elif method == bulk_edit.add_tag or method == bulk_edit.remove_tag:
|
||||
self._validate_parameters_tags(parameters)
|
||||
elif method == bulk_edit.modify_tags:
|
||||
self._validate_parameters_modify_tags(parameters)
|
||||
|
||||
return attrs
|
||||
|
||||
|
||||
class PostDocumentSerializer(serializers.Serializer):
|
||||
|
||||
document = serializers.FileField(
|
||||
label="Document",
|
||||
write_only=True,
|
||||
)
|
||||
|
||||
title = serializers.CharField(
|
||||
label="Title",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
correspondent = serializers.PrimaryKeyRelatedField(
|
||||
queryset=Correspondent.objects.all(),
|
||||
label="Correspondent",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
document_type = serializers.PrimaryKeyRelatedField(
|
||||
queryset=DocumentType.objects.all(),
|
||||
label="Document type",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
tags = serializers.PrimaryKeyRelatedField(
|
||||
many=True,
|
||||
queryset=Tag.objects.all(),
|
||||
label="Tags",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
def validate_document(self, document):
|
||||
document_data = document.file.read()
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise serializers.ValidationError(
|
||||
_("File type %(type)s not supported") %
|
||||
{'type': mime_type}
|
||||
)
|
||||
|
||||
return document.name, document_data
|
||||
|
||||
def validate_correspondent(self, correspondent):
|
||||
if correspondent:
|
||||
return correspondent.id
|
||||
else:
|
||||
return None
|
||||
|
||||
def validate_document_type(self, document_type):
|
||||
if document_type:
|
||||
return document_type.id
|
||||
else:
|
||||
return None
|
||||
|
||||
def validate_tags(self, tags):
|
||||
if tags:
|
||||
return [tag.id for tag in tags]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class BulkDownloadSerializer(DocumentListSerializer):
|
||||
|
||||
content = serializers.ChoiceField(
|
||||
choices=["archive", "originals", "both"],
|
||||
default="archive"
|
||||
)
|
||||
|
||||
compression = serializers.ChoiceField(
|
||||
choices=["none", "deflated", "bzip2", "lzma"],
|
||||
default="none"
|
||||
)
|
||||
|
||||
def validate_compression(self, compression):
|
||||
import zipfile
|
||||
|
||||
return {
|
||||
"none": zipfile.ZIP_STORED,
|
||||
"deflated": zipfile.ZIP_DEFLATED,
|
||||
"bzip2": zipfile.ZIP_BZIP2,
|
||||
"lzma": zipfile.ZIP_LZMA
|
||||
}[compression]
|
||||
|
@@ -1,24 +1,24 @@
|
||||
import logging
|
||||
import os
|
||||
from subprocess import Popen
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.admin.models import ADDITION, LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.db.models import Q
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
from rest_framework.reverse import reverse
|
||||
from filelock import FileLock
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory, archive_name_from_filename
|
||||
from .. import matching
|
||||
from ..file_handling import delete_empty_directories, \
|
||||
create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
def logger(message, group):
|
||||
logging.getLogger(__name__).debug(message, extra={"group": group})
|
||||
logger = logging.getLogger("paperless.handlers")
|
||||
|
||||
|
||||
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
@@ -36,7 +36,7 @@ def set_correspondent(sender,
|
||||
if document.correspondent and not replace:
|
||||
return
|
||||
|
||||
potential_correspondents = matching.match_correspondents(document.content,
|
||||
potential_correspondents = matching.match_correspondents(document,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
@@ -46,23 +46,23 @@ def set_correspondent(sender,
|
||||
selected = None
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
else:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"not assigning any correspondent",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Assigning correspondent {selected} to {document}",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
|
||||
document.correspondent = selected
|
||||
@@ -79,7 +79,7 @@ def set_document_type(sender,
|
||||
if document.document_type and not replace:
|
||||
return
|
||||
|
||||
potential_document_type = matching.match_document_types(document.content,
|
||||
potential_document_type = matching.match_document_types(document,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_document_type)
|
||||
@@ -90,23 +90,23 @@ def set_document_type(sender,
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
else:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"not assigning any document type",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
logger.info(
|
||||
f"Assigning document type {selected} to {document}",
|
||||
logging_group
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
|
||||
document.document_type = selected
|
||||
@@ -119,13 +119,16 @@ def set_tags(sender,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
**kwargs):
|
||||
if replace:
|
||||
document.tags.clear()
|
||||
current_tags = set([])
|
||||
else:
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
matched_tags = matching.match_tags(document.content, classifier)
|
||||
if replace:
|
||||
Document.tags.through.objects.filter(document=document).exclude(
|
||||
Q(tag__is_inbox_tag=True)).exclude(
|
||||
Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO)
|
||||
).delete()
|
||||
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
matched_tags = matching.match_tags(document, classifier)
|
||||
|
||||
relevant_tags = set(matched_tags) - current_tags
|
||||
|
||||
@@ -133,82 +136,60 @@ def set_tags(sender,
|
||||
return
|
||||
|
||||
message = 'Tagging "{}" with "{}"'
|
||||
logger(
|
||||
message.format(document, ", ".join([t.slug for t in relevant_tags])),
|
||||
logging_group
|
||||
logger.info(
|
||||
message.format(document, ", ".join([t.name for t in relevant_tags])),
|
||||
extra={'group': logging_group}
|
||||
)
|
||||
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
|
||||
def run_pre_consume_script(sender, filename, **kwargs):
|
||||
|
||||
if not settings.PRE_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
Popen((settings.PRE_CONSUME_SCRIPT, filename)).wait()
|
||||
|
||||
|
||||
def run_post_consume_script(sender, document, **kwargs):
|
||||
|
||||
if not settings.POST_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
Popen((
|
||||
settings.POST_CONSUME_SCRIPT,
|
||||
str(document.pk),
|
||||
document.file_name,
|
||||
os.path.normpath(document.source_path),
|
||||
os.path.normpath(document.thumbnail_path),
|
||||
reverse("document-download", kwargs={"pk": document.pk}),
|
||||
reverse("document-thumb", kwargs={"pk": document.pk}),
|
||||
str(document.correspondent),
|
||||
str(",".join(document.tags.all().values_list("slug", flat=True)))
|
||||
)).wait()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {instance.file_name}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
for filename in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if filename and os.path.isfile(filename):
|
||||
try:
|
||||
os.unlink(filename)
|
||||
logger.debug(
|
||||
f"Deleted file {filename}.")
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"While deleting document {str(instance)}, the file "
|
||||
f"{filename} could not be deleted: {e}"
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
if instance.has_archive_version:
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
|
||||
|
||||
class CannotMoveFilesException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def validate_move(instance, old_path, new_path):
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
logger.fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return False
|
||||
raise CannotMoveFilesException()
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
logger.warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return False
|
||||
|
||||
return True
|
||||
raise CannotMoveFilesException()
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@@ -226,81 +207,86 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
# This will in turn cause this logic to move the file where it belongs.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
return
|
||||
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
|
||||
try:
|
||||
os.rename(old_source_path, new_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
# Don't save here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
|
||||
if instance.archive_checksum:
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for the
|
||||
# archive file, we try our best to revert the changes.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
try:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also manage
|
||||
# to move it from B to A. If not, we have a serious issue
|
||||
# that's going to get caught by the santiy checker.
|
||||
# all files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
old_filename = instance.filename
|
||||
old_source_path = instance.source_path
|
||||
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
instance.filename = generate_unique_filename(instance)
|
||||
move_original = old_filename != instance.filename
|
||||
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
old_archive_filename = instance.archive_filename
|
||||
old_archive_path = instance.archive_path
|
||||
|
||||
if instance.has_archive_version:
|
||||
|
||||
instance.archive_filename = generate_unique_filename(
|
||||
instance, archive_filename=True
|
||||
)
|
||||
|
||||
move_archive = old_archive_filename != instance.archive_filename # NOQA: E501
|
||||
else:
|
||||
move_archive = False
|
||||
|
||||
if not move_original and not move_archive:
|
||||
# Don't do anything if filenames did not change.
|
||||
return
|
||||
|
||||
if move_original:
|
||||
validate_move(instance, old_source_path, instance.source_path)
|
||||
create_source_path_directory(instance.source_path)
|
||||
os.rename(old_source_path, instance.source_path)
|
||||
|
||||
if move_archive:
|
||||
validate_move(
|
||||
instance, old_archive_path, instance.archive_path)
|
||||
create_source_path_directory(instance.archive_path)
|
||||
os.rename(old_archive_path, instance.archive_path)
|
||||
|
||||
# Don't save() here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(
|
||||
filename=instance.filename,
|
||||
archive_filename=instance.archive_filename,
|
||||
)
|
||||
|
||||
except (OSError, DatabaseError, CannotMoveFilesException):
|
||||
# This happens when either:
|
||||
# - moving the files failed due to file system errors
|
||||
# - saving to the database failed due to database errors
|
||||
# In both cases, we need to revert to the original state.
|
||||
|
||||
# Try to move files to their original location.
|
||||
try:
|
||||
if move_original and os.path.isfile(instance.source_path):
|
||||
os.rename(instance.source_path, old_source_path)
|
||||
|
||||
if move_archive and os.path.isfile(instance.archive_path):
|
||||
os.rename(instance.archive_path, old_archive_path)
|
||||
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also
|
||||
# manage to move it from B to A. If not, we have a serious
|
||||
# issue that's going to get caught by the santiy checker.
|
||||
# All files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed
|
||||
# anyway.
|
||||
pass
|
||||
|
||||
# restore old values on the instance
|
||||
instance.filename = old_filename
|
||||
instance.archive_filename = old_archive_filename
|
||||
|
||||
# finally, remove any empty sub folders. This will do nothing if
|
||||
# something has failed above.
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
|
||||
if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
@@ -319,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
|
||||
def add_to_index(sender, document, **kwargs):
|
||||
from documents import index
|
||||
|
||||
index.add_or_update_document(document)
|
||||
|
@@ -1,14 +1,17 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.db.models.signals import post_save
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index, sanity_checker
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.classifier import DocumentClassifier, load_classifier
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import SanityFailedError
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
from documents.sanity_checker import SanityCheckFailedException
|
||||
|
||||
logger = logging.getLogger("paperless.tasks")
|
||||
|
||||
|
||||
def index_optimize():
|
||||
@@ -23,34 +26,39 @@ def index_reindex():
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
for document in tqdm.tqdm(documents):
|
||||
index.update_document(writer, document)
|
||||
|
||||
|
||||
def train_classifier():
|
||||
classifier = DocumentClassifier()
|
||||
if (not Tag.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists() and
|
||||
not DocumentType.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists() and
|
||||
not Correspondent.objects.filter(
|
||||
matching_algorithm=Tag.MATCH_AUTO).exists()):
|
||||
|
||||
try:
|
||||
# load the classifier, since we might not have to train it again.
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError):
|
||||
# This is what we're going to fix here.
|
||||
pass
|
||||
return
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
if not classifier:
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
try:
|
||||
if classifier.train():
|
||||
logging.getLogger(__name__).info(
|
||||
logger.info(
|
||||
"Saving updated classifier model to {}...".format(
|
||||
settings.MODEL_FILE)
|
||||
)
|
||||
classifier.save_classifier()
|
||||
classifier.save()
|
||||
else:
|
||||
logging.getLogger(__name__).debug(
|
||||
logger.debug(
|
||||
"Training data unchanged."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).error(
|
||||
logger.warning(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
|
||||
@@ -60,7 +68,8 @@ def consume_file(path,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
override_tag_ids=None,
|
||||
task_id=None):
|
||||
|
||||
document = Consumer().try_consume_file(
|
||||
path,
|
||||
@@ -68,7 +77,9 @@ def consume_file(path,
|
||||
override_title=override_title,
|
||||
override_correspondent_id=override_correspondent_id,
|
||||
override_document_type_id=override_document_type_id,
|
||||
override_tag_ids=override_tag_ids)
|
||||
override_tag_ids=override_tag_ids,
|
||||
task_id=task_id
|
||||
)
|
||||
|
||||
if document:
|
||||
return "Success. New document id {} created".format(
|
||||
@@ -82,7 +93,27 @@ def consume_file(path,
|
||||
def sanity_check():
|
||||
messages = sanity_checker.check_sanity()
|
||||
|
||||
if len(messages) > 0:
|
||||
raise SanityFailedError(messages)
|
||||
messages.log_messages()
|
||||
|
||||
if messages.has_error():
|
||||
raise SanityCheckFailedException(
|
||||
"Sanity check failed with errors. See log.")
|
||||
elif messages.has_warning():
|
||||
return "Sanity check exited with warnings. See log."
|
||||
elif len(messages) > 0:
|
||||
return "Sanity check exited with infos. See log."
|
||||
else:
|
||||
return "No issues detected."
|
||||
|
||||
|
||||
def bulk_update_documents(document_ids):
|
||||
documents = Document.objects.filter(id__in=document_ids)
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
for doc in documents:
|
||||
post_save.send(Document, instance=doc, created=False)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for doc in documents:
|
||||
index.update_document(writer, doc)
|
||||
|
@@ -1,19 +1,26 @@
|
||||
<!doctype html>
|
||||
|
||||
{% load static %}
|
||||
{% load i18n %}
|
||||
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>PaperlessUi</title>
|
||||
<title>Paperless-ng</title>
|
||||
<base href="/">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="username" content="{{username}}">
|
||||
<meta name="full_name" content="{{full_name}}">
|
||||
<meta name="cookie_prefix" content="{{cookie_prefix}}">
|
||||
<link rel="icon" type="image/x-icon" href="favicon.ico">
|
||||
<link rel="stylesheet" href="{% static 'frontend/styles.css' %}"></head>
|
||||
<link rel="manifest" href="{% static webmanifest %}">
|
||||
<link rel="stylesheet" href="{% static styles_css %}">
|
||||
<link rel="apple-touch-icon" href="apple-touch-icon.png">
|
||||
</head>
|
||||
<body>
|
||||
<app-root>Loading...</app-root>
|
||||
<script src="{% static 'frontend/runtime.js' %}" defer></script>
|
||||
<script src="{% static 'frontend/polyfills.js' %}" defer></script>
|
||||
<script src="{% static 'frontend/main.js' %}" defer></script>
|
||||
<app-root>{% translate "Paperless-ng is loading..." %}</app-root>
|
||||
<script src="{% static runtime_js %}" defer></script>
|
||||
<script src="{% static polyfills_js %}" defer></script>
|
||||
<script src="{% static main_js %}" defer></script>
|
||||
</body>
|
||||
</html>
|
||||
|
@@ -1,6 +1,7 @@
|
||||
<!doctype html>
|
||||
|
||||
{% load static %}
|
||||
{% load i18n %}
|
||||
|
||||
<html lang="en">
|
||||
<head>
|
||||
@@ -9,7 +10,7 @@
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
|
||||
<meta name="generator" content="Jekyll v4.1.1">
|
||||
<title>Paperless Sign In</title>
|
||||
<title>{% translate "Paperless-ng signed out" %}</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
|
||||
@@ -36,9 +37,9 @@
|
||||
|
||||
<body class="text-center">
|
||||
<div class="form-signin">
|
||||
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>You have been successfully logged out. Bye!</p>
|
||||
<a href="/">Sign in again</a>
|
||||
<img class="mb-4" src="{% static 'frontend/en-US/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>{% translate "You have been successfully logged out. Bye!" %}</p>
|
||||
<a href="/">{% translate "Sign in again" %}</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
@@ -1,6 +1,7 @@
|
||||
<!doctype html>
|
||||
|
||||
{% load static %}
|
||||
{% load i18n %}
|
||||
|
||||
<html lang="en">
|
||||
<head>
|
||||
@@ -9,7 +10,7 @@
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
|
||||
<meta name="generator" content="Jekyll v4.1.1">
|
||||
<title>Paperless Sign In</title>
|
||||
<title>{% translate "Paperless-ng sign in" %}</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
|
||||
@@ -37,18 +38,20 @@
|
||||
<body class="text-center">
|
||||
<form class="form-signin" method="post">
|
||||
{% csrf_token %}
|
||||
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>Please sign in.</p>
|
||||
<img class="mb-4" src="{% static 'frontend/en-US/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>{% translate "Please sign in." %}</p>
|
||||
{% if form.errors %}
|
||||
<div class="alert alert-danger" role="alert">
|
||||
Your username and password didn't match. Please try again.
|
||||
{% translate "Your username and password didn't match. Please try again." %}
|
||||
</div>
|
||||
{% endif %}
|
||||
<label for="inputUsername" class="sr-only">Username</label>
|
||||
<input type="text" name="username" id="inputUsername" class="form-control" placeholder="Username" required autofocus>
|
||||
<label for="inputPassword" class="sr-only">Password</label>
|
||||
<input type="password" name="password" id="inputPassword" class="form-control" placeholder="Password" required>
|
||||
<button class="btn btn-lg btn-primary btn-block" type="submit">Sign in</button>
|
||||
{% translate "Username" as i18n_username %}
|
||||
{% translate "Password" as i18n_password %}
|
||||
<label for="inputUsername" class="sr-only">{{ i18n_username }}</label>
|
||||
<input type="text" name="username" id="inputUsername" class="form-control" placeholder="{{ i18n_username }}" required autofocus>
|
||||
<label for="inputPassword" class="sr-only">{{ i18n_password }}</label>
|
||||
<input type="password" name="password" id="inputPassword" class="form-control" placeholder="{{ i18n_password }}" required>
|
||||
<button class="btn btn-lg btn-primary btn-block" type="submit">{% translate "Sign in" %}</button>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
|
BIN
src/documents/tests/data/model.pickle
Normal file
BIN
src/documents/tests/data/model.pickle
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/documents/originals/0000002.pdf
Normal file
BIN
src/documents/tests/samples/documents/originals/0000002.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
src/documents/tests/samples/documents/originals/0000003.pdf
Normal file
BIN
src/documents/tests/samples/documents/originals/0000003.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/documents/originals/0000004.pdf.gpg
Normal file
BIN
src/documents/tests/samples/documents/originals/0000004.pdf.gpg
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/documents/thumbnails/0000002.png
Normal file
BIN
src/documents/tests/samples/documents/thumbnails/0000002.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
BIN
src/documents/tests/samples/documents/thumbnails/0000003.png
Normal file
BIN
src/documents/tests/samples/documents/thumbnails/0000003.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
BIN
src/documents/tests/samples/simple-noalpha.png
Normal file
BIN
src/documents/tests/samples/simple-noalpha.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 6.3 KiB |
BIN
src/documents/tests/samples/simple.jpg
Normal file
BIN
src/documents/tests/samples/simple.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 17 KiB |
BIN
src/documents/tests/samples/simple.png
Normal file
BIN
src/documents/tests/samples/simple.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
1
src/documents/tests/samples/simple.txt
Normal file
1
src/documents/tests/samples/simple.txt
Normal file
@@ -0,0 +1 @@
|
||||
This is a test file.
|
BIN
src/documents/tests/samples/test_with_bom.pdf
Normal file
BIN
src/documents/tests/samples/test_with_bom.pdf
Normal file
Binary file not shown.
63
src/documents/tests/test_admin.py
Normal file
63
src/documents/tests/test_admin.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.contrib.admin.sites import AdminSite
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import index
|
||||
from documents.admin import DocumentAdmin
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestDocumentAdmin(DirectoriesMixin, TestCase):
|
||||
|
||||
def get_document_from_index(self, doc):
|
||||
ix = index.open_index()
|
||||
with ix.searcher() as searcher:
|
||||
return searcher.document(id=doc.id)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestDocumentAdmin, self).setUp()
|
||||
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
|
||||
|
||||
def test_save_model(self):
|
||||
doc = Document.objects.create(title="test")
|
||||
|
||||
doc.title = "new title"
|
||||
self.doc_admin.save_model(None, doc, None, None)
|
||||
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
|
||||
self.assertEqual(self.get_document_from_index(doc)['title'], "new title")
|
||||
|
||||
def test_delete_model(self):
|
||||
doc = Document.objects.create(title="test")
|
||||
index.add_or_update_document(doc)
|
||||
self.assertIsNotNone(self.get_document_from_index(doc))
|
||||
|
||||
self.doc_admin.delete_model(None, doc)
|
||||
|
||||
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
|
||||
self.assertIsNone(self.get_document_from_index(doc))
|
||||
|
||||
def test_delete_queryset(self):
|
||||
docs = []
|
||||
for i in range(42):
|
||||
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
|
||||
docs.append(doc)
|
||||
index.add_or_update_document(doc)
|
||||
|
||||
self.assertEqual(Document.objects.count(), 42)
|
||||
|
||||
for doc in docs:
|
||||
self.assertIsNotNone(self.get_document_from_index(doc))
|
||||
|
||||
self.doc_admin.delete_queryset(None, Document.objects.all())
|
||||
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
for doc in docs:
|
||||
self.assertIsNone(self.get_document_from_index(doc))
|
||||
|
||||
def test_created(self):
|
||||
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
|
||||
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
|
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,12 @@
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from django.core.checks import Error
|
||||
from django.test import TestCase
|
||||
|
||||
from .factories import DocumentFactory
|
||||
from ..checks import changed_password_check
|
||||
from .. import document_consumer_declaration
|
||||
from ..checks import changed_password_check, parser_check
|
||||
from ..models import Document
|
||||
|
||||
|
||||
@@ -15,3 +18,13 @@ class ChecksTestCase(TestCase):
|
||||
def test_changed_password_check_no_encryption(self):
|
||||
DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(changed_password_check(None), [])
|
||||
|
||||
def test_parser_check(self):
|
||||
|
||||
self.assertEqual(parser_check(None), [])
|
||||
|
||||
with mock.patch('documents.checks.document_consumer_declaration.send') as m:
|
||||
m.return_value = []
|
||||
|
||||
self.assertEqual(parser_check(None), [Error("No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.")])
|
||||
|
@@ -1,10 +1,13 @@
|
||||
import os
|
||||
import tempfile
|
||||
from time import sleep
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
@@ -82,37 +85,19 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
self.classifier.save_classifier()
|
||||
self.classifier.save()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
|
||||
current_ver = DocumentClassifier.FORMAT_VERSION
|
||||
with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
|
||||
# assure that we won't load old classifiers.
|
||||
self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
|
||||
self.assertRaises(IncompatibleClassifierVersionError, classifier2.load)
|
||||
|
||||
self.classifier.save_classifier()
|
||||
self.classifier.save()
|
||||
|
||||
# assure that we can load the classifier after saving it.
|
||||
classifier2.reload()
|
||||
|
||||
def testReload(self):
|
||||
|
||||
self.generate_test_data()
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
classifier2.reload()
|
||||
v1 = classifier2.classifier_version
|
||||
|
||||
# change the classifier after some time.
|
||||
sleep(1)
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2.reload()
|
||||
v2 = classifier2.classifier_version
|
||||
self.assertNotEqual(v1, v2)
|
||||
classifier2.load()
|
||||
|
||||
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||
def testSaveClassifier(self):
|
||||
@@ -121,12 +106,21 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
|
||||
self.classifier.train()
|
||||
|
||||
self.classifier.save_classifier()
|
||||
self.classifier.save()
|
||||
|
||||
new_classifier = DocumentClassifier()
|
||||
new_classifier.reload()
|
||||
new_classifier.load()
|
||||
self.assertFalse(new_classifier.train())
|
||||
|
||||
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
|
||||
def test_load_and_classify(self):
|
||||
self.generate_test_data()
|
||||
|
||||
new_classifier = DocumentClassifier()
|
||||
new_classifier.load()
|
||||
|
||||
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
|
||||
|
||||
def test_one_correspondent_predict(self):
|
||||
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
|
||||
@@ -235,3 +229,42 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
|
||||
|
||||
def test_load_classifier_not_exists(self):
|
||||
self.assertFalse(os.path.exists(settings.MODEL_FILE))
|
||||
self.assertIsNone(load_classifier())
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.load")
|
||||
def test_load_classifier(self, load):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertIsNotNone(load_classifier())
|
||||
load.assert_called_once()
|
||||
|
||||
@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
|
||||
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
|
||||
@pytest.mark.skip(reason="Disabled caching due to high memory usage - need to investigate.")
|
||||
def test_load_classifier_cached(self):
|
||||
classifier = load_classifier()
|
||||
self.assertIsNotNone(classifier)
|
||||
|
||||
with mock.patch("documents.classifier.DocumentClassifier.load") as load:
|
||||
classifier2 = load_classifier()
|
||||
load.assert_not_called()
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.load")
|
||||
def test_load_classifier_incompatible_version(self, load):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
load.side_effect = IncompatibleClassifierVersionError()
|
||||
self.assertIsNone(load_classifier())
|
||||
self.assertFalse(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.load")
|
||||
def test_load_classifier_os_error(self, load):
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
||||
load.side_effect = OSError()
|
||||
self.assertIsNone(load_classifier())
|
||||
self.assertTrue(os.path.exists(settings.MODEL_FILE))
|
||||
|
@@ -5,12 +5,14 @@ import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..consumer import Consumer, ConsumerError
|
||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||
from ..parsers import DocumentParser, ParseError
|
||||
from ..tasks import sanity_check
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@@ -27,83 +29,8 @@ class TestAttributes(TestCase):
|
||||
|
||||
self.assertEqual(file_info.title, title, filename)
|
||||
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)
|
||||
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
|
||||
|
||||
def test_guess_attributes_from_name0(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Title.pdf", "Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name1(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name2(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
|
||||
|
||||
def test_guess_attributes_from_name3(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name4(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
|
||||
|
||||
def test_guess_attributes_from_name5(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name6(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Spaced Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Spaced Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name7(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Spaced Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Spaced Title",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name8(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Dashed-Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Dashed-Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name9(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Sender - Dashed-Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Dashed-Title",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name10(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
|
||||
"Σενδερ",
|
||||
"Τιτλε",
|
||||
self.TAGS
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name_when_correspondent_empty(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
' - weird empty correspondent but should not break.pdf',
|
||||
None,
|
||||
'weird empty correspondent but should not break',
|
||||
()
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
@@ -121,28 +48,6 @@ class TestAttributes(TestCase):
|
||||
()
|
||||
)
|
||||
|
||||
def test_guess_attributes_from_name_when_title_is_empty(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
'weird correspondent but should not break - .pdf',
|
||||
'weird correspondent but should not break',
|
||||
'',
|
||||
()
|
||||
)
|
||||
|
||||
def test_case_insensitive_tag_creation(self):
|
||||
"""
|
||||
Tags should be detected and created as lower case.
|
||||
:return:
|
||||
"""
|
||||
|
||||
filename = "Title - Correspondent - tAg1,TAG2.pdf"
|
||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
||||
|
||||
path = "Title - Correspondent - tag1,tag2.pdf"
|
||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
||||
|
||||
self.assertEqual(Tag.objects.all().count(), 2)
|
||||
|
||||
|
||||
class TestFieldPermutations(TestCase):
|
||||
|
||||
@@ -188,7 +93,7 @@ class TestFieldPermutations(TestCase):
|
||||
self.assertEqual(info.tags, (), filename)
|
||||
else:
|
||||
self.assertEqual(
|
||||
[t.slug for t in info.tags], tags.split(','),
|
||||
[t.name for t in info.tags], tags.split(','),
|
||||
filename
|
||||
)
|
||||
|
||||
@@ -199,69 +104,7 @@ class TestFieldPermutations(TestCase):
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent(self):
|
||||
template = '{correspondent} - {title}.pdf'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
spec = dict(correspondent=correspondent, title=title)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent_and_tags(self):
|
||||
template = '{correspondent} - {title} - {tags}.pdf'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
tags=tags)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title_and_tags(self):
|
||||
|
||||
template = (
|
||||
"{created} - "
|
||||
"{correspondent} - "
|
||||
"{title} - "
|
||||
"{tags}.pdf"
|
||||
)
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title(self):
|
||||
|
||||
template = "{created} - {correspondent} - {title}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
|
||||
# Skip cases where title looks like a tag as we can't
|
||||
# accommodate such cases.
|
||||
if title.lower() == title:
|
||||
continue
|
||||
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title(self):
|
||||
|
||||
template = "{created} - {title}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
@@ -273,21 +116,6 @@ class TestFieldPermutations(TestCase):
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title_and_tags(self):
|
||||
|
||||
template = "{created} - {title} - {tags}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"tags": tags
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_invalid_date_format(self):
|
||||
info = FileInfo.from_filename("06112017Z - title.pdf")
|
||||
self.assertEqual(info.title, "title")
|
||||
@@ -336,54 +164,46 @@ class TestFieldPermutations(TestCase):
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "anotherall")
|
||||
|
||||
# Complex transformation without date in replacement string
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
# Complex transformation with date in replacement string
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(none_patt, "none.gif"),
|
||||
(exact_patt, repl2), # <-- matches
|
||||
(exact_patt, repl1),
|
||||
(all_patt, "all.gif")]):
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
self.assertEqual(info.created.year, 2019)
|
||||
self.assertEqual(info.created.month, 9)
|
||||
self.assertEqual(info.created.day, 8)
|
||||
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, logging_group, scratch_dir, archive_path):
|
||||
super(DummyParser, self).__init__(logging_group)
|
||||
super(DummyParser, self).__init__(logging_group, None)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
self.text = "The Text"
|
||||
|
||||
|
||||
class CopyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None):
|
||||
super(CopyParser, self).__init__(logging_group, progress_callback)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=self.tempdir)
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
self.text = "The text"
|
||||
self.archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||
shutil.copy(document_path, self.archive_path)
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
@@ -391,10 +211,10 @@ class FaultyParser(DocumentParser):
|
||||
super(FaultyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
@@ -403,6 +223,8 @@ def fake_magic_from_file(file, mime=False):
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
elif os.path.splitext(file)[1] == ".png":
|
||||
return "image/png"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
@@ -412,10 +234,24 @@ def fake_magic_from_file(file, mime=False):
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_dummy_parser(self, logging_group):
|
||||
def _assert_first_last_send_progress(self, first_status="STARTING", last_status="SUCCESS", first_progress=0, first_progress_max=100, last_progress=100, last_progress_max=100):
|
||||
|
||||
self._send_progress.assert_called()
|
||||
|
||||
args, kwargs = self._send_progress.call_args_list[0]
|
||||
self.assertEqual(args[0], first_progress)
|
||||
self.assertEqual(args[1], first_progress_max)
|
||||
self.assertEqual(args[2], first_status)
|
||||
|
||||
args, kwargs = self._send_progress.call_args_list[len(self._send_progress.call_args_list) - 1]
|
||||
self.assertEqual(args[0], last_progress)
|
||||
self.assertEqual(args[1], last_progress_max)
|
||||
self.assertEqual(args[2], last_status)
|
||||
|
||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
||||
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
|
||||
|
||||
def make_faulty_parser(self, logging_group):
|
||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
@@ -428,7 +264,11 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0
|
||||
})]
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
# this prevents websocket message reports during testing.
|
||||
patcher = mock.patch("documents.consumer.Consumer._send_progress")
|
||||
self._send_progress = patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
self.consumer = Consumer()
|
||||
@@ -456,6 +296,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
self.assertIsNone(document.correspondent)
|
||||
self.assertIsNone(document.document_type)
|
||||
self.assertEqual(document.filename, "0000001.pdf")
|
||||
self.assertEqual(document.archive_filename, "0000001.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.source_path
|
||||
@@ -474,31 +315,36 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideFilename(self):
|
||||
filename = self.get_test_file()
|
||||
override_filename = "My Bank - Statement for November.pdf"
|
||||
override_filename = "Statement for November.pdf"
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
|
||||
|
||||
self.assertEqual(document.correspondent.name, "My Bank")
|
||||
self.assertEqual(document.title, "Statement for November")
|
||||
|
||||
def testOverrideTitle(self):
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideTitle(self):
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
||||
self.assertEqual(document.title, "Override Title")
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideCorrespondent(self):
|
||||
c = Correspondent.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
|
||||
self.assertEqual(document.correspondent.id, c.id)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideDocumentType(self):
|
||||
dt = DocumentType.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
|
||||
self.assertEqual(document.document_type.id, dt.id)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideTags(self):
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
@@ -509,37 +355,42 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
self.assertIn(t3, document.tags.all())
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testNotAFile(self):
|
||||
try:
|
||||
self.consumer.try_consume_file("non-existing-file")
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith('It is not a file'))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"File not found",
|
||||
self.consumer.try_consume_file,
|
||||
"non-existing-file"
|
||||
)
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
def testDuplicates1(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"It is a duplicate",
|
||||
self.consumer.try_consume_file,
|
||||
self.get_test_file()
|
||||
)
|
||||
|
||||
self.fail("Should throw exception")
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
def testDuplicates2(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"It is a duplicate",
|
||||
self.consumer.try_consume_file,
|
||||
self.get_test_archive_file()
|
||||
)
|
||||
|
||||
self.fail("Should throw exception")
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
def testDuplicates3(self):
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
@@ -549,13 +400,15 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue("No parsers abvailable for" in str(e))
|
||||
return
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"sample.pdf: Unsupported mime type application/pdf",
|
||||
self.consumer.try_consume_file,
|
||||
self.get_test_file()
|
||||
)
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testFaultyParser(self, m):
|
||||
@@ -565,24 +418,28 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Does not compute.")
|
||||
return
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"sample.pdf: Error while consuming document sample.pdf: Does not compute.",
|
||||
self.consumer.try_consume_file,
|
||||
self.get_test_file()
|
||||
)
|
||||
|
||||
self.fail("Should throw exception.")
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.consumer.Consumer._write")
|
||||
def testPostSaveError(self, m):
|
||||
filename = self.get_test_file()
|
||||
m.side_effect = OSError("NO.")
|
||||
try:
|
||||
self.consumer.try_consume_file(filename)
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "NO.")
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
"sample.pdf: The following error occured while consuming sample.pdf: NO.",
|
||||
self.consumer.try_consume_file,
|
||||
filename
|
||||
)
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
# file not deleted
|
||||
self.assertTrue(os.path.isfile(filename))
|
||||
@@ -594,14 +451,16 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
def testFilenameHandling(self):
|
||||
filename = self.get_test_file()
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
document = self.consumer.try_consume_file(filename, override_title="new docs")
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||
self.assertEqual(document.filename, "none/new docs.pdf")
|
||||
self.assertEqual(document.archive_filename, "none/new docs.pdf")
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.generate_filename")
|
||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m):
|
||||
|
||||
filenames = ["this", "that", "now this", "i cant decide"]
|
||||
@@ -611,20 +470,22 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
filenames.insert(0, f)
|
||||
return f
|
||||
|
||||
m.side_effect = lambda f: get_filename()
|
||||
m.side_effect = lambda f, archive_filename = False: get_filename()
|
||||
|
||||
filename = self.get_test_file()
|
||||
|
||||
Tag.objects.create(name="test", is_inbox_tag=True)
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
document = self.consumer.try_consume_file(filename, override_title="new docs")
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertIsNotNone(os.path.isfile(document.title))
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertTrue(os.path.isfile(document.archive_path))
|
||||
|
||||
@mock.patch("documents.consumer.DocumentClassifier")
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def testClassifyDocument(self, m):
|
||||
correspondent = Correspondent.objects.create(name="test")
|
||||
dtype = DocumentType.objects.create(name="test")
|
||||
@@ -642,3 +503,161 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(document.document_type, dtype)
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(CONSUMER_DELETE_DUPLICATES=True)
|
||||
def test_delete_duplicate(self):
|
||||
dst = self.get_test_file()
|
||||
self.assertTrue(os.path.isfile(dst))
|
||||
doc = self.consumer.try_consume_file(dst)
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
self.assertFalse(os.path.isfile(dst))
|
||||
self.assertIsNotNone(doc)
|
||||
|
||||
self._send_progress.reset_mock()
|
||||
|
||||
dst = self.get_test_file()
|
||||
self.assertTrue(os.path.isfile(dst))
|
||||
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
|
||||
self.assertFalse(os.path.isfile(dst))
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
|
||||
def test_no_delete_duplicate(self):
|
||||
dst = self.get_test_file()
|
||||
self.assertTrue(os.path.isfile(dst))
|
||||
doc = self.consumer.try_consume_file(dst)
|
||||
|
||||
self.assertFalse(os.path.isfile(dst))
|
||||
self.assertIsNotNone(doc)
|
||||
|
||||
dst = self.get_test_file()
|
||||
self.assertTrue(os.path.isfile(dst))
|
||||
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
|
||||
self.assertTrue(os.path.isfile(dst))
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_similar_filenames(self, m):
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
|
||||
m.return_value = [(None, {
|
||||
"parser": CopyParser,
|
||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
||||
"weight": 0
|
||||
})]
|
||||
doc1 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
|
||||
doc2 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
|
||||
doc3 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
|
||||
|
||||
self.assertEqual(doc1.filename, "simple.png")
|
||||
self.assertEqual(doc1.archive_filename, "simple.pdf")
|
||||
self.assertEqual(doc2.filename, "simple.pdf")
|
||||
self.assertEqual(doc2.archive_filename, "simple_01.pdf")
|
||||
self.assertEqual(doc3.filename, "simple.png.pdf")
|
||||
self.assertEqual(doc3.archive_filename, "simple.png.pdf")
|
||||
|
||||
sanity_check()
|
||||
|
||||
|
||||
class PreConsumeTestCase(TestCase):
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
@override_settings(PRE_CONSUME_SCRIPT=None)
|
||||
def test_no_pre_consume_script(self, m):
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.run_pre_consume_script()
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
@mock.patch("documents.consumer.Consumer._send_progress")
|
||||
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
||||
def test_pre_consume_script_not_found(self, m, m2):
|
||||
c = Consumer()
|
||||
c.filename = "somefile.pdf"
|
||||
c.path = "path-to-file"
|
||||
self.assertRaises(ConsumerError, c.run_pre_consume_script)
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
def test_pre_consume_script(self, m):
|
||||
with tempfile.NamedTemporaryFile() as script:
|
||||
with override_settings(PRE_CONSUME_SCRIPT=script.name):
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.run_pre_consume_script()
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
command = args[0]
|
||||
|
||||
self.assertEqual(command[0], script.name)
|
||||
self.assertEqual(command[1], "path-to-file")
|
||||
|
||||
|
||||
class PostConsumeTestCase(TestCase):
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT=None)
|
||||
def test_no_post_consume_script(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
Consumer().run_post_consume_script(doc)
|
||||
|
||||
m.assert_not_called()
|
||||
|
||||
@override_settings(POST_CONSUME_SCRIPT="does-not-exist")
|
||||
@mock.patch("documents.consumer.Consumer._send_progress")
|
||||
def test_post_consume_script_not_found(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
c = Consumer()
|
||||
c.filename = "somefile.pdf"
|
||||
self.assertRaises(ConsumerError, c.run_post_consume_script, doc)
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
def test_post_consume_script_simple(self, m):
|
||||
with tempfile.NamedTemporaryFile() as script:
|
||||
with override_settings(POST_CONSUME_SCRIPT=script.name):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
|
||||
Consumer().run_post_consume_script(doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.consumer.Popen")
|
||||
def test_post_consume_script_with_correspondent(self, m):
|
||||
with tempfile.NamedTemporaryFile() as script:
|
||||
with override_settings(POST_CONSUME_SCRIPT=script.name):
|
||||
c = Correspondent.objects.create(name="my_bank")
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
Consumer().run_post_consume_script(doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
command = args[0]
|
||||
|
||||
self.assertEqual(command[0], script.name)
|
||||
self.assertEqual(command[1], str(doc.pk))
|
||||
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
|
||||
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
self.assertCountEqual(command[8].split(","), ["a", "b"])
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
@@ -9,7 +8,6 @@ from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
@@ -138,3 +136,18 @@ class TestDate(TestCase):
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
||||
|
||||
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
|
||||
def test_ignored_dates(self, *args):
|
||||
text = (
|
||||
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
@@ -1,10 +1,10 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
from django.utils import timezone
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
@@ -47,20 +47,20 @@ class TestDocument(TestCase):
|
||||
|
||||
def test_file_name(self):
|
||||
|
||||
doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.pdf")
|
||||
doc = Document(mime_type="application/pdf", title="test", created=timezone.datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
|
||||
|
||||
def test_file_name_jpg(self):
|
||||
|
||||
doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.jpg")
|
||||
doc = Document(mime_type="image/jpeg", title="test", created=timezone.datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
|
||||
|
||||
def test_file_name_unknown(self):
|
||||
|
||||
doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.zip")
|
||||
doc = Document(mime_type="application/zip", title="test", created=timezone.datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
|
||||
|
||||
def test_file_name_invalid(self):
|
||||
def test_file_name_invalid_type(self):
|
||||
|
||||
doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test")
|
||||
doc = Document(mime_type="image/jpegasd", title="test", created=timezone.datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
||||
|
@@ -1,15 +1,20 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
from django.utils import timezone
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..models import Document, Correspondent
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
|
||||
generate_unique_filename
|
||||
from ..models import Document, Correspondent, Tag, DocumentType
|
||||
|
||||
|
||||
class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
@@ -40,13 +45,13 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
# Enable encryption and check again
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf.gpg".format(document.pk))
|
||||
"none/none.pdf.gpg")
|
||||
|
||||
document.save()
|
||||
|
||||
@@ -62,7 +67,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test.pdf.gpg"), True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
@@ -74,12 +79,12 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none.pdf")
|
||||
|
||||
# Make the folder read- and execute-only (no writing and no renaming)
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
|
||||
@@ -89,8 +94,8 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
|
||||
@@ -108,7 +113,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
@@ -125,8 +130,8 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
|
||||
self.assertEqual(document.filename, "none/none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
@@ -138,7 +143,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
@@ -146,7 +151,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure file deletion after delete
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
@@ -168,7 +173,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
"none/none.pdf")
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
@@ -185,6 +190,24 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
|
||||
self.assertTrue(os.path.isfile(important_file))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{document_type} - {title}")
|
||||
def test_document_type(self):
|
||||
dt = DocumentType.objects.create(name="my_doc_type")
|
||||
d = Document.objects.create(title="the_doc", mime_type="application/pdf")
|
||||
|
||||
self.assertEqual(generate_filename(d), "none - the_doc.pdf")
|
||||
|
||||
d.document_type = dt
|
||||
|
||||
self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{asn} - {title}")
|
||||
def test_asn(self):
|
||||
d1 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=652, checksum="A")
|
||||
d2 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=None, checksum="B")
|
||||
self.assertEqual(generate_filename(d1), "652 - the_doc.pdf")
|
||||
self.assertEqual(generate_filename(d2), "none - the_doc.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_underscore(self):
|
||||
document = Document()
|
||||
@@ -199,7 +222,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
@@ -215,7 +238,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
@@ -231,7 +254,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
"none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
@@ -246,7 +269,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
"demo.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
|
||||
def test_tags_out_of_bounds(self):
|
||||
@@ -261,7 +284,58 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
"none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags}")
|
||||
def test_tags_without_args(self):
|
||||
document = Document()
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(generate_filename(document), f"{document.pk:07}.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title} {tag_list}")
|
||||
def test_tag_list(self):
|
||||
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
|
||||
doc.tags.create(name="tag2")
|
||||
doc.tags.create(name="tag1")
|
||||
|
||||
self.assertEqual(generate_filename(doc), "doc1 tag1,tag2.pdf")
|
||||
|
||||
doc = Document.objects.create(title="doc2", checksum="B", mime_type="application/pdf")
|
||||
|
||||
self.assertEqual(generate_filename(doc), "doc2.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="//etc/something/{title}")
|
||||
def test_filename_relative(self):
|
||||
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
|
||||
doc.filename = generate_filename(doc)
|
||||
doc.save()
|
||||
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "etc", "something", "doc1.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}-{created_month}-{created_day}")
|
||||
def test_created_year_month_day(self):
|
||||
d1 = timezone.make_aware(datetime.datetime(2020, 3, 6, 1, 1, 1))
|
||||
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", created=d1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-03-06.pdf")
|
||||
|
||||
doc1.created = timezone.make_aware(datetime.datetime(2020, 11, 16, 1, 1, 1))
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{added_year}-{added_month}-{added_day}")
|
||||
def test_added_year_month_day(self):
|
||||
d1 = timezone.make_aware(datetime.datetime(232, 1, 9, 1, 1, 1))
|
||||
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", added=d1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "232-01-09.pdf")
|
||||
|
||||
doc1.added = timezone.make_aware(datetime.datetime(2020, 11, 16, 1, 1, 1))
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
@@ -272,7 +346,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk))
|
||||
self.assertEqual(document.filename, "none/none/none.pdf")
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
@@ -282,7 +356,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
|
||||
@@ -330,6 +404,60 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_duplicates(self):
|
||||
document = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="A", pk=1)
|
||||
document2 = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="B", pk=2)
|
||||
Path(document.source_path).touch()
|
||||
Path(document2.source_path).touch()
|
||||
document.filename = "0000001.pdf"
|
||||
document.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document.filename, "qwe.pdf")
|
||||
|
||||
document2.filename = "0000002.pdf"
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe_01.pdf")
|
||||
|
||||
# saving should not change the file names.
|
||||
|
||||
document.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document.filename, "qwe.pdf")
|
||||
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe_01.pdf")
|
||||
|
||||
document.delete()
|
||||
|
||||
self.assertFalse(os.path.isfile(document.source_path))
|
||||
|
||||
# filename free, should remove _01 suffix
|
||||
|
||||
document2.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(document2.filename, "qwe.pdf")
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.signals.handlers.Document.objects.filter")
|
||||
def test_no_update_without_change(self, m):
|
||||
doc = Document.objects.create(title="document", filename="document.pdf", archive_filename="document.pdf", checksum="A", archive_checksum="B", mime_type="application/pdf")
|
||||
Path(doc.source_path).touch()
|
||||
Path(doc.archive_path).touch()
|
||||
|
||||
doc.save()
|
||||
|
||||
m.assert_not_called()
|
||||
|
||||
|
||||
|
||||
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
|
||||
@@ -339,7 +467,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
@@ -352,22 +480,21 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
#Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
@@ -378,14 +505,49 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
def test_move_archive_exists(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
existing_archive_file = os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
Path(existing_archive_file).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(existing_archive_file))
|
||||
self.assertEqual(doc.archive_filename, "none/my_doc_01.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_move_original_only(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "document_01.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "document.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document_01.pdf", checksum="A",
|
||||
archive_checksum="B", archive_filename="document.pdf")
|
||||
|
||||
self.assertEqual(doc.filename, "document.pdf")
|
||||
self.assertEqual(doc.archive_filename, "document.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_move_archive_only(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "document.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "document_01.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document.pdf", checksum="A",
|
||||
archive_checksum="B", archive_filename="document_01.pdf")
|
||||
|
||||
self.assertEqual(doc.filename, "document.pdf")
|
||||
self.assertEqual(doc.archive_filename, "document.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@@ -406,8 +568,9 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
|
||||
m.assert_called()
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
@@ -419,7 +582,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
#Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
@@ -443,19 +606,21 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
m.assert_called()
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_archive_deleted(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
@@ -469,6 +634,28 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_archive_deleted2(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "document.png")
|
||||
original2 = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(original2).touch()
|
||||
Path(archive).touch()
|
||||
|
||||
doc1 = Document.objects.create(mime_type="image/png", title="document", filename="document.png", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
|
||||
doc2 = Document.objects.create(mime_type="application/pdf", title="0000001", filename="0000001.pdf", checksum="C")
|
||||
|
||||
self.assertTrue(os.path.isfile(doc1.source_path))
|
||||
self.assertTrue(os.path.isfile(doc1.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc2.source_path))
|
||||
|
||||
doc2.delete()
|
||||
|
||||
self.assertTrue(os.path.isfile(doc1.source_path))
|
||||
self.assertTrue(os.path.isfile(doc1.archive_path))
|
||||
self.assertFalse(os.path.isfile(doc2.source_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_database_error(self):
|
||||
|
||||
@@ -476,7 +663,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
|
||||
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
doc.save()
|
||||
@@ -485,3 +672,45 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
|
||||
class TestFilenameGeneration(TestCase):
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{title}"
|
||||
)
|
||||
def test_invalid_characters(self):
|
||||
|
||||
doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1")
|
||||
self.assertEqual(generate_filename(doc), "This. is the title.pdf")
|
||||
|
||||
doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay.pdf")
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{created}"
|
||||
)
|
||||
def test_date(self):
|
||||
doc = Document.objects.create(title="does not matter", created=timezone.make_aware(datetime.datetime(2020,5,21, 7,36,51, 153)), mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "2020-05-21.pdf")
|
||||
|
||||
|
||||
def run():
|
||||
doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow")
|
||||
doc.filename = generate_unique_filename(doc)
|
||||
Path(doc.thumbnail_path).touch()
|
||||
with open(doc.source_path, "w") as f:
|
||||
f.write(str(uuid.uuid4()))
|
||||
with open(doc.source_path, "rb") as f:
|
||||
doc.checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
with open(doc.archive_path, "w") as f:
|
||||
f.write(str(uuid.uuid4()))
|
||||
with open(doc.archive_path, "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
doc.save()
|
||||
|
||||
for i in range(30):
|
||||
doc.title = str(random.randrange(1, 5))
|
||||
doc.save()
|
||||
|
@@ -1,6 +1,9 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from documents import index
|
||||
from documents.index import JsonFormatter
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class JsonFormatterTest(TestCase):
|
||||
@@ -12,3 +15,21 @@ class JsonFormatterTest(TestCase):
|
||||
self.assertListEqual(self.formatter.format([]), [])
|
||||
|
||||
|
||||
class TestAutoComplete(DirectoriesMixin, TestCase):
|
||||
|
||||
def test_auto_complete(self):
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", checksum="A", content="test test2 test3")
|
||||
doc2 = Document.objects.create(title="doc2", checksum="B", content="test test2")
|
||||
doc3 = Document.objects.create(title="doc3", checksum="C", content="test2")
|
||||
|
||||
index.add_or_update_document(doc1)
|
||||
index.add_or_update_document(doc2)
|
||||
index.add_or_update_document(doc3)
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
self.assertListEqual(index.autocomplete(ix, "tes"), [b"test3", b"test", b"test2"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=3), [b"test3", b"test", b"test2"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
|
||||
|
@@ -1,66 +0,0 @@
|
||||
import logging
|
||||
import uuid
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Log
|
||||
|
||||
|
||||
class TestPaperlessLog(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.logger = logging.getLogger(
|
||||
"documents.management.commands.document_consumer")
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_that_it_saves_at_all(self):
|
||||
|
||||
kw = {"group": uuid.uuid4()}
|
||||
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
|
||||
# Debug messages are ignored by default
|
||||
self.logger.debug("This is a debugging message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 1)
|
||||
|
||||
self.logger.info("This is an informational message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 2)
|
||||
|
||||
self.logger.warning("This is an warning message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 3)
|
||||
|
||||
self.logger.error("This is an error message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 4)
|
||||
|
||||
self.logger.critical("This is a critical message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 5)
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_groups(self):
|
||||
|
||||
kw1 = {"group": uuid.uuid4()}
|
||||
kw2 = {"group": uuid.uuid4()}
|
||||
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
|
||||
self.logger.info("This is an informational message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 1)
|
||||
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
|
||||
|
||||
self.logger.warning("This is an warning message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 2)
|
||||
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
|
||||
|
||||
self.logger.error("This is an error message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 3)
|
||||
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
|
||||
|
||||
self.logger.critical("This is a critical message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 4)
|
||||
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
|
193
src/documents/tests/test_management.py
Normal file
193
src/documents/tests/test_management.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import hashlib
|
||||
import tempfile
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
from documents.file_handling import generate_filename
|
||||
from documents.management.commands.document_archiver import handle_document
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
doc = self.make_models()
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
doc = self.make_models()
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
|
||||
|
||||
handle_document(doc.pk)
|
||||
|
||||
doc = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertIsNotNone(doc.archive_checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
|
||||
self.assertEqual(doc.archive_filename, "none/A.pdf")
|
||||
|
||||
def test_unknown_mime_type(self):
|
||||
doc = self.make_models()
|
||||
doc.mime_type = "sdgfh"
|
||||
doc.save()
|
||||
shutil.copy(sample_file, doc.source_path)
|
||||
|
||||
handle_document(doc.pk)
|
||||
|
||||
doc = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertIsNone(doc.archive_checksum)
|
||||
self.assertIsNone(doc.archive_filename)
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
|
||||
def test_naming_priorities(self):
|
||||
doc1 = Document.objects.create(checksum="A", title="document", content="first document", mime_type="application/pdf", filename="document.pdf")
|
||||
doc2 = Document.objects.create(checksum="B", title="document", content="second document", mime_type="application/pdf", filename="document_01.pdf")
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document.pdf"))
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document_01.pdf"))
|
||||
|
||||
handle_document(doc2.pk)
|
||||
handle_document(doc1.pk)
|
||||
|
||||
doc1 = Document.objects.get(id=doc1.id)
|
||||
doc2 = Document.objects.get(id=doc2.id)
|
||||
|
||||
self.assertEqual(doc1.archive_filename, "document.pdf")
|
||||
self.assertEqual(doc2.archive_filename, "document_01.pdf")
|
||||
|
||||
|
||||
class TestDecryptDocuments(TestCase):
|
||||
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test",
|
||||
PAPERLESS_FILENAME_FORMAT=None
|
||||
)
|
||||
@mock.patch("documents.management.commands.decrypt_documents.input")
|
||||
def test_decrypt(self, m):
|
||||
|
||||
media_dir = tempfile.mkdtemp()
|
||||
originals_dir = os.path.join(media_dir, "documents", "originals")
|
||||
thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
|
||||
os.makedirs(originals_dir, exist_ok=True)
|
||||
os.makedirs(thumb_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=originals_dir,
|
||||
THUMBNAIL_DIR=thumb_dir,
|
||||
PASSPHRASE="test"
|
||||
).enable()
|
||||
|
||||
doc = Document.objects.create(checksum="82186aaa94f0b98697d704b90fd1c072", title="wow", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000004.pdf.gpg"), os.path.join(originals_dir, "0000004.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000004.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
|
||||
|
||||
call_command('decrypt_documents')
|
||||
|
||||
doc.refresh_from_db()
|
||||
|
||||
self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(doc.filename, "0000004.pdf")
|
||||
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000004.pdf")))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
|
||||
self.assertTrue(os.path.isfile(doc.thumbnail_path))
|
||||
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, doc.checksum)
|
||||
|
||||
|
||||
class TestMakeIndex(TestCase):
|
||||
|
||||
@mock.patch("documents.management.commands.document_index.index_reindex")
|
||||
def test_reindex(self, m):
|
||||
call_command("document_index", "reindex")
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.management.commands.document_index.index_optimize")
|
||||
def test_optimize(self, m):
|
||||
call_command("document_index", "optimize")
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
class TestRenamer(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_rename(self):
|
||||
doc = Document.objects.create(title="test", mime_type="image/jpeg")
|
||||
doc.filename = generate_filename(doc)
|
||||
doc.archive_filename = generate_filename(doc, archive_filename=True)
|
||||
doc.save()
|
||||
|
||||
Path(doc.source_path).touch()
|
||||
Path(doc.archive_path).touch()
|
||||
|
||||
with override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}"):
|
||||
call_command("document_renamer")
|
||||
|
||||
doc2 = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertEqual(doc2.filename, "none/test.jpg")
|
||||
self.assertEqual(doc2.archive_filename, "none/test.pdf")
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc2.source_path))
|
||||
self.assertTrue(os.path.isfile(doc2.archive_path))
|
||||
|
||||
|
||||
class TestCreateClassifier(TestCase):
|
||||
|
||||
@mock.patch("documents.management.commands.document_create_classifier.train_classifier")
|
||||
def test_create_classifier(self, m):
|
||||
call_command("document_create_classifier")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
class TestSanityChecker(DirectoriesMixin, TestCase):
|
||||
|
||||
def test_no_issues(self):
|
||||
with self.assertLogs() as capture:
|
||||
call_command("document_sanity_checker")
|
||||
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertIn("Sanity checker detected no issues.", capture.output[0])
|
||||
|
||||
def test_errors(self):
|
||||
doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc")
|
||||
Path(doc.source_path).touch()
|
||||
Path(doc.thumbnail_path).touch()
|
||||
|
||||
with self.assertLogs() as capture:
|
||||
call_command("document_sanity_checker")
|
||||
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertIn("Checksum mismatch of document", capture.output[0])
|
@@ -1,42 +0,0 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_archiver import handle_document
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
|
||||
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
handle_document(self.d1)
|
||||
|
||||
doc = Document.objects.get(id=self.d1.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
|
@@ -203,7 +203,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
||||
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
|
||||
class TestConsumerPolling(TestConsumer):
|
||||
# just do all the tests with polling
|
||||
pass
|
||||
@@ -215,8 +215,7 @@ class TestConsumerRecursive(TestConsumer):
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
|
||||
class TestConsumerRecursivePolling(TestConsumer):
|
||||
# just do all the tests with polling and recursive
|
||||
pass
|
||||
@@ -230,7 +229,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
tag_names = ("existingTag", "Space Tag")
|
||||
# Create a Tag prior to consuming a file using it in path
|
||||
tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
|
||||
tag_ids = [Tag.objects.create(name="existingtag").pk,]
|
||||
|
||||
self.t_start()
|
||||
|
||||
@@ -257,6 +256,6 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
# their order.
|
||||
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
|
||||
def test_consume_file_with_path_tags_polling(self):
|
||||
self.test_consume_file_with_path_tags()
|
||||
|
@@ -1,57 +0,0 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
|
||||
|
||||
class TestDecryptDocuments(TestCase):
|
||||
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test",
|
||||
PAPERLESS_FILENAME_FORMAT=None
|
||||
)
|
||||
@mock.patch("documents.management.commands.decrypt_documents.input")
|
||||
def test_decrypt(self, m):
|
||||
|
||||
media_dir = tempfile.mkdtemp()
|
||||
originals_dir = os.path.join(media_dir, "documents", "originals")
|
||||
thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
|
||||
os.makedirs(originals_dir, exist_ok=True)
|
||||
os.makedirs(thumb_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=originals_dir,
|
||||
THUMBNAIL_DIR=thumb_dir,
|
||||
PASSPHRASE="test"
|
||||
).enable()
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
|
||||
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
call_command('decrypt_documents')
|
||||
|
||||
doc = Document.objects.get(id=2)
|
||||
|
||||
self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(doc.filename, "0000002.pdf")
|
||||
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
|
||||
self.assertTrue(os.path.isfile(doc.thumbnail_path))
|
||||
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, doc.checksum)
|
||||
|
@@ -3,59 +3,224 @@ import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.sanity_checker import check_sanity
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.tests.utils import DirectoriesMixin, paperless_environment
|
||||
|
||||
|
||||
class TestExporter(DirectoriesMixin, TestCase):
|
||||
class TestExportImport(DirectoriesMixin, TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.target = tempfile.mkdtemp()
|
||||
self.addCleanup(shutil.rmtree, self.target)
|
||||
|
||||
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
|
||||
self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
|
||||
self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
|
||||
self.d4 = Document.objects.create(content="Content", checksum="82186aaa94f0b98697d704b90fd1c072", title="wow_dec", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
self.t1 = Tag.objects.create(name="t")
|
||||
self.dt1 = DocumentType.objects.create(name="dt")
|
||||
self.c1 = Correspondent.objects.create(name="c")
|
||||
|
||||
self.d1.tags.add(self.t1)
|
||||
self.d1.correspondent = self.c1
|
||||
self.d1.document_type = self.dt1
|
||||
self.d1.save()
|
||||
super(TestExportImport, self).setUp()
|
||||
|
||||
def _get_document_from_manifest(self, manifest, id):
|
||||
f = list(filter(lambda d: d['model'] == "documents.document" and d['pk'] == id, manifest))
|
||||
if len(f) == 1:
|
||||
return f[0]
|
||||
else:
|
||||
raise ValueError(f"document with id {id} does not exist in manifest")
|
||||
|
||||
@override_settings(
|
||||
PASSPHRASE="test"
|
||||
)
|
||||
def test_exporter(self):
|
||||
def _do_export(self, use_filename_format=False, compare_checksums=False, delete=False):
|
||||
args = ['document_exporter', self.target]
|
||||
if use_filename_format:
|
||||
args += ["--use-filename-format"]
|
||||
if compare_checksums:
|
||||
args += ["--compare-checksums"]
|
||||
if delete:
|
||||
args += ["--delete"]
|
||||
|
||||
call_command(*args)
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
return manifest
|
||||
|
||||
def test_exporter(self, use_filename_format=False):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
manifest = self._do_export(use_filename_format=use_filename_format)
|
||||
|
||||
Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
Correspondent.objects.create(name="c")
|
||||
self.assertEqual(len(manifest), 7)
|
||||
self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 4)
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
call_command('document_exporter', target)
|
||||
|
||||
with open(os.path.join(target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
self.assertEqual(len(manifest), 5)
|
||||
self.assertEqual(self._get_document_from_manifest(manifest, self.d1.id)['fields']['title'], "wow1")
|
||||
self.assertEqual(self._get_document_from_manifest(manifest, self.d2.id)['fields']['title'], "wow2")
|
||||
self.assertEqual(self._get_document_from_manifest(manifest, self.d3.id)['fields']['title'], "wow2")
|
||||
self.assertEqual(self._get_document_from_manifest(manifest, self.d4.id)['fields']['title'], "wow_dec")
|
||||
|
||||
for element in manifest:
|
||||
if element['model'] == 'documents.document':
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
fname = os.path.join(self.target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
self.assertEqual(element['fields']['storage_type'], Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
|
||||
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
fname = os.path.join(self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['archive_checksum'])
|
||||
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
with paperless_environment() as dirs:
|
||||
self.assertEqual(Document.objects.count(), 4)
|
||||
Document.objects.all().delete()
|
||||
Correspondent.objects.all().delete()
|
||||
DocumentType.objects.all().delete()
|
||||
Tag.objects.all().delete()
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
call_command('document_importer', self.target)
|
||||
self.assertEqual(Document.objects.count(), 4)
|
||||
self.assertEqual(Tag.objects.count(), 1)
|
||||
self.assertEqual(Correspondent.objects.count(), 1)
|
||||
self.assertEqual(DocumentType.objects.count(), 1)
|
||||
self.assertEqual(Document.objects.get(id=self.d1.id).title, "wow1")
|
||||
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
|
||||
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
|
||||
self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
|
||||
|
||||
def test_exporter_with_filename_format(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
with override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"):
|
||||
self.test_exporter(use_filename_format=True)
|
||||
|
||||
def test_update_export_changed_time(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
self._do_export()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
st_mtime_1 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
st_mtime_2 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
|
||||
|
||||
Path(self.d1.source_path).touch()
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
self.assertEqual(m.call_count, 1)
|
||||
|
||||
st_mtime_3 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
self.assertNotEqual(st_mtime_1, st_mtime_2)
|
||||
self.assertNotEqual(st_mtime_2, st_mtime_3)
|
||||
|
||||
def test_update_export_changed_checksum(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
self._do_export()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
self.d2.checksum = "asdfasdgf3"
|
||||
self.d2.save()
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export(compare_checksums=True)
|
||||
self.assertEqual(m.call_count, 1)
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
def test_update_export_deleted_document(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
manifest = self._do_export()
|
||||
|
||||
self.assertTrue(len(manifest), 7)
|
||||
doc_from_manifest = self._get_document_from_manifest(manifest, self.d3.id)
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
|
||||
self.d3.delete()
|
||||
|
||||
manifest = self._do_export()
|
||||
self.assertRaises(ValueError, self._get_document_from_manifest, manifest, self.d3.id)
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
|
||||
|
||||
manifest = self._do_export(delete=True)
|
||||
self.assertFalse(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
|
||||
|
||||
self.assertTrue(len(manifest), 6)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}/{correspondent}")
|
||||
def test_update_export_changed_location(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
m = self._do_export(use_filename_format=True)
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
self.d1.title = "new_title"
|
||||
self.d1.save()
|
||||
self._do_export(use_filename_format=True, delete=True)
|
||||
self.assertFalse(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
|
||||
self.assertFalse(os.path.isdir(os.path.join(self.target, "wow1")))
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "new_title", "c.pdf")))
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow2", "none.pdf")))
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow2", "none_01.pdf")))
|
||||
|
||||
def test_export_missing_files(self):
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
self.addCleanup(shutil.rmtree, target)
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", mime_type="application/pdf")
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
||||
|
@@ -14,6 +14,12 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
|
||||
self.tag_no_match = Tag.objects.create(name="test2")
|
||||
|
||||
self.d3.tags.add(self.tag_inbox)
|
||||
self.d3.tags.add(self.tag_no_match)
|
||||
|
||||
|
||||
self.correspondent_first = Correspondent.objects.create(
|
||||
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
@@ -38,7 +44,7 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 1)
|
||||
self.assertEqual(d_second.tags.count(), 1)
|
||||
self.assertEqual(d_unrelated.tags.count(), 0)
|
||||
self.assertEqual(d_unrelated.tags.count(), 2)
|
||||
|
||||
self.assertEqual(d_first.tags.first(), self.tag_first)
|
||||
self.assertEqual(d_second.tags.first(), self.tag_second)
|
||||
@@ -56,3 +62,17 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(d_first.correspondent, self.correspondent_first)
|
||||
self.assertEqual(d_second.correspondent, self.correspondent_second)
|
||||
|
||||
def test_overwrite_preserve_inbox(self):
|
||||
self.d1.tags.add(self.tag_second)
|
||||
|
||||
call_command('document_retagger', '--tags', '--overwrite')
|
||||
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
|
||||
|
||||
self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
|
||||
self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
|
||||
self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
|
||||
|
||||
|
52
src/documents/tests/test_management_thumbnails.py
Normal file
52
src/documents/tests/test_management_thumbnails.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_thumbnails import _process_document
|
||||
from documents.models import Document, Tag, Correspondent, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestMakeThumbnails(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf", filename="test.pdf")
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.d1.source_path)
|
||||
|
||||
self.d2 = Document.objects.create(checksum="Ass", title="A", content="first document", mime_type="application/pdf", filename="test2.pdf")
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.d2.source_path)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestMakeThumbnails, self).setUp()
|
||||
self.make_models()
|
||||
|
||||
def test_process_document(self):
|
||||
self.assertFalse(os.path.isfile(self.d1.thumbnail_path))
|
||||
_process_document(self.d1.id)
|
||||
self.assertTrue(os.path.isfile(self.d1.thumbnail_path))
|
||||
|
||||
@mock.patch("documents.management.commands.document_thumbnails.shutil.move")
|
||||
def test_process_document_invalid_mime_type(self, m):
|
||||
self.d1.mime_type = "asdasdasd"
|
||||
self.d1.save()
|
||||
|
||||
_process_document(self.d1.id)
|
||||
|
||||
m.assert_not_called()
|
||||
|
||||
def test_command(self):
|
||||
self.assertFalse(os.path.isfile(self.d1.thumbnail_path))
|
||||
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))
|
||||
call_command('document_thumbnails')
|
||||
self.assertTrue(os.path.isfile(self.d1.thumbnail_path))
|
||||
self.assertTrue(os.path.isfile(self.d2.thumbnail_path))
|
||||
|
||||
def test_command_documentid(self):
|
||||
self.assertFalse(os.path.isfile(self.d1.thumbnail_path))
|
||||
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))
|
||||
call_command('document_thumbnails', '-d', f"{self.d1.id}")
|
||||
self.assertTrue(os.path.isfile(self.d1.thumbnail_path))
|
||||
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))
|
@@ -21,13 +21,15 @@ class TestMatching(TestCase):
|
||||
matching_algorithm=getattr(klass, algorithm)
|
||||
)
|
||||
for string in true:
|
||||
doc = Document(content=string)
|
||||
self.assertTrue(
|
||||
matching.matches(instance, string),
|
||||
matching.matches(instance, doc),
|
||||
'"%s" should match "%s" but it does not' % (text, string)
|
||||
)
|
||||
for string in false:
|
||||
doc = Document(content=string)
|
||||
self.assertFalse(
|
||||
matching.matches(instance, string),
|
||||
matching.matches(instance, doc),
|
||||
'"%s" should not match "%s" but it does' % (text, string)
|
||||
)
|
||||
|
||||
@@ -169,7 +171,7 @@ class TestMatching(TestCase):
|
||||
def test_match_regex(self):
|
||||
|
||||
self._test_matching(
|
||||
r"alpha\w+gamma",
|
||||
"alpha\w+gamma",
|
||||
"MATCH_REGEX",
|
||||
(
|
||||
"I have alpha_and_gamma in me",
|
||||
@@ -187,6 +189,16 @@ class TestMatching(TestCase):
|
||||
)
|
||||
)
|
||||
|
||||
def test_tach_invalid_regex(self):
|
||||
self._test_matching(
|
||||
"[[",
|
||||
"MATCH_REGEX",
|
||||
[],
|
||||
[
|
||||
"Don't match this"
|
||||
]
|
||||
)
|
||||
|
||||
def test_match_fuzzy(self):
|
||||
|
||||
self._test_matching(
|
||||
|
325
src/documents/tests/test_migration_archive_files.py
Normal file
325
src/documents/tests/test_migration_archive_files.py
Normal file
@@ -0,0 +1,325 @@
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.tests.utils import DirectoriesMixin, TestMigrations
|
||||
|
||||
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
||||
|
||||
def archive_path_old(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def archive_path_new(doc):
|
||||
if doc.archive_filename is not None:
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
str(doc.archive_filename)
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def source_path(doc):
|
||||
if doc.filename:
|
||||
fname = str(doc.filename)
|
||||
else:
|
||||
fname = "{:07}{}".format(doc.pk, doc.file_type)
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def thumbnail_path(doc):
|
||||
file_name = "{:07}.png".format(doc.pk)
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
file_name += ".gpg"
|
||||
|
||||
return os.path.join(
|
||||
settings.THUMBNAIL_DIR,
|
||||
file_name
|
||||
)
|
||||
|
||||
|
||||
def make_test_document(document_class, title: str, mime_type: str, original: str, original_filename: str, archive: str = None, archive_filename: str = None):
|
||||
doc = document_class()
|
||||
doc.filename = original_filename
|
||||
doc.title = title
|
||||
doc.mime_type = mime_type
|
||||
doc.content = "the content, does not matter for this test"
|
||||
doc.save()
|
||||
|
||||
shutil.copy2(original, source_path(doc))
|
||||
with open(original, "rb") as f:
|
||||
doc.checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
if archive:
|
||||
if archive_filename:
|
||||
doc.archive_filename = archive_filename
|
||||
shutil.copy2(archive, archive_path_new(doc))
|
||||
else:
|
||||
shutil.copy2(archive, archive_path_old(doc))
|
||||
|
||||
with open(archive, "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
doc.save()
|
||||
|
||||
Path(thumbnail_path(doc)).touch()
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg")
|
||||
simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf")
|
||||
simple_pdf3 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000003.pdf")
|
||||
simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt")
|
||||
simple_png = os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png")
|
||||
simple_png2 = os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1011_auto_20210101_2340'
|
||||
migrate_to = '1012_fix_archive_files'
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
self.unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf3, "unrelated.pdf", simple_pdf)
|
||||
self.no_text = make_test_document(Document, "no-text", "image/png", simple_png2, "no-text.png", simple_pdf)
|
||||
self.doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
|
||||
self.clash1 = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
|
||||
self.clash2 = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf)
|
||||
self.clash3 = make_test_document(Document, "clash", "image/png", simple_png, "clash.png", simple_pdf)
|
||||
self.clash4 = make_test_document(Document, "clash.png", "application/pdf", simple_pdf2, "clash.png.pdf", simple_pdf2)
|
||||
|
||||
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash2))
|
||||
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash3))
|
||||
self.assertNotEqual(archive_path_old(self.clash1), archive_path_old(self.clash4))
|
||||
|
||||
def testArchiveFilesMigrated(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
|
||||
for doc in Document.objects.all():
|
||||
if doc.archive_checksum:
|
||||
self.assertIsNotNone(doc.archive_filename)
|
||||
self.assertTrue(os.path.isfile(archive_path_new(doc)))
|
||||
else:
|
||||
self.assertIsNone(doc.archive_filename)
|
||||
|
||||
with open(source_path(doc), "rb") as f:
|
||||
original_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(original_checksum, doc.checksum)
|
||||
|
||||
if doc.archive_checksum:
|
||||
self.assertTrue(os.path.isfile(archive_path_new(doc)))
|
||||
with open(archive_path_new(doc), "rb") as f:
|
||||
archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(archive_checksum, doc.archive_checksum)
|
||||
|
||||
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 6)
|
||||
|
||||
def test_filenames(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
|
||||
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, f"{self.clash1.id:07}.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, f"{self.clash2.id:07}.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, f"{self.clash3.id:07}.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
|
||||
|
||||
def test_filenames(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
|
||||
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, "none/clash.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, "none/clash_01.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, "none/clash_02.pdf")
|
||||
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
|
||||
|
||||
|
||||
def fake_parse_wrapper(parser, path, mime_type, file_name):
|
||||
parser.archive_path = None
|
||||
parser.text = "the text"
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1011_auto_20210101_2340'
|
||||
migrate_to = '1012_fix_archive_files'
|
||||
auto_migrate = False
|
||||
|
||||
def test_archive_missing(self):
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
|
||||
os.unlink(archive_path_old(doc))
|
||||
|
||||
self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
|
||||
|
||||
def test_parser_missing(self):
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
|
||||
doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
|
||||
|
||||
self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
|
||||
|
||||
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
|
||||
def test_parser_error(self, m):
|
||||
m.side_effect = ParseError()
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
|
||||
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
|
||||
|
||||
self.assertIsNotNone(doc1.archive_checksum)
|
||||
self.assertIsNotNone(doc2.archive_checksum)
|
||||
|
||||
with self.assertLogs() as capture:
|
||||
self.performMigration()
|
||||
|
||||
self.assertEqual(m.call_count, 6)
|
||||
|
||||
self.assertEqual(
|
||||
len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
|
||||
4)
|
||||
|
||||
self.assertEqual(
|
||||
len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
|
||||
2)
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc1 = Document.objects.get(id=doc1.id)
|
||||
doc2 = Document.objects.get(id=doc2.id)
|
||||
|
||||
self.assertIsNone(doc1.archive_checksum)
|
||||
self.assertIsNone(doc2.archive_checksum)
|
||||
self.assertIsNone(doc1.archive_filename)
|
||||
self.assertIsNone(doc2.archive_filename)
|
||||
|
||||
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
|
||||
def test_parser_no_archive(self, m):
|
||||
m.side_effect = fake_parse_wrapper
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
|
||||
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
|
||||
|
||||
with self.assertLogs() as capture:
|
||||
self.performMigration()
|
||||
|
||||
self.assertEqual(
|
||||
len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
|
||||
2)
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
doc1 = Document.objects.get(id=doc1.id)
|
||||
doc2 = Document.objects.get(id=doc2.id)
|
||||
|
||||
self.assertIsNone(doc1.archive_checksum)
|
||||
self.assertIsNone(doc2.archive_checksum)
|
||||
self.assertIsNone(doc1.archive_filename)
|
||||
self.assertIsNone(doc2.archive_filename)
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1012_fix_archive_files'
|
||||
migrate_to = '1011_auto_20210101_2340'
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
doc_unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf2, "unrelated.txt", simple_pdf2, "unrelated.pdf")
|
||||
doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
|
||||
clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_02.pdf")
|
||||
|
||||
def testArchiveFilesReverted(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
|
||||
for doc in Document.objects.all():
|
||||
if doc.archive_checksum:
|
||||
self.assertTrue(os.path.isfile(archive_path_old(doc)))
|
||||
with open(source_path(doc), "rb") as f:
|
||||
original_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(original_checksum, doc.checksum)
|
||||
|
||||
if doc.archive_checksum:
|
||||
self.assertTrue(os.path.isfile(archive_path_old(doc)))
|
||||
with open(archive_path_old(doc), "rb") as f:
|
||||
archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(archive_checksum, doc.archive_checksum)
|
||||
|
||||
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 2)
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
class TestMigrateArchiveFilesBackwardsWithFilenameFormat(TestMigrateArchiveFilesBackwards):
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
class TestMigrateArchiveFilesBackwardsErrors(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1012_fix_archive_files'
|
||||
migrate_to = '1011_auto_20210101_2340'
|
||||
auto_migrate = False
|
||||
|
||||
def test_filename_clash(self):
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash_02.pdf")
|
||||
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
|
||||
|
||||
self.assertRaisesMessage(ValueError, "would clash with another archive filename", self.performMigration)
|
||||
|
||||
def test_filename_exists(self):
|
||||
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
|
||||
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash.pdf")
|
||||
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
|
||||
|
||||
self.assertRaisesMessage(ValueError, "file already exists.", self.performMigration)
|
88
src/documents/tests/test_migration_mime_type.py
Normal file
88
src/documents/tests/test_migration_mime_type.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.tests.utils import DirectoriesMixin, TestMigrations
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def source_path_before(self):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def file_type_after(self):
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
||||
|
||||
def source_path_after(doc):
|
||||
if doc.filename:
|
||||
fname = str(doc.filename)
|
||||
else:
|
||||
fname = "{:07}{}".format(doc.pk, file_type_after(doc))
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
@override_settings(PASSPHRASE="test")
|
||||
class TestMigrateMimeType(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1002_auto_20201111_1105'
|
||||
migrate_to = '1003_mime_types'
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
doc = Document.objects.create(title="test", file_type="pdf", filename="file1.pdf")
|
||||
self.doc_id = doc.id
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_before(doc))
|
||||
|
||||
doc2 = Document.objects.create(checksum="B", file_type="pdf", storage_type=STORAGE_TYPE_GPG)
|
||||
self.doc2_id = doc2.id
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000004.pdf.gpg"), source_path_before(doc2))
|
||||
|
||||
def testMimeTypesMigrated(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
|
||||
doc = Document.objects.get(id=self.doc_id)
|
||||
self.assertEqual(doc.mime_type, "application/pdf")
|
||||
|
||||
doc2 = Document.objects.get(id=self.doc2_id)
|
||||
self.assertEqual(doc2.mime_type, "application/pdf")
|
||||
|
||||
|
||||
@override_settings(PASSPHRASE="test")
|
||||
class TestMigrateMimeTypeBackwards(DirectoriesMixin, TestMigrations):
|
||||
|
||||
migrate_from = '1003_mime_types'
|
||||
migrate_to = '1002_auto_20201111_1105'
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
doc = Document.objects.create(title="test", mime_type="application/pdf", filename="file1.pdf")
|
||||
self.doc_id = doc.id
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_after(doc))
|
||||
|
||||
def testMimeTypesReverted(self):
|
||||
Document = self.apps.get_model('documents', 'Document')
|
||||
|
||||
doc = Document.objects.get(id=self.doc_id)
|
||||
self.assertEqual(doc.file_type, "pdf")
|
@@ -68,7 +68,7 @@ class TestParserDiscovery(TestCase):
|
||||
)
|
||||
|
||||
|
||||
def fake_get_thumbnail(self, path, mimetype):
|
||||
def fake_get_thumbnail(self, path, mimetype, file_name):
|
||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
|
||||
|
||||
@@ -89,15 +89,15 @@ class TestBaseParser(TestCase):
|
||||
def test_get_optimised_thumbnail(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
parser.get_optimised_thumbnail("any", "not important")
|
||||
parser.get_optimised_thumbnail("any", "not important", "document.pdf")
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=False)
|
||||
def test_get_optimised_thumb_disabled(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
path = parser.get_optimised_thumbnail("any", "not important")
|
||||
self.assertEqual(path, fake_get_thumbnail(None, None, None))
|
||||
path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
|
||||
self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
@@ -114,9 +114,10 @@ class TestParserAvailability(TestCase):
|
||||
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
|
||||
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
|
||||
self.assertIsInstance(get_parser_class_for_mime_type('application/pdf')(logging_group=None), RasterisedDocumentParser)
|
||||
self.assertIsInstance(get_parser_class_for_mime_type('text/plain')(logging_group=None), TextDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
|
||||
|
||||
self.assertTrue(is_file_ext_supported('.pdf'))
|
||||
self.assertFalse(is_file_ext_supported('.hsdfh'))
|
||||
self.assertFalse(is_file_ext_supported(''))
|
||||
|
@@ -1,57 +0,0 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.models import Document, Tag, Correspondent
|
||||
from documents.signals.handlers import run_post_consume_script
|
||||
|
||||
|
||||
class PostConsumeTestCase(TestCase):
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT=None)
|
||||
def test_no_post_consume_script(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT="script")
|
||||
def test_post_consume_script_simple(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT="script")
|
||||
def test_post_consume_script_with_correspondent(self, m):
|
||||
c = Correspondent.objects.create(name="my_bank")
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
command = args[0]
|
||||
|
||||
self.assertEqual(command[0], "script")
|
||||
self.assertEqual(command[1], str(doc.pk))
|
||||
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
|
||||
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
# TODO: tags are unordered by default.
|
||||
self.assertEqual(command[8], "a,b")
|
@@ -1,23 +1,82 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import filelock
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import check_sanity, SanityFailedError
|
||||
from documents.sanity_checker import check_sanity, SanityCheckMessages
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestSanityCheckMessages(TestCase):
|
||||
|
||||
def test_no_messages(self):
|
||||
messages = SanityCheckMessages()
|
||||
self.assertEqual(len(messages), 0)
|
||||
self.assertFalse(messages.has_error())
|
||||
self.assertFalse(messages.has_warning())
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertEqual(capture.records[0].levelno, logging.INFO)
|
||||
self.assertEqual(capture.records[0].message, "Sanity checker detected no issues.")
|
||||
|
||||
def test_info(self):
|
||||
messages = SanityCheckMessages()
|
||||
messages.info("Something might be wrong")
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertFalse(messages.has_error())
|
||||
self.assertFalse(messages.has_warning())
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertEqual(capture.records[0].levelno, logging.INFO)
|
||||
self.assertEqual(capture.records[0].message, "Something might be wrong")
|
||||
|
||||
def test_warning(self):
|
||||
messages = SanityCheckMessages()
|
||||
messages.warning("Something is wrong")
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertFalse(messages.has_error())
|
||||
self.assertTrue(messages.has_warning())
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertEqual(capture.records[0].levelno, logging.WARNING)
|
||||
self.assertEqual(capture.records[0].message, "Something is wrong")
|
||||
|
||||
def test_error(self):
|
||||
messages = SanityCheckMessages()
|
||||
messages.error("Something is seriously wrong")
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertTrue(messages.has_error())
|
||||
self.assertFalse(messages.has_warning())
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertEqual(capture.records[0].levelno, logging.ERROR)
|
||||
self.assertEqual(capture.records[0].message, "Something is seriously wrong")
|
||||
|
||||
class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_test_data(self):
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
|
||||
with filelock.FileLock(settings.MEDIA_LOCK):
|
||||
# just make sure that the lockfile is present.
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
|
||||
|
||||
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
|
||||
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
|
||||
|
||||
def assertSanityError(self, messageRegex):
|
||||
messages = check_sanity()
|
||||
self.assertTrue(messages.has_error())
|
||||
self.assertRegex(messages[0]['message'], messageRegex)
|
||||
|
||||
def test_no_docs(self):
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
@@ -29,59 +88,75 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
def test_no_thumbnail(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.thumbnail_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Thumbnail of document .* does not exist")
|
||||
|
||||
def test_thumbnail_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.thumbnail_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Cannot read thumbnail file of document")
|
||||
os.chmod(doc.thumbnail_path, 0o777)
|
||||
|
||||
def test_no_original(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.source_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Original of document .* does not exist.")
|
||||
|
||||
def test_original_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.source_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Cannot read original file of document")
|
||||
os.chmod(doc.source_path, 0o777)
|
||||
|
||||
def test_original_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Checksum mismatch of document")
|
||||
|
||||
def test_no_archive(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.archive_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Archived version of document .* does not exist.")
|
||||
|
||||
def test_archive_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.archive_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Cannot read archive file of document")
|
||||
os.chmod(doc.archive_path, 0o777)
|
||||
|
||||
def test_archive_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
self.assertSanityError("Checksum mismatch of archived document")
|
||||
|
||||
def test_empty_content(self):
|
||||
doc = self.make_test_data()
|
||||
doc.content = ""
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
messages = check_sanity()
|
||||
self.assertFalse(messages.has_error())
|
||||
self.assertFalse(messages.has_warning())
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertRegex(messages[0]['message'], "Document .* has no content.")
|
||||
|
||||
def test_orphaned_file(self):
|
||||
doc = self.make_test_data()
|
||||
Path(self.dirs.originals_dir, "orphaned").touch()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
messages = check_sanity()
|
||||
self.assertFalse(messages.has_error())
|
||||
self.assertTrue(messages.has_warning())
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertRegex(messages[0]['message'], "Orphaned file in media dir")
|
||||
|
||||
def test_all(self):
|
||||
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
|
||||
string = str(SanityFailedError(check_sanity()))
|
||||
def test_archive_filename_no_checksum(self):
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = None
|
||||
doc.save()
|
||||
self.assertSanityError("has an archive file, but its checksum is missing.")
|
||||
|
||||
def test_archive_checksum_no_filename(self):
|
||||
doc = self.make_test_data()
|
||||
doc.archive_filename = None
|
||||
doc.save()
|
||||
self.assertSanityError("has an archive file checksum, but no archive filename.")
|
||||
|
34
src/documents/tests/test_settings.py
Normal file
34
src/documents/tests/test_settings.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import logging
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from paperless.settings import default_task_workers, default_threads_per_worker
|
||||
|
||||
|
||||
class TestSettings(TestCase):
|
||||
|
||||
@mock.patch("paperless.settings.multiprocessing.cpu_count")
|
||||
def test_single_core(self, cpu_count):
|
||||
cpu_count.return_value = 1
|
||||
|
||||
default_workers = default_task_workers()
|
||||
|
||||
default_threads = default_threads_per_worker(default_workers)
|
||||
|
||||
self.assertEqual(default_workers, 1)
|
||||
self.assertEqual(default_threads, 1)
|
||||
|
||||
def test_workers_threads(self):
|
||||
for i in range(1, 64):
|
||||
with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count:
|
||||
cpu_count.return_value = i
|
||||
|
||||
default_workers = default_task_workers()
|
||||
|
||||
default_threads = default_threads_per_worker(default_workers)
|
||||
|
||||
self.assertTrue(default_workers >= 1)
|
||||
self.assertTrue(default_threads >= 1)
|
||||
|
||||
self.assertTrue(default_workers * default_threads <= i, f"{i}")
|
@@ -1,10 +1,13 @@
|
||||
from datetime import datetime
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.models import Document
|
||||
from documents.models import Document, Tag, Correspondent, DocumentType
|
||||
from documents.sanity_checker import SanityCheckMessages, SanityCheckFailedException
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
@@ -20,5 +23,88 @@ class TestTasks(DirectoriesMixin, TestCase):
|
||||
|
||||
tasks.index_optimize()
|
||||
|
||||
def test_train_classifier(self):
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_no_auto_matching(self, load_classifier):
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_not_called()
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_tag(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_type(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
@mock.patch("documents.tasks.load_classifier")
|
||||
def test_train_classifier_with_auto_correspondent(self, load_classifier):
|
||||
load_classifier.return_value = None
|
||||
Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
tasks.train_classifier()
|
||||
load_classifier.assert_called_once()
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
def test_train_classifier(self):
|
||||
c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
|
||||
doc = Document.objects.create(correspondent=c, content="test", title="test")
|
||||
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
|
||||
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime = os.stat(settings.MODEL_FILE).st_mtime
|
||||
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
|
||||
self.assertEqual(mtime, mtime2)
|
||||
|
||||
doc.content = "test2"
|
||||
doc.save()
|
||||
tasks.train_classifier()
|
||||
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
|
||||
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
|
||||
self.assertNotEqual(mtime2, mtime3)
|
||||
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_success(self, m):
|
||||
m.return_value = SanityCheckMessages()
|
||||
self.assertEqual(tasks.sanity_check(), "No issues detected.")
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_error(self, m):
|
||||
messages = SanityCheckMessages()
|
||||
messages.error("Some error")
|
||||
m.return_value = messages
|
||||
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_warning(self, m):
|
||||
messages = SanityCheckMessages()
|
||||
messages.warning("Some warning")
|
||||
m.return_value = messages
|
||||
self.assertEqual(tasks.sanity_check(), "Sanity check exited with warnings. See log.")
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_info(self, m):
|
||||
messages = SanityCheckMessages()
|
||||
messages.info("Some info")
|
||||
m.return_value = messages
|
||||
self.assertEqual(tasks.sanity_check(), "Sanity check exited with infos. See log.")
|
||||
m.assert_called_once()
|
||||
|
||||
def test_bulk_update_documents(self):
|
||||
doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(),
|
||||
created=timezone.now(), modified=timezone.now())
|
||||
|
||||
tasks.bulk_update_documents([doc1.pk])
|
||||
|
30
src/documents/tests/test_views.py
Normal file
30
src/documents/tests/test_views.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
class TestViews(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.user = User.objects.create_user("testuser")
|
||||
|
||||
def test_login_redirect(self):
|
||||
response = self.client.get('/')
|
||||
self.assertEqual(response.status_code, 302)
|
||||
self.assertEqual(response.url, "/accounts/login/?next=/")
|
||||
|
||||
def test_index(self):
|
||||
self.client.force_login(self.user)
|
||||
for (language_given, language_actual) in [("", "en-US"), ("en-US", "en-US"), ("de", "de"), ("en", "en-US"), ("en-us", "en-US"), ("fr", "fr"), ("jp", "en-US")]:
|
||||
if language_given:
|
||||
self.client.cookies.load({settings.LANGUAGE_COOKIE_NAME: language_given})
|
||||
elif settings.LANGUAGE_COOKIE_NAME in self.client.cookies.keys():
|
||||
self.client.cookies.pop(settings.LANGUAGE_COOKIE_NAME)
|
||||
|
||||
response = self.client.get('/', )
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.context_data['webmanifest'], f"frontend/{language_actual}/manifest.webmanifest")
|
||||
self.assertEqual(response.context_data['styles_css'], f"frontend/{language_actual}/styles.css")
|
||||
self.assertEqual(response.context_data['runtime_js'], f"frontend/{language_actual}/runtime.js")
|
||||
self.assertEqual(response.context_data['polyfills_js'], f"frontend/{language_actual}/polyfills.js")
|
||||
self.assertEqual(response.context_data['main_js'], f"frontend/{language_actual}/main.js")
|
@@ -2,8 +2,12 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.test import override_settings
|
||||
from django.apps import apps
|
||||
from django.db import connection
|
||||
from django.db.migrations.executor import MigrationExecutor
|
||||
from django.test import override_settings, TransactionTestCase
|
||||
|
||||
|
||||
def setup_directories():
|
||||
@@ -18,13 +22,16 @@ def setup_directories():
|
||||
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
|
||||
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
|
||||
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
|
||||
dirs.logging_dir = os.path.join(dirs.data_dir, "log")
|
||||
|
||||
os.makedirs(dirs.index_dir, exist_ok=True)
|
||||
os.makedirs(dirs.originals_dir, exist_ok=True)
|
||||
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
|
||||
os.makedirs(dirs.archive_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
os.makedirs(dirs.logging_dir, exist_ok=True)
|
||||
|
||||
dirs.settings_override = override_settings(
|
||||
DATA_DIR=dirs.data_dir,
|
||||
SCRATCH_DIR=dirs.scratch_dir,
|
||||
MEDIA_ROOT=dirs.media_dir,
|
||||
@@ -32,10 +39,13 @@ def setup_directories():
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
LOGGING_DIR=dirs.logging_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle"),
|
||||
MEDIA_LOCK=os.path.join(dirs.media_dir, "media.lock")
|
||||
|
||||
).enable()
|
||||
)
|
||||
dirs.settings_override.enable()
|
||||
|
||||
return dirs
|
||||
|
||||
@@ -45,6 +55,18 @@ def remove_dirs(dirs):
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def paperless_environment():
|
||||
dirs = None
|
||||
try:
|
||||
dirs = setup_directories()
|
||||
yield dirs
|
||||
finally:
|
||||
if dirs:
|
||||
remove_dirs(dirs)
|
||||
|
||||
|
||||
class DirectoriesMixin:
|
||||
@@ -60,3 +82,45 @@ class DirectoriesMixin:
|
||||
def tearDown(self) -> None:
|
||||
super(DirectoriesMixin, self).tearDown()
|
||||
remove_dirs(self.dirs)
|
||||
|
||||
|
||||
class TestMigrations(TransactionTestCase):
|
||||
|
||||
@property
|
||||
def app(self):
|
||||
return apps.get_containing_app_config(type(self).__module__).name
|
||||
|
||||
migrate_from = None
|
||||
migrate_to = None
|
||||
auto_migrate = True
|
||||
|
||||
def setUp(self):
|
||||
super(TestMigrations, self).setUp()
|
||||
|
||||
assert self.migrate_from and self.migrate_to, \
|
||||
"TestCase '{}' must define migrate_from and migrate_to properties".format(type(self).__name__)
|
||||
self.migrate_from = [(self.app, self.migrate_from)]
|
||||
self.migrate_to = [(self.app, self.migrate_to)]
|
||||
executor = MigrationExecutor(connection)
|
||||
old_apps = executor.loader.project_state(self.migrate_from).apps
|
||||
|
||||
# Reverse to the original migration
|
||||
executor.migrate(self.migrate_from)
|
||||
|
||||
self.setUpBeforeMigration(old_apps)
|
||||
|
||||
self.apps = old_apps
|
||||
|
||||
if self.auto_migrate:
|
||||
self.performMigration()
|
||||
|
||||
def performMigration(self):
|
||||
# Run the migration to test
|
||||
executor = MigrationExecutor(connection)
|
||||
executor.loader.build_graph() # reload.
|
||||
executor.migrate(self.migrate_to)
|
||||
|
||||
self.apps = executor.loader.project_state(self.migrate_to).apps
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
pass
|
||||
|
@@ -1,10 +1,21 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django.db.models import Count, Max
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Max, Case, When, IntegerField
|
||||
from django.db.models.functions import Lower
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
from django.utils.translation import get_language
|
||||
from django.views.decorators.cache import cache_control
|
||||
from django.views.generic import TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from django_q.tasks import async_task
|
||||
from rest_framework import parsers
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.filters import OrderingFilter, SearchFilter
|
||||
from rest_framework.mixins import (
|
||||
@@ -19,40 +30,74 @@ from rest_framework.views import APIView
|
||||
from rest_framework.viewsets import (
|
||||
GenericViewSet,
|
||||
ModelViewSet,
|
||||
ReadOnlyModelViewSet
|
||||
ViewSet
|
||||
)
|
||||
|
||||
import documents.index as index
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
from .bulk_download import OriginalAndArchiveStrategy, OriginalsOnlyStrategy, \
|
||||
ArchiveOnlyStrategy
|
||||
from .classifier import load_classifier
|
||||
from .filters import (
|
||||
CorrespondentFilterSet,
|
||||
DocumentFilterSet,
|
||||
TagFilterSet,
|
||||
DocumentTypeFilterSet,
|
||||
LogFilterSet
|
||||
DocumentTypeFilterSet
|
||||
)
|
||||
from .forms import UploadForm
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .matching import match_correspondents, match_tags, match_document_types
|
||||
from .models import Correspondent, Document, Tag, DocumentType, SavedView
|
||||
from .parsers import get_parser_class_for_mime_type
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer,
|
||||
DocumentSerializer,
|
||||
LogSerializer,
|
||||
TagSerializer,
|
||||
DocumentTypeSerializer
|
||||
DocumentTypeSerializer,
|
||||
PostDocumentSerializer,
|
||||
SavedViewSerializer,
|
||||
BulkEditSerializer,
|
||||
DocumentListSerializer,
|
||||
BulkDownloadSerializer
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.api")
|
||||
|
||||
|
||||
class IndexView(TemplateView):
|
||||
template_name = "index.html"
|
||||
|
||||
def get_language(self):
|
||||
# This is here for the following reason:
|
||||
# Django identifies languages in the form "en-us"
|
||||
# However, angular generates locales as "en-US".
|
||||
# this translates between these two forms.
|
||||
lang = get_language()
|
||||
if "-" in lang:
|
||||
first = lang[:lang.index("-")]
|
||||
second = lang[lang.index("-")+1:]
|
||||
return f"{first}-{second.upper()}"
|
||||
else:
|
||||
return lang
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super().get_context_data(**kwargs)
|
||||
context['cookie_prefix'] = settings.COOKIE_PREFIX
|
||||
context['username'] = self.request.user.username
|
||||
context['full_name'] = self.request.user.get_full_name()
|
||||
context['styles_css'] = f"frontend/{self.get_language()}/styles.css"
|
||||
context['runtime_js'] = f"frontend/{self.get_language()}/runtime.js"
|
||||
context['polyfills_js'] = f"frontend/{self.get_language()}/polyfills.js" # NOQA: E501
|
||||
context['main_js'] = f"frontend/{self.get_language()}/main.js"
|
||||
context['webmanifest'] = f"frontend/{self.get_language()}/manifest.webmanifest" # NOQA: E501
|
||||
return context
|
||||
|
||||
|
||||
class CorrespondentViewSet(ModelViewSet):
|
||||
model = Correspondent
|
||||
|
||||
queryset = Correspondent.objects.annotate(
|
||||
document_count=Count('documents'),
|
||||
last_correspondence=Max('documents__created')).order_by('name')
|
||||
last_correspondence=Max('documents__created')).order_by(Lower('name'))
|
||||
|
||||
serializer_class = CorrespondentSerializer
|
||||
pagination_class = StandardPagination
|
||||
@@ -71,7 +116,7 @@ class TagViewSet(ModelViewSet):
|
||||
model = Tag
|
||||
|
||||
queryset = Tag.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
document_count=Count('documents')).order_by(Lower('name'))
|
||||
|
||||
serializer_class = TagSerializer
|
||||
pagination_class = StandardPagination
|
||||
@@ -85,7 +130,7 @@ class DocumentTypeViewSet(ModelViewSet):
|
||||
model = DocumentType
|
||||
|
||||
queryset = DocumentType.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
document_count=Count('documents')).order_by(Lower('name'))
|
||||
|
||||
serializer_class = DocumentTypeSerializer
|
||||
pagination_class = StandardPagination
|
||||
@@ -118,13 +163,29 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
"added",
|
||||
"archive_serial_number")
|
||||
|
||||
def get_queryset(self):
|
||||
return Document.objects.distinct()
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
fields_param = self.request.query_params.get('fields', None)
|
||||
if fields_param:
|
||||
fields = fields_param.split(",")
|
||||
else:
|
||||
fields = None
|
||||
serializer_class = self.get_serializer_class()
|
||||
kwargs.setdefault('context', self.get_serializer_context())
|
||||
kwargs.setdefault('fields', fields)
|
||||
return serializer_class(*args, **kwargs)
|
||||
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super(DocumentViewSet, self).update(
|
||||
request, *args, **kwargs)
|
||||
from documents import index
|
||||
index.add_or_update_document(self.get_object())
|
||||
return response
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
from documents import index
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
@@ -137,13 +198,13 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
|
||||
def file_response(self, pk, request, disposition):
|
||||
doc = Document.objects.get(id=pk)
|
||||
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
|
||||
if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
|
||||
file_handle = doc.archive_file
|
||||
filename = doc.archive_file_name
|
||||
filename = doc.get_public_filename(archive=True)
|
||||
mime_type = 'application/pdf'
|
||||
else:
|
||||
file_handle = doc.source_file
|
||||
filename = doc.file_name
|
||||
filename = doc.get_public_filename()
|
||||
mime_type = doc.mime_type
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
@@ -154,30 +215,76 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
def post_document(self, request, pk=None):
|
||||
# TODO: is this a good implementation?
|
||||
form = UploadForm(data=request.POST, files=request.FILES)
|
||||
if form.is_valid():
|
||||
form.save()
|
||||
return Response("OK")
|
||||
def get_metadata(self, file, mime_type):
|
||||
if not os.path.isfile(file):
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
|
||||
try:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception as e:
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else:
|
||||
return HttpResponseBadRequest(str(form.errors))
|
||||
return []
|
||||
|
||||
def get_filesize(self, filename):
|
||||
if os.path.isfile(filename):
|
||||
return os.stat(filename).st_size
|
||||
else:
|
||||
return None
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
return Response({
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
meta = {
|
||||
"original_checksum": doc.checksum,
|
||||
"original_size": self.get_filesize(doc.source_path),
|
||||
"original_mime_type": doc.mime_type,
|
||||
"media_filename": doc.filename,
|
||||
"has_archive_version": doc.has_archive_version,
|
||||
"original_metadata": self.get_metadata(
|
||||
doc.source_path, doc.mime_type),
|
||||
"archive_checksum": doc.archive_checksum,
|
||||
"archive_media_filename": doc.archive_filename
|
||||
}
|
||||
|
||||
if doc.has_archive_version:
|
||||
meta['archive_size'] = self.get_filesize(doc.archive_path)
|
||||
meta['archive_metadata'] = self.get_metadata(
|
||||
doc.archive_path, "application/pdf")
|
||||
else:
|
||||
meta['archive_size'] = None
|
||||
meta['archive_metadata'] = None
|
||||
|
||||
return Response(meta)
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def suggestions(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
return Response({
|
||||
"correspondents": [
|
||||
c.id for c in match_correspondents(doc, classifier)
|
||||
],
|
||||
"tags": [t.id for t in match_tags(doc, classifier)],
|
||||
"document_types": [
|
||||
dt.id for dt in match_document_types(doc, classifier)
|
||||
]
|
||||
})
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
@@ -191,7 +298,14 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
def thumb(self, request, pk=None):
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
|
||||
doc = Document.objects.get(id=pk)
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
handle = GnuPG.decrypted(doc.thumbnail_file)
|
||||
else:
|
||||
handle = doc.thumbnail_file
|
||||
# TODO: Send ETag information and use that to send new thumbnails
|
||||
# if available
|
||||
return HttpResponse(handle,
|
||||
content_type='image/png')
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
@@ -205,45 +319,236 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
raise Http404()
|
||||
|
||||
|
||||
class LogViewSet(ReadOnlyModelViewSet):
|
||||
model = Log
|
||||
class LogViewSet(ViewSet):
|
||||
|
||||
queryset = Log.objects.all()
|
||||
serializer_class = LogSerializer
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
log_files = ["paperless", "mail"]
|
||||
|
||||
def retrieve(self, request, pk=None, *args, **kwargs):
|
||||
if pk not in self.log_files:
|
||||
raise Http404()
|
||||
|
||||
filename = os.path.join(settings.LOGGING_DIR, f"{pk}.log")
|
||||
|
||||
if not os.path.isfile(filename):
|
||||
raise Http404()
|
||||
|
||||
with open(filename, "r") as f:
|
||||
lines = [line.rstrip() for line in f.readlines()]
|
||||
|
||||
return Response(lines)
|
||||
|
||||
def list(self, request, *args, **kwargs):
|
||||
return Response(self.log_files)
|
||||
|
||||
|
||||
class SavedViewViewSet(ModelViewSet):
|
||||
model = SavedView
|
||||
|
||||
queryset = SavedView.objects.all()
|
||||
serializer_class = SavedViewSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filterset_class = LogFilterSet
|
||||
ordering_fields = ("created",)
|
||||
|
||||
def get_queryset(self):
|
||||
user = self.request.user
|
||||
return SavedView.objects.filter(user=user)
|
||||
|
||||
def perform_create(self, serializer):
|
||||
serializer.save(user=self.request.user)
|
||||
|
||||
|
||||
class BulkEditView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = BulkEditSerializer
|
||||
parser_classes = (parsers.JSONParser,)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
method = serializer.validated_data.get("method")
|
||||
parameters = serializer.validated_data.get("parameters")
|
||||
documents = serializer.validated_data.get("documents")
|
||||
|
||||
try:
|
||||
# TODO: parameter validation
|
||||
result = method(documents, **parameters)
|
||||
return Response({"result": result})
|
||||
except Exception as e:
|
||||
return HttpResponseBadRequest(str(e))
|
||||
|
||||
|
||||
class PostDocumentView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = PostDocumentSerializer
|
||||
parser_classes = (parsers.MultiPartParser,)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
doc_name, doc_data = serializer.validated_data.get('document')
|
||||
correspondent_id = serializer.validated_data.get('correspondent')
|
||||
document_type_id = serializer.validated_data.get('document_type')
|
||||
tag_ids = serializer.validated_data.get('tags')
|
||||
title = serializer.validated_data.get('title')
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
f.write(doc_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
temp_filename = f.name
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
temp_filename,
|
||||
override_filename=doc_name,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent_id,
|
||||
override_document_type_id=document_type_id,
|
||||
override_tag_ids=tag_ids,
|
||||
task_id=task_id,
|
||||
task_name=os.path.basename(doc_name)[:100])
|
||||
|
||||
return Response("OK")
|
||||
|
||||
|
||||
class SelectionDataView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = DocumentListSerializer
|
||||
parser_classes = (parsers.MultiPartParser, parsers.JSONParser)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, format=None):
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
ids = serializer.validated_data.get('documents')
|
||||
|
||||
correspondents = Correspondent.objects.annotate(
|
||||
document_count=Count(Case(
|
||||
When(documents__id__in=ids, then=1),
|
||||
output_field=IntegerField()
|
||||
)))
|
||||
|
||||
tags = Tag.objects.annotate(document_count=Count(Case(
|
||||
When(documents__id__in=ids, then=1),
|
||||
output_field=IntegerField()
|
||||
)))
|
||||
|
||||
types = DocumentType.objects.annotate(document_count=Count(Case(
|
||||
When(documents__id__in=ids, then=1),
|
||||
output_field=IntegerField()
|
||||
)))
|
||||
|
||||
r = Response({
|
||||
"selected_correspondents": [{
|
||||
"id": t.id,
|
||||
"document_count": t.document_count
|
||||
} for t in correspondents],
|
||||
"selected_tags": [{
|
||||
"id": t.id,
|
||||
"document_count": t.document_count
|
||||
} for t in tags],
|
||||
"selected_document_types": [{
|
||||
"id": t.id,
|
||||
"document_count": t.document_count
|
||||
} for t in types]
|
||||
})
|
||||
|
||||
return r
|
||||
|
||||
|
||||
class SearchView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def add_infos_to_hit(self, r):
|
||||
doc = Document.objects.get(id=r['id'])
|
||||
try:
|
||||
doc = Document.objects.get(id=r['id'])
|
||||
except Document.DoesNotExist:
|
||||
logger.warning(
|
||||
f"Search index returned a non-existing document: "
|
||||
f"id: {r['id']}, title: {r['title']}. "
|
||||
f"Search index needs reindex."
|
||||
)
|
||||
doc = None
|
||||
|
||||
return {'id': r['id'],
|
||||
'highlights': r.highlights("content", text=doc.content),
|
||||
'highlights': r.highlights("content", text=doc.content) if doc else None, # NOQA: E501
|
||||
'score': r.score,
|
||||
'rank': r.rank,
|
||||
'document': DocumentSerializer(doc).data,
|
||||
'document': DocumentSerializer(doc).data if doc else None,
|
||||
'title': r['title']
|
||||
}
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'query' not in request.query_params:
|
||||
from documents import index
|
||||
|
||||
if 'query' in request.query_params:
|
||||
query = request.query_params['query']
|
||||
else:
|
||||
query = None
|
||||
|
||||
if 'more_like' in request.query_params:
|
||||
more_like_id = request.query_params['more_like']
|
||||
more_like_content = Document.objects.get(id=more_like_id).content
|
||||
else:
|
||||
more_like_id = None
|
||||
more_like_content = None
|
||||
|
||||
if not query and not more_like_id:
|
||||
return Response({
|
||||
'count': 0,
|
||||
'page': 0,
|
||||
'page_count': 0,
|
||||
'corrected_query': None,
|
||||
'results': []})
|
||||
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
@@ -252,9 +557,10 @@ class SearchView(APIView):
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
try:
|
||||
with index.query_page(self.ix, query, page) as (result_page,
|
||||
corrected_query):
|
||||
with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
@@ -269,10 +575,6 @@ class SearchAutoCompleteView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'term' in request.query_params:
|
||||
term = request.query_params['term']
|
||||
@@ -286,7 +588,11 @@ class SearchAutoCompleteView(APIView):
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
from documents import index
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
return Response(index.autocomplete(ix, term, limit))
|
||||
|
||||
|
||||
class StatisticsView(APIView):
|
||||
@@ -294,8 +600,66 @@ class StatisticsView(APIView):
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def get(self, request, format=None):
|
||||
return Response({
|
||||
'documents_total': Document.objects.all().count(),
|
||||
'documents_inbox': Document.objects.filter(
|
||||
documents_total = Document.objects.all().count()
|
||||
if Tag.objects.filter(is_inbox_tag=True).exists():
|
||||
documents_inbox = Document.objects.filter(
|
||||
tags__is_inbox_tag=True).distinct().count()
|
||||
else:
|
||||
documents_inbox = None
|
||||
|
||||
return Response({
|
||||
'documents_total': documents_total,
|
||||
'documents_inbox': documents_inbox,
|
||||
})
|
||||
|
||||
|
||||
class BulkDownloadView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = BulkDownloadSerializer
|
||||
parser_classes = (parsers.JSONParser,)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, format=None):
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
ids = serializer.validated_data.get('documents')
|
||||
compression = serializer.validated_data.get('compression')
|
||||
content = serializer.validated_data.get('content')
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
temp = tempfile.NamedTemporaryFile(
|
||||
dir=settings.SCRATCH_DIR,
|
||||
suffix="-compressed-archive",
|
||||
delete=False)
|
||||
|
||||
if content == 'both':
|
||||
strategy_class = OriginalAndArchiveStrategy
|
||||
elif content == 'originals':
|
||||
strategy_class = OriginalsOnlyStrategy
|
||||
else:
|
||||
strategy_class = ArchiveOnlyStrategy
|
||||
|
||||
with zipfile.ZipFile(temp.name, "w", compression) as zipf:
|
||||
strategy = strategy_class(zipf)
|
||||
for id in ids:
|
||||
doc = Document.objects.get(id=id)
|
||||
strategy.add_document(doc)
|
||||
|
||||
with open(temp.name, "rb") as f:
|
||||
response = HttpResponse(f, content_type="application/zip")
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
"attachment", "documents.zip")
|
||||
|
||||
return response
|
||||
|
Reference in New Issue
Block a user