Merge branch 'dev' into feature/any-all-filtering

This commit is contained in:
Michael Shamoon
2022-02-14 22:23:31 -08:00
323 changed files with 85068 additions and 12189 deletions

78
src/documents/admin.py Executable file → Normal file
View File

@@ -1,10 +1,6 @@
from django.contrib import admin
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
from . import index
from .models import Correspondent, Document, DocumentType, Log, Tag, \
from .models import Correspondent, Document, DocumentType, Tag, \
SavedView, SavedViewFilterRule
@@ -23,12 +19,12 @@ class TagAdmin(admin.ModelAdmin):
list_display = (
"name",
"colour",
"color",
"match",
"matching_algorithm"
)
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
list_filter = ("color", "matching_algorithm")
list_editable = ("color", "match", "matching_algorithm")
class DocumentTypeAdmin(admin.ModelAdmin):
@@ -50,26 +46,31 @@ class DocumentAdmin(admin.ModelAdmin):
"modified",
"mime_type",
"storage_type",
"filename")
"filename",
"checksum",
"archive_filename",
"archive_checksum"
)
list_display_links = ("title",)
list_display = (
"correspondent",
"id",
"title",
"tags_",
"created",
"mime_type",
"filename",
"archive_filename"
)
list_filter = (
"document_type",
"tags",
"correspondent"
("mime_type"),
("archive_serial_number", admin.EmptyFieldListFilter),
("archive_filename", admin.EmptyFieldListFilter),
)
filter_horizontal = ("tags",)
ordering = ["-created"]
ordering = ["-id"]
date_hierarchy = "created"
@@ -81,56 +82,24 @@ class DocumentAdmin(admin.ModelAdmin):
created_.short_description = "Created"
def delete_queryset(self, request, queryset):
ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index
with index.open_index_writer() as writer:
for o in queryset:
index.remove_document(writer, o)
super(DocumentAdmin, self).delete_queryset(request, queryset)
def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj)
def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change)
@mark_safe
def tags_(self, obj):
r = ""
for tag in obj.tags.all():
r += self._html_tag(
"span",
tag.name + ", "
)
return r
@staticmethod
def _html_tag(kind, inside=None, **kwargs):
attributes = format_html_join(' ', '{}="{}"', kwargs.items())
if inside is not None:
return format_html("<{kind} {attributes}>{inside}</{kind}>",
kind=kind, attributes=attributes, inside=inside)
return format_html("<{} {}/>", kind, attributes)
class LogAdmin(admin.ModelAdmin):
def has_add_permission(self, request):
return False
def has_change_permission(self, request, obj=None):
return False
list_display = ("created", "message", "level",)
list_filter = ("level", "created",)
ordering = ('-created',)
list_display_links = ("created", "message")
class RuleInline(admin.TabularInline):
model = SavedViewFilterRule
@@ -149,5 +118,4 @@ admin.site.register(Correspondent, CorrespondentAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(DocumentType, DocumentTypeAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)
admin.site.register(SavedView, SavedViewAdmin)

View File

@@ -0,0 +1,60 @@
from zipfile import ZipFile
from documents.models import Document
class BulkArchiveStrategy:
def __init__(self, zipf: ZipFile):
self.zipf = zipf
def make_unique_filename(self,
doc: Document,
archive: bool = False,
folder: str = ""):
counter = 0
while True:
filename = folder + doc.get_public_filename(archive, counter)
if filename in self.zipf.namelist():
counter += 1
else:
return filename
def add_document(self, doc: Document):
raise NotImplementedError() # pragma: no cover
class OriginalsOnlyStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document):
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
class ArchiveOnlyStrategy(BulkArchiveStrategy):
def __init__(self, zipf):
super(ArchiveOnlyStrategy, self).__init__(zipf)
def add_document(self, doc: Document):
if doc.has_archive_version:
self.zipf.write(doc.archive_path,
self.make_unique_filename(doc, archive=True))
else:
self.zipf.write(doc.source_path,
self.make_unique_filename(doc))
class OriginalAndArchiveStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document):
if doc.has_archive_version:
self.zipf.write(
doc.archive_path, self.make_unique_filename(
doc, archive=True, folder="archive/"
)
)
self.zipf.write(
doc.source_path,
self.make_unique_filename(doc, folder="originals/")
)

View File

@@ -2,9 +2,7 @@ import itertools
from django.db.models import Q
from django_q.tasks import async_task
from whoosh.writing import AsyncWriter
from documents import index
from documents.models import Document, Correspondent, DocumentType
@@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
def delete(doc_ids):
Document.objects.filter(id__in=doc_ids).delete()
ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index
with index.open_index_writer() as writer:
for id in doc_ids:
index.remove_document_by_id(writer, id)

117
src/documents/classifier.py Executable file → Normal file
View File

@@ -3,12 +3,9 @@ import logging
import os
import pickle
import re
import shutil
from django.conf import settings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.utils.multiclass import type_of_target
from documents.models import Document, MatchingModel
@@ -17,7 +14,11 @@ class IncompatibleClassifierVersionError(Exception):
pass
logger = logging.getLogger(__name__)
class ClassifierModelCorruptError(Exception):
pass
logger = logging.getLogger("paperless.classifier")
def preprocess_content(content):
@@ -26,15 +27,46 @@ def preprocess_content(content):
return content
def load_classifier():
if not os.path.isfile(settings.MODEL_FILE):
logger.debug(
f"Document classification model does not exist (yet), not "
f"performing automatic matching."
)
return None
classifier = DocumentClassifier()
try:
classifier.load()
except (ClassifierModelCorruptError,
IncompatibleClassifierVersionError):
# there's something wrong with the model file.
logger.exception(
f"Unrecoverable error while loading document "
f"classification model, deleting model file."
)
os.unlink(settings.MODEL_FILE)
classifier = None
except OSError:
logger.exception(
f"IO error while loading document classification model"
)
classifier = None
except Exception:
logger.exception(
f"Unknown error while loading document classification model"
)
classifier = None
return classifier
class DocumentClassifier(object):
FORMAT_VERSION = 6
def __init__(self):
# mtime of the model file on disk. used to prevent reloading when
# nothing has changed.
self.classifier_version = 0
# hash of the training data. used to prevent re-training when the
# training data has not changed.
self.data_hash = None
@@ -45,20 +77,15 @@ class DocumentClassifier(object):
self.correspondent_classifier = None
self.document_type_classifier = None
def reload(self):
if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
with open(settings.MODEL_FILE, "rb") as f:
schema_version = pickle.load(f)
def load(self):
with open(settings.MODEL_FILE, "rb") as f:
schema_version = pickle.load(f)
if schema_version != self.FORMAT_VERSION:
raise IncompatibleClassifierVersionError(
"Cannor load classifier, incompatible versions.")
else:
if self.classifier_version > 0:
# Don't be confused by this check. It's simply here
# so that we wont log anything on initial reload.
logger.info("Classifier updated on disk, "
"reloading classifier models")
if schema_version != self.FORMAT_VERSION:
raise IncompatibleClassifierVersionError(
"Cannor load classifier, incompatible versions.")
else:
try:
self.data_hash = pickle.load(f)
self.data_vectorizer = pickle.load(f)
self.tags_binarizer = pickle.load(f)
@@ -66,10 +93,14 @@ class DocumentClassifier(object):
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.document_type_classifier = pickle.load(f)
self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
except Exception:
raise ClassifierModelCorruptError()
def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f:
def save(self):
target_file = settings.MODEL_FILE
target_file_temp = settings.MODEL_FILE + ".part"
with open(target_file_temp, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.data_hash, f)
pickle.dump(self.data_vectorizer, f)
@@ -80,14 +111,19 @@ class DocumentClassifier(object):
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
if os.path.isfile(target_file):
os.unlink(target_file)
shutil.move(target_file_temp, target_file)
def train(self):
data = list()
labels_tags = list()
labels_correspondent = list()
labels_document_type = list()
# Step 1: Extract and preprocess training data from the database.
logging.getLogger(__name__).debug("Gathering data from database...")
logger.debug("Gathering data from database...")
m = hashlib.sha1()
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
preprocessed_content = preprocess_content(doc.content)
@@ -108,10 +144,11 @@ class DocumentClassifier(object):
m.update(y.to_bytes(4, 'little', signed=True))
labels_correspondent.append(y)
tags = [tag.pk for tag in doc.tags.filter(
tags = sorted([tag.pk for tag in doc.tags.filter(
matching_algorithm=MatchingModel.MATCH_AUTO
)]
m.update(bytearray(tags))
)])
for tag in tags:
m.update(tag.to_bytes(4, 'little', signed=True))
labels_tags.append(tags)
if not data:
@@ -134,7 +171,7 @@ class DocumentClassifier(object):
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
num_document_types = len(set(labels_document_type) | {-1}) - 1
logging.getLogger(__name__).debug(
logger.debug(
"{} documents, {} tag(s), {} correspondent(s), "
"{} document type(s).".format(
len(data),
@@ -144,8 +181,12 @@ class DocumentClassifier(object):
)
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
# Step 2: vectorize data
logging.getLogger(__name__).debug("Vectorizing data...")
logger.debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1, 2),
@@ -155,7 +196,7 @@ class DocumentClassifier(object):
# Step 3: train the classifiers
if num_tags > 0:
logging.getLogger(__name__).debug("Training tags classifier...")
logger.debug("Training tags classifier...")
if num_tags == 1:
# Special case where only one tag has auto:
@@ -174,12 +215,12 @@ class DocumentClassifier(object):
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
self.tags_classifier = None
logging.getLogger(__name__).debug(
logger.debug(
"There are no tags. Not training tags classifier."
)
if num_correspondents > 0:
logging.getLogger(__name__).debug(
logger.debug(
"Training correspondent classifier..."
)
self.correspondent_classifier = MLPClassifier(tol=0.01)
@@ -189,13 +230,13 @@ class DocumentClassifier(object):
)
else:
self.correspondent_classifier = None
logging.getLogger(__name__).debug(
logger.debug(
"There are no correspondents. Not training correspondent "
"classifier."
)
if num_document_types > 0:
logging.getLogger(__name__).debug(
logger.debug(
"Training document type classifier..."
)
self.document_type_classifier = MLPClassifier(tol=0.01)
@@ -205,7 +246,7 @@ class DocumentClassifier(object):
)
else:
self.document_type_classifier = None
logging.getLogger(__name__).debug(
logger.debug(
"There are no document types. Not training document type "
"classifier."
)
@@ -237,6 +278,8 @@ class DocumentClassifier(object):
return None
def predict_tags(self, content):
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
y = self.tags_classifier.predict(X)

172
src/documents/consumer.py Executable file → Normal file
View File

@@ -1,9 +1,12 @@
import datetime
import hashlib
import os
import uuid
from subprocess import Popen
import magic
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
from django.db import transaction
from django.db.models import Q
@@ -11,7 +14,7 @@ from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .classifier import load_classifier
from .file_handling import create_source_path_directory, \
generate_unique_filename
from .loggers import LoggingMixin
@@ -27,8 +30,45 @@ class ConsumerError(Exception):
pass
MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
MESSAGE_FILE_NOT_FOUND = "file_not_found"
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
MESSAGE_NEW_FILE = "new_file"
MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
MESSAGE_PARSING_DOCUMENT = "parsing_document"
MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
MESSAGE_PARSE_DATE = "parse_date"
MESSAGE_SAVE_DOCUMENT = "save_document"
MESSAGE_FINISHED = "finished"
class Consumer(LoggingMixin):
logging_name = "paperless.consumer"
def _send_progress(self, current_progress, max_progress, status,
message=None, document_id=None):
payload = {
'filename': os.path.basename(self.filename) if self.filename else None, # NOQA: E501
'task_id': self.task_id,
'current_progress': current_progress,
'max_progress': max_progress,
'status': status,
'message': message,
'document_id': document_id
}
async_to_sync(self.channel_layer.group_send)("status_updates",
{'type': 'status_update',
'data': payload})
def _fail(self, message, log_message=None, exc_info=None):
self._send_progress(100, 100, 'FAILED', message)
self.log("error", log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}")
def __init__(self):
super().__init__()
self.path = None
@@ -37,15 +77,16 @@ class Consumer(LoggingMixin):
self.override_correspondent_id = None
self.override_tag_ids = None
self.override_document_type_id = None
self.task_id = None
self.channel_layer = get_channel_layer()
def pre_check_file_exists(self):
if not os.path.isfile(self.path):
self.log(
"error",
"Cannot consume {}: It is not a file.".format(self.path)
self._fail(
MESSAGE_FILE_NOT_FOUND,
f"Cannot consume {self.path}: File not found."
)
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
@@ -53,12 +94,9 @@ class Consumer(LoggingMixin):
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
self.log(
"error",
"Not consuming {}: It is a duplicate.".format(self.filename)
)
raise ConsumerError(
"Not consuming {}: It is a duplicate.".format(self.filename)
self._fail(
MESSAGE_DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate."
)
def pre_check_directories(self):
@@ -72,15 +110,21 @@ class Consumer(LoggingMixin):
return
if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
raise ConsumerError(
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
f"Configured pre-consume script "
f"{settings.PRE_CONSUME_SCRIPT} does not exist.")
self.log("info",
f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
try:
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
f"Error while executing pre-consume script: {e}",
exc_info=True
)
def run_post_consume_script(self, document):
@@ -88,9 +132,16 @@ class Consumer(LoggingMixin):
return
if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
raise ConsumerError(
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
f"Configured post-consume script "
f"{settings.POST_CONSUME_SCRIPT} does not exist.")
f"{settings.POST_CONSUME_SCRIPT} does not exist."
)
self.log(
"info",
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}"
)
try:
Popen((
@@ -106,8 +157,10 @@ class Consumer(LoggingMixin):
"name", flat=True)))
)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
f"Error while executing post-consume script: {e}",
exc_info=True
)
def try_consume_file(self,
@@ -116,7 +169,8 @@ class Consumer(LoggingMixin):
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
override_tag_ids=None,
task_id=None):
"""
Return the document object if it was successfully created.
"""
@@ -127,6 +181,9 @@ class Consumer(LoggingMixin):
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
self.override_tag_ids = override_tag_ids
self.task_id = task_id or str(uuid.uuid4())
self._send_progress(0, 100, 'STARTING', MESSAGE_NEW_FILE)
# this is for grouping logging entries for this particular file
# together.
@@ -149,11 +206,10 @@ class Consumer(LoggingMixin):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
raise ConsumerError(
f"Unsupported mime type {mime_type} of file {self.filename}")
else:
self.log("debug",
f"Parser: {parser_class.__name__}")
self._fail(
MESSAGE_UNSUPPORTED_TYPE,
f"Unsupported mime type {mime_type}"
)
# Notify all listeners that we're going to do some work.
@@ -165,35 +221,53 @@ class Consumer(LoggingMixin):
self.run_pre_consume_script()
def progress_callback(current_progress, max_progress):
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 50 + 20)
self._send_progress(p, 100, "WORKING")
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group)
document_parser = parser_class(self.logging_group, progress_callback)
self.log("debug", f"Parser: {type(document_parser).__name__}")
# However, this already created working directories which we have to
# clean up.
# Parse the document. This may take some time.
text = None
date = None
thumbnail = None
archive_path = None
try:
self._send_progress(20, 100, 'WORKING', MESSAGE_PARSING_DOCUMENT)
self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, 'WORKING',
MESSAGE_GENERATING_THUMBNAIL)
thumbnail = document_parser.get_optimised_thumbnail(
self.path, mime_type)
self.path, mime_type, self.filename)
text = document_parser.get_text()
date = document_parser.get_date()
if not date:
self._send_progress(90, 100, 'WORKING',
MESSAGE_PARSE_DATE)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
except ParseError as e:
document_parser.cleanup()
self.log(
"error",
f"Error while consuming document {self.filename}: {e}")
raise ConsumerError(e)
self._fail(
str(e),
f"Error while consuming document {self.filename}: {e}",
exc_info=True
)
# Prepare the document classifier.
@@ -201,15 +275,9 @@ class Consumer(LoggingMixin):
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
try:
classifier = DocumentClassifier()
classifier.reload()
except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
self.log(
"warning",
f"Cannot classify documents: {e}.")
classifier = None
classifier = load_classifier()
self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
@@ -235,8 +303,7 @@ class Consumer(LoggingMixin):
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
document.filename = generate_unique_filename(
document, settings.ORIGINALS_DIR)
document.filename = generate_unique_filename(document)
create_source_path_directory(document.source_path)
self._write(document.storage_type,
@@ -246,6 +313,10 @@ class Consumer(LoggingMixin):
thumbnail, document.thumbnail_path)
if archive_path and os.path.isfile(archive_path):
document.archive_filename = generate_unique_filename(
document,
archive_filename=True
)
create_source_path_directory(document.archive_path)
self._write(document.storage_type,
archive_path, document.archive_path)
@@ -262,13 +333,22 @@ class Consumer(LoggingMixin):
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
os.path.dirname(self.path),
"._" + os.path.basename(self.path))
if os.path.isfile(shadow_file):
self.log("debug", "Deleting file {}".format(shadow_file))
os.unlink(shadow_file)
except Exception as e:
self.log(
"error",
self._fail(
str(e),
f"The following error occured while consuming "
f"{self.filename}: {e}"
f"{self.filename}: {e}",
exc_info=True
)
raise ConsumerError(e)
finally:
document_parser.cleanup()
@@ -279,6 +359,8 @@ class Consumer(LoggingMixin):
"Document {} consumption finished".format(document)
)
self._send_progress(100, 100, 'SUCCESS', MESSAGE_FINISHED, document.id)
return document
def _store(self, text, date, mime_type):

View File

@@ -8,6 +8,9 @@ from django.conf import settings
from django.template.defaultfilters import slugify
logger = logging.getLogger("paperless.filehandling")
class defaultdictNoStr(defaultdict):
def __str__(self):
@@ -76,12 +79,40 @@ def many_to_dictionary(field):
return mydictionary
def generate_unique_filename(doc, root):
def generate_unique_filename(doc,
archive_filename=False):
"""
Generates a unique filename for doc in settings.ORIGINALS_DIR.
The returned filename is guaranteed to be either the current filename
of the document if unchanged, or a new filename that does not correspondent
to any existing files. The function will append _01, _02, etc to the
filename before the extension to avoid conflicts.
If archive_filename is True, return a unique archive filename instead.
"""
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to
# the original filename first.
if archive_filename and doc.filename:
new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
if new_filename == old_filename or not os.path.exists(os.path.join(root, new_filename)): # NOQA: E501
return new_filename
counter = 0
while True:
new_filename = generate_filename(doc, counter)
if new_filename == doc.filename:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
@@ -91,7 +122,7 @@ def generate_unique_filename(doc, root):
return new_filename
def generate_filename(doc, counter=0, append_gpg=True):
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
@@ -120,6 +151,11 @@ def generate_filename(doc, counter=0, append_gpg=True):
else:
document_type = "none"
if doc.archive_serial_number:
asn = str(doc.archive_serial_number)
else:
asn = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
@@ -133,6 +169,7 @@ def generate_filename(doc, counter=0, append_gpg=True):
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
asn=asn,
tags=tags,
tag_list=tag_list
).strip()
@@ -140,23 +177,21 @@ def generate_filename(doc, counter=0, append_gpg=True):
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning(
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{doc.file_type}"
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"

13
src/documents/filters.py Executable file → Normal file
View File

@@ -1,3 +1,4 @@
from django.db.models import Q
from django_filters.rest_framework import BooleanFilter, FilterSet, Filter
from .models import Correspondent, Document, Tag, DocumentType, Log
@@ -74,6 +75,16 @@ class InboxFilter(Filter):
return qs
class TitleContentFilter(Filter):
def filter(self, qs, value):
if value:
return qs.filter(Q(title__icontains=value) |
Q(content__icontains=value))
else:
return qs
class DocumentFilterSet(FilterSet):
is_tagged = BooleanFilter(
@@ -91,6 +102,8 @@ class DocumentFilterSet(FilterSet):
is_in_inbox = InboxFilter()
title_content = TitleContentFilter()
class Meta:
model = Document
fields = {

View File

@@ -2,75 +2,70 @@ import logging
import os
from contextlib import contextmanager
import math
from dateutil.parser import isoparse
from django.conf import settings
from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME, BOOLEAN
from whoosh.highlight import HtmlFormatter
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.searching import ResultsPage, Searcher
from whoosh.writing import AsyncWriter
from documents.models import Document
logger = logging.getLogger(__name__)
class JsonFormatter(Formatter):
def __init__(self):
self.seen = {}
def format_token(self, text, token, replace=False):
ttext = self._text(get_text(text, token, replace))
return {'text': ttext, 'highlight': 'true'}
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
amend_token = None
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
text_inbetween = text[index:t.startchar]
if amend_token and t.startchar - index < 10:
amend_token['text'] += text_inbetween
else:
output.append({'text': text_inbetween,
'highlight': False})
amend_token = None
token = self.format_token(text, t, replace)
if amend_token:
amend_token['text'] += token['text']
else:
output.append(token)
amend_token = token
index = t.endchar
if index < fragment.endchar:
output.append({'text': text[index:fragment.endchar],
'highlight': False})
return output
def format(self, fragments, replace=False):
output = []
for fragment in fragments:
output.append(self.format_fragment(fragment, replace=replace))
return output
logger = logging.getLogger("paperless.index")
def get_schema():
return Schema(
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
id=NUMERIC(
stored=True,
unique=True
),
title=TEXT(
sortable=True
),
content=TEXT(),
correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
type=TEXT(stored=True),
created=DATETIME(stored=True, sortable=True),
modified=DATETIME(stored=True, sortable=True),
added=DATETIME(stored=True, sortable=True),
asn=NUMERIC(
sortable=True
),
correspondent=TEXT(
sortable=True
),
correspondent_id=NUMERIC(),
has_correspondent=BOOLEAN(),
tag=KEYWORD(
commas=True,
scorable=True,
lowercase=True
),
tag_id=KEYWORD(
commas=True,
scorable=True
),
has_tag=BOOLEAN(),
type=TEXT(
sortable=True
),
type_id=NUMERIC(),
has_type=BOOLEAN(),
created=DATETIME(
sortable=True
),
modified=DATETIME(
sortable=True
),
added=DATETIME(
sortable=True
),
)
@@ -78,25 +73,56 @@ def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception as e:
logger.error(f"Error while opening the index: {e}, recreating.")
except Exception:
logger.exception(f"Error while opening the index, recreating.")
if not os.path.isdir(settings.INDEX_DIR):
os.makedirs(settings.INDEX_DIR, exist_ok=True)
return create_in(settings.INDEX_DIR, get_schema())
@contextmanager
def open_index_writer(optimize=False):
writer = AsyncWriter(open_index())
try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)
@contextmanager
def open_index_searcher():
searcher = open_index().searcher()
try:
yield searcher
finally:
searcher.close()
def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
writer.update_document(
id=doc.pk,
title=doc.title,
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None,
correspondent_id=doc.correspondent.id if doc.correspondent else None,
has_correspondent=doc.correspondent is not None,
tag=tags if tags else None,
tag_id=tags_ids if tags_ids else None,
has_tag=len(tags) > 0,
type=doc.document_type.name if doc.document_type else None,
type_id=doc.document_type.id if doc.document_type else None,
has_type=doc.document_type is not None,
created=doc.created,
added=doc.added,
asn=doc.archive_serial_number,
modified=doc.modified,
)
@@ -110,61 +136,162 @@ def remove_document_by_id(writer, doc_id):
def add_or_update_document(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
update_document(writer, document)
def remove_document_from_index(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
remove_document(writer, document)
@contextmanager
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
searcher = ix.searcher()
try:
if querystring:
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
ix.schema)
qp.add_plugin(DateParserPlugin())
str_q = qp.parse(querystring)
corrected = searcher.correct_query(str_q, querystring)
else:
str_q = None
corrected = None
class DelayedQuery:
if more_like_doc_id:
docnum = searcher.document_number(id=more_like_doc_id)
kts = searcher.key_terms_from_text(
'content', more_like_doc_content, numterms=20,
model=classify.Bo1Model, normalize=False)
more_like_q = query.Or(
[query.Term('content', word, boost=weight)
for word, weight in kts])
result_page = searcher.search_page(
more_like_q, page, filter=str_q, mask={docnum})
elif str_q:
result_page = searcher.search_page(str_q, page)
else:
raise ValueError(
"Either querystring or more_like_doc_id is required."
)
def _get_query(self):
raise NotImplementedError()
result_page.results.fragmenter = highlight.ContextFragmenter(
def _get_query_filter(self):
criterias = []
for k, v in self.query_params.items():
if k == 'correspondent__id':
criterias.append(query.Term('correspondent_id', v))
elif k == 'tags__id__all':
for tag_id in v.split(","):
criterias.append(query.Term('tag_id', tag_id))
elif k == 'document_type__id':
criterias.append(query.Term('type_id', v))
elif k == 'correspondent__isnull':
criterias.append(query.Term("has_correspondent", v == "false"))
elif k == 'is_tagged':
criterias.append(query.Term("has_tag", v == "true"))
elif k == 'document_type__isnull':
criterias.append(query.Term("has_type", v == "false"))
elif k == 'created__date__lt':
criterias.append(
query.DateRange("created", start=None, end=isoparse(v)))
elif k == 'created__date__gt':
criterias.append(
query.DateRange("created", start=isoparse(v), end=None))
elif k == 'added__date__gt':
criterias.append(
query.DateRange("added", start=isoparse(v), end=None))
elif k == 'added__date__lt':
criterias.append(
query.DateRange("added", start=None, end=isoparse(v)))
if len(criterias) > 0:
return query.And(criterias)
else:
return None
def _get_query_sortedby(self):
if 'ordering' not in self.query_params:
return None, False
field: str = self.query_params['ordering']
sort_fields_map = {
"created": "created",
"modified": "modified",
"added": "added",
"title": "title",
"correspondent__name": "correspondent",
"document_type__name": "type",
"archive_serial_number": "asn"
}
if field.startswith('-'):
field = field[1:]
reverse = True
else:
reverse = False
if field not in sort_fields_map:
return None, False
else:
return sort_fields_map[field], reverse
def __init__(self, searcher: Searcher, query_params, page_size):
self.searcher = searcher
self.query_params = query_params
self.page_size = page_size
self.saved_results = dict()
self.first_score = None
def __len__(self):
page = self[0:1]
return len(page)
def __getitem__(self, item):
if item.start in self.saved_results:
return self.saved_results[item.start]
q, mask = self._get_query()
sortedby, reverse = self._get_query_sortedby()
page: ResultsPage = self.searcher.search_page(
q,
mask=mask,
filter=self._get_query_filter(),
pagenum=math.floor(item.start / self.page_size) + 1,
pagelen=self.page_size,
sortedby=sortedby,
reverse=reverse
)
page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
if corrected and corrected.query != str_q:
if (not self.first_score and
len(page.results) > 0 and
sortedby is None):
self.first_score = page.results[0].score
page.results.top_n = list(map(
lambda hit: (
(hit[0] / self.first_score) if self.first_score else None,
hit[1]
),
page.results.top_n
))
self.saved_results[item.start] = page
return page
class DelayedFullTextQuery(DelayedQuery):
def _get_query(self):
q_str = self.query_params['query']
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
self.searcher.ixreader.schema)
qp.add_plugin(DateParserPlugin())
q = qp.parse(q_str)
corrected = self.searcher.correct_query(q, q_str)
if corrected.query != q:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
finally:
searcher.close()
return q, None
class DelayedMoreLikeThisQuery(DelayedQuery):
def _get_query(self):
more_like_doc_id = int(self.query_params['more_like_id'])
content = Document.objects.get(id=more_like_doc_id).content
docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text(
'content', content, numterms=20,
model=classify.Bo1Model, normalize=False)
q = query.Or(
[query.Term('content', word, boost=weight)
for word, weight in kts])
mask = {docnum}
return q, mask
def autocomplete(ix, term, limit=10):

View File

@@ -4,33 +4,24 @@ import uuid
from django.conf import settings
class PaperlessHandler(logging.Handler):
def emit(self, record):
if settings.DISABLE_DBHANDLER:
return
# We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point
from .models import Log
kwargs = {"message": record.msg, "level": record.levelno}
if hasattr(record, "group"):
kwargs["group"] = record.group
Log.objects.create(**kwargs)
class LoggingMixin:
logging_group = None
logging_name = None
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message, **kwargs):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
if self.logging_name:
logger = logging.getLogger(self.logging_name)
else:
name = ".".join([
self.__class__.__module__,
self.__class__.__name__
])
logger = logging.getLogger(name)
getattr(logger, level)(message, extra={
"group": self.logging_group

View File

@@ -16,12 +16,12 @@ from whoosh.writing import AsyncWriter
from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory
from ...mixins import Renderable
from ...file_handling import create_source_path_directory, \
generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
logger = logging.getLogger(__name__)
logger = logging.getLogger("paperless.management.archiver")
def handle_document(document_id):
@@ -31,38 +31,57 @@ def handle_document(document_id):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})")
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type)
parser.parse(
document.source_path,
mime_type,
document.get_public_filename())
thumbnail = parser.get_optimised_thumbnail(
document.source_path,
mime_type,
document.get_public_filename()
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), 'rb') as f:
checksum = hashlib.md5(f.read()).hexdigest()
# i'm going to save first so that in case the file move
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# we also don't use save() since that triggers the filehandling
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document, archive_filename=True)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text()
content=parser.get_text(),
archive_filename=document.archive_filename
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(),
document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with AsyncWriter(index.open_index()) as writer:
index.update_document(writer, document)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception as e:
logger.error(f"Error while parsing document {document}: {str(e)}")
logger.exception(f"Error while parsing document {document} "
f"(ID: {document_id})")
finally:
parser.cleanup()
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
@@ -71,10 +90,6 @@ class Command(Renderable, BaseCommand):
modified) after their initial import.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"-f", "--overwrite",
@@ -91,6 +106,12 @@ class Command(Renderable, BaseCommand):
help="Specify the ID of a document, and this command will only "
"run on this specific document."
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def handle(self, *args, **options):
@@ -106,7 +127,7 @@ class Command(Renderable, BaseCommand):
document_ids = list(map(
lambda doc: doc.id,
filter(
lambda d: overwrite or not d.archive_checksum,
lambda d: overwrite or not d.has_archive_version,
documents
)
))
@@ -125,7 +146,8 @@ class Command(Renderable, BaseCommand):
handle_document,
document_ids
),
total=len(document_ids)
total=len(document_ids),
disable=options['no_progress_bar']
))
except KeyboardInterrupt:
print("Aborting...")

View File

@@ -1,6 +1,7 @@
import logging
import os
from pathlib import Path
from pathlib import Path, PurePath
from threading import Thread
from time import sleep
from django.conf import settings
@@ -17,11 +18,11 @@ try:
except ImportError:
INotify = flags = None
logger = logging.getLogger(__name__)
logger = logging.getLogger("paperless.management.consumer")
def _tags_from_path(filepath):
"""Walk up the directory tree from filepath to CONSUMPTION_DIr
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
"""
tag_ids = set()
@@ -35,8 +36,15 @@ def _tags_from_path(filepath):
return tag_ids
def _is_ignored(filepath: str) -> bool:
filepath_relative = PurePath(filepath).relative_to(
settings.CONSUMPTION_DIR)
return any(
filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
def _consume(filepath):
if os.path.isdir(filepath):
if os.path.isdir(filepath) or _is_ignored(filepath):
return
if not os.path.isfile(filepath):
@@ -54,10 +62,10 @@ def _consume(filepath):
if settings.CONSUMER_SUBDIRS_AS_TAGS:
tag_ids = _tags_from_path(filepath)
except Exception as e:
logger.error(
"Error creating tags from path: {}".format(e))
logger.exception("Error creating tags from path")
try:
logger.info(f"Adding {filepath} to the task queue.")
async_task("documents.tasks.consume_file",
filepath,
override_tag_ids=tag_ids if tag_ids else None,
@@ -66,14 +74,17 @@ def _consume(filepath):
# Catch all so that the consumer won't crash.
# This is also what the test case is listening for to check for
# errors.
logger.error(
"Error while consuming document: {}".format(e))
logger.exception("Error while consuming document")
def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
def _consume_wait_unmodified(file):
if _is_ignored(file):
return
logger.debug(f"Waiting for file {file} to remain unmodified")
mtime = -1
current_try = 0
while current_try < num_tries:
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
try:
new_mtime = os.stat(file).st_mtime
except FileNotFoundError:
@@ -84,7 +95,7 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
_consume(file)
return
mtime = new_mtime
sleep(wait_time)
sleep(settings.CONSUMER_POLLING_DELAY)
current_try += 1
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
@@ -93,10 +104,14 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
class Handler(FileSystemEventHandler):
def on_created(self, event):
_consume_wait_unmodified(event.src_path)
Thread(
target=_consume_wait_unmodified, args=(event.src_path,)
).start()
def on_moved(self, event):
_consume_wait_unmodified(event.dest_path)
Thread(
target=_consume_wait_unmodified, args=(event.dest_path,)
).start()
class Command(BaseCommand):
@@ -108,12 +123,7 @@ class Command(BaseCommand):
# This is here primarily for the tests and is irrelevant in production.
stop_flag = False
def __init__(self, *args, **kwargs):
self.logger = logging.getLogger(__name__)
BaseCommand.__init__(self, *args, **kwargs)
self.observer = None
observer = None
def add_arguments(self, parser):
parser.add_argument(
@@ -153,7 +163,7 @@ class Command(BaseCommand):
if options["oneshot"]:
return
if settings.CONSUMER_POLLING == 0:
if settings.CONSUMER_POLLING == 0 and INotify:
self.handle_inotify(directory, recursive)
else:
self.handle_polling(directory, recursive)
@@ -161,7 +171,7 @@ class Command(BaseCommand):
logger.debug("Consumer exiting.")
def handle_polling(self, directory, recursive):
logging.getLogger(__name__).info(
logger.info(
f"Polling directory for changes: {directory}")
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
self.observer.schedule(Handler(), directory, recursive=recursive)
@@ -176,7 +186,7 @@ class Command(BaseCommand):
self.observer.join()
def handle_inotify(self, directory, recursive):
logging.getLogger(__name__).info(
logger.info(
f"Using inotify to watch directory for changes: {directory}")
inotify = INotify()

View File

@@ -1,10 +1,9 @@
from django.core.management.base import BaseCommand
from ...mixins import Renderable
from ...tasks import train_classifier
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
Trains the classifier on your data and saves the resulting models to a

View File

@@ -6,20 +6,22 @@ import time
import tqdm
from django.conf import settings
from django.contrib.auth.models import User
from django.core import serializers
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from filelock import FileLock
from documents.models import Document, Correspondent, Tag, DocumentType
from documents.models import Document, Correspondent, Tag, DocumentType, \
SavedView, SavedViewFilterRule
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from paperless.db import GnuPG
from paperless_mail.models import MailAccount, MailRule
from ...file_handling import generate_filename, delete_empty_directories
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
@@ -55,6 +57,12 @@ class Command(Renderable, BaseCommand):
"do not belong to the current export, such as files from "
"deleted documents."
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
@@ -79,9 +87,9 @@ class Command(Renderable, BaseCommand):
raise CommandError("That path doesn't appear to be writable")
with FileLock(settings.MEDIA_LOCK):
self.dump()
self.dump(options['no_progress_bar'])
def dump(self):
def dump(self, progress_bar_disable=False):
# 1. Take a snapshot of what files exist in the current export folder
for root, dirs, files in os.walk(self.target):
self.files_in_export_dir.extend(
@@ -106,9 +114,27 @@ class Command(Renderable, BaseCommand):
serializers.serialize("json", documents))
manifest += document_manifest
manifest += json.loads(serializers.serialize(
"json", MailAccount.objects.all()))
manifest += json.loads(serializers.serialize(
"json", MailRule.objects.all()))
manifest += json.loads(serializers.serialize(
"json", SavedView.objects.all()))
manifest += json.loads(serializers.serialize(
"json", SavedViewFilterRule.objects.all()))
manifest += json.loads(serializers.serialize(
"json", User.objects.all()))
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(enumerate(document_manifest),
total=len(document_manifest)):
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=progress_bar_disable
):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
@@ -140,7 +166,7 @@ class Command(Renderable, BaseCommand):
thumbnail_target = os.path.join(self.target, thumbnail_name)
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
if os.path.exists(document.archive_path):
if document.has_archive_version:
archive_name = base_name + "-archive.pdf"
archive_target = os.path.join(self.target, archive_name)
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name

View File

@@ -15,7 +15,6 @@ from documents.models import Document
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from ...file_handling import create_source_path_directory
from ...mixins import Renderable
from ...signals.handlers import update_filename_and_move_files
@@ -28,7 +27,7 @@ def disable_signal(sig, receiver, sender):
sig.connect(receiver=receiver, sender=sender)
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
@@ -37,6 +36,12 @@ class Command(Renderable, BaseCommand):
def add_arguments(self, parser):
parser.add_argument("source")
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
@@ -71,7 +76,7 @@ class Command(Renderable, BaseCommand):
# Fill up the database with whatever is in the manifest
call_command("loaddata", manifest_path)
self._import_files_from_manifest()
self._import_files_from_manifest(options['no_progress_bar'])
print("Updating search index...")
call_command('document_index', 'reindex')
@@ -112,7 +117,7 @@ class Command(Renderable, BaseCommand):
f"does not appear to be in the source directory."
)
def _import_files_from_manifest(self):
def _import_files_from_manifest(self, progress_bar_disable):
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
@@ -124,7 +129,10 @@ class Command(Renderable, BaseCommand):
lambda r: r["model"] == "documents.document",
self.manifest))
for record in tqdm.tqdm(manifest_documents):
for record in tqdm.tqdm(
manifest_documents,
disable=progress_bar_disable
):
document = Document.objects.get(pk=record["pk"])
@@ -152,6 +160,9 @@ class Command(Renderable, BaseCommand):
shutil.copy2(thumbnail_path, document.thumbnail_path)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
shutil.copy2(archive_path, document.archive_path)
document.save()

View File

@@ -1,26 +1,25 @@
from django.core.management import BaseCommand
from django.db import transaction
from documents.mixins import Renderable
from documents.tasks import index_reindex, index_optimize
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = "Manages the document index."
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument("command", choices=['reindex', 'optimize'])
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
with transaction.atomic():
if options['command'] == 'reindex':
index_reindex()
index_reindex(progress_bar_disable=options['no_progress_bar'])
elif options['command'] == 'optimize':
index_optimize()

View File

@@ -1,12 +0,0 @@
from django.core.management.base import BaseCommand
from documents.models import Log
class Command(BaseCommand):
help = "A quick & dirty way to see what's in the logs"
def handle(self, *args, **options):
for log in Log.objects.order_by("pk"):
print(log)

View File

@@ -5,24 +5,28 @@ from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from documents.models import Document
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(Document.objects.all()):
for document in tqdm.tqdm(
Document.objects.all(),
disable=options['no_progress_bar']
):
post_save.send(Document, instance=document)

64
src/documents/management/commands/document_retagger.py Executable file → Normal file
View File

@@ -1,15 +1,17 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.classifier import load_classifier
from documents.models import Document
from ...mixins import Renderable
from ...signals.handlers import set_correspondent, set_document_type, set_tags
class Command(Renderable, BaseCommand):
logger = logging.getLogger("paperless.management.retagger")
class Command(BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
@@ -18,10 +20,6 @@ class Command(Renderable, BaseCommand):
modified) after their initial import.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"-c", "--correspondent",
@@ -59,10 +57,26 @@ class Command(Renderable, BaseCommand):
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules."
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
parser.add_argument(
"--suggest",
default=False,
action="store_true",
help="Return the suggestion, don't change anything."
)
parser.add_argument(
"--base-url",
help="The base URL to use to build the link to the documents."
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
# Detect if we support color
color = self.style.ERROR("test") != "test"
if options["inbox_only"]:
queryset = Document.objects.filter(tags__is_inbox_tag=True)
@@ -70,17 +84,12 @@ class Command(Renderable, BaseCommand):
queryset = Document.objects.all()
documents = queryset.distinct()
classifier = DocumentClassifier()
try:
classifier.reload()
except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning(
f"Cannot classify documents: {e}.")
classifier = None
classifier = load_classifier()
for document in documents:
logging.getLogger(__name__).info(
f"Processing document {document.title}")
for document in tqdm.tqdm(
documents,
disable=options['no_progress_bar']
):
if options['correspondent']:
set_correspondent(
@@ -88,18 +97,27 @@ class Command(Renderable, BaseCommand):
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'])
use_first=options['use_first'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)
if options['document_type']:
set_document_type(sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'])
use_first=options['use_first'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)
if options['tags']:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'])
replace=options['overwrite'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)

View File

@@ -0,0 +1,23 @@
from django.core.management.base import BaseCommand
from documents.sanity_checker import check_sanity
class Command(BaseCommand):
help = """
This command checks your document archive for issues.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def handle(self, *args, **options):
messages = check_sanity(progress=not options['no_progress_bar'])
messages.log_messages()

View File

@@ -7,7 +7,6 @@ from django import db
from django.core.management.base import BaseCommand
from documents.models import Document
from ...mixins import Renderable
from ...parsers import get_parser_class_for_mime_type
@@ -23,23 +22,22 @@ def _process_document(doc_in):
try:
thumb = parser.get_optimised_thumbnail(
document.source_path, document.mime_type)
document.source_path,
document.mime_type,
document.get_public_filename()
)
shutil.move(thumb, document.thumbnail_path)
finally:
parser.cleanup()
class Command(Renderable, BaseCommand):
class Command(BaseCommand):
help = """
This will regenerate the thumbnails for all documents.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"-d", "--document",
@@ -49,11 +47,14 @@ class Command(Renderable, BaseCommand):
help="Specify the ID of a document, and this command will only "
"run on this specific document."
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
logging.getLogger().handlers[0].level = logging.ERROR
if options['document']:
@@ -70,5 +71,7 @@ class Command(Renderable, BaseCommand):
with multiprocessing.Pool() as pool:
list(tqdm.tqdm(
pool.imap_unordered(_process_document, ids), total=len(ids)
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=options['no_progress_bar']
))

View File

@@ -0,0 +1,42 @@
import logging
import os
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand, CommandError
logger = logging.getLogger("paperless.management.superuser")
class Command(BaseCommand):
help = """
Creates a Django superuser based on env variables.
""".replace(" ", "")
def handle(self, *args, **options):
username = os.getenv('PAPERLESS_ADMIN_USER')
if not username:
return
mail = os.getenv('PAPERLESS_ADMIN_MAIL', 'root@localhost')
password = os.getenv('PAPERLESS_ADMIN_PASSWORD')
# Check if user exists already, leave as is if it does
if User.objects.filter(username=username).exists():
user: User = User.objects.get_by_natural_key(username)
user.set_password(password)
user.save()
self.stdout.write(f"Changed password of user {username}.")
elif password:
# Create superuser based on env variables
User.objects.create_superuser(username, mail, password)
self.stdout.write(
f'Created superuser "{username}" with provided password.')
else:
self.stdout.write(
f'Did not create superuser "{username}".')
self.stdout.write(
'Make sure you specified "PAPERLESS_ADMIN_PASSWORD" in your '
'"docker-compose.env" file.')

View File

@@ -1,18 +1,17 @@
import logging
import re
from fuzzywuzzy import fuzz
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
logger = logging.getLogger(__name__)
logger = logging.getLogger("paperless.matching")
def log_reason(matching_model, document, reason):
class_name = type(matching_model).__name__
logger.debug(
f"Assigning {class_name} {matching_model.name} to document "
f"{class_name} {matching_model.name} matched on document "
f"{document} because {reason}")
@@ -91,7 +90,7 @@ def matches(matching_model, document):
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
result = bool(re.search(
rf"\b{matching_model.match}\b",
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs
))
@@ -123,6 +122,8 @@ def matches(matching_model, document):
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from fuzzywuzzy import fuzz
match = re.sub(r'[^\w\s]', '', matching_model.match)
text = re.sub(r'[^\w\s]', '', document_content)
if matching_model.is_insensitive:
@@ -160,6 +161,9 @@ def _split_match(matching_model):
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
re.escape(
normspace(" ", (t[0] or t[1]).strip())
).replace(r"\ ", r"\s+")
for t in findterms(matching_model.match)
]

View File

@@ -0,0 +1,330 @@
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
import hashlib
import logging
import os
import shutil
from time import sleep
import pathvalidate
from django.conf import settings
from django.db import migrations, models
from django.template.defaultfilters import slugify
from documents.file_handling import defaultdictNoStr, many_to_dictionary
logger = logging.getLogger("paperless.migrations")
###############################################################################
# This is code copied straight paperless before the change.
###############################################################################
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
else:
fname = "{:07}{}".format(doc.pk, doc.file_type)
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name, replacement_text="-"
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name, replacement_text="-"
)
else:
document_type = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
###############################################################################
# This code performs bidirection archive file transformation.
###############################################################################
def parse_wrapper(parser, path, mime_type, file_name):
# this is here so that I can mock this out for testing.
parser.parse(path, mime_type, file_name)
def create_archive_version(doc, retry_count=3):
from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \
ParseError
logger.info(
f"Regenerating archive document for document ID:{doc.id}"
)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
parse_wrapper(parser, source_path(doc), doc.mime_type,
os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.get_archive_path() and os.path.isfile(
parser.get_archive_path()):
doc.archive_filename = generate_unique_filename(
doc, archive_filename=True)
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
os.makedirs(os.path.dirname(archive_path_new(doc)),
exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
logger.error(
f"Parser did not return an archive document for document "
f"ID:{doc.id}. Removing archive document."
)
doc.save()
return
except ParseError:
if try_num + 1 == retry_count:
logger.exception(
f"Unable to regenerate archive document for ID:{doc.id}. You "
f"need to invoke the document_archiver management command "
f"manually for that document."
)
doc.archive_checksum = None
doc.save()
return
else:
# This is mostly here for the tika parser in docker
# environemnts. The servers for parsing need to come up first,
# and the docker setup doesn't ensure that tika is running
# before attempting migrations.
logger.error("Parse error, will try again in 5 seconds...")
sleep(5)
finally:
parser.cleanup()
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
affected_document_ids = set()
old_archive_path_to_id = {}
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
# check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: "
f"{old_path}")
# check that we can regenerate affected archive versions
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
for doc in Document.objects.filter(archive_checksum__isnull=False):
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
logger.debug(
f"Removing {old_path}"
)
os.unlink(old_path)
else:
# Set archive path for unaffected files
doc.archive_filename = archive_name_from_filename(doc.filename)
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename
)
# regenerate archive documents
for doc_id in affected_document_ids:
doc = Document.objects.get(id=doc_id)
create_archive_version(doc)
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
old_archive_paths = set()
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if old_archive_path in old_archive_paths:
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.")
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists."
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if new_archive_path != old_archive_path:
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
shutil.move(new_archive_path, old_archive_path)
class Migration(migrations.Migration):
dependencies = [
('documents', '1011_auto_20210101_2340'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_filename',
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
),
]

View File

@@ -0,0 +1,70 @@
# Generated by Django 3.1.4 on 2020-12-02 21:43
from django.db import migrations, models
COLOURS_OLD = {
1: "#a6cee3",
2: "#1f78b4",
3: "#b2df8a",
4: "#33a02c",
5: "#fb9a99",
6: "#e31a1c",
7: "#fdbf6f",
8: "#ff7f00",
9: "#cab2d6",
10: "#6a3d9a",
11: "#b15928",
12: "#000000",
13: "#cccccc",
}
def forward(apps, schema_editor):
Tag = apps.get_model('documents', 'Tag')
for tag in Tag.objects.all():
colour_old_id = tag.colour_old
rgb = COLOURS_OLD[colour_old_id]
tag.color = rgb
tag.save()
def reverse(apps, schema_editor):
Tag = apps.get_model('documents', 'Tag')
def _get_colour_id(rdb):
for idx, rdbx in COLOURS_OLD.items():
if rdbx == rdb:
return idx
# Return colour 1 if we can't match anything
return 1
for tag in Tag.objects.all():
colour_id = _get_colour_id(tag.color)
tag.colour_old = colour_id
tag.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1012_fix_archive_files'),
]
operations = [
migrations.RenameField(
model_name='tag',
old_name='colour',
new_name='colour_old',
),
migrations.AddField(
model_name='tag',
name='color',
field=models.CharField(default='#a6cee3', max_length=7, verbose_name='color'),
),
migrations.RunPython(forward, reverse),
migrations.RemoveField(
model_name='tag',
name='colour_old',
)
]

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.7 on 2021-02-28 15:14
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1013_migrate_tag_colour'),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains')], verbose_name='rule type'),
),
]

View File

@@ -0,0 +1,29 @@
# Generated by Django 3.1.7 on 2021-04-04 18:28
import logging
from django.db import migrations
logger = logging.getLogger("paperless.migrations")
def remove_null_characters(apps, schema_editor):
Document = apps.get_model('documents', 'Document')
for doc in Document.objects.all():
content: str = doc.content
if '\0' in content:
logger.info(f"Removing null characters from document {doc}...")
doc.content = content.replace('\0', ' ')
doc.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1014_auto_20210228_1614'),
]
operations = [
migrations.RunPython(remove_null_characters, migrations.RunPython.noop)
]

View File

@@ -0,0 +1,23 @@
# Generated by Django 3.1.7 on 2021-03-17 12:51
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1015_remove_null_characters'),
]
operations = [
migrations.AlterField(
model_name='savedview',
name='sort_field',
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='sort field'),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains'), (20, 'fulltext query'), (21, 'more like this')], verbose_name='rule type'),
),
]

View File

@@ -1,9 +0,0 @@
class Renderable:
"""
A handy mixin to make it easier/cleaner to print output based on a
verbosity value.
"""
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

69
src/documents/models.py Executable file → Normal file
View File

@@ -16,7 +16,6 @@ from django.utils.timezone import is_aware
from django.utils.translation import gettext_lazy as _
from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension
@@ -66,10 +65,6 @@ class MatchingModel(models.Model):
class Correspondent(MatchingModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
class Meta:
ordering = ("name",)
verbose_name = _("correspondent")
@@ -78,25 +73,11 @@ class Correspondent(MatchingModel):
class Tag(MatchingModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
colour = models.PositiveIntegerField(
color = models.CharField(
_("color"),
choices=COLOURS, default=1)
max_length=7,
default="#a6cee3"
)
is_inbox_tag = models.BooleanField(
_("is inbox tag"),
@@ -208,10 +189,21 @@ class Document(models.Model):
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current filename in storage")
)
archive_filename = models.FilePathField(
_("archive filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current archive filename in storage")
)
archive_serial_number = models.IntegerField(
_("archive serial number"),
blank=True,
@@ -256,16 +248,18 @@ class Document(models.Model):
return open(self.source_path, "rb")
@property
def archive_path(self):
if self.filename:
fname = archive_name_from_filename(self.filename)
else:
fname = "{:07}.pdf".format(self.pk)
def has_archive_version(self):
return self.archive_filename is not None
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
@property
def archive_path(self):
if self.has_archive_version:
return os.path.join(
settings.ARCHIVE_DIR,
str(self.archive_filename)
)
else:
return None
@property
def archive_file(self):
@@ -361,7 +355,10 @@ class SavedView(models.Model):
sort_field = models.CharField(
_("sort field"),
max_length=128)
max_length=128,
null=True,
blank=True
)
sort_reverse = models.BooleanField(
_("sort reverse"),
default=False)
@@ -387,7 +384,11 @@ class SavedViewFilterRule(models.Model):
(15, _("modified before")),
(16, _("modified after")),
(17, _("does not have tag")),
(19, _("has tags in")),
(18, _("does not have ASN")),
(19, _("title or content contains")),
(20, _("fulltext query")),
(21, _("more like this")),
(22, _("has tags in"))
]
saved_view = models.ForeignKey(

View File

@@ -6,7 +6,6 @@ import shutil
import subprocess
import tempfile
import dateparser
import magic
from django.conf import settings
from django.utils import timezone
@@ -36,7 +35,7 @@ DATE_REGEX = re.compile(
)
logger = logging.getLogger(__name__)
logger = logging.getLogger("paperless.parsing")
def is_mime_type_supported(mime_type):
@@ -144,6 +143,46 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def get_default_thumbnail():
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
out_path = os.path.join(temp_dir, "convert_gs.png")
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
try:
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
except ParseError:
return get_default_thumbnail()
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
@@ -162,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
output_file=out_path,
logging_group=logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
out_path = make_thumbnail_from_pdf_gs_fallback(
in_path, temp_dir, logging_group)
return out_path
@@ -200,6 +216,8 @@ def parse_date(filename, text):
"""
Call dateparser.parse with a particular date ordering
"""
import dateparser
return dateparser.parse(
ds,
settings={
@@ -261,7 +279,9 @@ class DocumentParser(LoggingMixin):
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, logging_group):
logging_name = "paperless.parsing"
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
@@ -271,6 +291,11 @@ class DocumentParser(LoggingMixin):
self.archive_path = None
self.text = None
self.date = None
self.progress_callback = progress_callback
def progress(self, current_progress, max_progress):
if self.progress_callback:
self.progress_callback(current_progress, max_progress)
def extract_metadata(self, document_path, mime_type):
return []
@@ -281,14 +306,17 @@ class DocumentParser(LoggingMixin):
def get_archive_path(self):
return self.archive_path
def get_thumbnail(self, document_path, mime_type):
def get_thumbnail(self, document_path, mime_type, file_name=None):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def get_optimised_thumbnail(self, document_path, mime_type):
thumbnail = self.get_thumbnail(document_path, mime_type)
def get_optimised_thumbnail(self,
document_path,
mime_type,
file_name=None):
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
@@ -311,5 +339,5 @@ class DocumentParser(LoggingMixin):
return self.date
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))
self.log("debug", f"Deleting directory {self.tempdir}")
shutil.rmtree(self.tempdir)

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

View File

@@ -1,45 +1,55 @@
import hashlib
import logging
import os
from django.conf import settings
from tqdm import tqdm
from documents.models import Document
class SanityMessage:
message = None
class SanityCheckMessages:
def __init__(self):
self._messages = []
def error(self, message):
self._messages.append({"level": logging.ERROR, "message": message})
def warning(self, message):
self._messages.append({"level": logging.WARNING, "message": message})
def info(self, message):
self._messages.append({"level": logging.INFO, "message": message})
def log_messages(self):
logger = logging.getLogger("paperless.sanity_checker")
if len(self._messages) == 0:
logger.info("Sanity checker detected no issues.")
else:
for msg in self._messages:
logger.log(msg['level'], msg['message'])
def __len__(self):
return len(self._messages)
def __getitem__(self, item):
return self._messages[item]
def has_error(self):
return any([msg['level'] == logging.ERROR for msg in self._messages])
def has_warning(self):
return any([msg['level'] == logging.WARNING for msg in self._messages])
class SanityWarning(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"Warning: {self.message}"
class SanityCheckFailedException(Exception):
pass
class SanityError(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"ERROR: {self.message}"
class SanityFailedError(Exception):
def __init__(self, messages):
self.messages = messages
def __str__(self):
message_string = "\n".join([str(m) for m in self.messages])
return (
f"The following issuse were found by the sanity checker:\n"
f"{message_string}\n\n===============\n\n")
def check_sanity():
messages = []
def check_sanity(progress=False):
messages = SanityCheckMessages()
present_files = []
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
@@ -50,72 +60,81 @@ def check_sanity():
if lockfile in present_files:
present_files.remove(lockfile)
for doc in Document.objects.all():
for doc in tqdm(Document.objects.all(), disable=not progress):
# Check sanity of the thumbnail
if not os.path.isfile(doc.thumbnail_path):
messages.append(SanityError(
f"Thumbnail of document {doc.pk} does not exist."))
messages.error(f"Thumbnail of document {doc.pk} does not exist.")
else:
present_files.remove(os.path.normpath(doc.thumbnail_path))
if os.path.normpath(doc.thumbnail_path) in present_files:
present_files.remove(os.path.normpath(doc.thumbnail_path))
try:
with doc.thumbnail_file as f:
f.read()
except OSError as e:
messages.append(SanityError(
messages.error(
f"Cannot read thumbnail file of document {doc.pk}: {e}"
))
)
# Check sanity of the original file
# TODO: extract method
if not os.path.isfile(doc.source_path):
messages.append(SanityError(
f"Original of document {doc.pk} does not exist."))
messages.error(f"Original of document {doc.pk} does not exist.")
else:
present_files.remove(os.path.normpath(doc.source_path))
if os.path.normpath(doc.source_path) in present_files:
present_files.remove(os.path.normpath(doc.source_path))
try:
with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
f"Cannot read original file of document {doc.pk}: {e}"))
messages.error(
f"Cannot read original file of document {doc.pk}: {e}")
else:
if not checksum == doc.checksum:
messages.append(SanityError(
messages.error(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
)
# Check sanity of the archive file.
if doc.archive_checksum:
if doc.archive_checksum and not doc.archive_filename:
messages.error(
f"Document {doc.pk} has an archive file checksum, but no "
f"archive filename."
)
elif not doc.archive_checksum and doc.archive_filename:
messages.error(
f"Document {doc.pk} has an archive file, but its checksum is "
f"missing."
)
elif doc.has_archive_version:
if not os.path.isfile(doc.archive_path):
messages.append(SanityError(
messages.error(
f"Archived version of document {doc.pk} does not exist."
))
)
else:
present_files.remove(os.path.normpath(doc.archive_path))
if os.path.normpath(doc.archive_path) in present_files:
present_files.remove(os.path.normpath(doc.archive_path))
try:
with doc.archive_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
messages.error(
f"Cannot read archive file of document {doc.pk}: {e}"
))
)
else:
if not checksum == doc.archive_checksum:
messages.append(SanityError(
f"Checksum mismatch of archive {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
messages.error(
f"Checksum mismatch of archived document "
f"{doc.pk}. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}."
)
# other document checks
if not doc.content:
messages.append(SanityWarning(
f"Document {doc.pk} has no content."
))
messages.info(f"Document {doc.pk} has no content.")
for extra_file in present_files:
messages.append(SanityWarning(
f"Orphaned file in media dir: {extra_file}"
))
messages.warning(f"Orphaned file in media dir: {extra_file}")
return messages

View File

@@ -1,13 +1,18 @@
import re
import magic
import math
from django.utils.text import slugify
from rest_framework import serializers
from rest_framework.fields import SerializerMethodField
from . import bulk_edit
from .models import Correspondent, Tag, Document, Log, DocumentType, \
SavedView, SavedViewFilterRule
from .models import Correspondent, Tag, Document, DocumentType, \
SavedView, SavedViewFilterRule, MatchingModel
from .parsers import is_mime_type_supported
from django.utils.translation import gettext as _
# https://www.django-rest-framework.org/api-guide/serializers/#example
class DynamicFieldsModelSerializer(serializers.ModelSerializer):
@@ -31,16 +36,30 @@ class DynamicFieldsModelSerializer(serializers.ModelSerializer):
self.fields.pop(field_name)
class CorrespondentSerializer(serializers.ModelSerializer):
class MatchingModelSerializer(serializers.ModelSerializer):
document_count = serializers.IntegerField(read_only=True)
last_correspondence = serializers.DateTimeField(read_only=True)
def get_slug(self, obj):
return slugify(obj.name)
slug = SerializerMethodField()
def validate_match(self, match):
if 'matching_algorithm' in self.initial_data and self.initial_data['matching_algorithm'] == MatchingModel.MATCH_REGEX: # NOQA: E501
try:
re.compile(match)
except Exception as e:
raise serializers.ValidationError(
_("Invalid regular expression: %(error)s") %
{'error': str(e)}
)
return match
class CorrespondentSerializer(MatchingModelSerializer):
last_correspondence = serializers.DateTimeField(read_only=True)
class Meta:
model = Correspondent
fields = (
@@ -55,13 +74,7 @@ class CorrespondentSerializer(serializers.ModelSerializer):
)
class DocumentTypeSerializer(serializers.ModelSerializer):
document_count = serializers.IntegerField(read_only=True)
def get_slug(self, obj):
return slugify(obj.name)
slug = SerializerMethodField()
class DocumentTypeSerializer(MatchingModelSerializer):
class Meta:
model = DocumentType
@@ -76,13 +89,40 @@ class DocumentTypeSerializer(serializers.ModelSerializer):
)
class TagSerializer(serializers.ModelSerializer):
class ColorField(serializers.Field):
document_count = serializers.IntegerField(read_only=True)
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
def get_slug(self, obj):
return slugify(obj.name)
slug = SerializerMethodField()
def to_internal_value(self, data):
for id, color in self.COLOURS:
if id == data:
return color
raise serializers.ValidationError()
def to_representation(self, value):
for id, color in self.COLOURS:
if color == value:
return id
return 1
class TagSerializerVersion1(MatchingModelSerializer):
colour = ColorField(source='color', default="#a6cee3")
class Meta:
model = Tag
@@ -99,6 +139,45 @@ class TagSerializer(serializers.ModelSerializer):
)
class TagSerializer(MatchingModelSerializer):
def get_text_color(self, obj):
try:
h = obj.color.lstrip('#')
rgb = tuple(int(h[i:i + 2], 16)/256 for i in (0, 2, 4))
luminance = math.sqrt(
0.299 * math.pow(rgb[0], 2) +
0.587 * math.pow(rgb[1], 2) +
0.114 * math.pow(rgb[2], 2)
)
return "#ffffff" if luminance < 0.53 else "#000000"
except ValueError:
return "#000000"
text_color = serializers.SerializerMethodField()
class Meta:
model = Tag
fields = (
"id",
"slug",
"name",
"color",
"text_color",
"match",
"matching_algorithm",
"is_insensitive",
"is_inbox_tag",
"document_count"
)
def validate_color(self, color):
regex = r"#[0-9a-fA-F]{6}"
if not re.match(regex, color):
raise serializers.ValidationError(_("Invalid color."))
return color
class CorrespondentField(serializers.PrimaryKeyRelatedField):
def get_queryset(self):
return Correspondent.objects.all()
@@ -127,7 +206,7 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
return obj.get_public_filename()
def get_archived_file_name(self, obj):
if obj.archive_checksum:
if obj.has_archive_version:
return obj.get_public_filename(archive=True)
else:
return None
@@ -151,19 +230,6 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
)
class LogSerializer(serializers.ModelSerializer):
class Meta:
model = Log
fields = (
"id",
"created",
"message",
"group",
"level"
)
class SavedViewFilterRuleSerializer(serializers.ModelSerializer):
class Meta:
@@ -203,14 +269,34 @@ class SavedViewSerializer(serializers.ModelSerializer):
return saved_view
class BulkEditSerializer(serializers.Serializer):
class DocumentListSerializer(serializers.Serializer):
documents = serializers.ListField(
child=serializers.IntegerField(),
required=True,
label="Documents",
write_only=True
write_only=True,
child=serializers.IntegerField()
)
def _validate_document_id_list(self, documents, name="documents"):
if not type(documents) == list:
raise serializers.ValidationError(f"{name} must be a list")
if not all([type(i) == int for i in documents]):
raise serializers.ValidationError(
f"{name} must be a list of integers")
count = Document.objects.filter(id__in=documents).count()
if not count == len(documents):
raise serializers.ValidationError(
f"Some documents in {name} don't exist or were "
f"specified twice.")
def validate_documents(self, documents):
self._validate_document_id_list(documents)
return documents
class BulkEditSerializer(DocumentListSerializer):
method = serializers.ChoiceField(
choices=[
"set_correspondent",
@@ -226,18 +312,6 @@ class BulkEditSerializer(serializers.Serializer):
parameters = serializers.DictField(allow_empty=True)
def _validate_document_id_list(self, documents, name="documents"):
if not type(documents) == list:
raise serializers.ValidationError(f"{name} must be a list")
if not all([type(i) == int for i in documents]):
raise serializers.ValidationError(
f"{name} must be a list of integers")
count = Document.objects.filter(id__in=documents).count()
if not count == len(documents):
raise serializers.ValidationError(
f"Some documents in {name} don't exist or were "
f"specified twice.")
def _validate_tag_id_list(self, tags, name="tags"):
if not type(tags) == list:
raise serializers.ValidationError(f"{name} must be a list")
@@ -249,10 +323,6 @@ class BulkEditSerializer(serializers.Serializer):
raise serializers.ValidationError(
f"Some tags in {name} don't exist or were specified twice.")
def validate_documents(self, documents):
self._validate_document_id_list(documents)
return documents
def validate_method(self, method):
if method == "set_correspondent":
return bulk_edit.set_correspondent
@@ -378,7 +448,9 @@ class PostDocumentSerializer(serializers.Serializer):
if not is_mime_type_supported(mime_type):
raise serializers.ValidationError(
"This file type is not supported.")
_("File type %(type)s not supported") %
{'type': mime_type}
)
return document.name, document_data
@@ -401,9 +473,24 @@ class PostDocumentSerializer(serializers.Serializer):
return None
class SelectionDataSerializer(serializers.Serializer):
class BulkDownloadSerializer(DocumentListSerializer):
documents = serializers.ListField(
required=True,
child=serializers.IntegerField()
content = serializers.ChoiceField(
choices=["archive", "originals", "both"],
default="archive"
)
compression = serializers.ChoiceField(
choices=["none", "deflated", "bzip2", "lzma"],
default="none"
)
def validate_compression(self, compression):
import zipfile
return {
"none": zipfile.ZIP_STORED,
"deflated": zipfile.ZIP_DEFLATED,
"bzip2": zipfile.ZIP_BZIP2,
"lzma": zipfile.ZIP_LZMA
}[compression]

284
src/documents/signals/handlers.py Executable file → Normal file
View File

@@ -1,7 +1,7 @@
import logging
import os
from subprocess import Popen
from django.utils import termcolors
from django.conf import settings
from django.contrib.admin.models import ADDITION, LogEntry
from django.contrib.auth.models import User
@@ -9,18 +9,17 @@ from django.contrib.contenttypes.models import ContentType
from django.db import models, DatabaseError
from django.db.models import Q
from django.dispatch import receiver
from django.utils import timezone
from django.utils import termcolors, timezone
from filelock import FileLock
from .. import index, matching
from .. import matching
from ..file_handling import delete_empty_directories, \
create_source_path_directory, archive_name_from_filename, \
create_source_path_directory, \
generate_unique_filename
from ..models import Document, Tag
from ..models import Document, Tag, MatchingModel
def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
logger = logging.getLogger("paperless.handlers")
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
@@ -34,6 +33,9 @@ def set_correspondent(sender,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs):
if document.correspondent and not replace:
return
@@ -48,27 +50,45 @@ def set_correspondent(sender,
selected = None
if potential_count > 1:
if use_first:
logger(
logger.debug(
f"Detected {potential_count} potential correspondents, "
f"so we've opted for {selected}",
logging_group
extra={'group': logging_group}
)
else:
logger(
logger.debug(
f"Detected {potential_count} potential correspondents, "
f"not assigning any correspondent",
logging_group
extra={'group': logging_group}
)
return
if selected or replace:
logger(
f"Assigning correspondent {selected} to {document}",
logging_group
)
if suggest:
if base_url:
print(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
)
print(f"{base_url}/documents/{document.pk}")
else:
print(
(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
) + f" [{document.pk}]"
)
print(f"Suggest correspondent {selected}")
else:
logger.info(
f"Assigning correspondent {selected} to {document}",
extra={'group': logging_group}
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_document_type(sender,
@@ -77,6 +97,9 @@ def set_document_type(sender,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs):
if document.document_type and not replace:
return
@@ -92,27 +115,45 @@ def set_document_type(sender,
if potential_count > 1:
if use_first:
logger(
logger.info(
f"Detected {potential_count} potential document types, "
f"so we've opted for {selected}",
logging_group
extra={'group': logging_group}
)
else:
logger(
logger.info(
f"Detected {potential_count} potential document types, "
f"not assigning any document type",
logging_group
extra={'group': logging_group}
)
return
if selected or replace:
logger(
f"Assigning document type {selected} to {document}",
logging_group
)
if suggest:
if base_url:
print(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
)
print(f"{base_url}/documents/{document.pk}")
else:
print(
(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
) + f" [{document.pk}]"
)
print(f"Sugest document type {selected}")
else:
logger.info(
f"Assigning document type {selected} to {document}",
extra={'group': logging_group}
)
document.document_type = selected
document.save(update_fields=("document_type",))
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(sender,
@@ -120,6 +161,9 @@ def set_tags(sender,
logging_group=None,
classifier=None,
replace=False,
suggest=False,
base_url=None,
color=False,
**kwargs):
if replace:
@@ -134,33 +178,65 @@ def set_tags(sender,
relevant_tags = set(matched_tags) - current_tags
if not relevant_tags:
return
if suggest:
extra_tags = current_tags - set(matched_tags)
extra_tags = [
t for t in extra_tags
if t.matching_algorithm == MatchingModel.MATCH_AUTO
]
if not relevant_tags and not extra_tags:
return
if base_url:
print(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
)
print(f"{base_url}/documents/{document.pk}")
else:
print(
(
termcolors.colorize(str(document), fg='green')
if color
else str(document)
) + f" [{document.pk}]"
)
if relevant_tags:
print(
"Suggest tags: " + ", ".join([t.name for t in relevant_tags])
)
if extra_tags:
print("Extra tags: " + ", ".join([t.name for t in extra_tags]))
else:
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger(
message.format(document, ", ".join([t.name for t in relevant_tags])),
logging_group
)
message = 'Tagging "{}" with "{}"'
logger.info(
message.format(
document, ", ".join([t.name for t in relevant_tags])
),
extra={'group': logging_group}
)
document.tags.add(*relevant_tags)
document.tags.add(*relevant_tags)
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
with FileLock(settings.MEDIA_LOCK):
for f in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
if os.path.isfile(f):
for filename in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
if filename and os.path.isfile(filename):
try:
os.unlink(f)
logging.getLogger(__name__).debug(
f"Deleted file {f}.")
os.unlink(filename)
logger.debug(
f"Deleted file {filename}.")
except OSError as e:
logging.getLogger(__name__).warning(
logger.warning(
f"While deleting document {str(instance)}, the file "
f"{f} could not be deleted: {e}"
f"{filename} could not be deleted: {e}"
)
delete_empty_directories(
@@ -168,27 +244,30 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
root=settings.ORIGINALS_DIR
)
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
if instance.has_archive_version:
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
class CannotMoveFilesException(Exception):
pass
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal(
logger.fatal(
f"Document {str(instance)}: File {old_path} has gone.")
return False
raise CannotMoveFilesException()
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning(
logger.warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
return False
return True
raise CannotMoveFilesException()
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@@ -207,56 +286,61 @@ def update_filename_and_move_files(sender, instance, **kwargs):
return
with FileLock(settings.MEDIA_LOCK):
old_filename = instance.filename
new_filename = generate_unique_filename(
instance, settings.ORIGINALS_DIR)
try:
old_filename = instance.filename
old_source_path = instance.source_path
if new_filename == instance.filename:
# Don't do anything if its the same.
return
instance.filename = generate_unique_filename(instance)
move_original = old_filename != instance.filename
old_source_path = instance.source_path
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
if not validate_move(instance, old_source_path, new_source_path):
return
# archive files are optional, archive checksum tells us if we have one,
# since this is None for documents without archived files.
if instance.archive_checksum:
new_archive_filename = archive_name_from_filename(new_filename)
old_archive_filename = instance.archive_filename
old_archive_path = instance.archive_path
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
new_archive_filename)
if not validate_move(instance, old_archive_path, new_archive_path):
if instance.has_archive_version:
instance.archive_filename = generate_unique_filename(
instance, archive_filename=True
)
move_archive = old_archive_filename != instance.archive_filename # NOQA: E501
else:
move_archive = False
if not move_original and not move_archive:
# Don't do anything if filenames did not change.
return
create_source_path_directory(new_archive_path)
else:
old_archive_path = None
new_archive_path = None
if move_original:
validate_move(instance, old_source_path, instance.source_path)
create_source_path_directory(instance.source_path)
os.rename(old_source_path, instance.source_path)
create_source_path_directory(new_source_path)
try:
os.rename(old_source_path, new_source_path)
if instance.archive_checksum:
os.rename(old_archive_path, new_archive_path)
instance.filename = new_filename
if move_archive:
validate_move(
instance, old_archive_path, instance.archive_path)
create_source_path_directory(instance.archive_path)
os.rename(old_archive_path, instance.archive_path)
# Don't save() here to prevent infinite recursion.
Document.objects.filter(pk=instance.pk).update(
filename=new_filename)
filename=instance.filename,
archive_filename=instance.archive_filename,
)
except OSError as e:
instance.filename = old_filename
# this happens when we can't move a file. If that's the case for
# the archive file, we try our best to revert the changes.
# no need to save the instance, the update() has not happened yet.
except (OSError, DatabaseError, CannotMoveFilesException):
# This happens when either:
# - moving the files failed due to file system errors
# - saving to the database failed due to database errors
# In both cases, we need to revert to the original state.
# Try to move files to their original location.
try:
os.rename(new_source_path, old_source_path)
os.rename(new_archive_path, old_archive_path)
if move_original and os.path.isfile(instance.source_path):
os.rename(instance.source_path, old_source_path)
if move_archive and os.path.isfile(instance.archive_path):
os.rename(instance.archive_path, old_archive_path)
except Exception as e:
# This is fine, since:
# A: if we managed to move source from A to B, we will also
@@ -267,16 +351,10 @@ def update_filename_and_move_files(sender, instance, **kwargs):
# B: if moving the orignal file failed, nothing has changed
# anyway.
pass
except DatabaseError as e:
# this happens after moving files, so move them back into place.
# since moving them once succeeded, it's very likely going to
# succeed again.
os.rename(new_source_path, old_source_path)
if instance.archive_checksum:
os.rename(new_archive_path, old_archive_path)
# restore old values on the instance
instance.filename = old_filename
# again, no need to save the instance, since the actual update()
# operation failed.
instance.archive_filename = old_archive_filename
# finally, remove any empty sub folders. This will do nothing if
# something has failed above.
@@ -284,7 +362,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
if old_archive_path and not os.path.isfile(old_archive_path):
if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)
@@ -305,4 +383,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
def add_to_index(sender, document, **kwargs):
from documents import index
index.add_or_update_document(document)

File diff suppressed because one or more lines are too long

View File

@@ -42,3 +42,58 @@ body {
border-top-left-radius: 0;
border-top-right-radius: 0;
}
@media (prefers-color-scheme: dark) {
/*
From theme_dark.scss
$primary-dark-mode: #45973a;
$danger-dark-mode: #b71631;
$bg-dark-mode: #161618;
$bg-dark-mode-accent: #21262d;
$bg-light-dark-mode: #1c1c1f;
$text-color-dark-mode: #abb2bf;
$border-color-dark-mode: #47494f;
*/
body {
background-color: #161618 !important;
color: #abb2bf;
}
svg.logo .text {
fill: #abb2bf!important;
}
.form-control:not(.is-invalid):not(.btn) {
border-color: #47494f;
}
.form-control:not(.btn) {
background-color: #161618;
color: #abb2bf;
}
.form-control:not(.btn)::placeholder {
color: #abb2bf;
}
.form-control:not(.btn):focus {
background-color: #1c1c1f !important;
color: #8e97a9 !important;
}
.btn-primary {
color: #fff;
background-color: #17541f;
border-color: #17541f;
}
.btn-primary:hover, .btn-primary:focus {
background-color: #0f3614;
border-color: #0c2c10;
}
.btn-primary:not(:disabled):not(.disabled):active {
background-color: #0c2c10;
border-color: #09220d;
}
}

View File

@@ -6,11 +6,12 @@ from django.db.models.signals import post_save
from whoosh.writing import AsyncWriter
from documents import index, sanity_checker
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.classifier import DocumentClassifier, load_classifier
from documents.consumer import Consumer, ConsumerError
from documents.models import Document
from documents.sanity_checker import SanityFailedError
from documents.models import Document, Tag, DocumentType, Correspondent
from documents.sanity_checker import SanityCheckFailedException
logger = logging.getLogger("paperless.tasks")
def index_optimize():
@@ -19,40 +20,45 @@ def index_optimize():
writer.commit(optimize=True)
def index_reindex():
def index_reindex(progress_bar_disable=False):
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents):
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
index.update_document(writer, document)
def train_classifier():
classifier = DocumentClassifier()
if (not Tag.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not DocumentType.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not Correspondent.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists()):
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (OSError, EOFError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
return
classifier = load_classifier()
if not classifier:
classifier = DocumentClassifier()
try:
if classifier.train():
logging.getLogger(__name__).info(
logger.info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
)
classifier.save_classifier()
classifier.save()
else:
logging.getLogger(__name__).debug(
logger.debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
logger.warning(
"Classifier error: " + str(e)
)
@@ -62,7 +68,8 @@ def consume_file(path,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
override_tag_ids=None,
task_id=None):
document = Consumer().try_consume_file(
path,
@@ -70,7 +77,9 @@ def consume_file(path,
override_title=override_title,
override_correspondent_id=override_correspondent_id,
override_document_type_id=override_document_type_id,
override_tag_ids=override_tag_ids)
override_tag_ids=override_tag_ids,
task_id=task_id
)
if document:
return "Success. New document id {} created".format(
@@ -84,8 +93,15 @@ def consume_file(path,
def sanity_check():
messages = sanity_checker.check_sanity()
if len(messages) > 0:
raise SanityFailedError(messages)
messages.log_messages()
if messages.has_error():
raise SanityCheckFailedException(
"Sanity check failed with errors. See log.")
elif messages.has_warning():
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:
return "Sanity check exited with infos. See log."
else:
return "No issues detected."

View File

@@ -7,14 +7,16 @@
<head>
<meta charset="utf-8">
<title>Paperless-ng</title>
<base href="/">
<base href="{% url 'base' %}">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="username" content="{{username}}">
<meta name="full_name" content="{{full_name}}">
<meta name="cookie_prefix" content="{{cookie_prefix}}">
<meta name="robots" content="noindex,nofollow">
<link rel="icon" type="image/x-icon" href="favicon.ico">
<link rel="manifest" href="{% static webmanifest %}">
<link rel="stylesheet" href="{% static styles_css %}">
<link rel="apple-touch-icon" href="{% static apple_touch_icon %}">
</head>
<body>
<app-root>{% translate "Paperless-ng is loading..." %}</app-root>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.7 KiB

View File

@@ -0,0 +1 @@
This is a test file.

View File

@@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
from django.test import TestCase
from django.utils import timezone
from documents import index
from documents.admin import DocumentAdmin
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
@@ -11,49 +12,52 @@ from documents.tests.utils import DirectoriesMixin
class TestDocumentAdmin(DirectoriesMixin, TestCase):
def get_document_from_index(self, doc):
ix = index.open_index()
with ix.searcher() as searcher:
return searcher.document(id=doc.id)
def setUp(self) -> None:
super(TestDocumentAdmin, self).setUp()
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
@mock.patch("documents.admin.index.add_or_update_document")
def test_save_model(self, m):
def test_save_model(self):
doc = Document.objects.create(title="test")
doc.title = "new title"
self.doc_admin.save_model(None, doc, None, None)
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
m.assert_called_once()
self.assertEqual(self.get_document_from_index(doc)['id'], doc.id)
def test_tags(self):
def test_delete_model(self):
doc = Document.objects.create(title="test")
doc.tags.create(name="t1")
doc.tags.create(name="t2")
index.add_or_update_document(doc)
self.assertIsNotNone(self.get_document_from_index(doc))
self.assertEqual(self.doc_admin.tags_(doc), "<span >t1, </span><span >t2, </span>")
def test_tags_empty(self):
doc = Document.objects.create(title="test")
self.assertEqual(self.doc_admin.tags_(doc), "")
@mock.patch("documents.admin.index.remove_document")
def test_delete_model(self, m):
doc = Document.objects.create(title="test")
self.doc_admin.delete_model(None, doc)
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
m.assert_called_once()
@mock.patch("documents.admin.index.remove_document")
def test_delete_queryset(self, m):
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
self.assertIsNone(self.get_document_from_index(doc))
def test_delete_queryset(self):
docs = []
for i in range(42):
Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
docs.append(doc)
index.add_or_update_document(doc)
self.assertEqual(Document.objects.count(), 42)
for doc in docs:
self.assertIsNotNone(self.get_document_from_index(doc))
self.doc_admin.delete_queryset(None, Document.objects.all())
self.assertEqual(m.call_count, 42)
self.assertEqual(Document.objects.count(), 0)
for doc in docs:
self.assertIsNone(self.get_document_from_index(doc))
def test_created(self):
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")

View File

@@ -1,15 +1,21 @@
import datetime
import io
import json
import os
import shutil
import tempfile
import zipfile
from unittest import mock
import pytest
from django.conf import settings
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework.test import APITestCase
from whoosh.writing import AsyncWriter
from documents import index, bulk_edit
from documents.models import Document, Correspondent, DocumentType, Tag, SavedView
from documents.models import Document, Correspondent, DocumentType, Tag, SavedView, MatchingModel
from documents.tests.utils import DirectoriesMixin
@@ -144,21 +150,19 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_thumbnail)
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_download_with_archive(self):
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
content = b"This is a test"
content_archive = b"This is the same test but archived"
with open(filename, "wb") as f:
f.write(content)
filename = os.path.basename(filename)
doc = Document.objects.create(title="none", filename=filename,
doc = Document.objects.create(title="none", filename="my_document.pdf",
archive_filename="archived.pdf",
mime_type="application/pdf")
with open(doc.source_path, "wb") as f:
f.write(content)
with open(doc.archive_path, "wb") as f:
f.write(content_archive)
@@ -228,6 +232,12 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(len(results), 2)
self.assertCountEqual([results[0]['id'], results[1]['id']], [doc1.id, doc3.id])
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_2.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertCountEqual([results[0]['id'], results[1]['id']], [doc2.id, doc3.id])
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
@@ -261,10 +271,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
results = response.data['results']
self.assertEqual(len(results), 0)
def test_search_no_query(self):
response = self.client.get("/api/search/")
results = response.data['results']
def test_documents_title_content_filter(self):
doc1 = Document.objects.create(title="title A", content="content A", checksum="A", mime_type="application/pdf")
doc2 = Document.objects.create(title="title B", content="content A", checksum="B", mime_type="application/pdf")
doc3 = Document.objects.create(title="title A", content="content B", checksum="C", mime_type="application/pdf")
doc4 = Document.objects.create(title="title B", content="content B", checksum="D", mime_type="application/pdf")
response = self.client.get("/api/documents/?title_content=A")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 3)
self.assertCountEqual([results[0]['id'], results[1]['id'], results[2]['id']], [doc1.id, doc2.id, doc3.id])
response = self.client.get("/api/documents/?title_content=B")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 3)
self.assertCountEqual([results[0]['id'], results[1]['id'], results[2]['id']], [doc2.id, doc3.id, doc4.id])
response = self.client.get("/api/documents/?title_content=X")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 0)
def test_search(self):
@@ -278,32 +306,24 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/search/?query=bank")
response = self.client.get("/api/documents/?query=bank")
results = response.data['results']
self.assertEqual(response.data['count'], 3)
self.assertEqual(response.data['page'], 1)
self.assertEqual(response.data['page_count'], 1)
self.assertEqual(len(results), 3)
response = self.client.get("/api/search/?query=september")
response = self.client.get("/api/documents/?query=september")
results = response.data['results']
self.assertEqual(response.data['count'], 1)
self.assertEqual(response.data['page'], 1)
self.assertEqual(response.data['page_count'], 1)
self.assertEqual(len(results), 1)
response = self.client.get("/api/search/?query=statement")
response = self.client.get("/api/documents/?query=statement")
results = response.data['results']
self.assertEqual(response.data['count'], 2)
self.assertEqual(response.data['page'], 1)
self.assertEqual(response.data['page_count'], 1)
self.assertEqual(len(results), 2)
response = self.client.get("/api/search/?query=sfegdfg")
response = self.client.get("/api/documents/?query=sfegdfg")
results = response.data['results']
self.assertEqual(response.data['count'], 0)
self.assertEqual(response.data['page'], 0)
self.assertEqual(response.data['page_count'], 0)
self.assertEqual(len(results), 0)
def test_search_multi_page(self):
@@ -316,53 +336,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
seen_ids = []
for i in range(1, 6):
response = self.client.get(f"/api/search/?query=content&page={i}")
response = self.client.get(f"/api/documents/?query=content&page={i}&page_size=10")
results = response.data['results']
self.assertEqual(response.data['count'], 55)
self.assertEqual(response.data['page'], i)
self.assertEqual(response.data['page_count'], 6)
self.assertEqual(len(results), 10)
for result in results:
self.assertNotIn(result['id'], seen_ids)
seen_ids.append(result['id'])
response = self.client.get(f"/api/search/?query=content&page=6")
response = self.client.get(f"/api/documents/?query=content&page=6&page_size=10")
results = response.data['results']
self.assertEqual(response.data['count'], 55)
self.assertEqual(response.data['page'], 6)
self.assertEqual(response.data['page_count'], 6)
self.assertEqual(len(results), 5)
for result in results:
self.assertNotIn(result['id'], seen_ids)
seen_ids.append(result['id'])
response = self.client.get(f"/api/search/?query=content&page=7")
results = response.data['results']
self.assertEqual(response.data['count'], 55)
self.assertEqual(response.data['page'], 6)
self.assertEqual(response.data['page_count'], 6)
self.assertEqual(len(results), 5)
def test_search_invalid_page(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(15):
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
index.update_document(writer, doc)
first_page = self.client.get(f"/api/search/?query=content&page=1").data
second_page = self.client.get(f"/api/search/?query=content&page=2").data
should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
self.assertDictEqual(first_page, should_be_first_page_1)
self.assertDictEqual(first_page, should_be_first_page_2)
self.assertDictEqual(first_page, should_be_first_page_3)
self.assertDictEqual(first_page, should_be_first_page_4)
self.assertNotEqual(len(first_page['results']), len(second_page['results']))
response = self.client.get(f"/api/documents/?query=content&page=0&page_size=10")
self.assertEqual(response.status_code, 404)
response = self.client.get(f"/api/documents/?query=content&page=3&page_size=10")
self.assertEqual(response.status_code, 404)
@mock.patch("documents.index.autocomplete")
def test_search_autocomplete(self, m):
@@ -386,6 +387,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
@pytest.mark.skip(reason="Not implemented yet")
def test_search_spelling_correction(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(55):
@@ -411,7 +413,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get(f"/api/search/?more_like={d2.id}")
response = self.client.get(f"/api/documents/?more_like_id={d2.id}")
self.assertEqual(response.status_code, 200)
@@ -421,6 +423,79 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(results[0]['id'], d3.id)
self.assertEqual(results[1]['id'], d1.id)
def test_search_filtering(self):
t = Tag.objects.create(name="tag")
t2 = Tag.objects.create(name="tag2")
c = Correspondent.objects.create(name="correspondent")
dt = DocumentType.objects.create(name="type")
d1 = Document.objects.create(checksum="1", correspondent=c, content="test")
d2 = Document.objects.create(checksum="2", document_type=dt, content="test")
d3 = Document.objects.create(checksum="3", content="test")
d3.tags.add(t)
d3.tags.add(t2)
d4 = Document.objects.create(checksum="4", created=datetime.datetime(2020, 7, 13), content="test")
d4.tags.add(t2)
d5 = Document.objects.create(checksum="5", added=datetime.datetime(2020, 7, 13), content="test")
d6 = Document.objects.create(checksum="6", content="test2")
with AsyncWriter(index.open_index()) as writer:
for doc in Document.objects.all():
index.update_document(writer, doc)
def search_query(q):
r = self.client.get("/api/documents/?query=test" + q)
self.assertEqual(r.status_code, 200)
return [hit['id'] for hit in r.data['results']]
self.assertCountEqual(search_query(""), [d1.id, d2.id, d3.id, d4.id, d5.id])
self.assertCountEqual(search_query("&is_tagged=true"), [d3.id, d4.id])
self.assertCountEqual(search_query("&is_tagged=false"), [d1.id, d2.id, d5.id])
self.assertCountEqual(search_query("&correspondent__id=" + str(c.id)), [d1.id])
self.assertCountEqual(search_query("&document_type__id=" + str(dt.id)), [d2.id])
self.assertCountEqual(search_query("&correspondent__isnull"), [d2.id, d3.id, d4.id, d5.id])
self.assertCountEqual(search_query("&document_type__isnull"), [d1.id, d3.id, d4.id, d5.id])
self.assertCountEqual(search_query("&tags__id__all=" + str(t.id) + "," + str(t2.id)), [d3.id])
self.assertCountEqual(search_query("&tags__id__all=" + str(t.id)), [d3.id])
self.assertCountEqual(search_query("&tags__id__all=" + str(t2.id)), [d3.id, d4.id])
self.assertIn(d4.id, search_query("&created__date__lt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
self.assertNotIn(d4.id, search_query("&created__date__gt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
self.assertNotIn(d4.id, search_query("&created__date__lt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
self.assertIn(d4.id, search_query("&created__date__gt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
self.assertIn(d5.id, search_query("&added__date__lt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
self.assertNotIn(d5.id, search_query("&added__date__gt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
self.assertNotIn(d5.id, search_query("&added__date__lt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
self.assertIn(d5.id, search_query("&added__date__gt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
def test_search_sorting(self):
c1 = Correspondent.objects.create(name="corres Ax")
c2 = Correspondent.objects.create(name="corres Cx")
c3 = Correspondent.objects.create(name="corres Bx")
d1 = Document.objects.create(checksum="1", correspondent=c1, content="test", archive_serial_number=2, title="3")
d2 = Document.objects.create(checksum="2", correspondent=c2, content="test", archive_serial_number=3, title="2")
d3 = Document.objects.create(checksum="3", correspondent=c3, content="test", archive_serial_number=1, title="1")
with AsyncWriter(index.open_index()) as writer:
for doc in Document.objects.all():
index.update_document(writer, doc)
def search_query(q):
r = self.client.get("/api/documents/?query=test" + q)
self.assertEqual(r.status_code, 200)
return [hit['id'] for hit in r.data['results']]
self.assertListEqual(search_query("&ordering=archive_serial_number"), [d3.id, d1.id, d2.id])
self.assertListEqual(search_query("&ordering=-archive_serial_number"), [d2.id, d1.id, d3.id])
self.assertListEqual(search_query("&ordering=title"), [d3.id, d2.id, d1.id])
self.assertListEqual(search_query("&ordering=-title"), [d1.id, d2.id, d3.id])
self.assertListEqual(search_query("&ordering=correspondent__name"), [d1.id, d3.id, d2.id])
self.assertListEqual(search_query("&ordering=-correspondent__name"), [d2.id, d3.id, d1.id])
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
@@ -436,6 +511,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data['documents_total'], 3)
self.assertEqual(response.data['documents_inbox'], 1)
def test_statistics_no_inbox_tag(self):
Document.objects.create(title="none1", checksum="A")
response = self.client.get("/api/statistics/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['documents_inbox'], None)
@mock.patch("documents.views.async_task")
def test_upload(self, m):
@@ -569,10 +651,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
def test_get_metadata(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A")
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A", archive_filename="archive.pdf")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
source_file = os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png")
archive_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
shutil.copy(source_file, doc.source_path)
shutil.copy(archive_file, doc.archive_path)
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
@@ -583,6 +668,14 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertTrue(meta['has_archive_version'])
self.assertEqual(len(meta['original_metadata']), 0)
self.assertGreater(len(meta['archive_metadata']), 0)
self.assertEqual(meta['media_filename'], "file.pdf")
self.assertEqual(meta['archive_media_filename'], "archive.pdf")
self.assertEqual(meta['original_size'], os.stat(source_file).st_size)
self.assertEqual(meta['archive_size'], os.stat(archive_file).st_size)
def test_get_metadata_invalid_doc(self):
response = self.client.get(f"/api/documents/34576/metadata/")
self.assertEqual(response.status_code, 404)
def test_get_metadata_no_archive(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
@@ -598,6 +691,46 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertFalse(meta['has_archive_version'])
self.assertGreater(len(meta['original_metadata']), 0)
self.assertIsNone(meta['archive_metadata'])
self.assertIsNone(meta['archive_media_filename'])
def test_get_metadata_missing_files(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf", archive_filename="file.pdf", archive_checksum="B", checksum="A")
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
meta = response.data
self.assertTrue(meta['has_archive_version'])
self.assertIsNone(meta['original_metadata'])
self.assertIsNone(meta['original_size'])
self.assertIsNone(meta['archive_metadata'])
self.assertIsNone(meta['archive_size'])
def test_get_empty_suggestions(self):
doc = Document.objects.create(title="test", mime_type="application/pdf")
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data, {'correspondents': [], 'tags': [], 'document_types': []})
def test_get_suggestions_invalid_doc(self):
response = self.client.get(f"/api/documents/34676/suggestions/")
self.assertEqual(response.status_code, 404)
@mock.patch("documents.views.match_correspondents")
@mock.patch("documents.views.match_tags")
@mock.patch("documents.views.match_document_types")
def test_get_suggestions(self, match_document_types, match_tags, match_correspondents):
doc = Document.objects.create(title="test", mime_type="application/pdf", content="this is an invoice!")
match_tags.return_value = [Tag(id=56), Tag(id=123)]
match_document_types.return_value = [DocumentType(id=23)]
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.data, {'correspondents': [88,2], 'tags': [56,123], 'document_types': [23]})
def test_saved_views(self):
u1 = User.objects.create_user("user1")
@@ -683,6 +816,126 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
v1 = SavedView.objects.get(id=v1.id)
self.assertEqual(v1.filter_rules.count(), 0)
def test_get_logs(self):
response = self.client.get("/api/logs/")
self.assertEqual(response.status_code, 200)
self.assertCountEqual(response.data, ["mail", "paperless"])
def test_get_invalid_log(self):
response = self.client.get("/api/logs/bogus_log/")
self.assertEqual(response.status_code, 404)
@override_settings(LOGGING_DIR="bogus_dir")
def test_get_nonexistent_log(self):
response = self.client.get("/api/logs/paperless/")
self.assertEqual(response.status_code, 404)
def test_get_log(self):
log_data = "test\ntest2\n"
with open(os.path.join(settings.LOGGING_DIR, "paperless.log"), "w") as f:
f.write(log_data)
response = self.client.get("/api/logs/paperless/")
self.assertEqual(response.status_code, 200)
self.assertListEqual(response.data, ["test", "test2"])
def test_invalid_regex_other_algorithm(self):
for endpoint in ['correspondents', 'tags', 'document_types']:
response = self.client.post(f"/api/{endpoint}/", {
"name": "test",
"matching_algorithm": MatchingModel.MATCH_ANY,
"match": "["
}, format='json')
self.assertEqual(response.status_code, 201, endpoint)
def test_invalid_regex(self):
for endpoint in ['correspondents', 'tags', 'document_types']:
response = self.client.post(f"/api/{endpoint}/", {
"name": "test",
"matching_algorithm": MatchingModel.MATCH_REGEX,
"match": "["
}, format='json')
self.assertEqual(response.status_code, 400, endpoint)
def test_valid_regex(self):
for endpoint in ['correspondents', 'tags', 'document_types']:
response = self.client.post(f"/api/{endpoint}/", {
"name": "test",
"matching_algorithm": MatchingModel.MATCH_REGEX,
"match": "[0-9]"
}, format='json')
self.assertEqual(response.status_code, 201, endpoint)
def test_regex_no_algorithm(self):
for endpoint in ['correspondents', 'tags', 'document_types']:
response = self.client.post(f"/api/{endpoint}/", {
"name": "test",
"match": "[0-9]"
}, format='json')
self.assertEqual(response.status_code, 201, endpoint)
def test_tag_color_default(self):
response = self.client.post("/api/tags/", {
"name": "tag"
}, format="json")
self.assertEqual(response.status_code, 201)
self.assertEqual(Tag.objects.get(id=response.data['id']).color, "#a6cee3")
self.assertEqual(self.client.get(f"/api/tags/{response.data['id']}/", format="json").data['colour'], 1)
def test_tag_color(self):
response = self.client.post("/api/tags/", {
"name": "tag",
"colour": 3
}, format="json")
self.assertEqual(response.status_code, 201)
self.assertEqual(Tag.objects.get(id=response.data['id']).color, "#b2df8a")
self.assertEqual(self.client.get(f"/api/tags/{response.data['id']}/", format="json").data['colour'], 3)
def test_tag_color_invalid(self):
response = self.client.post("/api/tags/", {
"name": "tag",
"colour": 34
}, format="json")
self.assertEqual(response.status_code, 400)
def test_tag_color_custom(self):
tag = Tag.objects.create(name="test", color="#abcdef")
self.assertEqual(self.client.get(f"/api/tags/{tag.id}/", format="json").data['colour'], 1)
class TestDocumentApiV2(DirectoriesMixin, APITestCase):
def setUp(self):
super(TestDocumentApiV2, self).setUp()
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_login(user=self.user)
self.client.defaults['HTTP_ACCEPT'] = 'application/json; version=2'
def test_tag_validate_color(self):
self.assertEqual(self.client.post("/api/tags/", {"name": "test", "color": "#12fFaA"}, format="json").status_code, 201)
self.assertEqual(self.client.post("/api/tags/", {"name": "test1", "color": "abcdef"}, format="json").status_code, 400)
self.assertEqual(self.client.post("/api/tags/", {"name": "test2", "color": "#abcdfg"}, format="json").status_code, 400)
self.assertEqual(self.client.post("/api/tags/", {"name": "test3", "color": "#asd"}, format="json").status_code, 400)
self.assertEqual(self.client.post("/api/tags/", {"name": "test4", "color": "#12121212"}, format="json").status_code, 400)
def test_tag_text_color(self):
t = Tag.objects.create(name="tag1", color="#000000")
self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#ffffff")
t.color = "#ffffff"
t.save()
self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
t.color = "asdf"
t.save()
self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
t.color = "123"
t.save()
self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
class TestBulkEdit(DirectoriesMixin, APITestCase):
@@ -1037,6 +1290,113 @@ class TestBulkEdit(DirectoriesMixin, APITestCase):
self.assertCountEqual(response.data['selected_document_types'], [{"id": self.c1.id, "document_count": 1}, {"id": self.c2.id, "document_count": 0}])
class TestBulkDownload(DirectoriesMixin, APITestCase):
def setUp(self):
super(TestBulkDownload, self).setUp()
user = User.objects.create_superuser(username="temp_admin")
self.client.force_login(user=user)
self.doc1 = Document.objects.create(title="unrelated", checksum="A")
self.doc2 = Document.objects.create(title="document A", filename="docA.pdf", mime_type="application/pdf", checksum="B", created=datetime.datetime(2021, 1, 1))
self.doc2b = Document.objects.create(title="document A", filename="docA2.pdf", mime_type="application/pdf", checksum="D", created=datetime.datetime(2021, 1, 1))
self.doc3 = Document.objects.create(title="document B", filename="docB.jpg", mime_type="image/jpeg", checksum="C", created=datetime.datetime(2020, 3, 21), archive_filename="docB.pdf", archive_checksum="D")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.doc2.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), self.doc2b.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.jpg"), self.doc3.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "test_with_bom.pdf"), self.doc3.archive_path)
def test_download_originals(self):
response = self.client.post("/api/documents/bulk_download/", json.dumps({
"documents": [self.doc2.id, self.doc3.id],
"content": "originals"
}), content_type='application/json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response['Content-Type'], 'application/zip')
with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
self.assertEqual(len(zipf.filelist), 2)
self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
self.assertIn("2020-03-21 document B.jpg", zipf.namelist())
with self.doc2.source_file as f:
self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
with self.doc3.source_file as f:
self.assertEqual(f.read(), zipf.read("2020-03-21 document B.jpg"))
def test_download_default(self):
response = self.client.post("/api/documents/bulk_download/", json.dumps({
"documents": [self.doc2.id, self.doc3.id]
}), content_type='application/json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response['Content-Type'], 'application/zip')
with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
self.assertEqual(len(zipf.filelist), 2)
self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
self.assertIn("2020-03-21 document B.pdf", zipf.namelist())
with self.doc2.source_file as f:
self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
with self.doc3.archive_file as f:
self.assertEqual(f.read(), zipf.read("2020-03-21 document B.pdf"))
def test_download_both(self):
response = self.client.post("/api/documents/bulk_download/", json.dumps({
"documents": [self.doc2.id, self.doc3.id],
"content": "both"
}), content_type='application/json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response['Content-Type'], 'application/zip')
with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
self.assertEqual(len(zipf.filelist), 3)
self.assertIn("originals/2021-01-01 document A.pdf", zipf.namelist())
self.assertIn("archive/2020-03-21 document B.pdf", zipf.namelist())
self.assertIn("originals/2020-03-21 document B.jpg", zipf.namelist())
with self.doc2.source_file as f:
self.assertEqual(f.read(), zipf.read("originals/2021-01-01 document A.pdf"))
with self.doc3.archive_file as f:
self.assertEqual(f.read(), zipf.read("archive/2020-03-21 document B.pdf"))
with self.doc3.source_file as f:
self.assertEqual(f.read(), zipf.read("originals/2020-03-21 document B.jpg"))
def test_filename_clashes(self):
response = self.client.post("/api/documents/bulk_download/", json.dumps({
"documents": [self.doc2.id, self.doc2b.id]
}), content_type='application/json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response['Content-Type'], 'application/zip')
with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
self.assertEqual(len(zipf.filelist), 2)
self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
self.assertIn("2021-01-01 document A_01.pdf", zipf.namelist())
with self.doc2.source_file as f:
self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
with self.doc2b.source_file as f:
self.assertEqual(f.read(), zipf.read("2021-01-01 document A_01.pdf"))
def test_compression(self):
response = self.client.post("/api/documents/bulk_download/", json.dumps({
"documents": [self.doc2.id, self.doc2b.id],
"compression": "lzma"
}), content_type='application/json')
class TestApiAuth(APITestCase):
def test_auth_required(self):
@@ -1057,7 +1417,20 @@ class TestApiAuth(APITestCase):
self.assertEqual(self.client.get("/api/logs/").status_code, 401)
self.assertEqual(self.client.get("/api/saved_views/").status_code, 401)
self.assertEqual(self.client.get("/api/search/").status_code, 401)
self.assertEqual(self.client.get("/api/search/auto_complete/").status_code, 401)
self.assertEqual(self.client.get("/api/search/autocomplete/").status_code, 401)
self.assertEqual(self.client.get("/api/documents/bulk_edit/").status_code, 401)
self.assertEqual(self.client.get("/api/documents/bulk_download/").status_code, 401)
self.assertEqual(self.client.get("/api/documents/selection_data/").status_code, 401)
def test_api_version_no_auth(self):
response = self.client.get("/api/")
self.assertNotIn("X-Api-Version", response)
self.assertNotIn("X-Version", response)
def test_api_version_with_auth(self):
user = User.objects.create_superuser(username="test")
self.client.force_login(user)
response = self.client.get("/api/")
self.assertIn("X-Api-Version", response)
self.assertIn("X-Version", response)

View File

@@ -1,10 +1,13 @@
import os
import tempfile
from time import sleep
from pathlib import Path
from unittest import mock
import pytest
from django.conf import settings
from django.test import TestCase, override_settings
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
from documents.models import Correspondent, Document, Tag, DocumentType
from documents.tests.utils import DirectoriesMixin
@@ -82,37 +85,19 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertTrue(self.classifier.train())
self.assertFalse(self.classifier.train())
self.classifier.save_classifier()
self.classifier.save()
classifier2 = DocumentClassifier()
current_ver = DocumentClassifier.FORMAT_VERSION
with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
# assure that we won't load old classifiers.
self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
self.assertRaises(IncompatibleClassifierVersionError, classifier2.load)
self.classifier.save_classifier()
self.classifier.save()
# assure that we can load the classifier after saving it.
classifier2.reload()
def testReload(self):
self.generate_test_data()
self.assertTrue(self.classifier.train())
self.classifier.save_classifier()
classifier2 = DocumentClassifier()
classifier2.reload()
v1 = classifier2.classifier_version
# change the classifier after some time.
sleep(1)
self.classifier.save_classifier()
classifier2.reload()
v2 = classifier2.classifier_version
self.assertNotEqual(v1, v2)
classifier2.load()
@override_settings(DATA_DIR=tempfile.mkdtemp())
def testSaveClassifier(self):
@@ -121,12 +106,21 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.classifier.train()
self.classifier.save_classifier()
self.classifier.save()
new_classifier = DocumentClassifier()
new_classifier.reload()
new_classifier.load()
self.assertFalse(new_classifier.train())
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
def test_load_and_classify(self):
self.generate_test_data()
new_classifier = DocumentClassifier()
new_classifier.load()
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
def test_one_correspondent_predict(self):
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
@@ -235,3 +229,42 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.classifier.train()
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
def test_load_classifier_not_exists(self):
self.assertFalse(os.path.exists(settings.MODEL_FILE))
self.assertIsNone(load_classifier())
@mock.patch("documents.classifier.DocumentClassifier.load")
def test_load_classifier(self, load):
Path(settings.MODEL_FILE).touch()
self.assertIsNotNone(load_classifier())
load.assert_called_once()
@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
@pytest.mark.skip(reason="Disabled caching due to high memory usage - need to investigate.")
def test_load_classifier_cached(self):
classifier = load_classifier()
self.assertIsNotNone(classifier)
with mock.patch("documents.classifier.DocumentClassifier.load") as load:
classifier2 = load_classifier()
load.assert_not_called()
@mock.patch("documents.classifier.DocumentClassifier.load")
def test_load_classifier_incompatible_version(self, load):
Path(settings.MODEL_FILE).touch()
self.assertTrue(os.path.exists(settings.MODEL_FILE))
load.side_effect = IncompatibleClassifierVersionError()
self.assertIsNone(load_classifier())
self.assertFalse(os.path.exists(settings.MODEL_FILE))
@mock.patch("documents.classifier.DocumentClassifier.load")
def test_load_classifier_os_error(self, load):
Path(settings.MODEL_FILE).touch()
self.assertTrue(os.path.exists(settings.MODEL_FILE))
load.side_effect = OSError()
self.assertIsNone(load_classifier())
self.assertTrue(os.path.exists(settings.MODEL_FILE))

View File

@@ -5,12 +5,14 @@ import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.conf import settings
from django.test import TestCase, override_settings
from .utils import DirectoriesMixin
from ..consumer import Consumer, ConsumerError
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
from ..parsers import DocumentParser, ParseError
from ..tasks import sanity_check
class TestAttributes(TestCase):
@@ -165,25 +167,43 @@ class TestFieldPermutations(TestCase):
class DummyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type):
def get_thumbnail(self, document_path, mime_type, file_name=None):
# not important during tests
raise NotImplementedError()
def __init__(self, logging_group, scratch_dir, archive_path):
super(DummyParser, self).__init__(logging_group)
super(DummyParser, self).__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
self.archive_path = archive_path
def get_optimised_thumbnail(self, document_path, mime_type):
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None):
self.text = "The Text"
class CopyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def __init__(self, logging_group, progress_callback=None):
super(CopyParser, self).__init__(logging_group, progress_callback)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=self.tempdir)
def parse(self, document_path, mime_type, file_name=None):
self.text = "The text"
self.archive_path = os.path.join(self.tempdir, "archive.pdf")
shutil.copy(document_path, self.archive_path)
class FaultyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type):
def get_thumbnail(self, document_path, mime_type, file_name=None):
# not important during tests
raise NotImplementedError()
@@ -191,7 +211,7 @@ class FaultyParser(DocumentParser):
super(FaultyParser, self).__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self, document_path, mime_type):
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None):
@@ -203,6 +223,8 @@ def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
elif os.path.splitext(file)[1] == ".png":
return "image/png"
else:
return "unknown"
else:
@@ -212,10 +234,24 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(DirectoriesMixin, TestCase):
def make_dummy_parser(self, logging_group):
def _assert_first_last_send_progress(self, first_status="STARTING", last_status="SUCCESS", first_progress=0, first_progress_max=100, last_progress=100, last_progress_max=100):
self._send_progress.assert_called()
args, kwargs = self._send_progress.call_args_list[0]
self.assertEqual(args[0], first_progress)
self.assertEqual(args[1], first_progress_max)
self.assertEqual(args[2], first_status)
args, kwargs = self._send_progress.call_args_list[len(self._send_progress.call_args_list) - 1]
self.assertEqual(args[0], last_progress)
self.assertEqual(args[1], last_progress_max)
self.assertEqual(args[2], last_status)
def make_dummy_parser(self, logging_group, progress_callback=None):
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
def make_faulty_parser(self, logging_group):
def make_faulty_parser(self, logging_group, progress_callback=None):
return FaultyParser(logging_group, self.dirs.scratch_dir)
def setUp(self):
@@ -228,7 +264,11 @@ class TestConsumer(DirectoriesMixin, TestCase):
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
self.addCleanup(patcher.stop)
# this prevents websocket message reports during testing.
patcher = mock.patch("documents.consumer.Consumer._send_progress")
self._send_progress = patcher.start()
self.addCleanup(patcher.stop)
self.consumer = Consumer()
@@ -256,6 +296,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertIsNone(document.correspondent)
self.assertIsNone(document.document_type)
self.assertEqual(document.filename, "0000001.pdf")
self.assertEqual(document.archive_filename, "0000001.pdf")
self.assertTrue(os.path.isfile(
document.source_path
@@ -274,6 +315,29 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertFalse(os.path.isfile(filename))
self._assert_first_last_send_progress()
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def testDeleteMacFiles(self):
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
filename = self.get_test_file()
shadow_file = os.path.join(self.dirs.scratch_dir, "._sample.pdf")
shutil.copy(filename, shadow_file)
self.assertTrue(os.path.isfile(shadow_file))
document = self.consumer.try_consume_file(filename)
self.assertTrue(os.path.isfile(
document.source_path
))
self.assertFalse(os.path.isfile(shadow_file))
self.assertFalse(os.path.isfile(filename))
def testOverrideFilename(self):
filename = self.get_test_file()
override_filename = "Statement for November.pdf"
@@ -282,21 +346,26 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertEqual(document.title, "Statement for November")
self._assert_first_last_send_progress()
def testOverrideTitle(self):
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
self.assertEqual(document.title, "Override Title")
self._assert_first_last_send_progress()
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
self.assertEqual(document.correspondent.id, c.id)
self._assert_first_last_send_progress()
def testOverrideDocumentType(self):
dt = DocumentType.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
self.assertEqual(document.document_type.id, dt.id)
self._assert_first_last_send_progress()
def testOverrideTags(self):
t1 = Tag.objects.create(name="t1")
@@ -307,37 +376,42 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
self.assertIn(t3, document.tags.all())
self._assert_first_last_send_progress()
def testNotAFile(self):
try:
self.consumer.try_consume_file("non-existing-file")
except ConsumerError as e:
self.assertTrue(str(e).endswith('It is not a file'))
return
self.fail("Should throw exception")
self.assertRaisesMessage(
ConsumerError,
"File not found",
self.consumer.try_consume_file,
"non-existing-file"
)
self._assert_first_last_send_progress(last_status="FAILED")
def testDuplicates1(self):
self.consumer.try_consume_file(self.get_test_file())
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).endswith("It is a duplicate."))
return
self.assertRaisesMessage(
ConsumerError,
"It is a duplicate",
self.consumer.try_consume_file,
self.get_test_file()
)
self.fail("Should throw exception")
self._assert_first_last_send_progress(last_status="FAILED")
def testDuplicates2(self):
self.consumer.try_consume_file(self.get_test_file())
try:
self.consumer.try_consume_file(self.get_test_archive_file())
except ConsumerError as e:
self.assertTrue(str(e).endswith("It is a duplicate."))
return
self.assertRaisesMessage(
ConsumerError,
"It is a duplicate",
self.consumer.try_consume_file,
self.get_test_archive_file()
)
self.fail("Should throw exception")
self._assert_first_last_send_progress(last_status="FAILED")
def testDuplicates3(self):
self.consumer.try_consume_file(self.get_test_archive_file())
@@ -347,13 +421,15 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testNoParsers(self, m):
m.return_value = []
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual("Unsupported mime type application/pdf of file sample.pdf", str(e))
return
self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Unsupported mime type application/pdf",
self.consumer.try_consume_file,
self.get_test_file()
)
self._assert_first_last_send_progress(last_status="FAILED")
self.fail("Should throw exception")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testFaultyParser(self, m):
@@ -363,24 +439,28 @@ class TestConsumer(DirectoriesMixin, TestCase):
"weight": 0
})]
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "Does not compute.")
return
self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Error while consuming document sample.pdf: Does not compute.",
self.consumer.try_consume_file,
self.get_test_file()
)
self.fail("Should throw exception.")
self._assert_first_last_send_progress(last_status="FAILED")
@mock.patch("documents.consumer.Consumer._write")
def testPostSaveError(self, m):
filename = self.get_test_file()
m.side_effect = OSError("NO.")
try:
self.consumer.try_consume_file(filename)
except ConsumerError as e:
self.assertEqual(str(e), "NO.")
else:
self.fail("Should raise exception")
self.assertRaisesMessage(
ConsumerError,
"sample.pdf: The following error occured while consuming sample.pdf: NO.",
self.consumer.try_consume_file,
filename
)
self._assert_first_last_send_progress(last_status="FAILED")
# file not deleted
self.assertTrue(os.path.isfile(filename))
@@ -396,6 +476,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertEqual(document.title, "new docs")
self.assertEqual(document.filename, "none/new docs.pdf")
self.assertEqual(document.archive_filename, "none/new docs.pdf")
self._assert_first_last_send_progress()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.generate_unique_filename")
@@ -408,7 +491,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
filenames.insert(0, f)
return f
m.side_effect = lambda f, root: get_filename()
m.side_effect = lambda f, archive_filename = False: get_filename()
filename = self.get_test_file()
@@ -419,8 +502,11 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertEqual(document.title, "new docs")
self.assertIsNotNone(os.path.isfile(document.title))
self.assertTrue(os.path.isfile(document.source_path))
self.assertTrue(os.path.isfile(document.archive_path))
@mock.patch("documents.consumer.DocumentClassifier")
self._assert_first_last_send_progress()
@mock.patch("documents.consumer.load_classifier")
def testClassifyDocument(self, m):
correspondent = Correspondent.objects.create(name="test")
dtype = DocumentType.objects.create(name="test")
@@ -439,19 +525,26 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
self._assert_first_last_send_progress()
@override_settings(CONSUMER_DELETE_DUPLICATES=True)
def test_delete_duplicate(self):
dst = self.get_test_file()
self.assertTrue(os.path.isfile(dst))
doc = self.consumer.try_consume_file(dst)
self._assert_first_last_send_progress()
self.assertFalse(os.path.isfile(dst))
self.assertIsNotNone(doc)
self._send_progress.reset_mock()
dst = self.get_test_file()
self.assertTrue(os.path.isfile(dst))
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
self.assertFalse(os.path.isfile(dst))
self._assert_first_last_send_progress(last_status="FAILED")
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
def test_no_delete_duplicate(self):
@@ -467,6 +560,32 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
self.assertTrue(os.path.isfile(dst))
self._assert_first_last_send_progress(last_status="FAILED")
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_similar_filenames(self, m):
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
m.return_value = [(None, {
"parser": CopyParser,
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
"weight": 0
})]
doc1 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
doc2 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
doc3 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
self.assertEqual(doc1.filename, "simple.png")
self.assertEqual(doc1.archive_filename, "simple.pdf")
self.assertEqual(doc2.filename, "simple.pdf")
self.assertEqual(doc2.archive_filename, "simple_01.pdf")
self.assertEqual(doc3.filename, "simple.png.pdf")
self.assertEqual(doc3.archive_filename, "simple.png.pdf")
sanity_check()
class PreConsumeTestCase(TestCase):
@@ -479,9 +598,11 @@ class PreConsumeTestCase(TestCase):
m.assert_not_called()
@mock.patch("documents.consumer.Popen")
@mock.patch("documents.consumer.Consumer._send_progress")
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
def test_pre_consume_script_not_found(self, m):
def test_pre_consume_script_not_found(self, m, m2):
c = Consumer()
c.filename = "somefile.pdf"
c.path = "path-to-file"
self.assertRaises(ConsumerError, c.run_pre_consume_script)
@@ -503,7 +624,6 @@ class PreConsumeTestCase(TestCase):
self.assertEqual(command[1], "path-to-file")
class PostConsumeTestCase(TestCase):
@mock.patch("documents.consumer.Popen")
@@ -519,12 +639,13 @@ class PostConsumeTestCase(TestCase):
m.assert_not_called()
@override_settings(POST_CONSUME_SCRIPT="does-not-exist")
def test_post_consume_script_not_found(self):
@mock.patch("documents.consumer.Consumer._send_progress")
def test_post_consume_script_not_found(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
self.assertRaises(ConsumerError, Consumer().run_post_consume_script, doc)
c = Consumer()
c.filename = "somefile.pdf"
self.assertRaises(ConsumerError, c.run_post_consume_script, doc)
@mock.patch("documents.consumer.Popen")
def test_post_consume_script_simple(self, m):

View File

@@ -1,7 +1,6 @@
import datetime
import os
import shutil
from unittest import mock
from uuid import uuid4
from dateutil import tz
@@ -9,7 +8,6 @@ from django.conf import settings
from django.test import TestCase, override_settings
from documents.parsers import parse_date
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestDate(TestCase):
@@ -152,4 +150,4 @@ class TestDate(TestCase):
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
)

View File

@@ -201,6 +201,13 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{asn} - {title}")
def test_asn(self):
d1 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=652, checksum="A")
d2 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=None, checksum="B")
self.assertEqual(generate_filename(d1), "652 - the_doc.pdf")
self.assertEqual(generate_filename(d2), "none - the_doc.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
@@ -439,6 +446,18 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(document2.filename, "qwe.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
@mock.patch("documents.signals.handlers.Document.objects.filter")
def test_no_update_without_change(self, m):
doc = Document.objects.create(title="document", filename="document.pdf", archive_filename="document.pdf", checksum="A", archive_checksum="B", mime_type="application/pdf")
Path(doc.source_path).touch()
Path(doc.archive_path).touch()
doc.save()
m.assert_not_called()
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@@ -448,7 +467,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@@ -461,7 +480,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
@@ -475,7 +494,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
self.assertTrue(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
@@ -486,14 +505,49 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
def test_move_archive_exists(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
existing_archive_file = os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")
Path(original).touch()
Path(archive).touch()
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
Path(existing_archive_file).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertTrue(os.path.isfile(existing_archive_file))
self.assertEqual(doc.archive_filename, "none/my_doc_01.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_move_original_only(self):
original = os.path.join(settings.ORIGINALS_DIR, "document_01.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "document.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document_01.pdf", checksum="A",
archive_checksum="B", archive_filename="document.pdf")
self.assertEqual(doc.filename, "document.pdf")
self.assertEqual(doc.archive_filename, "document.pdf")
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_move_archive_only(self):
original = os.path.join(settings.ORIGINALS_DIR, "document.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "document_01.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document.pdf", checksum="A",
archive_checksum="B", archive_filename="document_01.pdf")
self.assertEqual(doc.filename, "document.pdf")
self.assertEqual(doc.archive_filename, "document.pdf")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@@ -514,8 +568,9 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
m.assert_called()
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
@@ -527,7 +582,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
#Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertFalse(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@@ -551,19 +606,21 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
m.assert_called()
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_archive_deleted(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@@ -577,6 +634,28 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
self.assertFalse(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_archive_deleted2(self):
original = os.path.join(settings.ORIGINALS_DIR, "document.png")
original2 = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(original2).touch()
Path(archive).touch()
doc1 = Document.objects.create(mime_type="image/png", title="document", filename="document.png", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc2 = Document.objects.create(mime_type="application/pdf", title="0000001", filename="0000001.pdf", checksum="C")
self.assertTrue(os.path.isfile(doc1.source_path))
self.assertTrue(os.path.isfile(doc1.archive_path))
self.assertTrue(os.path.isfile(doc2.source_path))
doc2.delete()
self.assertTrue(os.path.isfile(doc1.source_path))
self.assertTrue(os.path.isfile(doc1.archive_path))
self.assertFalse(os.path.isfile(doc2.source_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_database_error(self):
@@ -584,7 +663,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
m.side_effect = DatabaseError()
doc.save()
@@ -594,6 +673,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
class TestFilenameGeneration(TestCase):
@override_settings(
@@ -617,7 +697,7 @@ class TestFilenameGeneration(TestCase):
def run():
doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow")
doc.filename = generate_unique_filename(doc, settings.ORIGINALS_DIR)
doc.filename = generate_unique_filename(doc)
Path(doc.thumbnail_path).touch()
with open(doc.source_path, "w") as f:
f.write(str(uuid.uuid4()))

View File

@@ -1,20 +1,10 @@
from django.test import TestCase
from documents import index
from documents.index import JsonFormatter
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
class JsonFormatterTest(TestCase):
def setUp(self) -> None:
self.formatter = JsonFormatter()
def test_empty_fragments(self):
self.assertListEqual(self.formatter.format([]), [])
class TestAutoComplete(DirectoriesMixin, TestCase):
def test_auto_complete(self):

View File

@@ -1,66 +0,0 @@
import logging
import uuid
from unittest import mock
from django.test import TestCase, override_settings
from ..models import Log
class TestPaperlessLog(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.logger = logging.getLogger(
"documents.management.commands.document_consumer")
@override_settings(DISABLE_DBHANDLER=False)
def test_that_it_saves_at_all(self):
kw = {"group": uuid.uuid4()}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw)
self.assertEqual(Log.objects.all().count(), 1)
self.logger.info("This is an informational message", extra=kw)
self.assertEqual(Log.objects.all().count(), 2)
self.logger.warning("This is an warning message", extra=kw)
self.assertEqual(Log.objects.all().count(), 3)
self.logger.error("This is an error message", extra=kw)
self.assertEqual(Log.objects.all().count(), 4)
self.logger.critical("This is a critical message", extra=kw)
self.assertEqual(Log.objects.all().count(), 5)
@override_settings(DISABLE_DBHANDLER=False)
def test_groups(self):
kw1 = {"group": uuid.uuid4()}
kw2 = {"group": uuid.uuid4()}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)

View File

@@ -20,6 +20,7 @@ from documents.tests.utils import DirectoriesMixin
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestArchiver(DirectoriesMixin, TestCase):
def make_models(self):
@@ -42,9 +43,42 @@ class TestArchiver(DirectoriesMixin, TestCase):
doc = Document.objects.get(id=doc.id)
self.assertIsNotNone(doc.checksum)
self.assertIsNotNone(doc.archive_checksum)
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
self.assertEqual(doc.archive_filename, "none/A.pdf")
def test_unknown_mime_type(self):
doc = self.make_models()
doc.mime_type = "sdgfh"
doc.save()
shutil.copy(sample_file, doc.source_path)
handle_document(doc.pk)
doc = Document.objects.get(id=doc.id)
self.assertIsNotNone(doc.checksum)
self.assertIsNone(doc.archive_checksum)
self.assertIsNone(doc.archive_filename)
self.assertTrue(os.path.isfile(doc.source_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_naming_priorities(self):
doc1 = Document.objects.create(checksum="A", title="document", content="first document", mime_type="application/pdf", filename="document.pdf")
doc2 = Document.objects.create(checksum="B", title="document", content="second document", mime_type="application/pdf", filename="document_01.pdf")
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document.pdf"))
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document_01.pdf"))
handle_document(doc2.pk)
handle_document(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
self.assertEqual(doc1.archive_filename, "document.pdf")
self.assertEqual(doc2.archive_filename, "document_01.pdf")
class TestDecryptDocuments(TestCase):
@@ -106,24 +140,27 @@ class TestMakeIndex(TestCase):
class TestRenamer(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_rename(self):
doc = Document.objects.create(title="test", mime_type="application/pdf")
doc = Document.objects.create(title="test", mime_type="image/jpeg")
doc.filename = generate_filename(doc)
doc.archive_filename = generate_filename(doc, archive_filename=True)
doc.save()
Path(doc.source_path).touch()
Path(doc.archive_path).touch()
old_source_path = doc.source_path
with override_settings(PAPERLESS_FILENAME_FORMAT="{title}"):
with override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}"):
call_command("document_renamer")
doc2 = Document.objects.get(id=doc.id)
self.assertEqual(doc2.filename, "test.pdf")
self.assertFalse(os.path.isfile(old_source_path))
self.assertEqual(doc2.filename, "none/test.jpg")
self.assertEqual(doc2.archive_filename, "none/test.pdf")
self.assertFalse(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
self.assertTrue(os.path.isfile(doc2.source_path))
self.assertTrue(os.path.isfile(doc2.archive_path))
class TestCreateClassifier(TestCase):
@@ -133,3 +170,24 @@ class TestCreateClassifier(TestCase):
call_command("document_create_classifier")
m.assert_called_once()
class TestSanityChecker(DirectoriesMixin, TestCase):
def test_no_issues(self):
with self.assertLogs() as capture:
call_command("document_sanity_checker")
self.assertEqual(len(capture.output), 1)
self.assertIn("Sanity checker detected no issues.", capture.output[0])
def test_errors(self):
doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc")
Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch()
with self.assertLogs() as capture:
call_command("document_sanity_checker")
self.assertEqual(len(capture.output), 1)
self.assertIn("Checksum mismatch of document", capture.output[0])

View File

@@ -60,10 +60,10 @@ class ConsumerMixin:
super(ConsumerMixin, self).tearDown()
def wait_for_task_mock_call(self):
def wait_for_task_mock_call(self, excpeted_call_count=1):
n = 0
while n < 100:
if self.task_mock.call_count > 0:
if self.task_mock.call_count >= excpeted_call_count:
# give task_mock some time to finish and raise errors
sleep(1)
return
@@ -202,8 +202,44 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
def test_mac_write(self):
self.task_mock.side_effect = self.bogus_task
@override_settings(CONSUMER_POLLING=1)
self.t_start()
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, ".DS_STORE"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_second_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_second_file.pdf"))
sleep(5)
self.wait_for_task_mock_call(excpeted_call_count=2)
self.assertEqual(2, self.task_mock.call_count)
fnames = [os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list]
self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
def test_is_ignored(self):
test_paths = [
(os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
(os.path.join(self.dirs.consumption_dir, "foo","bar.pdf"), False),
(os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"), True),
(os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
]
for file_path, expected_ignored in test_paths:
self.assertEqual(
expected_ignored,
document_consumer._is_ignored(file_path),
f'_is_ignored("{file_path}") != {expected_ignored}')
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
class TestConsumerPolling(TestConsumer):
# just do all the tests with polling
pass
@@ -215,8 +251,7 @@ class TestConsumerRecursive(TestConsumer):
pass
@override_settings(CONSUMER_RECURSIVE=True)
@override_settings(CONSUMER_POLLING=1)
@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
class TestConsumerRecursivePolling(TestConsumer):
# just do all the tests with polling and recursive
pass
@@ -257,6 +292,6 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
# their order.
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
@override_settings(CONSUMER_POLLING=1)
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
def test_consume_file_with_path_tags_polling(self):
self.test_consume_file_with_path_tags()

View File

@@ -22,7 +22,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.target = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, self.target)
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf")
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
self.d4 = Document.objects.create(content="Content", checksum="82186aaa94f0b98697d704b90fd1c072", title="wow_dec", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
@@ -69,7 +69,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 7)
self.assertEqual(len(manifest), 8)
self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 4)
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))

View File

@@ -11,14 +11,17 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
self.d4 = Document.objects.create(checksum="D", title="D", content="auto document")
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
self.tag_no_match = Tag.objects.create(name="test2")
self.tag_auto = Tag.objects.create(name="tagauto", matching_algorithm=Tag.MATCH_AUTO)
self.d3.tags.add(self.tag_inbox)
self.d3.tags.add(self.tag_no_match)
self.d4.tags.add(self.tag_auto)
self.correspondent_first = Correspondent.objects.create(
@@ -32,7 +35,8 @@ class TestRetagger(DirectoriesMixin, TestCase):
name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
def get_updated_docs(self):
return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
return Document.objects.get(title="A"), Document.objects.get(title="B"), \
Document.objects.get(title="C"), Document.objects.get(title="D")
def setUp(self) -> None:
super(TestRetagger, self).setUp()
@@ -40,25 +44,26 @@ class TestRetagger(DirectoriesMixin, TestCase):
def test_add_tags(self):
call_command('document_retagger', '--tags')
d_first, d_second, d_unrelated = self.get_updated_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 1)
self.assertEqual(d_second.tags.count(), 1)
self.assertEqual(d_unrelated.tags.count(), 2)
self.assertEqual(d_auto.tags.count(), 1)
self.assertEqual(d_first.tags.first(), self.tag_first)
self.assertEqual(d_second.tags.first(), self.tag_second)
def test_add_type(self):
call_command('document_retagger', '--document_type')
d_first, d_second, d_unrelated = self.get_updated_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, self.doctype_first)
self.assertEqual(d_second.document_type, self.doctype_second)
def test_add_correspondent(self):
call_command('document_retagger', '--correspondent')
d_first, d_second, d_unrelated = self.get_updated_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, self.correspondent_first)
self.assertEqual(d_second.correspondent, self.correspondent_second)
@@ -68,11 +73,55 @@ class TestRetagger(DirectoriesMixin, TestCase):
call_command('document_retagger', '--tags', '--overwrite')
d_first, d_second, d_unrelated = self.get_updated_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
self.assertEqual(d_auto.tags.count(), 0)
def test_add_tags_suggest(self):
call_command('document_retagger', '--tags', '--suggest')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 0)
self.assertEqual(d_second.tags.count(), 0)
self.assertEqual(d_auto.tags.count(), 1)
def test_add_type_suggest(self):
call_command('document_retagger', '--document_type', '--suggest')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
def test_add_correspondent_suggest(self):
call_command('document_retagger', '--correspondent', '--suggest')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)
self.assertEqual(d_second.correspondent, None)
def test_add_tags_suggest_url(self):
call_command('document_retagger', '--tags', '--suggest', '--base-url=http://localhost')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 0)
self.assertEqual(d_second.tags.count(), 0)
self.assertEqual(d_auto.tags.count(), 1)
def test_add_type_suggest_url(self):
call_command('document_retagger', '--document_type', '--suggest', '--base-url=http://localhost')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
def test_add_correspondent_suggest_url(self):
call_command('document_retagger', '--correspondent', '--suggest', '--base-url=http://localhost')
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)
self.assertEqual(d_second.correspondent, None)

View File

@@ -0,0 +1,66 @@
import os
import shutil
from unittest import mock
from django.contrib.auth.models import User
from django.core.management import call_command
from django.test import TestCase
from documents.management.commands.document_thumbnails import _process_document
from documents.models import Document, Tag, Correspondent, DocumentType
from documents.tests.utils import DirectoriesMixin
class TestManageSuperUser(DirectoriesMixin, TestCase):
def reset_environment(self):
if "PAPERLESS_ADMIN_USER" in os.environ:
del os.environ["PAPERLESS_ADMIN_USER"]
if "PAPERLESS_ADMIN_PASSWORD" in os.environ:
del os.environ["PAPERLESS_ADMIN_PASSWORD"]
def setUp(self) -> None:
super().setUp()
self.reset_environment()
def tearDown(self) -> None:
super().tearDown()
self.reset_environment()
def test_no_user(self):
call_command("manage_superuser")
# just the consumer user.
self.assertEqual(User.objects.count(), 1)
self.assertTrue(User.objects.filter(username="consumer").exists())
def test_create(self):
os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
os.environ["PAPERLESS_ADMIN_PASSWORD"] = "123456"
call_command("manage_superuser")
user: User = User.objects.get_by_natural_key("new_user")
self.assertTrue(user.check_password("123456"))
def test_update(self):
os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
os.environ["PAPERLESS_ADMIN_PASSWORD"] = "123456"
call_command("manage_superuser")
os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
os.environ["PAPERLESS_ADMIN_PASSWORD"] = "more_secure_pwd_7645"
call_command("manage_superuser")
user: User = User.objects.get_by_natural_key("new_user")
self.assertTrue(user.check_password("more_secure_pwd_7645"))
def test_no_password(self):
os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
call_command("manage_superuser")
with self.assertRaises(User.DoesNotExist):
User.objects.get_by_natural_key("new_user")

View File

@@ -0,0 +1,325 @@
import hashlib
import os
import shutil
from pathlib import Path
from unittest import mock
from django.conf import settings
from django.test import override_settings
from documents.parsers import ParseError
from documents.tests.utils import DirectoriesMixin, TestMigrations
STORAGE_TYPE_GPG = "gpg"
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(self):
if self.filename:
fname = archive_name_from_filename(self.filename)
else:
fname = "{:07}.pdf".format(self.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
else:
fname = "{:07}{}".format(doc.pk, doc.file_type)
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def thumbnail_path(doc):
file_name = "{:07}.png".format(doc.pk)
if doc.storage_type == STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
settings.THUMBNAIL_DIR,
file_name
)
def make_test_document(document_class, title: str, mime_type: str, original: str, original_filename: str, archive: str = None, archive_filename: str = None):
doc = document_class()
doc.filename = original_filename
doc.title = title
doc.mime_type = mime_type
doc.content = "the content, does not matter for this test"
doc.save()
shutil.copy2(original, source_path(doc))
with open(original, "rb") as f:
doc.checksum = hashlib.md5(f.read()).hexdigest()
if archive:
if archive_filename:
doc.archive_filename = archive_filename
shutil.copy2(archive, archive_path_new(doc))
else:
shutil.copy2(archive, archive_path_old(doc))
with open(archive, "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
doc.save()
Path(thumbnail_path(doc)).touch()
return doc
simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg")
simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf")
simple_pdf3 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000003.pdf")
simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt")
simple_png = os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png")
simple_png2 = os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations):
migrate_from = '1011_auto_20210101_2340'
migrate_to = '1012_fix_archive_files'
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
self.unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf3, "unrelated.pdf", simple_pdf)
self.no_text = make_test_document(Document, "no-text", "image/png", simple_png2, "no-text.png", simple_pdf)
self.doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
self.clash1 = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
self.clash2 = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf)
self.clash3 = make_test_document(Document, "clash", "image/png", simple_png, "clash.png", simple_pdf)
self.clash4 = make_test_document(Document, "clash.png", "application/pdf", simple_pdf2, "clash.png.pdf", simple_pdf2)
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash2))
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash3))
self.assertNotEqual(archive_path_old(self.clash1), archive_path_old(self.clash4))
def testArchiveFilesMigrated(self):
Document = self.apps.get_model('documents', 'Document')
for doc in Document.objects.all():
if doc.archive_checksum:
self.assertIsNotNone(doc.archive_filename)
self.assertTrue(os.path.isfile(archive_path_new(doc)))
else:
self.assertIsNone(doc.archive_filename)
with open(source_path(doc), "rb") as f:
original_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(original_checksum, doc.checksum)
if doc.archive_checksum:
self.assertTrue(os.path.isfile(archive_path_new(doc)))
with open(archive_path_new(doc), "rb") as f:
archive_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(archive_checksum, doc.archive_checksum)
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 6)
def test_filenames(self):
Document = self.apps.get_model('documents', 'Document')
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, f"{self.clash1.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, f"{self.clash2.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, f"{self.clash3.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
def test_filenames(self):
Document = self.apps.get_model('documents', 'Document')
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, "none/clash.pdf")
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, "none/clash_01.pdf")
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, "none/clash_02.pdf")
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
def fake_parse_wrapper(parser, path, mime_type, file_name):
parser.archive_path = None
parser.text = "the text"
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
migrate_from = '1011_auto_20210101_2340'
migrate_to = '1012_fix_archive_files'
auto_migrate = False
def test_archive_missing(self):
Document = self.apps.get_model("documents", "Document")
doc = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
os.unlink(archive_path_old(doc))
self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
def test_parser_missing(self):
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
def test_parser_error(self, m):
m.side_effect = ParseError()
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
self.assertIsNotNone(doc1.archive_checksum)
self.assertIsNotNone(doc2.archive_checksum)
with self.assertLogs() as capture:
self.performMigration()
self.assertEqual(m.call_count, 6)
self.assertEqual(
len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
4)
self.assertEqual(
len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
2)
Document = self.apps.get_model("documents", "Document")
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
self.assertIsNone(doc1.archive_checksum)
self.assertIsNone(doc2.archive_checksum)
self.assertIsNone(doc1.archive_filename)
self.assertIsNone(doc2.archive_filename)
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
def test_parser_no_archive(self, m):
m.side_effect = fake_parse_wrapper
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
with self.assertLogs() as capture:
self.performMigration()
self.assertEqual(
len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
2)
Document = self.apps.get_model("documents", "Document")
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
self.assertIsNone(doc1.archive_checksum)
self.assertIsNone(doc2.archive_checksum)
self.assertIsNone(doc1.archive_filename)
self.assertIsNone(doc2.archive_filename)
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
migrate_from = '1012_fix_archive_files'
migrate_to = '1011_auto_20210101_2340'
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc_unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf2, "unrelated.txt", simple_pdf2, "unrelated.pdf")
doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_02.pdf")
def testArchiveFilesReverted(self):
Document = self.apps.get_model('documents', 'Document')
for doc in Document.objects.all():
if doc.archive_checksum:
self.assertTrue(os.path.isfile(archive_path_old(doc)))
with open(source_path(doc), "rb") as f:
original_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(original_checksum, doc.checksum)
if doc.archive_checksum:
self.assertTrue(os.path.isfile(archive_path_old(doc)))
with open(archive_path_old(doc), "rb") as f:
archive_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(archive_checksum, doc.archive_checksum)
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 2)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestMigrateArchiveFilesBackwardsWithFilenameFormat(TestMigrateArchiveFilesBackwards):
pass
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwardsErrors(DirectoriesMixin, TestMigrations):
migrate_from = '1012_fix_archive_files'
migrate_to = '1011_auto_20210101_2340'
auto_migrate = False
def test_filename_clash(self):
Document = self.apps.get_model("documents", "Document")
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash_02.pdf")
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
self.assertRaisesMessage(ValueError, "would clash with another archive filename", self.performMigration)
def test_filename_exists(self):
Document = self.apps.get_model("documents", "Document")
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash.pdf")
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
self.assertRaisesMessage(ValueError, "file already exists.", self.performMigration)

View File

@@ -1,52 +1,11 @@
import os
import shutil
from pathlib import Path
from django.apps import apps
from django.conf import settings
from django.db import connection
from django.db.migrations.executor import MigrationExecutor
from django.test import TestCase, TransactionTestCase, override_settings
from django.test import override_settings
from documents.models import Document
from documents.parsers import get_default_file_extension
from documents.tests.utils import DirectoriesMixin
class TestMigrations(TransactionTestCase):
@property
def app(self):
return apps.get_containing_app_config(type(self).__module__).name
migrate_from = None
migrate_to = None
def setUp(self):
super(TestMigrations, self).setUp()
assert self.migrate_from and self.migrate_to, \
"TestCase '{}' must define migrate_from and migrate_to properties".format(type(self).__name__)
self.migrate_from = [(self.app, self.migrate_from)]
self.migrate_to = [(self.app, self.migrate_to)]
executor = MigrationExecutor(connection)
old_apps = executor.loader.project_state(self.migrate_from).apps
# Reverse to the original migration
executor.migrate(self.migrate_from)
self.setUpBeforeMigration(old_apps)
# Run the migration to test
executor = MigrationExecutor(connection)
executor.loader.build_graph() # reload.
executor.migrate(self.migrate_to)
self.apps = executor.loader.project_state(self.migrate_to).apps
def setUpBeforeMigration(self, apps):
pass
from documents.tests.utils import DirectoriesMixin, TestMigrations
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"

View File

@@ -0,0 +1,15 @@
from documents.tests.utils import DirectoriesMixin, TestMigrations
class TestMigrateNullCharacters(DirectoriesMixin, TestMigrations):
migrate_from = '1014_auto_20210228_1614'
migrate_to = '1015_remove_null_characters'
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
self.doc = Document.objects.create(content="aaa\0bbb")
def testMimeTypesMigrated(self):
Document = self.apps.get_model('documents', 'Document')
self.assertNotIn("\0", Document.objects.get(id=self.doc.id).content)

View File

@@ -0,0 +1,37 @@
from documents.tests.utils import DirectoriesMixin, TestMigrations
class TestMigrateTagColor(DirectoriesMixin, TestMigrations):
migrate_from = '1012_fix_archive_files'
migrate_to = '1013_migrate_tag_colour'
def setUpBeforeMigration(self, apps):
Tag = apps.get_model("documents", "Tag")
self.t1_id = Tag.objects.create(name="tag1").id
self.t2_id = Tag.objects.create(name="tag2", colour=1).id
self.t3_id = Tag.objects.create(name="tag3", colour=5).id
def testMimeTypesMigrated(self):
Tag = self.apps.get_model('documents', 'Tag')
self.assertEqual(Tag.objects.get(id=self.t1_id).color, "#a6cee3")
self.assertEqual(Tag.objects.get(id=self.t2_id).color, "#a6cee3")
self.assertEqual(Tag.objects.get(id=self.t3_id).color, "#fb9a99")
class TestMigrateTagColorBackwards(DirectoriesMixin, TestMigrations):
migrate_from = '1013_migrate_tag_colour'
migrate_to = '1012_fix_archive_files'
def setUpBeforeMigration(self, apps):
Tag = apps.get_model("documents", "Tag")
self.t1_id = Tag.objects.create(name="tag1").id
self.t2_id = Tag.objects.create(name="tag2", color="#cab2d6").id
self.t3_id = Tag.objects.create(name="tag3", color="#123456").id
def testMimeTypesReverted(self):
Tag = self.apps.get_model('documents', 'Tag')
self.assertEqual(Tag.objects.get(id=self.t1_id).colour, 1)
self.assertEqual(Tag.objects.get(id=self.t2_id).colour, 9)
self.assertEqual(Tag.objects.get(id=self.t3_id).colour, 1)

View File

@@ -68,7 +68,7 @@ class TestParserDiscovery(TestCase):
)
def fake_get_thumbnail(self, path, mimetype):
def fake_get_thumbnail(self, path, mimetype, file_name):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
@@ -89,15 +89,15 @@ class TestBaseParser(TestCase):
def test_get_optimised_thumbnail(self):
parser = DocumentParser(None)
parser.get_optimised_thumbnail("any", "not important")
parser.get_optimised_thumbnail("any", "not important", "document.pdf")
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=False)
def test_get_optimised_thumb_disabled(self):
parser = DocumentParser(None)
path = parser.get_optimised_thumbnail("any", "not important")
self.assertEqual(path, fake_get_thumbnail(None, None, None))
path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
class TestParserAvailability(TestCase):
@@ -114,8 +114,8 @@ class TestParserAvailability(TestCase):
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
self.assertIsInstance(get_parser_class_for_mime_type('application/pdf')(logging_group=None), RasterisedDocumentParser)
self.assertIsInstance(get_parser_class_for_mime_type('text/plain')(logging_group=None), TextDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
self.assertTrue(is_file_ext_supported('.pdf'))

View File

@@ -1,3 +1,4 @@
import logging
import os
import shutil
from pathlib import Path
@@ -7,10 +8,59 @@ from django.conf import settings
from django.test import TestCase
from documents.models import Document
from documents.sanity_checker import check_sanity, SanityFailedError
from documents.sanity_checker import check_sanity, SanityCheckMessages
from documents.tests.utils import DirectoriesMixin
class TestSanityCheckMessages(TestCase):
def test_no_messages(self):
messages = SanityCheckMessages()
self.assertEqual(len(messages), 0)
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.INFO)
self.assertEqual(capture.records[0].message, "Sanity checker detected no issues.")
def test_info(self):
messages = SanityCheckMessages()
messages.info("Something might be wrong")
self.assertEqual(len(messages), 1)
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.INFO)
self.assertEqual(capture.records[0].message, "Something might be wrong")
def test_warning(self):
messages = SanityCheckMessages()
messages.warning("Something is wrong")
self.assertEqual(len(messages), 1)
self.assertFalse(messages.has_error())
self.assertTrue(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.WARNING)
self.assertEqual(capture.records[0].message, "Something is wrong")
def test_error(self):
messages = SanityCheckMessages()
messages.error("Something is seriously wrong")
self.assertEqual(len(messages), 1)
self.assertTrue(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.ERROR)
self.assertEqual(capture.records[0].message, "Something is seriously wrong")
class TestSanityCheck(DirectoriesMixin, TestCase):
def make_test_data(self):
@@ -21,7 +71,12 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
def assertSanityError(self, messageRegex):
messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertRegex(messages[0]['message'], messageRegex)
def test_no_docs(self):
self.assertEqual(len(check_sanity()), 0)
@@ -33,59 +88,75 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
def test_no_thumbnail(self):
doc = self.make_test_data()
os.remove(doc.thumbnail_path)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Thumbnail of document .* does not exist")
def test_thumbnail_no_access(self):
doc = self.make_test_data()
os.chmod(doc.thumbnail_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Cannot read thumbnail file of document")
os.chmod(doc.thumbnail_path, 0o777)
def test_no_original(self):
doc = self.make_test_data()
os.remove(doc.source_path)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Original of document .* does not exist.")
def test_original_no_access(self):
doc = self.make_test_data()
os.chmod(doc.source_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Cannot read original file of document")
os.chmod(doc.source_path, 0o777)
def test_original_checksum_mismatch(self):
doc = self.make_test_data()
doc.checksum = "WOW"
doc.save()
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Checksum mismatch of document")
def test_no_archive(self):
doc = self.make_test_data()
os.remove(doc.archive_path)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Archived version of document .* does not exist.")
def test_archive_no_access(self):
doc = self.make_test_data()
os.chmod(doc.archive_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Cannot read archive file of document")
os.chmod(doc.archive_path, 0o777)
def test_archive_checksum_mismatch(self):
doc = self.make_test_data()
doc.archive_checksum = "WOW"
doc.save()
self.assertEqual(len(check_sanity()), 1)
self.assertSanityError("Checksum mismatch of archived document")
def test_empty_content(self):
doc = self.make_test_data()
doc.content = ""
doc.save()
self.assertEqual(len(check_sanity()), 1)
messages = check_sanity()
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Document .* has no content.")
def test_orphaned_file(self):
doc = self.make_test_data()
Path(self.dirs.originals_dir, "orphaned").touch()
self.assertEqual(len(check_sanity()), 1)
messages = check_sanity()
self.assertFalse(messages.has_error())
self.assertTrue(messages.has_warning())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Orphaned file in media dir")
def test_all(self):
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
string = str(SanityFailedError(check_sanity()))
def test_archive_filename_no_checksum(self):
doc = self.make_test_data()
doc.archive_checksum = None
doc.save()
self.assertSanityError("has an archive file, but its checksum is missing.")
def test_archive_checksum_no_filename(self):
doc = self.make_test_data()
doc.archive_filename = None
doc.save()
self.assertSanityError("has an archive file checksum, but no archive filename.")

View File

@@ -20,7 +20,7 @@ class TestSettings(TestCase):
self.assertEqual(default_threads, 1)
def test_workers_threads(self):
for i in range(2, 64):
for i in range(1, 64):
with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count:
cpu_count.return_value = i
@@ -31,4 +31,4 @@ class TestSettings(TestCase):
self.assertTrue(default_workers >= 1)
self.assertTrue(default_threads >= 1)
self.assertTrue(default_workers * default_threads < i, f"{i}")
self.assertTrue(default_workers * default_threads <= i, f"{i}")

View File

@@ -1,12 +1,13 @@
from datetime import datetime
import os
from unittest import mock
from django.conf import settings
from django.test import TestCase
from django.utils import timezone
from documents import tasks
from documents.models import Document
from documents.sanity_checker import SanityError, SanityFailedError
from documents.models import Document, Tag, Correspondent, DocumentType
from documents.sanity_checker import SanityCheckMessages, SanityCheckFailedException
from documents.tests.utils import DirectoriesMixin
@@ -22,20 +23,87 @@ class TestTasks(DirectoriesMixin, TestCase):
tasks.index_optimize()
def test_train_classifier(self):
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_no_auto_matching(self, load_classifier):
tasks.train_classifier()
load_classifier.assert_not_called()
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_tag(self, load_classifier):
load_classifier.return_value = None
Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_type(self, load_classifier):
load_classifier.return_value = None
DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_correspondent(self, load_classifier):
load_classifier.return_value = None
Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
def test_train_classifier(self):
c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
doc = Document.objects.create(correspondent=c, content="test", title="test")
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime = os.stat(settings.MODEL_FILE).st_mtime
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
self.assertEqual(mtime, mtime2)
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check(self, m):
m.return_value = []
tasks.sanity_check()
m.assert_called_once()
m.reset_mock()
m.return_value = [SanityError("")]
self.assertRaises(SanityFailedError, tasks.sanity_check)
def test_sanity_check_success(self, m):
m.return_value = SanityCheckMessages()
self.assertEqual(tasks.sanity_check(), "No issues detected.")
m.assert_called_once()
def test_culk_update_documents(self):
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error(self, m):
messages = SanityCheckMessages()
messages.error("Some error")
m.return_value = messages
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_warning(self, m):
messages = SanityCheckMessages()
messages.warning("Some warning")
m.return_value = messages
self.assertEqual(tasks.sanity_check(), "Sanity check exited with warnings. See log.")
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_info(self, m):
messages = SanityCheckMessages()
messages.info("Some info")
m.return_value = messages
self.assertEqual(tasks.sanity_check(), "Sanity check exited with infos. See log.")
m.assert_called_once()
def test_bulk_update_documents(self):
doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(),
created=timezone.now(), modified=timezone.now())

View File

@@ -15,7 +15,7 @@ class TestViews(TestCase):
def test_index(self):
self.client.force_login(self.user)
for (language_given, language_actual) in [("", "en-US"), ("en-US", "en-US"), ("de", "de"), ("en", "en-US"), ("en-us", "en-US"), ("fr", "fr"), ("jp", "en-US")]:
for (language_given, language_actual) in [("", "en-US"), ("en-US", "en-US"), ("de", "de-DE"), ("en", "en-US"), ("en-us", "en-US"), ("fr", "fr-FR"), ("jp", "en-US")]:
if language_given:
self.client.cookies.load({settings.LANGUAGE_COOKIE_NAME: language_given})
elif settings.LANGUAGE_COOKIE_NAME in self.client.cookies.keys():

View File

@@ -4,7 +4,10 @@ import tempfile
from collections import namedtuple
from contextlib import contextmanager
from django.test import override_settings
from django.apps import apps
from django.db import connection
from django.db.migrations.executor import MigrationExecutor
from django.test import override_settings, TransactionTestCase
def setup_directories():
@@ -19,12 +22,15 @@ def setup_directories():
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
dirs.logging_dir = os.path.join(dirs.data_dir, "log")
os.makedirs(dirs.index_dir, exist_ok=True)
os.makedirs(dirs.originals_dir, exist_ok=True)
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
os.makedirs(dirs.archive_dir, exist_ok=True)
os.makedirs(dirs.logging_dir, exist_ok=True)
dirs.settings_override = override_settings(
DATA_DIR=dirs.data_dir,
SCRATCH_DIR=dirs.scratch_dir,
@@ -33,6 +39,7 @@ def setup_directories():
THUMBNAIL_DIR=dirs.thumbnail_dir,
ARCHIVE_DIR=dirs.archive_dir,
CONSUMPTION_DIR=dirs.consumption_dir,
LOGGING_DIR=dirs.logging_dir,
INDEX_DIR=dirs.index_dir,
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle"),
MEDIA_LOCK=os.path.join(dirs.media_dir, "media.lock")
@@ -75,3 +82,45 @@ class DirectoriesMixin:
def tearDown(self) -> None:
super(DirectoriesMixin, self).tearDown()
remove_dirs(self.dirs)
class TestMigrations(TransactionTestCase):
@property
def app(self):
return apps.get_containing_app_config(type(self).__module__).name
migrate_from = None
migrate_to = None
auto_migrate = True
def setUp(self):
super(TestMigrations, self).setUp()
assert self.migrate_from and self.migrate_to, \
"TestCase '{}' must define migrate_from and migrate_to properties".format(type(self).__name__)
self.migrate_from = [(self.app, self.migrate_from)]
self.migrate_to = [(self.app, self.migrate_to)]
executor = MigrationExecutor(connection)
old_apps = executor.loader.project_state(self.migrate_from).apps
# Reverse to the original migration
executor.migrate(self.migrate_from)
self.setUpBeforeMigration(old_apps)
self.apps = old_apps
if self.auto_migrate:
self.performMigration()
def performMigration(self):
# Run the migration to test
executor = MigrationExecutor(connection)
executor.loader.build_graph() # reload.
executor.migrate(self.migrate_to)
self.apps = executor.loader.project_state(self.migrate_to).apps
def setUpBeforeMigration(self, apps):
pass

403
src/documents/views.py Executable file → Normal file
View File

@@ -1,6 +1,8 @@
import logging
import os
import tempfile
import uuid
import zipfile
from datetime import datetime
from time import mktime
@@ -15,7 +17,9 @@ from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from rest_framework import parsers
from rest_framework.decorators import action
from rest_framework.exceptions import NotFound
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.generics import GenericAPIView
from rest_framework.mixins import (
DestroyModelMixin,
ListModelMixin,
@@ -28,33 +32,40 @@ from rest_framework.views import APIView
from rest_framework.viewsets import (
GenericViewSet,
ModelViewSet,
ReadOnlyModelViewSet
ViewSet
)
import documents.index as index
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .bulk_download import OriginalAndArchiveStrategy, OriginalsOnlyStrategy, \
ArchiveOnlyStrategy
from .classifier import load_classifier
from .filters import (
CorrespondentFilterSet,
DocumentFilterSet,
TagFilterSet,
DocumentTypeFilterSet,
LogFilterSet
DocumentTypeFilterSet
)
from .models import Correspondent, Document, Log, Tag, DocumentType, SavedView
from .matching import match_correspondents, match_tags, match_document_types
from .models import Correspondent, Document, Tag, DocumentType, SavedView
from .parsers import get_parser_class_for_mime_type
from .serialisers import (
CorrespondentSerializer,
DocumentSerializer,
LogSerializer,
TagSerializerVersion1,
TagSerializer,
DocumentTypeSerializer,
PostDocumentSerializer,
SavedViewSerializer,
BulkEditSerializer, SelectionDataSerializer
BulkEditSerializer,
DocumentListSerializer,
BulkDownloadSerializer
)
logger = logging.getLogger("paperless.api")
class IndexView(TemplateView):
template_name = "index.html"
@@ -81,6 +92,7 @@ class IndexView(TemplateView):
context['polyfills_js'] = f"frontend/{self.get_language()}/polyfills.js" # NOQA: E501
context['main_js'] = f"frontend/{self.get_language()}/main.js"
context['webmanifest'] = f"frontend/{self.get_language()}/manifest.webmanifest" # NOQA: E501
context['apple_touch_icon'] = f"frontend/{self.get_language()}/apple-touch-icon.png" # NOQA: E501
return context
@@ -110,7 +122,12 @@ class TagViewSet(ModelViewSet):
queryset = Tag.objects.annotate(
document_count=Count('documents')).order_by(Lower('name'))
serializer_class = TagSerializer
def get_serializer_class(self):
if int(self.request.version) == 1:
return TagSerializerVersion1
else:
return TagSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
@@ -132,10 +149,6 @@ class DocumentTypeViewSet(ModelViewSet):
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
class BulkEditForm(object):
pass
class DocumentViewSet(RetrieveModelMixin,
UpdateModelMixin,
DestroyModelMixin,
@@ -159,6 +172,9 @@ class DocumentViewSet(RetrieveModelMixin,
"added",
"archive_serial_number")
def get_queryset(self):
return Document.objects.distinct()
def get_serializer(self, *args, **kwargs):
fields_param = self.request.query_params.get('fields', None)
if fields_param:
@@ -173,10 +189,12 @@ class DocumentViewSet(RetrieveModelMixin,
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
from documents import index
index.add_or_update_document(self.get_object())
return response
def destroy(self, request, *args, **kwargs):
from documents import index
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
@@ -189,7 +207,7 @@ class DocumentViewSet(RetrieveModelMixin,
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
file_handle = doc.archive_file
filename = doc.get_public_filename(archive=True)
mime_type = 'application/pdf'
@@ -212,7 +230,7 @@ class DocumentViewSet(RetrieveModelMixin,
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(logging_group=None)
parser = parser_class(progress_callback=None, logging_group=None)
try:
return parser.extract_metadata(file, mime_type)
@@ -222,35 +240,60 @@ class DocumentViewSet(RetrieveModelMixin,
else:
return []
def get_filesize(self, filename):
if os.path.isfile(filename):
return os.stat(filename).st_size
else:
return None
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
meta = {
"original_checksum": doc.checksum,
"original_size": os.stat(doc.source_path).st_size,
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": os.path.isfile(doc.archive_path),
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type)
}
if doc.archive_checksum and os.path.isfile(doc.archive_path):
meta['archive_checksum'] = doc.archive_checksum
meta['archive_size'] = os.stat(doc.archive_path).st_size,
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
meta['archive_checksum'] = None
meta['archive_size'] = None
meta['archive_metadata'] = None
return Response(meta)
except Document.DoesNotExist:
raise Http404()
meta = {
"original_checksum": doc.checksum,
"original_size": self.get_filesize(doc.source_path),
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": doc.has_archive_version,
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type),
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename
}
if doc.has_archive_version:
meta['archive_size'] = self.get_filesize(doc.archive_path)
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
meta['archive_size'] = None
meta['archive_metadata'] = None
return Response(meta)
@action(methods=['get'], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
classifier = load_classifier()
return Response({
"correspondents": [
c.id for c in match_correspondents(doc, classifier)
],
"tags": [t.id for t in match_tags(doc, classifier)],
"document_types": [
dt.id for dt in match_document_types(doc, classifier)
]
})
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
try:
@@ -269,6 +312,8 @@ class DocumentViewSet(RetrieveModelMixin,
handle = GnuPG.decrypted(doc.thumbnail_file)
else:
handle = doc.thumbnail_file
# TODO: Send ETag information and use that to send new thumbnails
# if available
return HttpResponse(handle,
content_type='image/png')
except (FileNotFoundError, Document.DoesNotExist):
@@ -283,16 +328,92 @@ class DocumentViewSet(RetrieveModelMixin,
raise Http404()
class LogViewSet(ReadOnlyModelViewSet):
model = Log
class SearchResultSerializer(DocumentSerializer):
def to_representation(self, instance):
doc = Document.objects.get(id=instance['id'])
r = super(SearchResultSerializer, self).to_representation(doc)
r['__search_hit__'] = {
"score": instance.score,
"highlights": instance.highlights("content",
text=doc.content) if doc else None, # NOQA: E501
"rank": instance.rank
}
return r
class UnifiedSearchViewSet(DocumentViewSet):
def __init__(self, *args, **kwargs):
super(UnifiedSearchViewSet, self).__init__(*args, **kwargs)
self.searcher = None
def get_serializer_class(self):
if self._is_search_request():
return SearchResultSerializer
else:
return DocumentSerializer
def _is_search_request(self):
return ("query" in self.request.query_params or
"more_like_id" in self.request.query_params)
def filter_queryset(self, queryset):
if self._is_search_request():
from documents import index
if "query" in self.request.query_params:
query_class = index.DelayedFullTextQuery
elif "more_like_id" in self.request.query_params:
query_class = index.DelayedMoreLikeThisQuery
else:
raise ValueError()
return query_class(
self.searcher,
self.request.query_params,
self.paginator.get_page_size(self.request))
else:
return super(UnifiedSearchViewSet, self).filter_queryset(queryset)
def list(self, request, *args, **kwargs):
if self._is_search_request():
from documents import index
try:
with index.open_index_searcher() as s:
self.searcher = s
return super(UnifiedSearchViewSet, self).list(request)
except NotFound:
raise
except Exception as e:
return HttpResponseBadRequest(str(e))
else:
return super(UnifiedSearchViewSet, self).list(request)
class LogViewSet(ViewSet):
queryset = Log.objects.all()
serializer_class = LogSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = LogFilterSet
ordering_fields = ("created",)
log_files = ["paperless", "mail"]
def retrieve(self, request, pk=None, *args, **kwargs):
if pk not in self.log_files:
raise Http404()
filename = os.path.join(settings.LOGGING_DIR, f"{pk}.log")
if not os.path.isfile(filename):
raise Http404()
with open(filename, "r") as f:
lines = [line.rstrip() for line in f.readlines()]
return Response(lines)
def list(self, request, *args, **kwargs):
return Response(self.log_files)
class SavedViewViewSet(ModelViewSet):
@@ -311,23 +432,12 @@ class SavedViewViewSet(ModelViewSet):
serializer.save(user=self.request.user)
class BulkEditView(APIView):
class BulkEditView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkEditSerializer
parser_classes = (parsers.JSONParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
@@ -344,23 +454,12 @@ class BulkEditView(APIView):
return HttpResponseBadRequest(str(e))
class PostDocumentView(APIView):
class PostDocumentView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = PostDocumentSerializer
parser_classes = (parsers.MultiPartParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
@@ -381,35 +480,29 @@ class PostDocumentView(APIView):
delete=False) as f:
f.write(doc_data)
os.utime(f.name, times=(t, t))
temp_filename = f.name
task_id = str(uuid.uuid4())
async_task("documents.tasks.consume_file",
temp_filename,
override_filename=doc_name,
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_id=task_id,
task_name=os.path.basename(doc_name)[:100])
async_task("documents.tasks.consume_file",
f.name,
override_filename=doc_name,
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_name=os.path.basename(doc_name)[:100])
return Response("OK")
class SelectionDataView(APIView):
class SelectionDataView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = SelectionDataSerializer
serializer_class = DocumentListSerializer
parser_classes = (parsers.MultiPartParser, parsers.JSONParser)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
@@ -450,83 +543,10 @@ class SelectionDataView(APIView):
return r
class SearchView(APIView):
permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def add_infos_to_hit(self, r):
try:
doc = Document.objects.get(id=r['id'])
except Document.DoesNotExist:
logging.getLogger(__name__).warning(
f"Search index returned a non-existing document: "
f"id: {r['id']}, title: {r['title']}. "
f"Search index needs reindex."
)
doc = None
return {'id': r['id'],
'highlights': r.highlights("content", text=doc.content) if doc else None, # NOQA: E501
'score': r.score,
'rank': r.rank,
'document': DocumentSerializer(doc).data if doc else None,
'title': r['title']
}
def get(self, request, format=None):
if 'query' in request.query_params:
query = request.query_params['query']
else:
query = None
if 'more_like' in request.query_params:
more_like_id = request.query_params['more_like']
more_like_content = Document.objects.get(id=more_like_id).content
else:
more_like_id = None
more_like_content = None
if not query and not more_like_id:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
'corrected_query': None,
'results': []})
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
try:
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'corrected_query': corrected_query,
'results': list(map(self.add_infos_to_hit, result_page))})
except Exception as e:
return HttpResponseBadRequest(str(e))
class SearchAutoCompleteView(APIView):
permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def get(self, request, format=None):
if 'term' in request.query_params:
term = request.query_params['term']
@@ -540,7 +560,11 @@ class SearchAutoCompleteView(APIView):
else:
limit = 10
return Response(index.autocomplete(self.ix, term, limit))
from documents import index
ix = index.open_index()
return Response(index.autocomplete(ix, term, limit))
class StatisticsView(APIView):
@@ -548,8 +572,55 @@ class StatisticsView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
return Response({
'documents_total': Document.objects.all().count(),
'documents_inbox': Document.objects.filter(
documents_total = Document.objects.all().count()
if Tag.objects.filter(is_inbox_tag=True).exists():
documents_inbox = Document.objects.filter(
tags__is_inbox_tag=True).distinct().count()
else:
documents_inbox = None
return Response({
'documents_total': documents_total,
'documents_inbox': documents_inbox,
})
class BulkDownloadView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkDownloadSerializer
parser_classes = (parsers.JSONParser,)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
ids = serializer.validated_data.get('documents')
compression = serializer.validated_data.get('compression')
content = serializer.validated_data.get('content')
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
temp = tempfile.NamedTemporaryFile(
dir=settings.SCRATCH_DIR,
suffix="-compressed-archive",
delete=False)
if content == 'both':
strategy_class = OriginalAndArchiveStrategy
elif content == 'originals':
strategy_class = OriginalsOnlyStrategy
else:
strategy_class = ArchiveOnlyStrategy
with zipfile.ZipFile(temp.name, "w", compression) as zipf:
strategy = strategy_class(zipf)
for id in ids:
doc = Document.objects.get(id=id)
strategy.add_document(doc)
with open(temp.name, "rb") as f:
response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment", "documents.zip")
return response