Merge branch 'dev' into celery-tasks

This commit is contained in:
Jonas Winkler
2020-11-10 00:16:59 +01:00
77 changed files with 1605 additions and 959 deletions

View File

@@ -2,7 +2,9 @@ from django.contrib import admin
from django.contrib.auth.models import Group, User
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
from . import index
from .models import Correspondent, Document, DocumentType, Log, Tag
@@ -71,6 +73,21 @@ class DocumentAdmin(admin.ModelAdmin):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"
def delete_queryset(self, request, queryset):
ix = index.open_index()
with AsyncWriter(ix) as writer:
for o in queryset:
index.remove_document(writer, o)
super(DocumentAdmin, self).delete_queryset(request, queryset)
def delete_model(self, request, obj):
index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj)
def save_model(self, request, obj, form, change):
index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change)
@mark_safe
def tags_(self, obj):
r = ""

View File

@@ -18,7 +18,8 @@ class DocumentsConfig(AppConfig):
set_log_entry,
set_correspondent,
set_document_type,
set_tags
set_tags,
add_to_index
)
@@ -29,6 +30,7 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_log_entry)
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_post_consume_script)
post_delete.connect(cleanup_document_deletion)

View File

@@ -1,4 +1,3 @@
import magic
import os
from datetime import datetime
@@ -6,77 +5,25 @@ from time import mktime
from django import forms
from django.conf import settings
from .models import Document, Correspondent
from pathvalidate import validate_filename, ValidationError
class UploadForm(forms.Form):
TYPE_LOOKUP = {
"application/pdf": Document.TYPE_PDF,
"image/png": Document.TYPE_PNG,
"image/jpeg": Document.TYPE_JPG,
"image/gif": Document.TYPE_GIF,
"image/tiff": Document.TYPE_TIF,
}
correspondent = forms.CharField(
max_length=Correspondent._meta.get_field("name").max_length,
required=False
)
title = forms.CharField(
max_length=Document._meta.get_field("title").max_length,
required=False
)
document = forms.FileField()
def __init__(self, *args, **kwargs):
forms.Form.__init__(self, *args, **kwargs)
self._file_type = None
def clean_correspondent(self):
"""
I suppose it might look cleaner to use .get_or_create() here, but that
would also allow someone to fill up the db with bogus correspondents
before all validation was met.
"""
corresp = self.cleaned_data.get("correspondent")
if not corresp:
return None
if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
raise forms.ValidationError(
"That correspondent name is suspicious.")
return corresp
def clean_title(self):
title = self.cleaned_data.get("title")
if not title:
return None
if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
raise forms.ValidationError("That title is suspicious.")
return title
def clean_document(self):
try:
validate_filename(self.cleaned_data.get("document").name)
except ValidationError:
raise forms.ValidationError("That filename is suspicious.")
return self.cleaned_data.get("document")
document = self.cleaned_data.get("document").read()
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
file_type = m.id_buffer(document)
if file_type not in self.TYPE_LOOKUP:
raise forms.ValidationError("The file type is invalid.")
self._file_type = self.TYPE_LOOKUP[file_type]
return document
def get_filename(self, i=None):
return os.path.join(
settings.CONSUMPTION_DIR,
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
)
def save(self):
"""
@@ -85,15 +32,15 @@ class UploadForm(forms.Form):
form do that as well. Think of it as a poor-man's queue server.
"""
correspondent = self.cleaned_data.get("correspondent")
title = self.cleaned_data.get("title")
document = self.cleaned_data.get("document")
document = self.cleaned_data.get("document").read()
t = int(mktime(datetime.now().timetuple()))
file_name = os.path.join(
settings.CONSUMPTION_DIR,
"{} - {}.{}".format(correspondent, title, self._file_type)
)
file_name = self.get_filename()
i = 0
while os.path.exists(file_name):
i += 1
file_name = self.get_filename(i)
with open(file_name, "wb") as f:
f.write(document)

View File

@@ -2,15 +2,20 @@ import logging
from django.db import models
from django.dispatch import receiver
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter
from documents.models import Document
from paperless import settings
logger = logging.getLogger(__name__)
class JsonFormatter(Formatter):
def __init__(self):
self.seen = {}
@@ -68,7 +73,7 @@ def open_index(recreate=False):
def update_document(writer, doc):
logging.getLogger(__name__).debug("Updating index with document{}".format(str(doc)))
logger.debug("Indexing {}...".format(doc))
writer.update_document(
id=doc.pk,
title=doc.title,
@@ -77,19 +82,32 @@ def update_document(writer, doc):
)
@receiver(models.signals.post_save, sender=Document)
def add_document_to_index(sender, instance, **kwargs):
ix = open_index()
with AsyncWriter(ix) as writer:
update_document(writer, instance)
def remove_document(writer, doc):
logger.debug("Removing {} from index...".format(doc))
writer.delete_by_term('id', doc.pk)
@receiver(models.signals.post_delete, sender=Document)
def remove_document_from_index(sender, instance, **kwargs):
logging.getLogger(__name__).debug("Removing document {} from index".format(str(instance)))
def add_or_update_document(document):
ix = open_index()
with AsyncWriter(ix) as writer:
writer.delete_by_term('id', instance.pk)
update_document(writer, document)
def remove_document_from_index(document):
ix = open_index()
with AsyncWriter(ix) as writer:
remove_document(writer, document)
def query_page(ix, query, page):
with ix.searcher() as searcher:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
return result_page
def autocomplete(ix, term, limit=10):

View File

@@ -1,10 +1,6 @@
import logging
from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from paperless import settings
from ...mixins import Renderable
from ...tasks import train_classifier
class Command(Renderable, BaseCommand):
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
classifier = DocumentClassifier()
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass
try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
train_classifier()

View File

@@ -1,9 +1,7 @@
from django.core.management import BaseCommand
from whoosh.writing import AsyncWriter
import documents.index as index
from documents.mixins import Renderable
from documents.models import Document
from documents.tasks import index_reindex, index_optimize
class Command(Renderable, BaseCommand):
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
self.verbosity = options["verbosity"]
if options['command'] == 'reindex':
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
index_reindex()
elif options['command'] == 'optimize':
index.open_index().optimize()
index_optimize()

View File

@@ -0,0 +1,24 @@
from django.core.management.base import BaseCommand
from documents.models import Document, Tag
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
for document in Document.objects.all():
# Saving the document again will generate a new filename and rename
document.save()

View File

@@ -1,60 +0,0 @@
import argparse
import threading
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from django.core.management.base import BaseCommand
from documents.consumer import Consumer
from documents.models import Log, Document
from documents.parsers import get_parser_class
def process_document(doc):
parser_class = get_parser_class(doc.file_name)
if not parser_class:
print("no parser available")
else:
print("Parser: {}".format(parser_class.__name__))
parser = parser_class(doc.source_path, None)
try:
text = parser.get_text()
doc.content = text
doc.save()
finally:
parser.cleanup()
def document_index(value):
ivalue = int(value)
if not (1 <= ivalue <= Document.objects.count()):
raise argparse.ArgumentTypeError(
"{} is not a valid document index (out of range)".format(value))
return ivalue
class Command(BaseCommand):
help = "Performs OCR on all documents again!"
def add_arguments(self, parser):
parser.add_argument(
"-s", "--start_index",
default=None,
type=document_index
)
def handle(self, *args, **options):
docs = Document.objects.all().order_by("added")
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
for i in indices:
doc = docs[i]
print("==================================")
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
print("==================================")
process_document(doc)

View File

@@ -1,73 +0,0 @@
# Generated by Django 3.1.2 on 2020-10-29 14:29
import os
from django.db import migrations
from django.conf import settings
def make_index(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
print()
try:
print(" --> Creating document index...")
from whoosh.writing import AsyncWriter
from documents import index
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
except ImportError:
# index may not be relevant anymore
print(" --> Cannot create document index.")
def restore_filenames(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
file_name = "{:07}.{}".format(doc.pk, doc.file_type)
if doc.storage_type == "gpg":
file_name += ".gpg"
if not doc.filename == file_name:
try:
print("file was renamed, restoring {} to {}".format(doc.filename, file_name))
os.rename(os.path.join(settings.ORIGINALS_DIR, doc.filename),
os.path.join(settings.ORIGINALS_DIR, file_name))
except PermissionError:
pass
except FileNotFoundError:
pass
def initialize_document_classifier(apps, schema_editor):
try:
print("Initalizing document classifier...")
from documents.classifier import DocumentClassifier
classifier = DocumentClassifier()
try:
classifier.train()
classifier.save_classifier()
except Exception as e:
print("Classifier error: {}".format(e))
except ImportError:
print("Document classifier not found, skipping")
class Migration(migrations.Migration):
dependencies = [
('documents', '0023_document_current_filename'),
]
operations = [
migrations.RunPython(make_index, migrations.RunPython.noop),
migrations.RunPython(restore_filenames),
migrations.RunPython(initialize_document_classifier, migrations.RunPython.noop),
migrations.RemoveField(
model_name='document',
name='filename',
),
]

View File

@@ -0,0 +1,95 @@
# Generated by Django 3.1.3 on 2020-11-07 12:35
import os
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
def make_index(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
print()
try:
print(" --> Creating document index...")
from whoosh.writing import AsyncWriter
from documents import index
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
except ImportError:
# index may not be relevant anymore
print(" --> Cannot create document index.")
class Migration(migrations.Migration):
dependencies = [
('documents', '0023_document_current_filename'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_serial_number',
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
),
migrations.AddField(
model_name='tag',
name='is_inbox_tag',
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
),
migrations.CreateModel(
name='DocumentType',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField(blank=True, editable=False)),
('match', models.CharField(blank=True, max_length=256)),
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
('is_insensitive', models.BooleanField(default=True)),
],
options={
'abstract': False,
'ordering': ('name',),
},
),
migrations.AddField(
model_name='document',
name='document_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype'),
),
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
),
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
),
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-created',)},
),
migrations.RemoveField(
model_name='log',
name='modified',
),
migrations.AlterField(
model_name='log',
name='group',
field=models.UUIDField(blank=True, null=True),
),
migrations.RunPython(
code=make_index,
reverse_code=django.db.migrations.operations.special.RunPython.noop,
),
]

View File

@@ -0,0 +1,28 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.all().delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1000_update_paperless_all'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

View File

@@ -1,23 +0,0 @@
# Generated by Django 2.0.7 on 2018-07-12 09:52
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1000_update_paperless'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_serial_number',
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
),
migrations.AddField(
model_name='tag',
name='is_inbox_tag',
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
),
]

View File

@@ -1,33 +0,0 @@
# Generated by Django 2.0.7 on 2018-08-23 11:55
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '1001_workflow_improvements'),
]
operations = [
migrations.CreateModel(
name='DocumentType',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField(blank=True, editable=False)),
('match', models.CharField(blank=True, max_length=256)),
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
('is_insensitive', models.BooleanField(default=True)),
],
options={
'abstract': False,
},
),
migrations.AddField(
model_name='document',
name='document_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.DocumentType'),
),
]

View File

@@ -1,32 +0,0 @@
# Generated by Django 3.1.2 on 2020-10-28 17:51
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1002_auto_20180823_1155'),
]
operations = [
migrations.AlterModelOptions(
name='documenttype',
options={'ordering': ('name',)},
),
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
),
migrations.AlterField(
model_name='documenttype',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 3.1.2 on 2020-10-29 13:31
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1003_auto_20201028_1751'),
]
operations = [
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
),
]

View File

@@ -1,26 +0,0 @@
# Generated by Django 3.1.2 on 2020-11-02 00:07
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1004_auto_20201029_1331'),
]
operations = [
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-created',)},
),
migrations.RemoveField(
model_name='log',
name='modified',
),
migrations.AlterField(
model_name='log',
name='group',
field=models.UUIDField(blank=True, null=True),
),
]

View File

@@ -3,11 +3,12 @@
import logging
import os
import re
from collections import OrderedDict
from collections import OrderedDict, defaultdict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.dispatch import receiver
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
@@ -190,6 +191,14 @@ class Document(models.Model):
added = models.DateTimeField(
default=timezone.now, editable=False, db_index=True)
filename = models.FilePathField(
max_length=256,
editable=False,
default=None,
null=True,
help_text="Current filename in storage"
)
archive_serial_number = models.IntegerField(
blank=True,
null=True,
@@ -211,15 +220,123 @@ class Document(models.Model):
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
def find_renamed_document(self, subdirectory=""):
suffix = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
suffix += ".gpg"
# Go up in the directory hierarchy and try to delete all directories
root = os.path.normpath(Document.filename_to_path(subdirectory))
for filename in os.listdir(root):
if filename.endswith(suffix):
return os.path.join(subdirectory, filename)
fullname = os.path.join(subdirectory, filename)
if os.path.isdir(Document.filename_to_path(fullname)):
return self.find_renamed_document(fullname)
return None
@property
def source_filename(self):
# Initial filename generation (for new documents)
if self.filename is None:
self.filename = self.generate_source_filename()
# Check if document is still available under filename
elif not os.path.isfile(Document.filename_to_path(self.filename)):
recovered_filename = self.find_renamed_document()
# If we have found the file so update the filename
if recovered_filename is not None:
logger = logging.getLogger(__name__)
logger.warning("Filename of document " + str(self.id) +
" has changed and was successfully updated")
self.filename = recovered_filename
# Remove all empty subdirectories from MEDIA_ROOT
Document.delete_all_empty_subdirectories(
Document.filename_to_path(""))
else:
logger = logging.getLogger(__name__)
logger.error("File of document " + str(self.id) + " has " +
"gone and could not be recovered")
return self.filename
@staticmethod
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter+1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_source_filename(self):
# Create filename based on configured format
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
self.many_to_dictionary(self.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(self.correspondent),
title=slugify(self.title),
created=slugify(self.created),
added=slugify(self.added),
tags=tags)
else:
path = ""
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
else:
filename = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def create_source_directory(self):
new_filename = self.generate_source_filename()
# Determine the full "target" path
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
# Create new path
os.makedirs(dir_new, exist_ok=True)
@property
def source_path(self):
file_name = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return Document.filename_to_path(self.source_filename)
@staticmethod
def filename_to_path(filename):
return os.path.join(
settings.ORIGINALS_DIR,
file_name
filename
)
@property
@@ -245,6 +362,125 @@ class Document(models.Model):
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
def set_filename(self, filename):
if os.path.isfile(Document.filename_to_path(filename)):
self.filename = filename
@staticmethod
def try_delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(Document.filename_to_path(""))
while directory != root:
# Try to delete the current directory
try:
os.rmdir(directory)
except os.error:
# Directory not empty, no need to go further up
return
# Cut off actual directory and go one level up
directory, _ = os.path.split(directory)
directory = os.path.normpath(directory)
@staticmethod
def delete_all_empty_subdirectories(directory):
# Go through all folders and try to delete all directories
root = os.path.normpath(Document.filename_to_path(directory))
for filename in os.listdir(root):
fullname = os.path.join(directory, filename)
if not os.path.isdir(Document.filename_to_path(fullname)):
continue
# Go into subdirectory to see, if there is more to delete
Document.delete_all_empty_subdirectories(
os.path.join(directory, filename))
# Try to delete the directory
try:
os.rmdir(Document.filename_to_path(fullname))
continue
except os.error:
# Directory not empty, no need to go further up
continue
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename(sender, instance, **kwargs):
# Skip if document has not been saved yet
if instance.filename is None:
return
# Check is file exists and update filename otherwise
if not os.path.isfile(Document.filename_to_path(instance.filename)):
instance.filename = instance.source_filename
# Build the new filename
new_filename = instance.generate_source_filename()
# If the filename is the same, then nothing needs to be done
if instance.filename == new_filename:
return
# Determine the full "target" path
path_new = instance.filename_to_path(new_filename)
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
# Create new path
instance.create_source_directory()
# Determine the full "current" path
path_current = instance.filename_to_path(instance.source_filename)
# Move file
try:
os.rename(path_current, path_new)
except PermissionError:
# Do not update filename in object
return
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.error("Renaming of document " + str(instance.id) + " failed " +
"as file " + instance.filename + " was no longer present")
return
# Delete empty directory
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
instance.filename = new_filename
# Save instance
# This will not cause a cascade of post_save signals, as next time
# nothing needs to be renamed
instance.save()
@receiver(models.signals.post_delete, sender=Document)
def delete_files(sender, instance, **kwargs):
if instance.filename is None:
return
# Remove the document
old_file = instance.filename_to_path(instance.filename)
try:
os.remove(old_file)
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.warning("Deleted document " + str(instance.id) + " but file " +
old_file + " was no longer present")
# And remove the directory (if applicable)
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
class Log(models.Model):

View File

@@ -166,3 +166,7 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
user=user,
object_repr=document.__str__(),
)
def add_to_index(sender, document, **kwargs):
index.add_or_update_document(document)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,44 @@
html,
body {
height: 100%;
}
body {
display: -ms-flexbox;
display: flex;
-ms-flex-align: center;
align-items: center;
padding-top: 40px;
padding-bottom: 40px;
background-color: #f5f5f5;
}
.form-signin {
width: 100%;
max-width: 330px;
padding: 15px;
margin: auto;
}
.form-signin .checkbox {
font-weight: 400;
}
.form-signin .form-control {
position: relative;
box-sizing: border-box;
height: auto;
padding: 10px;
font-size: 16px;
}
.form-signin .form-control:focus {
z-index: 2;
}
.form-signin input[type="text"] {
margin-bottom: -1px;
border-bottom-right-radius: 0;
border-bottom-left-radius: 0;
}
.form-signin input[type="password"] {
margin-bottom: 10px;
border-top-left-radius: 0;
border-top-right-radius: 0;
}

57
src/documents/tasks.py Normal file
View File

@@ -0,0 +1,57 @@
import logging
from django.conf import settings
from django_q.tasks import async_task, result
from whoosh.writing import AsyncWriter
from documents import index
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.mail import MailFetcher
from documents.models import Document
def consume_mail():
MailFetcher().pull()
def index_optimize():
index.open_index().optimize()
def index_reindex():
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)
def train_classifier():
classifier = DocumentClassifier()
try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass
try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)

View File

@@ -9,11 +9,11 @@
<base href="/">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="icon" type="image/x-icon" href="favicon.ico">
<link rel="stylesheet" href="{% static 'styles.css' %}"></head>
<link rel="stylesheet" href="{% static 'frontend/styles.css' %}"></head>
<body>
<app-root>Loading...</app-root>
<script src="{% static 'runtime.js' %}" defer></script>
<script src="{% static 'polyfills.js' %}" defer></script>
<script src="{% static 'main.js' %}" defer></script>
<script src="{% static 'frontend/runtime.js' %}" defer></script>
<script src="{% static 'frontend/polyfills.js' %}" defer></script>
<script src="{% static 'frontend/main.js' %}" defer></script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!doctype html>
{% load static %}
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
<meta name="generator" content="Jekyll v4.1.1">
<title>Paperless Sign In</title>
<!-- Bootstrap core CSS -->
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
<style>
.bd-placeholder-img {
font-size: 1.125rem;
text-anchor: middle;
-webkit-user-select: none;
-moz-user-select: none;
-ms-user-select: none;
user-select: none;
}
@media (min-width: 768px) {
.bd-placeholder-img-lg {
font-size: 3.5rem;
}
}
</style>
<!-- Custom styles for this template -->
<link href="{% static 'signin.css' %}" rel="stylesheet">
</head>
<body class="text-center">
<div class="form-signin">
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
<p>You have been successfully logged out. Bye!</p>
<a href="/">Sign in again</a>
</div>
</body>
</html>

View File

@@ -0,0 +1,54 @@
<!doctype html>
{% load static %}
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
<meta name="generator" content="Jekyll v4.1.1">
<title>Paperless Sign In</title>
<!-- Bootstrap core CSS -->
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
<style>
.bd-placeholder-img {
font-size: 1.125rem;
text-anchor: middle;
-webkit-user-select: none;
-moz-user-select: none;
-ms-user-select: none;
user-select: none;
}
@media (min-width: 768px) {
.bd-placeholder-img-lg {
font-size: 3.5rem;
}
}
</style>
<!-- Custom styles for this template -->
<link href="{% static 'signin.css' %}" rel="stylesheet">
</head>
<body class="text-center">
<form class="form-signin" method="post">
{% csrf_token %}
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
<p>Please sign in.</p>
{% if form.errors %}
<div class="alert alert-danger" role="alert">
Your username and password didn't match. Please try again.
</div>
{% endif %}
<label for="inputUsername" class="sr-only">Username</label>
<input type="text" name="username" id="inputUsername" class="form-control" placeholder="Username" required autofocus>
<label for="inputPassword" class="sr-only">Password</label>
<input type="password" name="password" id="inputPassword" class="form-control" placeholder="Password" required>
<button class="btn btn-lg btn-primary btn-block" type="submit">Sign in</button>
</form>
</body>
</html>

View File

@@ -1,66 +1,10 @@
import re
from django.test import TestCase
from unittest import mock
from tempfile import TemporaryDirectory
from ..consumer import Consumer
from ..models import FileInfo, Tag
class TestConsumer(TestCase):
class DummyParser(object):
pass
def test__get_parser_class_1_parser(self):
self.assertEqual(
self._get_consumer()._get_parser_class("doc.pdf"),
self.DummyParser
)
@mock.patch("documents.consumer.os.makedirs")
@mock.patch("documents.consumer.os.path.exists", return_value=True)
@mock.patch("documents.consumer.document_consumer_declaration.send")
def test__get_parser_class_n_parsers(self, m, *args):
class DummyParser1(object):
pass
class DummyParser2(object):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
)
with TemporaryDirectory() as tmpdir:
self.assertEqual(
Consumer(consume=tmpdir)._get_parser_class("doc.pdf"),
DummyParser2
)
@mock.patch("documents.consumer.os.makedirs")
@mock.patch("documents.consumer.os.path.exists", return_value=True)
@mock.patch("documents.consumer.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = ((None, lambda _: None),)
with TemporaryDirectory() as tmpdir:
self.assertIsNone(
Consumer(consume=tmpdir)._get_parser_class("doc.pdf")
)
@mock.patch("documents.consumer.os.makedirs")
@mock.patch("documents.consumer.os.path.exists", return_value=True)
@mock.patch("documents.consumer.document_consumer_declaration.send")
def _get_consumer(self, m, *args):
m.return_value = (
(None, lambda _: {"weight": 0, "parser": self.DummyParser}),
)
with TemporaryDirectory() as tmpdir:
return Consumer(consume=tmpdir)
class TestAttributes(TestCase):
TAGS = ("tag1", "tag2", "tag3")

View File

@@ -0,0 +1,559 @@
import datetime
import os
import shutil
from unittest import mock
from uuid import uuid4
from pathlib import Path
from shutil import rmtree
from dateutil import tz
from django.test import TestCase, override_settings
from django.utils.text import slugify
from ..models import Tag, Document, Correspondent
from django.conf import settings
class TestDate(TestCase):
deletion_list = []
def add_to_deletion_list(self, dirname):
self.deletion_list.append(dirname)
def setUp(self):
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(folder + "/documents/originals")
override_settings(MEDIA_ROOT=folder).enable()
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
self.add_to_deletion_list(folder)
def tearDown(self):
for dirname in self.deletion_list:
shutil.rmtree(dirname, ignore_errors=True)
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_source_filename(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.source_filename, "0000001.pdf")
document.filename = "test.pdf"
self.assertEqual(document.source_filename, "test.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
document.storage_type = Document.STORAGE_TYPE_GPG
self.assertEqual(document.generate_source_filename(),
"0000001.pdf.gpg")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_file_renaming(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Enable encryption and check again
document.storage_type = Document.STORAGE_TYPE_GPG
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf.gpg")
document.save()
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf.gpg"), True)
self.assertEqual(document.generate_source_filename(),
"test/test-0000001.pdf.gpg")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_file_renaming_missing_permissions(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Make the folder read- and execute-only (no writing and no renaming)
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), True)
self.assertEqual(document.source_filename,
"none/none-0000001.pdf")
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_delete(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Ensure file deletion after delete
document.delete()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_delete_nofile(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_directory_not_empty(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
Path(document.source_path + "test").touch()
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
# Cleanup
os.remove(settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdftest")
os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Add tag to document
document.tags.create(name="type_demo")
document.tags.create(name="foo_bar")
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Add tag to document
document.tags.create(name="type-demo")
document.tags.create(name="foo-bar")
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Add tag to document
document.tags.create(name="type:demo")
document.tags.create(name="foo:bar")
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Add tag to document
document.tags.create(name="demo")
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_out_of_bounds_0(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}")
def test_tags_out_of_bounds_10000000(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}")
def test_tags_out_of_bounds_99(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none/none"), True)
document.delete()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
"/documents/originals/none/none/none-0000001.pdf"),
False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals"), True)
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def test_format_none(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_renamed(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Rename the document "illegaly"
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf",
settings.MEDIA_ROOT + "/documents/originals/" +
"test/test-0000001.pdf")
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf"), True)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), False)
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/foo/foo-0000001.pdf"), True)
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/foo"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), False)
self.assertEqual(document.generate_source_filename(),
"foo/foo-0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_renamed_encrypted(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_GPG
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf.gpg")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf.gpg")
# Rename the document "illegaly"
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf.gpg",
settings.MEDIA_ROOT + "/documents/originals/" +
"test/test-0000001.pdf.gpg")
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf.gpg"), True)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), False)
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/foo/foo-0000001.pdf.gpg"), True)
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/foo"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), False)
self.assertEqual(document.generate_source_filename(),
"foo/foo-0000001.pdf.gpg")
def test_delete_all_empty_subdirectories(self):
# Create our working directory
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(tmp)
self.add_to_deletion_list(tmp)
os.makedirs(os.path.join(tmp, "empty"))
os.makedirs(os.path.join(tmp, "empty", "subdirectory"))
os.makedirs(os.path.join(tmp, "notempty"))
Path(os.path.join(tmp, "notempty", "file")).touch()
Document.delete_all_empty_subdirectories(tmp)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
def test_try_delete_empty_directories(self):
# Create our working directory
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(tmp)
self.add_to_deletion_list(tmp)
os.makedirs(os.path.join(tmp, "notempty"))
Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty"))
Document.try_delete_empty_directories(
os.path.join(tmp, "notempty", "empty"))
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
self.assertEqual(os.path.isdir(
os.path.join(tmp, "notempty", "empty")), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_accidentally_deleted(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Delete the document "illegaly"
os.remove(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf")
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
self.assertEqual(document.source_filename,
"none/none-0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_set_filename(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Set existing filename
document.set_filename(tmp)
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
# Set non-existing filename
document.set_filename("doesnotexist")
self.assertEqual(document.source_filename, "none/none-0000001.pdf")

View File

@@ -0,0 +1,50 @@
from tempfile import TemporaryDirectory
from unittest import mock
from django.test import TestCase
from documents.parsers import get_parser_class
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_1_parser(self, m, *args):
class DummyParser(object):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser}),
)
self.assertEqual(
get_parser_class("doc.pdf"),
DummyParser
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_n_parsers(self, m, *args):
class DummyParser1(object):
pass
class DummyParser2(object):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
)
self.assertEqual(
get_parser_class("doc.pdf"),
DummyParser2
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = ((None, lambda _: None),)
with TemporaryDirectory() as tmpdir:
self.assertIsNone(
get_parser_class("doc.pdf")
)

View File

@@ -6,9 +6,6 @@ from django_filters.rest_framework import DjangoFilterBackend
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.views import APIView
from whoosh import highlight
from whoosh.qparser import QueryParser
from whoosh.query import terms
from paperless.db import GnuPG
from paperless.views import StandardPagination
@@ -97,7 +94,16 @@ class DocumentViewSet(RetrieveModelMixin,
filter_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id", "title", "correspondent__name", "created", "modified", "added", "archive_serial_number")
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(request, *args, **kwargs)
index.add_or_update_document(self.get_object())
return response
def destroy(self, request, *args, **kwargs):
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
#TODO: this should not be necessary here.
@@ -185,18 +191,13 @@ class SearchView(APIView):
except (ValueError, TypeError):
page = 1
with self.ix.searcher() as searcher:
query_parser = QueryParser("content", self.ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = index.JsonFormatter()
result_page = index.query_page(self.ix, query, page)
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
else:
return Response({

View File

@@ -1,11 +1,17 @@
from rest_framework.authentication import TokenAuthentication
from django.conf import settings
from django.contrib.auth.models import User
from rest_framework import authentication
class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
""" This class is here to provide authentication to the angular dev server
during development. This is disabled in production.
"""
# This authentication method is required to serve documents and thumbnails for the front end.
# https://stackoverflow.com/questions/29433416/token-in-query-string-with-django-rest-frameworks-tokenauthentication
class QueryTokenAuthentication(TokenAuthentication):
def authenticate(self, request):
# Check if 'token_auth' is in the request query params.
if 'auth_token' in request.query_params and 'HTTP_AUTHORIZATION' not in request.META:
return self.authenticate_credentials(request.query_params.get('auth_token'))
if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):
user = User.objects.filter(is_staff=True).first()
print("Auto-Login with user {}".format(user))
return (user, None)
else:
return None

View File

@@ -1,14 +0,0 @@
from django.utils.deprecation import MiddlewareMixin
from .models import User
class Middleware(MiddlewareMixin):
"""
This is a dummy authentication middleware class that creates what
is roughly an Anonymous authenticated user so we can disable login
and not interfere with existing user ID's. It's only used if
login is disabled in paperless.conf (default is to require login)
"""
def process_request(self, request):
request.user = User()

View File

@@ -1,31 +0,0 @@
from django.contrib.auth.models import User as DjangoUser
class User:
"""
This is a dummy django User used with our middleware to disable
login authentication if that is configured in paperless.conf
"""
is_superuser = True
is_active = True
is_staff = True
is_authenticated = True
@property
def id(self):
return DjangoUser.objects.order_by("pk").first().pk
@property
def pk(self):
return self.id
"""
NOTE: These are here as a hack instead of being in the User definition
NOTE: above due to the way pycodestyle handles lamdbdas.
NOTE: See https://github.com/PyCQA/pycodestyle/issues/379 for more.
"""
User.has_module_perms = lambda *_: True
User.has_perm = lambda *_: True

View File

@@ -21,6 +21,9 @@ def __get_boolean(key, default="NO"):
"""
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
###############################################################################
# Directories #
###############################################################################
@@ -66,9 +69,10 @@ INSTALLED_APPS = [
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
"django_filters",
"django_q",
"channels",
]
@@ -76,11 +80,15 @@ INSTALLED_APPS = [
REST_FRAMEWORK = {
'DEFAULT_AUTHENTICATION_CLASSES': [
'rest_framework.authentication.BasicAuthentication',
'rest_framework.authentication.TokenAuthentication',
'paperless.auth.QueryTokenAuthentication'
'rest_framework.authentication.SessionAuthentication'
]
}
if DEBUG:
REST_FRAMEWORK['DEFAULT_AUTHENTICATION_CLASSES'].append(
'paperless.auth.AngularApiAuthenticationOverride'
)
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware',
@@ -95,8 +103,6 @@ MIDDLEWARE = [
ROOT_URLCONF = 'paperless.urls'
LOGIN_URL = "admin:login"
FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
WSGI_APPLICATION = 'paperless.wsgi.application'
@@ -125,9 +131,6 @@ TEMPLATES = [
# Security #
###############################################################################
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
if DEBUG:
X_FRAME_OPTIONS = ''
# this should really be 'allow-from uri' but its not supported in any mayor
@@ -142,11 +145,6 @@ if DEBUG:
# Allow access from the angular development server during debugging
CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
# If auth is disabled, we just use our "bypass" authentication middleware
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
_index = MIDDLEWARE.index("django.contrib.auth.middleware.AuthenticationMiddleware")
MIDDLEWARE[_index] = "paperless.middleware.Middleware"
# The secret key has a default that should be fine so long as you're hosting
# Paperless on a closed network. However, if you're putting this anywhere
# public, you should change the key to something unique and verbose.
@@ -249,6 +247,16 @@ LOGGING = {
},
}
###############################################################################
# Task queue #
###############################################################################
Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}
###############################################################################
# Paperless Specific Settings #
###############################################################################
@@ -303,6 +311,9 @@ FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels_redis.core.RedisChannelLayer",

View File

@@ -1,9 +1,9 @@
from django.conf.urls import include, url
from django.contrib import admin
from django.contrib.auth.decorators import login_required
from django.urls import path, re_path
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import RedirectView
from rest_framework.authtoken import views
from rest_framework.routers import DefaultRouter
from paperless.consumers import StatusConsumer
@@ -35,7 +35,7 @@ urlpatterns = [
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
url(r"^api/token/", views.obtain_auth_token), url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
# Favicon
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
@@ -59,10 +59,12 @@ urlpatterns = [
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
# Frontend assets TODO: this is pretty bad.
path('assets/<path:path>', RedirectView.as_view(url='/static/assets/%(path)s')),
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
path('accounts/', include('django.contrib.auth.urls')),
# Root of the Frontent
url(r".*", IndexView.as_view()),
url(r".*", login_required(IndexView.as_view())),
]