mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into celery-tasks
This commit is contained in:
@@ -2,7 +2,9 @@ from django.contrib import admin
|
||||
from django.contrib.auth.models import Group, User
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from . import index
|
||||
from .models import Correspondent, Document, DocumentType, Log, Tag
|
||||
|
||||
|
||||
@@ -71,6 +73,21 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
created_.short_description = "Created"
|
||||
|
||||
def delete_queryset(self, request, queryset):
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
for o in queryset:
|
||||
index.remove_document(writer, o)
|
||||
super(DocumentAdmin, self).delete_queryset(request, queryset)
|
||||
|
||||
def delete_model(self, request, obj):
|
||||
index.remove_document_from_index(obj)
|
||||
super(DocumentAdmin, self).delete_model(request, obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
index.add_or_update_document(obj)
|
||||
super(DocumentAdmin, self).save_model(request, obj, form, change)
|
||||
|
||||
@mark_safe
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
|
@@ -18,7 +18,8 @@ class DocumentsConfig(AppConfig):
|
||||
set_log_entry,
|
||||
set_correspondent,
|
||||
set_document_type,
|
||||
set_tags
|
||||
set_tags,
|
||||
add_to_index
|
||||
|
||||
)
|
||||
|
||||
@@ -29,6 +30,7 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(set_document_type)
|
||||
document_consumption_finished.connect(set_tags)
|
||||
document_consumption_finished.connect(set_log_entry)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
post_delete.connect(cleanup_document_deletion)
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import magic
|
||||
import os
|
||||
|
||||
from datetime import datetime
|
||||
@@ -6,77 +5,25 @@ from time import mktime
|
||||
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Document, Correspondent
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
TYPE_LOOKUP = {
|
||||
"application/pdf": Document.TYPE_PDF,
|
||||
"image/png": Document.TYPE_PNG,
|
||||
"image/jpeg": Document.TYPE_JPG,
|
||||
"image/gif": Document.TYPE_GIF,
|
||||
"image/tiff": Document.TYPE_TIF,
|
||||
}
|
||||
|
||||
correspondent = forms.CharField(
|
||||
max_length=Correspondent._meta.get_field("name").max_length,
|
||||
required=False
|
||||
)
|
||||
title = forms.CharField(
|
||||
max_length=Document._meta.get_field("title").max_length,
|
||||
required=False
|
||||
)
|
||||
document = forms.FileField()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
forms.Form.__init__(self, *args, **kwargs)
|
||||
self._file_type = None
|
||||
|
||||
def clean_correspondent(self):
|
||||
"""
|
||||
I suppose it might look cleaner to use .get_or_create() here, but that
|
||||
would also allow someone to fill up the db with bogus correspondents
|
||||
before all validation was met.
|
||||
"""
|
||||
|
||||
corresp = self.cleaned_data.get("correspondent")
|
||||
|
||||
if not corresp:
|
||||
return None
|
||||
|
||||
if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
|
||||
raise forms.ValidationError(
|
||||
"That correspondent name is suspicious.")
|
||||
|
||||
return corresp
|
||||
|
||||
def clean_title(self):
|
||||
|
||||
title = self.cleaned_data.get("title")
|
||||
|
||||
if not title:
|
||||
return None
|
||||
|
||||
if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
|
||||
raise forms.ValidationError("That title is suspicious.")
|
||||
|
||||
return title
|
||||
|
||||
def clean_document(self):
|
||||
try:
|
||||
validate_filename(self.cleaned_data.get("document").name)
|
||||
except ValidationError:
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
return self.cleaned_data.get("document")
|
||||
|
||||
document = self.cleaned_data.get("document").read()
|
||||
|
||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
||||
file_type = m.id_buffer(document)
|
||||
|
||||
if file_type not in self.TYPE_LOOKUP:
|
||||
raise forms.ValidationError("The file type is invalid.")
|
||||
|
||||
self._file_type = self.TYPE_LOOKUP[file_type]
|
||||
|
||||
return document
|
||||
def get_filename(self, i=None):
|
||||
return os.path.join(
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
|
||||
)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
@@ -85,15 +32,15 @@ class UploadForm(forms.Form):
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
correspondent = self.cleaned_data.get("correspondent")
|
||||
title = self.cleaned_data.get("title")
|
||||
document = self.cleaned_data.get("document")
|
||||
document = self.cleaned_data.get("document").read()
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
file_name = os.path.join(
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{} - {}.{}".format(correspondent, title, self._file_type)
|
||||
)
|
||||
|
||||
file_name = self.get_filename()
|
||||
i = 0
|
||||
while os.path.exists(file_name):
|
||||
i += 1
|
||||
file_name = self.get_filename(i)
|
||||
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(document)
|
||||
|
@@ -2,15 +2,20 @@ import logging
|
||||
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from paperless import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JsonFormatter(Formatter):
|
||||
def __init__(self):
|
||||
self.seen = {}
|
||||
@@ -68,7 +73,7 @@ def open_index(recreate=False):
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
logging.getLogger(__name__).debug("Updating index with document{}".format(str(doc)))
|
||||
logger.debug("Indexing {}...".format(doc))
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
@@ -77,19 +82,32 @@ def update_document(writer, doc):
|
||||
)
|
||||
|
||||
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def add_document_to_index(sender, instance, **kwargs):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
update_document(writer, instance)
|
||||
def remove_document(writer, doc):
|
||||
logger.debug("Removing {} from index...".format(doc))
|
||||
writer.delete_by_term('id', doc.pk)
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def remove_document_from_index(sender, instance, **kwargs):
|
||||
logging.getLogger(__name__).debug("Removing document {} from index".format(str(instance)))
|
||||
def add_or_update_document(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
writer.delete_by_term('id', instance.pk)
|
||||
update_document(writer, document)
|
||||
|
||||
|
||||
def remove_document_from_index(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
def query_page(ix, query, page):
|
||||
with ix.searcher() as searcher:
|
||||
query_parser = MultifieldParser(["content", "title", "correspondent"],
|
||||
ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
return result_page
|
||||
|
||||
|
||||
def autocomplete(ix, term, limit=10):
|
||||
|
@@ -1,10 +1,6 @@
|
||||
import logging
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from paperless import settings
|
||||
from ...mixins import Renderable
|
||||
from ...tasks import train_classifier
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
try:
|
||||
# load the classifier, since we might not have to train it again.
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError):
|
||||
# This is what we're going to fix here.
|
||||
pass
|
||||
|
||||
try:
|
||||
if classifier.train():
|
||||
logging.getLogger(__name__).info(
|
||||
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
|
||||
)
|
||||
classifier.save_classifier()
|
||||
else:
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training data unchanged."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).error(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
train_classifier()
|
||||
|
@@ -1,9 +1,7 @@
|
||||
from django.core.management import BaseCommand
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
import documents.index as index
|
||||
from documents.mixins import Renderable
|
||||
from documents.models import Document
|
||||
from documents.tasks import index_reindex, index_optimize
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
if options['command'] == 'reindex':
|
||||
documents = Document.objects.all()
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
|
||||
index_reindex()
|
||||
elif options['command'] == 'optimize':
|
||||
index.open_index().optimize()
|
||||
index_optimize()
|
||||
|
24
src/documents/management/commands/document_renamer.py
Normal file
24
src/documents/management/commands/document_renamer.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document, Tag
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
This will rename all documents to match the latest filename format.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.all():
|
||||
# Saving the document again will generate a new filename and rename
|
||||
document.save()
|
@@ -1,60 +0,0 @@
|
||||
import argparse
|
||||
import threading
|
||||
from multiprocessing import Pool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.consumer import Consumer
|
||||
from documents.models import Log, Document
|
||||
from documents.parsers import get_parser_class
|
||||
|
||||
|
||||
def process_document(doc):
|
||||
parser_class = get_parser_class(doc.file_name)
|
||||
if not parser_class:
|
||||
print("no parser available")
|
||||
else:
|
||||
print("Parser: {}".format(parser_class.__name__))
|
||||
parser = parser_class(doc.source_path, None)
|
||||
try:
|
||||
text = parser.get_text()
|
||||
doc.content = text
|
||||
doc.save()
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
def document_index(value):
|
||||
ivalue = int(value)
|
||||
if not (1 <= ivalue <= Document.objects.count()):
|
||||
raise argparse.ArgumentTypeError(
|
||||
"{} is not a valid document index (out of range)".format(value))
|
||||
|
||||
return ivalue
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = "Performs OCR on all documents again!"
|
||||
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-s", "--start_index",
|
||||
default=None,
|
||||
type=document_index
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
docs = Document.objects.all().order_by("added")
|
||||
|
||||
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
|
||||
|
||||
for i in indices:
|
||||
doc = docs[i]
|
||||
print("==================================")
|
||||
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
|
||||
print("==================================")
|
||||
process_document(doc)
|
@@ -1,73 +0,0 @@
|
||||
# Generated by Django 3.1.2 on 2020-10-29 14:29
|
||||
import os
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def make_index(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
print()
|
||||
try:
|
||||
print(" --> Creating document index...")
|
||||
from whoosh.writing import AsyncWriter
|
||||
from documents import index
|
||||
ix = index.open_index(recreate=True)
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
except ImportError:
|
||||
# index may not be relevant anymore
|
||||
print(" --> Cannot create document index.")
|
||||
|
||||
|
||||
def restore_filenames(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
for doc in Document.objects.all():
|
||||
file_name = "{:07}.{}".format(doc.pk, doc.file_type)
|
||||
if doc.storage_type == "gpg":
|
||||
file_name += ".gpg"
|
||||
|
||||
if not doc.filename == file_name:
|
||||
try:
|
||||
print("file was renamed, restoring {} to {}".format(doc.filename, file_name))
|
||||
os.rename(os.path.join(settings.ORIGINALS_DIR, doc.filename),
|
||||
os.path.join(settings.ORIGINALS_DIR, file_name))
|
||||
except PermissionError:
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def initialize_document_classifier(apps, schema_editor):
|
||||
try:
|
||||
print("Initalizing document classifier...")
|
||||
from documents.classifier import DocumentClassifier
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.train()
|
||||
classifier.save_classifier()
|
||||
except Exception as e:
|
||||
print("Classifier error: {}".format(e))
|
||||
except ImportError:
|
||||
print("Document classifier not found, skipping")
|
||||
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0023_document_current_filename'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(make_index, migrations.RunPython.noop),
|
||||
migrations.RunPython(restore_filenames),
|
||||
migrations.RunPython(initialize_document_classifier, migrations.RunPython.noop),
|
||||
migrations.RemoveField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
),
|
||||
]
|
95
src/documents/migrations/1000_update_paperless_all.py
Normal file
95
src/documents/migrations/1000_update_paperless_all.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-07 12:35
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
def make_index(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
print()
|
||||
try:
|
||||
print(" --> Creating document index...")
|
||||
from whoosh.writing import AsyncWriter
|
||||
from documents import index
|
||||
ix = index.open_index(recreate=True)
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
except ImportError:
|
||||
# index may not be relevant anymore
|
||||
print(" --> Cannot create document index.")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0023_document_current_filename'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_serial_number',
|
||||
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='is_inbox_tag',
|
||||
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='DocumentType',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=128, unique=True)),
|
||||
('slug', models.SlugField(blank=True, editable=False)),
|
||||
('match', models.CharField(blank=True, max_length=256)),
|
||||
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
|
||||
('is_insensitive', models.BooleanField(default=True)),
|
||||
],
|
||||
options={
|
||||
'abstract': False,
|
||||
'ordering': ('name',),
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='document_type',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='content',
|
||||
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='log',
|
||||
options={'ordering': ('-created',)},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='log',
|
||||
name='modified',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='group',
|
||||
field=models.UUIDField(blank=True, null=True),
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=make_index,
|
||||
reverse_code=django.db.migrations.operations.special.RunPython.noop,
|
||||
),
|
||||
]
|
28
src/documents/migrations/1001_auto_20201109_1636.py
Normal file
28
src/documents/migrations/1001_auto_20201109_1636.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-09 16:36
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
from django_q.models import Schedule
|
||||
from django_q.tasks import schedule
|
||||
|
||||
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
||||
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
||||
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.all().delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1000_update_paperless_all'),
|
||||
('django_q', '0013_task_attempt_count'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
RunPython(add_schedules, remove_schedules)
|
||||
]
|
@@ -1,23 +0,0 @@
|
||||
# Generated by Django 2.0.7 on 2018-07-12 09:52
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1000_update_paperless'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_serial_number',
|
||||
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='is_inbox_tag',
|
||||
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
|
||||
),
|
||||
]
|
@@ -1,33 +0,0 @@
|
||||
# Generated by Django 2.0.7 on 2018-08-23 11:55
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1001_workflow_improvements'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='DocumentType',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=128, unique=True)),
|
||||
('slug', models.SlugField(blank=True, editable=False)),
|
||||
('match', models.CharField(blank=True, max_length=256)),
|
||||
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
|
||||
('is_insensitive', models.BooleanField(default=True)),
|
||||
],
|
||||
options={
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='document_type',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.DocumentType'),
|
||||
),
|
||||
]
|
@@ -1,32 +0,0 @@
|
||||
# Generated by Django 3.1.2 on 2020-10-28 17:51
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20180823_1155'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='documenttype',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
]
|
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 3.1.2 on 2020-10-29 13:31
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1003_auto_20201028_1751'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='content',
|
||||
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
|
||||
),
|
||||
]
|
@@ -1,26 +0,0 @@
|
||||
# Generated by Django 3.1.2 on 2020-11-02 00:07
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1004_auto_20201029_1331'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='log',
|
||||
options={'ordering': ('-created',)},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='log',
|
||||
name='modified',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='log',
|
||||
name='group',
|
||||
field=models.UUIDField(blank=True, null=True),
|
||||
),
|
||||
]
|
@@ -3,11 +3,12 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
@@ -190,6 +191,14 @@ class Document(models.Model):
|
||||
added = models.DateTimeField(
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
filename = models.FilePathField(
|
||||
max_length=256,
|
||||
editable=False,
|
||||
default=None,
|
||||
null=True,
|
||||
help_text="Current filename in storage"
|
||||
)
|
||||
|
||||
archive_serial_number = models.IntegerField(
|
||||
blank=True,
|
||||
null=True,
|
||||
@@ -211,15 +220,123 @@ class Document(models.Model):
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
|
||||
def find_renamed_document(self, subdirectory=""):
|
||||
suffix = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
suffix += ".gpg"
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(subdirectory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
if filename.endswith(suffix):
|
||||
return os.path.join(subdirectory, filename)
|
||||
|
||||
fullname = os.path.join(subdirectory, filename)
|
||||
if os.path.isdir(Document.filename_to_path(fullname)):
|
||||
return self.find_renamed_document(fullname)
|
||||
|
||||
return None
|
||||
|
||||
@property
|
||||
def source_filename(self):
|
||||
# Initial filename generation (for new documents)
|
||||
if self.filename is None:
|
||||
self.filename = self.generate_source_filename()
|
||||
|
||||
# Check if document is still available under filename
|
||||
elif not os.path.isfile(Document.filename_to_path(self.filename)):
|
||||
recovered_filename = self.find_renamed_document()
|
||||
|
||||
# If we have found the file so update the filename
|
||||
if recovered_filename is not None:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Filename of document " + str(self.id) +
|
||||
" has changed and was successfully updated")
|
||||
self.filename = recovered_filename
|
||||
|
||||
# Remove all empty subdirectories from MEDIA_ROOT
|
||||
Document.delete_all_empty_subdirectories(
|
||||
Document.filename_to_path(""))
|
||||
else:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("File of document " + str(self.id) + " has " +
|
||||
"gone and could not be recovered")
|
||||
|
||||
return self.filename
|
||||
|
||||
@staticmethod
|
||||
def many_to_dictionary(field):
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find('_')
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find('-')
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter+1:]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
def generate_source_filename(self):
|
||||
# Create filename based on configured format
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
self.many_to_dictionary(self.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(self.correspondent),
|
||||
title=slugify(self.title),
|
||||
created=slugify(self.created),
|
||||
added=slugify(self.added),
|
||||
tags=tags)
|
||||
else:
|
||||
path = ""
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
def create_source_directory(self):
|
||||
new_filename = self.generate_source_filename()
|
||||
|
||||
# Determine the full "target" path
|
||||
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
os.makedirs(dir_new, exist_ok=True)
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
file_name = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
file_name += ".gpg"
|
||||
return Document.filename_to_path(self.source_filename)
|
||||
|
||||
@staticmethod
|
||||
def filename_to_path(filename):
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
file_name
|
||||
filename
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -245,6 +362,125 @@ class Document(models.Model):
|
||||
def thumbnail_file(self):
|
||||
return open(self.thumbnail_path, "rb")
|
||||
|
||||
def set_filename(self, filename):
|
||||
if os.path.isfile(Document.filename_to_path(filename)):
|
||||
self.filename = filename
|
||||
|
||||
@staticmethod
|
||||
def try_delete_empty_directories(directory):
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(Document.filename_to_path(""))
|
||||
|
||||
while directory != root:
|
||||
# Try to delete the current directory
|
||||
try:
|
||||
os.rmdir(directory)
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
return
|
||||
|
||||
# Cut off actual directory and go one level up
|
||||
directory, _ = os.path.split(directory)
|
||||
directory = os.path.normpath(directory)
|
||||
|
||||
@staticmethod
|
||||
def delete_all_empty_subdirectories(directory):
|
||||
# Go through all folders and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(directory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
fullname = os.path.join(directory, filename)
|
||||
|
||||
if not os.path.isdir(Document.filename_to_path(fullname)):
|
||||
continue
|
||||
|
||||
# Go into subdirectory to see, if there is more to delete
|
||||
Document.delete_all_empty_subdirectories(
|
||||
os.path.join(directory, filename))
|
||||
|
||||
# Try to delete the directory
|
||||
try:
|
||||
os.rmdir(Document.filename_to_path(fullname))
|
||||
continue
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
continue
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_filename(sender, instance, **kwargs):
|
||||
# Skip if document has not been saved yet
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Check is file exists and update filename otherwise
|
||||
if not os.path.isfile(Document.filename_to_path(instance.filename)):
|
||||
instance.filename = instance.source_filename
|
||||
|
||||
# Build the new filename
|
||||
new_filename = instance.generate_source_filename()
|
||||
|
||||
# If the filename is the same, then nothing needs to be done
|
||||
if instance.filename == new_filename:
|
||||
return
|
||||
|
||||
# Determine the full "target" path
|
||||
path_new = instance.filename_to_path(new_filename)
|
||||
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
instance.create_source_directory()
|
||||
|
||||
# Determine the full "current" path
|
||||
path_current = instance.filename_to_path(instance.source_filename)
|
||||
|
||||
# Move file
|
||||
try:
|
||||
os.rename(path_current, path_new)
|
||||
except PermissionError:
|
||||
# Do not update filename in object
|
||||
return
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("Renaming of document " + str(instance.id) + " failed " +
|
||||
"as file " + instance.filename + " was no longer present")
|
||||
return
|
||||
|
||||
# Delete empty directory
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
instance.filename = new_filename
|
||||
|
||||
# Save instance
|
||||
# This will not cause a cascade of post_save signals, as next time
|
||||
# nothing needs to be renamed
|
||||
instance.save()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def delete_files(sender, instance, **kwargs):
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Remove the document
|
||||
old_file = instance.filename_to_path(instance.filename)
|
||||
|
||||
try:
|
||||
os.remove(old_file)
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Deleted document " + str(instance.id) + " but file " +
|
||||
old_file + " was no longer present")
|
||||
|
||||
# And remove the directory (if applicable)
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
|
||||
class Log(models.Model):
|
||||
|
||||
|
@@ -166,3 +166,7 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
user=user,
|
||||
object_repr=document.__str__(),
|
||||
)
|
||||
|
||||
|
||||
def add_to_index(sender, document, **kwargs):
|
||||
index.add_or_update_document(document)
|
||||
|
7
src/documents/static/bootstrap.min.css
vendored
Normal file
7
src/documents/static/bootstrap.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
44
src/documents/static/signin.css
Normal file
44
src/documents/static/signin.css
Normal file
@@ -0,0 +1,44 @@
|
||||
html,
|
||||
body {
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
body {
|
||||
display: -ms-flexbox;
|
||||
display: flex;
|
||||
-ms-flex-align: center;
|
||||
align-items: center;
|
||||
padding-top: 40px;
|
||||
padding-bottom: 40px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.form-signin {
|
||||
width: 100%;
|
||||
max-width: 330px;
|
||||
padding: 15px;
|
||||
margin: auto;
|
||||
}
|
||||
.form-signin .checkbox {
|
||||
font-weight: 400;
|
||||
}
|
||||
.form-signin .form-control {
|
||||
position: relative;
|
||||
box-sizing: border-box;
|
||||
height: auto;
|
||||
padding: 10px;
|
||||
font-size: 16px;
|
||||
}
|
||||
.form-signin .form-control:focus {
|
||||
z-index: 2;
|
||||
}
|
||||
.form-signin input[type="text"] {
|
||||
margin-bottom: -1px;
|
||||
border-bottom-right-radius: 0;
|
||||
border-bottom-left-radius: 0;
|
||||
}
|
||||
.form-signin input[type="password"] {
|
||||
margin-bottom: 10px;
|
||||
border-top-left-radius: 0;
|
||||
border-top-right-radius: 0;
|
||||
}
|
57
src/documents/tasks.py
Normal file
57
src/documents/tasks.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import logging
|
||||
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task, result
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.mail import MailFetcher
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
def consume_mail():
|
||||
MailFetcher().pull()
|
||||
|
||||
|
||||
def index_optimize():
|
||||
index.open_index().optimize()
|
||||
|
||||
|
||||
def index_reindex():
|
||||
documents = Document.objects.all()
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
|
||||
|
||||
def train_classifier():
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
try:
|
||||
# load the classifier, since we might not have to train it again.
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError):
|
||||
# This is what we're going to fix here.
|
||||
pass
|
||||
|
||||
try:
|
||||
if classifier.train():
|
||||
logging.getLogger(__name__).info(
|
||||
"Saving updated classifier model to {}...".format(
|
||||
settings.MODEL_FILE)
|
||||
)
|
||||
classifier.save_classifier()
|
||||
else:
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training data unchanged."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).error(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
@@ -9,11 +9,11 @@
|
||||
<base href="/">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<link rel="icon" type="image/x-icon" href="favicon.ico">
|
||||
<link rel="stylesheet" href="{% static 'styles.css' %}"></head>
|
||||
<link rel="stylesheet" href="{% static 'frontend/styles.css' %}"></head>
|
||||
<body>
|
||||
<app-root>Loading...</app-root>
|
||||
<script src="{% static 'runtime.js' %}" defer></script>
|
||||
<script src="{% static 'polyfills.js' %}" defer></script>
|
||||
<script src="{% static 'main.js' %}" defer></script>
|
||||
<script src="{% static 'frontend/runtime.js' %}" defer></script>
|
||||
<script src="{% static 'frontend/polyfills.js' %}" defer></script>
|
||||
<script src="{% static 'frontend/main.js' %}" defer></script>
|
||||
</body>
|
||||
</html>
|
||||
|
44
src/documents/templates/registration/logged_out.html
Normal file
44
src/documents/templates/registration/logged_out.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!doctype html>
|
||||
|
||||
{% load static %}
|
||||
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
|
||||
<meta name="generator" content="Jekyll v4.1.1">
|
||||
<title>Paperless Sign In</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
|
||||
|
||||
<style>
|
||||
.bd-placeholder-img {
|
||||
font-size: 1.125rem;
|
||||
text-anchor: middle;
|
||||
-webkit-user-select: none;
|
||||
-moz-user-select: none;
|
||||
-ms-user-select: none;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
@media (min-width: 768px) {
|
||||
.bd-placeholder-img-lg {
|
||||
font-size: 3.5rem;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<!-- Custom styles for this template -->
|
||||
<link href="{% static 'signin.css' %}" rel="stylesheet">
|
||||
</head>
|
||||
|
||||
<body class="text-center">
|
||||
<div class="form-signin">
|
||||
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>You have been successfully logged out. Bye!</p>
|
||||
<a href="/">Sign in again</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
54
src/documents/templates/registration/login.html
Normal file
54
src/documents/templates/registration/login.html
Normal file
@@ -0,0 +1,54 @@
|
||||
<!doctype html>
|
||||
|
||||
{% load static %}
|
||||
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
|
||||
<meta name="generator" content="Jekyll v4.1.1">
|
||||
<title>Paperless Sign In</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
|
||||
|
||||
<style>
|
||||
.bd-placeholder-img {
|
||||
font-size: 1.125rem;
|
||||
text-anchor: middle;
|
||||
-webkit-user-select: none;
|
||||
-moz-user-select: none;
|
||||
-ms-user-select: none;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
@media (min-width: 768px) {
|
||||
.bd-placeholder-img-lg {
|
||||
font-size: 3.5rem;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<!-- Custom styles for this template -->
|
||||
<link href="{% static 'signin.css' %}" rel="stylesheet">
|
||||
</head>
|
||||
|
||||
<body class="text-center">
|
||||
<form class="form-signin" method="post">
|
||||
{% csrf_token %}
|
||||
<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
|
||||
<p>Please sign in.</p>
|
||||
{% if form.errors %}
|
||||
<div class="alert alert-danger" role="alert">
|
||||
Your username and password didn't match. Please try again.
|
||||
</div>
|
||||
{% endif %}
|
||||
<label for="inputUsername" class="sr-only">Username</label>
|
||||
<input type="text" name="username" id="inputUsername" class="form-control" placeholder="Username" required autofocus>
|
||||
<label for="inputPassword" class="sr-only">Password</label>
|
||||
<input type="password" name="password" id="inputPassword" class="form-control" placeholder="Password" required>
|
||||
<button class="btn btn-lg btn-primary btn-block" type="submit">Sign in</button>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
@@ -1,66 +1,10 @@
|
||||
import re
|
||||
|
||||
from django.test import TestCase
|
||||
from unittest import mock
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from ..consumer import Consumer
|
||||
from ..models import FileInfo, Tag
|
||||
|
||||
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
class DummyParser(object):
|
||||
pass
|
||||
|
||||
def test__get_parser_class_1_parser(self):
|
||||
self.assertEqual(
|
||||
self._get_consumer()._get_parser_class("doc.pdf"),
|
||||
self.DummyParser
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def test__get_parser_class_n_parsers(self, m, *args):
|
||||
|
||||
class DummyParser1(object):
|
||||
pass
|
||||
|
||||
class DummyParser2(object):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
||||
)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertEqual(
|
||||
Consumer(consume=tmpdir)._get_parser_class("doc.pdf"),
|
||||
DummyParser2
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
m.return_value = ((None, lambda _: None),)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(
|
||||
Consumer(consume=tmpdir)._get_parser_class("doc.pdf")
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def _get_consumer(self, m, *args):
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": self.DummyParser}),
|
||||
)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
return Consumer(consume=tmpdir)
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
|
||||
TAGS = ("tag1", "tag2", "tag3")
|
||||
|
559
src/documents/tests/test_file_handling.py
Normal file
559
src/documents/tests/test_file_handling.py
Normal file
@@ -0,0 +1,559 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
from shutil import rmtree
|
||||
|
||||
from dateutil import tz
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from django.utils.text import slugify
|
||||
from ..models import Tag, Document, Correspondent
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
deletion_list = []
|
||||
|
||||
def add_to_deletion_list(self, dirname):
|
||||
self.deletion_list.append(dirname)
|
||||
|
||||
def setUp(self):
|
||||
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(folder + "/documents/originals")
|
||||
override_settings(MEDIA_ROOT=folder).enable()
|
||||
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
|
||||
self.add_to_deletion_list(folder)
|
||||
|
||||
def tearDown(self):
|
||||
for dirname in self.deletion_list:
|
||||
shutil.rmtree(dirname, ignore_errors=True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_source_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.source_filename, "0000001.pdf")
|
||||
|
||||
document.filename = "test.pdf"
|
||||
self.assertEqual(document.source_filename, "test.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"0000001.pdf.gpg")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_file_renaming(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Enable encryption and check again
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf.gpg")
|
||||
document.save()
|
||||
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf.gpg"), True)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"test/test-0000001.pdf.gpg")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Make the folder read- and execute-only (no writing and no renaming)
|
||||
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555)
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), True)
|
||||
self.assertEqual(document.source_filename,
|
||||
"none/none-0000001.pdf")
|
||||
|
||||
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_delete(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Ensure file deletion after delete
|
||||
document.delete()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_delete_nofile(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_directory_not_empty(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
Path(document.source_path + "test").touch()
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
|
||||
# Cleanup
|
||||
os.remove(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdftest")
|
||||
os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_underscore(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Add tag to document
|
||||
document.tags.create(name="type_demo")
|
||||
document.tags.create(name="foo_bar")
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Add tag to document
|
||||
document.tags.create(name="type-demo")
|
||||
document.tags.create(name="foo-bar")
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Add tag to document
|
||||
document.tags.create(name="type:demo")
|
||||
document.tags.create(name="foo:bar")
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Add tag to document
|
||||
document.tags.create(name="demo")
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_out_of_bounds_0(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}")
|
||||
def test_tags_out_of_bounds_10000000(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}")
|
||||
def test_tags_out_of_bounds_99(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none"), True)
|
||||
|
||||
document.delete()
|
||||
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none/none-0000001.pdf"),
|
||||
False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals"), True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def test_format_none(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_renamed(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Rename the document "illegaly"
|
||||
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
|
||||
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf",
|
||||
settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"test/test-0000001.pdf")
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf"), True)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), False)
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/foo/foo-0000001.pdf"), True)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/foo"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), False)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"foo/foo-0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_renamed_encrypted(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf.gpg")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf.gpg")
|
||||
|
||||
# Rename the document "illegaly"
|
||||
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
|
||||
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf.gpg",
|
||||
settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"test/test-0000001.pdf.gpg")
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf.gpg"), True)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), False)
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/foo/foo-0000001.pdf.gpg"), True)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/foo"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), False)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"foo/foo-0000001.pdf.gpg")
|
||||
|
||||
def test_delete_all_empty_subdirectories(self):
|
||||
# Create our working directory
|
||||
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "empty"))
|
||||
os.makedirs(os.path.join(tmp, "empty", "subdirectory"))
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
|
||||
Document.delete_all_empty_subdirectories(tmp)
|
||||
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
|
||||
def test_try_delete_empty_directories(self):
|
||||
# Create our working directory
|
||||
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
os.makedirs(os.path.join(tmp, "notempty", "empty"))
|
||||
|
||||
Document.try_delete_empty_directories(
|
||||
os.path.join(tmp, "notempty", "empty"))
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
self.assertEqual(os.path.isdir(
|
||||
os.path.join(tmp, "notempty", "empty")), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_accidentally_deleted(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Delete the document "illegaly"
|
||||
os.remove(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf")
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
self.assertEqual(document.source_filename,
|
||||
"none/none-0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_set_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Set existing filename
|
||||
document.set_filename(tmp)
|
||||
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
|
||||
|
||||
# Set non-existing filename
|
||||
document.set_filename("doesnotexist")
|
||||
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
|
50
src/documents/tests/test_parsers.py
Normal file
50
src/documents/tests/test_parsers.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.parsers import get_parser_class
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_1_parser(self, m, *args):
|
||||
class DummyParser(object):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class("doc.pdf"),
|
||||
DummyParser
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_n_parsers(self, m, *args):
|
||||
|
||||
class DummyParser1(object):
|
||||
pass
|
||||
|
||||
class DummyParser2(object):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class("doc.pdf"),
|
||||
DummyParser2
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
m.return_value = ((None, lambda _: None),)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(
|
||||
get_parser_class("doc.pdf")
|
||||
)
|
@@ -6,9 +6,6 @@ from django_filters.rest_framework import DjangoFilterBackend
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.views import APIView
|
||||
from whoosh import highlight
|
||||
from whoosh.qparser import QueryParser
|
||||
from whoosh.query import terms
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
@@ -97,7 +94,16 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
filter_class = DocumentFilterSet
|
||||
search_fields = ("title", "correspondent__name", "content")
|
||||
ordering_fields = (
|
||||
"id", "title", "correspondent__name", "created", "modified", "added", "archive_serial_number")
|
||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super(DocumentViewSet, self).update(request, *args, **kwargs)
|
||||
index.add_or_update_document(self.get_object())
|
||||
return response
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
#TODO: this should not be necessary here.
|
||||
@@ -185,18 +191,13 @@ class SearchView(APIView):
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
with self.ix.searcher() as searcher:
|
||||
query_parser = QueryParser("content", self.ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = index.JsonFormatter()
|
||||
result_page = index.query_page(self.ix, query, page)
|
||||
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
|
||||
else:
|
||||
return Response({
|
||||
|
Reference in New Issue
Block a user