Merge branch 'dev' into celery-tasks

This commit is contained in:
Jonas Winkler
2020-11-19 22:10:57 +01:00
145 changed files with 5228 additions and 11538 deletions

View File

@@ -1,5 +1,4 @@
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
@@ -32,7 +31,7 @@ class TagAdmin(admin.ModelAdmin):
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
readonly_fields = ("slug",)
readonly_fields = ("slug", )
class DocumentTypeAdmin(admin.ModelAdmin):
@@ -51,9 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type",)
list_display = ("title", "created", "added", "correspondent",
"tags_", "archive_serial_number", "document_type")
readonly_fields = ("added", "file_type", "storage_type", "filename")
list_display = (
"title",
"created",
"added",
"correspondent",
"tags_",
"archive_serial_number",
"document_type",
"filename"
)
list_filter = (
"document_type",
"tags",
@@ -120,8 +127,3 @@ admin.site.register(Tag, TagAdmin)
admin.site.register(DocumentType, DocumentTypeAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)
# Unless we implement multi-user, these default registrations don't make sense.
admin.site.unregister(Group)
admin.site.unregister(User)

View File

@@ -1,5 +1,4 @@
from django.apps import AppConfig
from django.db.models.signals import post_delete
class DocumentsConfig(AppConfig):
@@ -14,7 +13,6 @@ class DocumentsConfig(AppConfig):
add_inbox_tags,
run_pre_consume_script,
run_post_consume_script,
cleanup_document_deletion,
set_log_entry,
set_correspondent,
set_document_type,
@@ -33,6 +31,4 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_post_consume_script)
post_delete.connect(cleanup_document_deletion)
AppConfig.ready(self)

View File

@@ -4,6 +4,8 @@ from django.conf import settings
from django.core.checks import Error, register
from django.db.utils import OperationalError, ProgrammingError
from documents.signals import document_consumer_declaration
@register()
def changed_password_check(app_configs, **kwargs):
@@ -37,3 +39,17 @@ def changed_password_check(app_configs, **kwargs):
"""))]
return []
@register()
def parser_check(app_configs, **kwargs):
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
if len(parsers) == 0:
return [Error("No parsers found. This is a bug. The consumer won't be "
"able to onsume any documents without parsers.")]
else:
return []

View File

@@ -3,7 +3,6 @@ import logging
import os
import pickle
import re
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
@@ -64,7 +63,7 @@ class DocumentClassifier(object):
def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f) # Version
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.data_hash, f)
pickle.dump(self.data_vectorizer, f)
@@ -89,16 +88,14 @@ class DocumentClassifier(object):
data.append(preprocessed_content)
y = -1
if doc.document_type:
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_document_type.append(y)
y = -1
if doc.correspondent:
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_correspondent.append(y)
@@ -120,8 +117,8 @@ class DocumentClassifier(object):
num_tags = len(labels_tags_unique)
# substract 1 since -1 (null) is also part of the classes.
num_correspondents = len(labels_correspondent) - 1
num_document_types = len(labels_document_type) - 1
num_correspondents = len(set(labels_correspondent)) - 1
num_document_types = len(set(labels_document_type)) - 1
logging.getLogger(__name__).debug(
"{} documents, {} tag(s), {} correspondent(s), "
@@ -137,7 +134,7 @@ class DocumentClassifier(object):
logging.getLogger(__name__).debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1,2),
ngram_range=(1, 2),
min_df=0.01
)
data_vectorized = self.data_vectorizer.fit_transform(data)

View File

@@ -3,7 +3,6 @@ import hashlib
import logging
import os
import re
import uuid
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
@@ -13,7 +12,9 @@ from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .models import Document, FileInfo
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .signals import (
document_consumption_finished,
@@ -25,17 +26,10 @@ class ConsumerError(Exception):
pass
class Consumer:
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale pnm
2. Use tesseract on the pnm
3. Store the document in the MEDIA_ROOT with optional encryption
4. Store the OCR'd text in the database
5. Delete the document and image(s)
"""
class Consumer(LoggingMixin):
def _send_progress(self, filename, current_progress, max_progress, status, message, document_id=None):
def _send_progress(self, filename, current_progress, max_progress, status,
message, document_id=None):
payload = {
'filename': os.path.basename(filename),
'current_progress': current_progress,
@@ -44,156 +38,226 @@ class Consumer:
'message': message,
'document_id': document_id
}
async_to_sync(self.channel_layer.group_send)("status_updates", {'type': 'status_update', 'data': payload})
async_to_sync(self.channel_layer.group_send)("status_updates",
{'type': 'status_update',
'data': payload})
def __init__(self, consume=settings.CONSUMPTION_DIR,
scratch=settings.SCRATCH_DIR):
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.consume = consume
self.scratch = scratch
self.classifier = DocumentClassifier()
def __init__(self):
super().__init__()
self.path = None
self.filename = None
self.override_title = None
self.override_correspondent_id = None
self.override_tag_ids = None
self.override_document_type_id = None
self.channel_layer = get_channel_layer()
os.makedirs(self.scratch, exist_ok=True)
def pre_check_file_exists(self):
if not os.path.isfile(self.path):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
self.storage_type = Document.STORAGE_TYPE_GPG
if not self.consume:
def pre_check_consumption_dir(self):
if not settings.CONSUMPTION_DIR:
raise ConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
"set.")
if not os.path.exists(self.consume):
if not os.path.isdir(settings.CONSUMPTION_DIR):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.consume))
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
@transaction.atomic
def try_consume_file(self, file):
"""
Return True if file was consumed
"""
self.logging_group = uuid.uuid4()
if not re.match(FileInfo.REGEXES["title"], file):
return False
doc = file
if self._is_duplicate(doc):
self.log(
"warning",
"Skipping {} as it appears to be a duplicate".format(doc)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists():
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
raise ConsumerError(
"Not consuming {}: It is a duplicate.".format(self.filename)
)
return False
self.log("info", "Consuming {}".format(doc))
def pre_check_directories(self):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
def try_consume_file(self,
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
"""
Return the document object if it was successfully created.
"""
parser_class = get_parser_class(doc)
self.path = path
self.filename = override_filename or os.path.basename(path)
self.override_title = override_title
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
self.override_tag_ids = override_tag_ids
# this is for grouping logging entries for this particular file
# together.
self.renew_logging_group()
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class.
parser_class = get_parser_class(self.filename)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
return False
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else:
self.log("info", "Parser: {}".format(parser_class.__name__))
self.log("debug", "Parser: {}".format(parser_class.__name__))
self._send_progress(file, 0, 100, 'WORKING', 'Consumption started')
# Notify all listeners that we're going to do some work.
self._send_progress(self.filename, 0, 100, 'WORKING', 'Consumption started')
document_consumption_started.send(
sender=self.__class__,
filename=doc,
filename=self.path,
logging_group=self.logging_group
)
def progress_callback(current_progress, max_progress, message):
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 60 + 20)
self._send_progress(file, p, 100, "WORKING", message)
self._send_progress(self.filename, p, 100, "WORKING", message)
document_parser = parser_class(doc, self.logging_group, progress_callback)
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.path, self.logging_group, progress_callback)
# However, this already created working directories which we have to
# clean up.
# Parse the document. This may take some time.
try:
self.log("info", "Generating thumbnail for {}...".format(doc))
self._send_progress(file, 10, 100, 'WORKING',
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
self._send_progress(self.filename, 10, 100, 'WORKING',
'Generating thumbnail...')
thumbnail = document_parser.get_optimised_thumbnail()
self._send_progress(file, 20, 100, 'WORKING',
self.log("debug", "Parsing {}...".format(self.filename))
self._send_progress(self.filename, 20, 100, 'WORKING',
'Getting text from document...')
text = document_parser.get_text()
self._send_progress(file, 80, 100, 'WORKING',
self._send_progress(self.filename, 80, 100, 'WORKING',
'Getting date from document...')
date = document_parser.get_date()
self._send_progress(file, 85, 100, 'WORKING',
'Storing the document...')
document = self._store(
text,
doc,
thumbnail,
date
)
except ParseError as e:
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
document_parser.cleanup()
self._send_progress(self.filename, 100, 100, 'FAILED',
"Failed: {}".format(e))
raise ConsumerError(e)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
try:
classifier = DocumentClassifier()
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning(
"Cannot classify documents: {}.".format(e))
classifier = None
self._send_progress(self.filename, 85, 100, 'WORKING',
'Storing the document...')
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
document = self._store(
text=text,
date=date
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
self._send_progress(self.filename, 90, 100, 'WORKING',
'Performing post-consumption tasks...')
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
create_source_path_directory(document.source_path)
self._write(document, self.path, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
except Exception as e:
raise ConsumerError(e)
self._send_progress(file, 100, 100, 'FAILED',
"Failed: {}".format(e))
finally:
document_parser.cleanup()
return False
else:
document_parser.cleanup()
self._cleanup_doc(doc)
self.log(
"info",
"Document {} consumption finished".format(document)
)
self.log(
"info",
"Document {} consumption finished".format(document)
)
classifier = None
self._send_progress(file, 100, 100, 'SUCCESS',
'Finished.', document.id)
try:
self.classifier.reload()
classifier = self.classifier
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
return document
self._send_progress(file, 90, 100, 'WORKING',
'Performing post-consumption tasks...')
def _store(self, text, date):
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
self._send_progress(file, 100, 100, 'SUCCESS',
'Finished.', document.id)
return True
# If someone gave us the original filename, use it instead of doc.
def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(self.filename)
file_info = FileInfo.from_path(doc)
stats = os.stat(doc)
stats = os.stat(self.path)
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
else:
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with open(self.path, "rb") as f:
document = Document.objects.create(
correspondent=file_info.correspondent,
title=file_info.title,
@@ -202,7 +266,7 @@ class Consumer:
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
storage_type=self.storage_type
storage_type=storage_type
)
relevant_tags = set(file_info.tags)
@@ -211,14 +275,30 @@ class Consumer:
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
self.apply_overrides(document)
#TODO: why do we need to save the document again?
document.filename = generate_filename(document)
# We need to save the document twice, since we need the PK of the
# document in order to create its filename above.
document.save()
return document
def apply_overrides(self, document):
if self.override_title:
document.title = self.override_title
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:
document.tags.add(Tag.objects.get(pk=tag_id))
def _write(self, document, source, target):
with open(source, "rb") as read_file:
with open(target, "wb") as write_file:
@@ -227,13 +307,3 @@ class Consumer:
return
self.log("debug", "Encrypting")
write_file.write(GnuPG.encrypted(read_file))
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()

View File

@@ -0,0 +1,102 @@
import logging
import os
from collections import defaultdict
from django.conf import settings
from django.template.defaultfilters import slugify
def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
def delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(settings.ORIGINALS_DIR)
if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder.
# append os.path.set so that we avoid these cases:
# directory = /home/originals2/test
# root = /home/originals ("/" gets appended and startswith fails)
return
while directory != root:
if not os.listdir(directory):
# it's empty
try:
os.rmdir(directory)
except OSError:
# whatever. empty directories aren't that bad anyway.
return
else:
# it's not empty.
return
# go one level up
directory = os.path.normpath(os.path.dirname(directory))
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter + 1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_filename(document):
# Create filename based on configured format
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
many_to_dictionary(document.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(document.correspondent),
title=slugify(document.title),
created=slugify(document.created),
created_year=document.created.year if document.created else "none",
created_month=document.created.month if document.created else "none",
created_day=document.created.day if document.created else "none",
added=slugify(document.added),
added_year=document.added.year if document.added else "none",
added_month=document.added.month if document.added else "none",
added_day=document.added.day if document.added else "none",
tags=tags,
)
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
else:
filename = "%07i.%s" % (document.pk, document.file_type)
# Append .gpg for encrypted files
if document.storage_type == document.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename

View File

@@ -1,10 +1,11 @@
import os
import tempfile
from datetime import datetime
from time import mktime
from django import forms
from django.conf import settings
from django_q.tasks import async_task
from pathvalidate import validate_filename, ValidationError
@@ -19,12 +20,6 @@ class UploadForm(forms.Form):
raise forms.ValidationError("That filename is suspicious.")
return self.cleaned_data.get("document")
def get_filename(self, i=None):
return os.path.join(
settings.CONSUMPTION_DIR,
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
)
def save(self):
"""
Since the consumer already does a lot of work, it's easier just to save
@@ -33,15 +28,16 @@ class UploadForm(forms.Form):
"""
document = self.cleaned_data.get("document").read()
original_filename = self.cleaned_data.get("document").name
t = int(mktime(datetime.now().timetuple()))
file_name = self.get_filename()
i = 0
while os.path.exists(file_name):
i += 1
file_name = self.get_filename(i)
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
with open(file_name, "wb") as f:
f.write(document)
os.utime(file_name, times=(t, t))
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))

View File

@@ -1,7 +1,6 @@
import logging
from contextlib import contextmanager
from django.db import models
from django.dispatch import receiver
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.highlight import Formatter, get_text
@@ -9,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter
from documents.models import Document
from paperless import settings
logger = logging.getLogger(__name__)
@@ -69,6 +66,9 @@ def open_index(recreate=False):
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
else:
# TODO: this is not thread safe. If 2 instances try to create the index
# at the same time, this fails. This currently prevents parallel
# tests.
return create_in(settings.INDEX_DIR, get_schema())
@@ -99,15 +99,19 @@ def remove_document_from_index(document):
remove_document(writer, document)
@contextmanager
def query_page(ix, query, page):
with ix.searcher() as searcher:
searcher = ix.searcher()
try:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
return result_page
yield result_page
finally:
searcher.close()
def autocomplete(ix, term, limit=10):

View File

@@ -1,4 +1,5 @@
import logging
import uuid
class PaperlessHandler(logging.Handler):
@@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
kwargs["group"] = record.group
Log.objects.create(**kwargs)
class LoggingMixin:
logging_group = None
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
getattr(logger, level)(message, extra={
"group": self.logging_group
})

View File

@@ -1,250 +0,0 @@
import datetime
import imaplib
import logging
import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
from email.parser import BytesParser
from dateutil import parser
from django.conf import settings
from .models import Correspondent
class MailFetcherError(Exception):
pass
class InvalidMessageError(MailFetcherError):
pass
class Loggable(object):
def __init__(self, group=None):
self.logger = logging.getLogger(__name__)
self.logging_group = group or uuid.uuid4()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
class Message(Loggable):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
"""
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
def __init__(self, data, group=None):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
Loggable.__init__(self, group=group)
self.subject = None
self.time = None
self.attachment = None
message = BytesParser(policy=policy.default).parsebytes(data)
self.subject = str(message["Subject"]).replace("\r\n", "")
self.body = str(message.get_body())
self.check_subject()
self.check_body()
self._set_time(message)
self.log("info", 'Importing email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
content_disposition = part.get("Content-Disposition")
if not content_disposition:
continue
dispositions = content_disposition.strip().split(";")
if len(dispositions) < 2:
continue
if not dispositions[0].lower() == "attachment" and \
"filename" not in dispositions[1].lower():
continue
file_data = part.get_payload()
attachments.append(Attachment(
b64decode(file_data), content_type=part.get_content_type()))
if len(attachments) == 0:
raise InvalidMessageError(
"There don't appear to be any attachments to this message")
if len(attachments) > 1:
raise InvalidMessageError(
"There's more than one attachment to this message. It cannot "
"be indexed automatically."
)
self.attachment = attachments[0]
def __bool__(self):
return bool(self.attachment)
def check_subject(self):
if self.subject is None:
raise InvalidMessageError("Message does not have a subject")
if not Correspondent.SAFE_REGEX.match(self.subject):
raise InvalidMessageError("Message subject is unsafe: {}".format(
self.subject))
def check_body(self):
if self.SECRET not in self.body:
raise InvalidMessageError("The secret wasn't in the body")
def _set_time(self, message):
self.time = datetime.datetime.now()
message_time = message.get("Date")
if message_time:
try:
self.time = parser.parse(message_time)
except (ValueError, AttributeError):
pass # We assume that "now" is ok
@property
def file_name(self):
return "{}.{}".format(self.subject, self.attachment.suffix)
class Attachment(object):
SAFE_SUFFIX_REGEX = re.compile(
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
def __init__(self, data, content_type):
self.content_type = content_type
self.data = data
self.suffix = None
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
if not m:
raise MailFetcherError(
"Not-awesome file type: {}".format(self.content_type))
self.suffix = m.group(2) or m.group(4)
def read(self):
return self.data
class MailFetcher(Loggable):
def __init__(self, consume=settings.CONSUMPTION_DIR):
Loggable.__init__(self)
self._connection = None
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
self._enabled = bool(self._host)
if self._enabled and Message.SECRET is None:
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
self.last_checked = time.time()
self.consume = consume
def pull(self):
"""
Fetch all available mail at the target address and store it locally in
the consumption directory so that the file consumer can pick it up and
do its thing.
"""
if self._enabled:
# Reset the grouping id for each fetch
self.logging_group = uuid.uuid4()
self.log("debug", "Checking mail")
for message in self._get_messages():
self.log("info", 'Storing email: "{}"'.format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(self.consume, message.file_name)
with open(file_name, "wb") as f:
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))
self.last_checked = time.time()
def _get_messages(self):
r = []
try:
self._connect()
self._login()
for message in self._fetch():
if message:
r.append(message)
self._connection.expunge()
self._connection.close()
self._connection.logout()
except MailFetcherError as e:
self.log("error", str(e))
return r
def _connect(self):
try:
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
except OSError as e:
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
raise MailFetcherError(msg)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select(self._inbox)
if not inbox[0] == "OK":
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
__, data = self._connection.fetch(num, "(RFC822)")
message = None
try:
message = Message(data[0][1], self.logging_group)
except InvalidMessageError as e:
self.log("error", str(e))
else:
self._connection.store(num, "+FLAGS", "\\Deleted")
if message:
yield message

View File

@@ -3,11 +3,10 @@ import os
from django.conf import settings
from django.core.management.base import BaseCommand
from watchdog.observers import Observer
from django_q.tasks import async_task
from watchdog.events import FileSystemEventHandler
from documents.consumer import Consumer
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver
try:
from inotify_simple import INotify, flags
@@ -17,17 +16,25 @@ except ImportError:
class Handler(FileSystemEventHandler):
def __init__(self, consumer):
self.consumer = consumer
def _consume(self, file):
if os.path.isfile(file):
try:
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
except Exception as e:
# Catch all so that the consumer won't crash.
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
def on_created(self, event):
self.consumer.try_consume_file(event.src_path)
self._consume(event.src_path)
def on_moved(self, event):
self._consume(event.src_path)
class Command(BaseCommand):
"""
On every iteration of an infinite loop, consume what we can from the
consumption directory, and fetch any mail available.
consumption directory.
"""
def __init__(self, *args, **kwargs):
@@ -35,12 +42,6 @@ class Command(BaseCommand):
self.verbosity = 0
self.logger = logging.getLogger(__name__)
self.file_consumer = None
self.mail_fetcher = None
self.first_iteration = True
self.consumer = Consumer()
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
@@ -56,9 +57,6 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
directory = options["directory"]
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
os.makedirs(d, exist_ok=True)
logging.getLogger(__name__).info(
"Starting document consumer at {}".format(
directory
@@ -68,11 +66,16 @@ class Command(BaseCommand):
# Consume all files as this is not done initially by the watchdog
for entry in os.scandir(directory):
if entry.is_file():
self.consumer.try_consume_file(entry.path)
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
# Start the watchdog. Woof!
observer = Observer()
event_handler = Handler(self.consumer)
if settings.CONSUMER_POLLING > 0:
logging.getLogger(__name__).info('Using polling instead of file'
'system notifications.')
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
else:
observer = Observer()
event_handler = Handler()
observer.schedule(event_handler, directory, recursive=True)
observer.start()
try:

View File

@@ -1,4 +1,5 @@
from django.core.management.base import BaseCommand
from ...mixins import Renderable
from ...tasks import train_classifier

View File

@@ -1,16 +1,15 @@
import json
import os
import time
import shutil
import time
from django.core.management.base import BaseCommand, CommandError
from django.core import serializers
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document, Correspondent, Tag, DocumentType
from paperless.db import GnuPG
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...mixins import Renderable
class Command(Renderable, BaseCommand):

View File

@@ -3,15 +3,14 @@ import os
import shutil
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from paperless.db import GnuPG
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...file_handling import generate_filename, create_source_path_directory
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
@@ -82,6 +81,10 @@ class Command(Renderable, BaseCommand):
def _import_files_from_manifest(self):
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
for record in self.manifest:
if not record["model"] == "documents.document":
@@ -94,6 +97,14 @@ class Command(Renderable, BaseCommand):
document_path = os.path.join(self.source, doc_file)
thumbnail_path = os.path.join(self.source, thumb_file)
document.storage_type = storage_type
document.filename = generate_filename(document)
if os.path.isfile(document.source_path):
raise FileExistsError(document.source_path)
create_source_path_directory(document.source_path)
if settings.PASSPHRASE:
with open(document_path, "rb") as unencrypted:
@@ -109,18 +120,8 @@ class Command(Renderable, BaseCommand):
encrypted.write(GnuPG.encrypted(unencrypted))
else:
print("Moving {} to {}".format(document_path, document.source_path))
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)
# Reset the storage type to whatever we've used while importing
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
Document.objects.filter(
pk__in=[r["pk"] for r in self.manifest]
).update(
storage_type=storage_type
)
document.save()

View File

@@ -8,5 +8,5 @@ class Command(BaseCommand):
help = "A quick & dirty way to see what's in the logs"
def handle(self, *args, **options):
for l in Log.objects.order_by("pk"):
print(l)
for log in Log.objects.order_by("pk"):
print(log)

View File

@@ -1,7 +1,6 @@
from django.core.management.base import BaseCommand
from documents.models import Document, Tag
from documents.models import Document
from ...mixins import Renderable

View File

@@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier):
correspondents = Correspondent.objects.all()
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
return matched_correspondents
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
def match_document_types(document_content, classifier):
document_types = DocumentType.objects.all()
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
return matched_document_types
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
def match_tags(document_content, classifier):

View File

@@ -1,7 +1,4 @@
# Generated by Django 3.1.3 on 2020-11-07 12:35
import os
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion

View File

@@ -9,11 +9,11 @@ from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.all().delete()
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
class Migration(migrations.Migration):

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2020-11-11 11:05
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1001_auto_20201109_1636'),
]
operations = [
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True),
),
]

View File

@@ -3,18 +3,15 @@
import logging
import os
import re
from collections import OrderedDict, defaultdict
from collections import OrderedDict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.dispatch import receiver
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
class MatchingModel(models.Model):
MATCH_ANY = 1
@@ -116,6 +113,7 @@ class DocumentType(MatchingModel):
class Document(models.Model):
# TODO: why do we need an explicit list
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
@@ -192,7 +190,7 @@ class Document(models.Model):
default=timezone.now, editable=False, db_index=True)
filename = models.FilePathField(
max_length=256,
max_length=1024,
editable=False,
default=None,
null=True,
@@ -220,123 +218,18 @@ class Document(models.Model):
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
def find_renamed_document(self, subdirectory=""):
suffix = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
suffix += ".gpg"
# Go up in the directory hierarchy and try to delete all directories
root = os.path.normpath(Document.filename_to_path(subdirectory))
for filename in os.listdir(root):
if filename.endswith(suffix):
return os.path.join(subdirectory, filename)
fullname = os.path.join(subdirectory, filename)
if os.path.isdir(Document.filename_to_path(fullname)):
return self.find_renamed_document(fullname)
return None
@property
def source_filename(self):
# Initial filename generation (for new documents)
if self.filename is None:
self.filename = self.generate_source_filename()
# Check if document is still available under filename
elif not os.path.isfile(Document.filename_to_path(self.filename)):
recovered_filename = self.find_renamed_document()
# If we have found the file so update the filename
if recovered_filename is not None:
logger = logging.getLogger(__name__)
logger.warning("Filename of document " + str(self.id) +
" has changed and was successfully updated")
self.filename = recovered_filename
# Remove all empty subdirectories from MEDIA_ROOT
Document.delete_all_empty_subdirectories(
Document.filename_to_path(""))
else:
logger = logging.getLogger(__name__)
logger.error("File of document " + str(self.id) + " has " +
"gone and could not be recovered")
return self.filename
@staticmethod
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter+1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_source_filename(self):
# Create filename based on configured format
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
self.many_to_dictionary(self.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(self.correspondent),
title=slugify(self.title),
created=slugify(self.created),
added=slugify(self.added),
tags=tags)
else:
path = ""
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
else:
filename = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def create_source_directory(self):
new_filename = self.generate_source_filename()
# Determine the full "target" path
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
# Create new path
os.makedirs(dir_new, exist_ok=True)
@property
def source_path(self):
return Document.filename_to_path(self.source_filename)
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
@staticmethod
def filename_to_path(filename):
return os.path.join(
settings.ORIGINALS_DIR,
filename
fname
)
@property
@@ -362,125 +255,6 @@ class Document(models.Model):
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
def set_filename(self, filename):
if os.path.isfile(Document.filename_to_path(filename)):
self.filename = filename
@staticmethod
def try_delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(Document.filename_to_path(""))
while directory != root:
# Try to delete the current directory
try:
os.rmdir(directory)
except os.error:
# Directory not empty, no need to go further up
return
# Cut off actual directory and go one level up
directory, _ = os.path.split(directory)
directory = os.path.normpath(directory)
@staticmethod
def delete_all_empty_subdirectories(directory):
# Go through all folders and try to delete all directories
root = os.path.normpath(Document.filename_to_path(directory))
for filename in os.listdir(root):
fullname = os.path.join(directory, filename)
if not os.path.isdir(Document.filename_to_path(fullname)):
continue
# Go into subdirectory to see, if there is more to delete
Document.delete_all_empty_subdirectories(
os.path.join(directory, filename))
# Try to delete the directory
try:
os.rmdir(Document.filename_to_path(fullname))
continue
except os.error:
# Directory not empty, no need to go further up
continue
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename(sender, instance, **kwargs):
# Skip if document has not been saved yet
if instance.filename is None:
return
# Check is file exists and update filename otherwise
if not os.path.isfile(Document.filename_to_path(instance.filename)):
instance.filename = instance.source_filename
# Build the new filename
new_filename = instance.generate_source_filename()
# If the filename is the same, then nothing needs to be done
if instance.filename == new_filename:
return
# Determine the full "target" path
path_new = instance.filename_to_path(new_filename)
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
# Create new path
instance.create_source_directory()
# Determine the full "current" path
path_current = instance.filename_to_path(instance.source_filename)
# Move file
try:
os.rename(path_current, path_new)
except PermissionError:
# Do not update filename in object
return
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.error("Renaming of document " + str(instance.id) + " failed " +
"as file " + instance.filename + " was no longer present")
return
# Delete empty directory
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
instance.filename = new_filename
# Save instance
# This will not cause a cascade of post_save signals, as next time
# nothing needs to be renamed
instance.save()
@receiver(models.signals.post_delete, sender=Document)
def delete_files(sender, instance, **kwargs):
if instance.filename is None:
return
# Remove the document
old_file = instance.filename_to_path(instance.filename)
try:
os.remove(old_file)
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.warning("Deleted document " + str(instance.id) + " but file " +
old_file + " was no longer present")
# And remove the directory (if applicable)
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
class Log(models.Model):
@@ -518,7 +292,7 @@ class FileInfo:
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
# TODO: what is this used for
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(

View File

@@ -20,13 +20,16 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)
@@ -39,17 +42,16 @@ def get_parser_class(doc):
Determine the appropriate parser class based on the file
"""
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
#TODO: add a check that checks parser availability.
options = []
for parser in parsers:
result = parser(doc)
if result:
options.append(result)
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
parser_test = parser_declaration["test"]
if parser_test(doc):
options.append(parser_declaration)
if not options:
return None
@@ -59,7 +61,7 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False
args += ['-trim'] if trim else []
args += ['-type', str(type)] if type else []
args += ['-depth', str(depth)] if depth else []
args += [input, output]
args += [input_file, output_file]
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
@@ -100,17 +102,17 @@ class ParseError(Exception):
pass
class DocumentParser:
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group, progress_callback):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__)
self.logging_group = logging_group
self.progress_callback = progress_callback
def get_thumbnail(self):
@@ -121,16 +123,19 @@ class DocumentParser:
def optimise_thumbnail(self, in_path):
out_path = os.path.join(self.tempdir, "optipng.png")
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
self.log('debug', 'Execute: ' + " ".join(args))
self.log('debug', 'Execute: ' + " ".join(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
@@ -222,11 +227,6 @@ class DocumentParser:
return date
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir)

View File

@@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer):
class LogSerializer(serializers.ModelSerializer):
class Meta:
model = Log
fields = (

View File

@@ -1,5 +1,5 @@
from django.dispatch import Signal
document_consumption_started = Signal(providing_args=["filename"])
document_consumption_finished = Signal(providing_args=["document"])
document_consumer_declaration = Signal(providing_args=[])
document_consumption_started = Signal()
document_consumption_finished = Signal()
document_consumer_declaration = Signal()

View File

@@ -6,9 +6,13 @@ from django.conf import settings
from django.contrib.admin.models import ADDITION, LogEntry
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.db import models, DatabaseError
from django.dispatch import receiver
from django.utils import timezone
from .. import index, matching
from ..file_handling import delete_empty_directories, generate_filename, \
create_source_path_directory
from ..models import Document, Tag
@@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs):
)).wait()
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
if not isinstance(instance, Document):
return
for f in (instance.source_path, instance.thumbnail_path):
try:
os.unlink(f)
except FileNotFoundError:
pass # The file's already gone, so we're cool with it.
delete_empty_directories(os.path.dirname(instance.source_path))
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename_and_move_files(sender, instance, **kwargs):
if not instance.filename:
# Can't update the filename if there is not filename to begin with
# This happens after the consumer creates a new document.
# The PK needs to be set first by saving the document once. When this
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
# renamed anyway. In all other cases, instance.filename will be set.
return
old_filename = instance.filename
old_path = instance.source_path
new_filename = generate_filename(instance)
if new_filename == instance.filename:
# Don't do anything if its the same.
return
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
return
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
return
create_source_path_directory(new_path)
try:
os.rename(old_path, new_path)
instance.filename = new_filename
instance.save()
except OSError as e:
instance.filename = old_filename
except DatabaseError as e:
os.rename(new_path, old_path)
instance.filename = old_filename
if not os.path.isfile(old_path):
delete_empty_directories(os.path.dirname(old_path))
def set_log_entry(sender, document=None, logging_group=None, **kwargs):

View File

@@ -1,20 +1,15 @@
import logging
from django.conf import settings
from django_q.tasks import async_task, result
from whoosh.writing import AsyncWriter
from documents import index
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.mail import MailFetcher
from documents.consumer import Consumer, ConsumerError
from documents.models import Document
def consume_mail():
MailFetcher().pull()
def index_optimize():
index.open_index().optimize()
@@ -55,3 +50,27 @@ def train_classifier():
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
def consume_file(path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
document = Consumer().try_consume_file(
path,
override_filename=override_filename,
override_title=override_title,
override_correspondent_id=override_correspondent_id,
override_document_type_id=override_document_type_id,
override_tag_ids=override_tag_ids)
if document:
return "Success. New document id {} created".format(
document.pk
)
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")

File diff suppressed because it is too large Load Diff

View File

@@ -1,208 +0,0 @@
Return-Path: <sender@example.com>
X-Original-To: sender@mailbox4.mailhost.com
Delivered-To: sender@mailbox4.mailhost.com
Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from localhost (localhost.localdomain [127.0.0.1])
by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
X-Spam-Flag: NO
X-Spam-Score: 0
X-Spam-Level:
X-Spam-Status: No, score=0 tagged_above=-999 required=3
tests=[RCVD_IN_DNSWL_NONE=-0.0001]
Received: from mx8.mailhost.com ([127.0.0.1])
by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:16 +0000 (UTC)
To: paperless@example.com
From: Daniel Quinn <sender@example.com>
Subject: Test 0
Message-ID: <56B3CA2A.6030806@example.com>
Date: Thu, 4 Feb 2016 22:01:14 +0000
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
Thunderbird/38.5.0
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="------------090701020702030809070008"
This is a multi-part message in MIME format.
--------------090701020702030809070008
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
The secret word is "paperless" :-)
--------------090701020702030809070008
Content-Type: application/pdf;
name="test0.pdf"
Content-Transfer-Encoding: base64
Content-Disposition: attachment;
filename="test0.pdf"
JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
+Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
/bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
+l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
--------------090701020702030809070008--

View File

@@ -0,0 +1,217 @@
import os
import shutil
import tempfile
from unittest import mock
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework.test import APITestCase
from documents.models import Document, Correspondent, DocumentType, Tag
class DocumentApiTest(APITestCase):
def setUp(self):
self.scratch_dir = tempfile.mkdtemp()
self.media_dir = tempfile.mkdtemp()
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
os.makedirs(self.originals_dir, exist_ok=True)
os.makedirs(self.thumbnail_dir, exist_ok=True)
override_settings(
SCRATCH_DIR=self.scratch_dir,
MEDIA_ROOT=self.media_dir,
ORIGINALS_DIR=self.originals_dir,
THUMBNAIL_DIR=self.thumbnail_dir
).enable()
user = User.objects.create_superuser(username="temp_admin")
self.client.force_login(user=user)
def tearDown(self):
shutil.rmtree(self.scratch_dir, ignore_errors=True)
shutil.rmtree(self.media_dir, ignore_errors=True)
def testDocuments(self):
response = self.client.get("/api/documents/").data
self.assertEqual(response['count'], 0)
c = Correspondent.objects.create(name="c", pk=41)
dt = DocumentType.objects.create(name="dt", pk=63)
tag = Tag.objects.create(name="t", pk=85)
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
doc.tags.add(tag)
response = self.client.get("/api/documents/", format='json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['count'], 1)
returned_doc = response.data['results'][0]
self.assertEqual(returned_doc['id'], doc.id)
self.assertEqual(returned_doc['title'], doc.title)
self.assertEqual(returned_doc['correspondent']['name'], c.name)
self.assertEqual(returned_doc['document_type']['name'], dt.name)
self.assertEqual(returned_doc['correspondent']['id'], c.id)
self.assertEqual(returned_doc['document_type']['id'], dt.id)
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
self.assertEqual(len(returned_doc['tags']), 1)
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
self.assertListEqual(returned_doc['tags_id'], [tag.id])
c2 = Correspondent.objects.create(name="c2")
returned_doc['correspondent_id'] = c2.pk
returned_doc['title'] = "the new title"
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
self.assertEqual(response.status_code, 200)
doc_after_save = Document.objects.get(id=doc.id)
self.assertEqual(doc_after_save.correspondent, c2)
self.assertEqual(doc_after_save.title, "the new title")
self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
self.assertEqual(len(Document.objects.all()), 0)
def test_document_actions(self):
_, filename = tempfile.mkstemp(dir=self.originals_dir)
content = b"This is a test"
content_thumbnail = b"thumbnail content"
with open(filename, "wb") as f:
f.write(content)
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
f.write(content_thumbnail)
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_thumbnail)
def test_document_actions_not_existing_file(self):
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
def test_document_filters(self):
doc1 = Document.objects.create(title="none1", checksum="A")
doc2 = Document.objects.create(title="none2", checksum="B")
doc3 = Document.objects.create(title="none3", checksum="C")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
tag_2 = Tag.objects.create(name="t2")
tag_3 = Tag.objects.create(name="t3")
doc1.tags.add(tag_inbox)
doc2.tags.add(tag_2)
doc3.tags.add(tag_2)
doc3.tags.add(tag_3)
response = self.client.get("/api/documents/?is_in_inbox=true")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['id'], doc1.id)
response = self.client.get("/api/documents/?is_in_inbox=false")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['id'], doc2.id)
self.assertEqual(results[1]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['id'], doc1.id)
self.assertEqual(results[1]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 0)
response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 3)
@mock.patch("documents.index.autocomplete")
def test_search_autocomplete(self, m):
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
response = self.client.get("/api/search/autocomplete/?term=test")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 20)
response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
self.assertEqual(response.status_code, 400)
response = self.client.get("/api/search/autocomplete/")
self.assertEqual(response.status_code, 400)
response = self.client.get("/api/search/autocomplete/?term=")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
doc2 = Document.objects.create(title="none2", checksum="B")
doc3 = Document.objects.create(title="none3", checksum="C")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
doc1.tags.add(tag_inbox)
response = self.client.get("/api/statistics/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['documents_total'], 3)
self.assertEqual(response.data['documents_inbox'], 1)

View File

@@ -2,9 +2,9 @@ import unittest
from django.test import TestCase
from .factories import DocumentFactory
from ..checks import changed_password_check
from ..models import Document
from .factories import DocumentFactory
class ChecksTestCase(TestCase):

View File

@@ -0,0 +1,85 @@
import tempfile
from django.test import TestCase, override_settings
from documents.classifier import DocumentClassifier
from documents.models import Correspondent, Document, Tag, DocumentType
class TestClassifier(TestCase):
def setUp(self):
self.classifier = DocumentClassifier()
def generate_test_data(self):
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
self.c2 = Correspondent.objects.create(name="c2")
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
self.doc1.tags.add(self.t1)
self.doc2.tags.add(self.t1)
self.doc2.tags.add(self.t3)
self.doc_inbox.tags.add(self.t2)
def testNoTrainingData(self):
try:
self.classifier.train()
except ValueError as e:
self.assertEqual(str(e), "No training data available.")
else:
self.fail("Should raise exception")
def testEmpty(self):
Document.objects.create(title="WOW", checksum="3457", content="ASD")
self.classifier.train()
self.assertIsNone(self.classifier.document_type_classifier)
self.assertIsNone(self.classifier.tags_classifier)
self.assertIsNone(self.classifier.correspondent_classifier)
self.assertListEqual(self.classifier.predict_tags(""), [])
self.assertIsNone(self.classifier.predict_document_type(""))
self.assertIsNone(self.classifier.predict_correspondent(""))
def testTrain(self):
self.generate_test_data()
self.classifier.train()
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
def testPredict(self):
self.generate_test_data()
self.classifier.train()
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
def testDatasetHashing(self):
self.generate_test_data()
self.assertTrue(self.classifier.train())
self.assertFalse(self.classifier.train())
@override_settings(DATA_DIR=tempfile.mkdtemp())
def testSaveClassifier(self):
self.generate_test_data()
self.classifier.train()
self.classifier.save_classifier()
new_classifier = DocumentClassifier()
new_classifier.reload()
self.assertFalse(new_classifier.train())

View File

@@ -1,8 +1,15 @@
import os
import re
import shutil
import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.test import TestCase
from django.test import TestCase, override_settings
from ..models import FileInfo, Tag
from ..consumer import Consumer, ConsumerError
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
from ..parsers import DocumentParser, ParseError
class TestAttributes(TestCase):
@@ -394,3 +401,254 @@ class TestFieldPermutations(TestCase):
self.assertEqual(info.created.year, 2019)
self.assertEqual(info.created.month, 9)
self.assertEqual(info.created.day, 8)
class DummyParser(DocumentParser):
def get_thumbnail(self):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(DummyParser, self).__init__(path, logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self):
return self.fake_thumb
def get_text(self):
return "The Text"
class FaultyParser(DocumentParser):
def get_thumbnail(self):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(FaultyParser, self).__init__(path, logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self):
return self.fake_thumb
def get_text(self):
raise ParseError("Does not compute.")
class TestConsumer(TestCase):
def make_dummy_parser(self, path, logging_group):
return DummyParser(path, logging_group, self.scratch_dir)
def make_faulty_parser(self, path, logging_group):
return FaultyParser(path, logging_group, self.scratch_dir)
def setUp(self):
self.scratch_dir = tempfile.mkdtemp()
self.media_dir = tempfile.mkdtemp()
self.consumption_dir = tempfile.mkdtemp()
override_settings(
SCRATCH_DIR=self.scratch_dir,
MEDIA_ROOT=self.media_dir,
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
CONSUMPTION_DIR=self.consumption_dir
).enable()
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"test": lambda _: True,
"weight": 0
})]
self.addCleanup(patcher.stop)
self.consumer = Consumer()
def tearDown(self):
shutil.rmtree(self.scratch_dir, ignore_errors=True)
shutil.rmtree(self.media_dir, ignore_errors=True)
shutil.rmtree(self.consumption_dir, ignore_errors=True)
def get_test_file(self):
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
return f
def testNormalOperation(self):
filename = self.get_test_file()
document = self.consumer.try_consume_file(filename)
self.assertEqual(document.content, "The Text")
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
self.assertIsNone(document.correspondent)
self.assertIsNone(document.document_type)
self.assertEqual(document.filename, "0000001.pdf")
self.assertTrue(os.path.isfile(
document.source_path
))
self.assertTrue(os.path.isfile(
document.thumbnail_path
))
self.assertFalse(os.path.isfile(filename))
def testOverrideFilename(self):
filename = self.get_test_file()
override_filename = "My Bank - Statement for November.pdf"
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
self.assertEqual(document.correspondent.name, "My Bank")
self.assertEqual(document.title, "Statement for November")
def testOverrideTitle(self):
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
self.assertEqual(document.title, "Override Title")
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
self.assertEqual(document.correspondent.id, c.id)
def testOverrideDocumentType(self):
dt = DocumentType.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
self.assertEqual(document.document_type.id, dt.id)
def testOverrideTags(self):
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
t3 = Tag.objects.create(name="t3")
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
self.assertIn(t3, document.tags.all())
def testNotAFile(self):
try:
self.consumer.try_consume_file("non-existing-file")
except ConsumerError as e:
self.assertTrue(str(e).endswith('It is not a file'))
return
self.fail("Should throw exception")
@override_settings(CONSUMPTION_DIR=None)
def testConsumptionDirUnset(self):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
return
self.fail("Should throw exception")
@override_settings(CONSUMPTION_DIR="asd")
def testNoConsumptionDir(self):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "Consumption directory asd does not exist")
return
self.fail("Should throw exception")
def testDuplicates(self):
self.consumer.try_consume_file(self.get_test_file())
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).endswith("It is a duplicate."))
return
self.fail("Should throw exception")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testNoParsers(self, m):
m.return_value = []
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).startswith("No parsers abvailable"))
return
self.fail("Should throw exception")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"test": lambda _: True,
"weight": 0
})]
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "Does not compute.")
return
self.fail("Should throw exception.")
@mock.patch("documents.consumer.Consumer._write")
def testPostSaveError(self, m):
filename = self.get_test_file()
m.side_effect = OSError("NO.")
try:
self.consumer.try_consume_file(filename)
except ConsumerError as e:
self.assertEqual(str(e), "NO.")
else:
self.fail("Should raise exception")
# file not deleted
self.assertTrue(os.path.isfile(filename))
# Database empty
self.assertEqual(len(Document.objects.all()), 0)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def testFilenameHandling(self):
filename = self.get_test_file()
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
print(document.source_path)
print("===")
self.assertEqual(document.title, "new docs")
self.assertEqual(document.correspondent.name, "Bank")
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
@mock.patch("documents.consumer.DocumentClassifier")
def testClassifyDocument(self, m):
correspondent = Correspondent.objects.create(name="test")
dtype = DocumentType.objects.create(name="test")
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
m.return_value = MagicMock()
m.return_value.predict_correspondent.return_value = correspondent.pk
m.return_value.predict_document_type.return_value = dtype.pk
m.return_value.predict_tags.return_value = [t1.pk]
document = self.consumer.try_consume_file(self.get_test_file())
self.assertEqual(document.correspondent, correspondent)
self.assertEqual(document.document_type, dtype)
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())

View File

@@ -1,17 +1,14 @@
import datetime
import os
import shutil
from unittest import mock
from uuid import uuid4
from pathlib import Path
from shutil import rmtree
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from django.utils.text import slugify
from ..models import Tag, Document, Correspondent
from django.conf import settings
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
from ..models import Document, Correspondent
from ..signals.handlers import update_filename_and_move_files
class TestDate(TestCase):
@@ -31,18 +28,6 @@ class TestDate(TestCase):
for dirname in self.deletion_list:
shutil.rmtree(dirname, ignore_errors=True)
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_source_filename(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.source_filename, "0000001.pdf")
document.filename = "test.pdf"
self.assertEqual(document.source_filename, "test.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
document = Document()
@@ -50,58 +35,50 @@ class TestDate(TestCase):
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
self.assertEqual(generate_filename(document), "{:07d}.pdf".format(document.pk))
document.storage_type = Document.STORAGE_TYPE_GPG
self.assertEqual(document.generate_source_filename(),
"0000001.pdf.gpg")
self.assertEqual(generate_filename(document),
"{:07d}.pdf.gpg".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test default source_path
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk))
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
document.filename = generate_filename(document)
# Ensure that filename is properly generated
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
# Enable encryption and check again
document.storage_type = Document.STORAGE_TYPE_GPG
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf.gpg")
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf.gpg".format(document.pk))
document.save()
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
# test that creating dirs for the source_path creates the correct directory
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf.gpg"), True)
self.assertEqual(document.generate_source_filename(),
"test/test-0000001.pdf.gpg")
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self):
document = Document()
document.file_type = "pdf"
@@ -109,34 +86,67 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk))
# Make the folder read- and execute-only (no writing and no renaming)
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555)
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), True)
self.assertEqual(document.source_filename,
"none/none-0000001.pdf")
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777)
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self):
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document = Document()
document.file_type = "pdf"
document.checksum = "BBBBB"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Test source_path
self.assertTrue(os.path.isfile(document.source_path))
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
# This will cause save() to fail.
document.checksum = document1.checksum
# Assume saving the document initially works, this gets called.
# After renaming, an error occurs, and filename is not saved:
# document should still be available at document.filename.
update_filename_and_move_files(None, document)
# Check proper handling of files
self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete(self):
document = Document()
document.file_type = "pdf"
@@ -144,21 +154,20 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Ensure file deletion after delete
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self):
document = Document()
document.file_type = "pdf"
@@ -167,8 +176,7 @@ class TestDate(TestCase):
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self):
document = Document()
document.file_type = "pdf"
@@ -176,28 +184,24 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
Path(document.source_path + "test").touch()
important_file = document.source_path + "test"
Path(important_file).touch()
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
# Cleanup
os.remove(settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdftest")
os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none")
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
self.assertTrue(os.path.isfile(important_file))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
@@ -212,13 +216,8 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
self.assertEqual(generate_filename(document),
"demo-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self):
@@ -233,13 +232,8 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
self.assertEqual(generate_filename(document),
"demo-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self):
@@ -254,13 +248,8 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
self.assertEqual(generate_filename(document),
"none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self):
@@ -274,64 +263,25 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"demo-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
self.assertEqual(generate_filename(document),
"demo-{:07d}.pdf".format(document.pk))
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_out_of_bounds_0(self):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
def test_tags_out_of_bounds(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}")
def test_tags_out_of_bounds_10000000(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
# Add tag to document
document.tags.create(name="demo")
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
self.assertEqual(generate_filename(document),
"none-{:07d}.pdf".format(document.pk))
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}")
def test_tags_out_of_bounds_99(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}/{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()
document.file_type = "pdf"
@@ -339,153 +289,34 @@ class TestDate(TestCase):
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none/none-0000001.pdf")
document.create_source_directory()
document.filename = generate_filename(document)
self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none/none"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), True)
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
"/documents/originals/none/none/none-0000001.pdf"),
False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals"), True)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def test_format_none(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_renamed(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Rename the document "illegaly"
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf",
settings.MEDIA_ROOT + "/documents/originals/" +
"test/test-0000001.pdf")
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf"), True)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), False)
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/foo/foo-0000001.pdf"), True)
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/foo"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), False)
self.assertEqual(document.generate_source_filename(),
"foo/foo-0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_renamed_encrypted(self):
document = Document()
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_GPG
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf.gpg")
document.create_source_directory()
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf.gpg")
# Rename the document "illegaly"
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf.gpg",
settings.MEDIA_ROOT + "/documents/originals/" +
"test/test-0000001.pdf.gpg")
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/test/test-0000001.pdf.gpg"), True)
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-0000001.pdf"), False)
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/foo/foo-0000001.pdf.gpg"), True)
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/foo"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), False)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), False)
self.assertEqual(document.generate_source_filename(),
"foo/foo-0000001.pdf.gpg")
def test_delete_all_empty_subdirectories(self):
# Create our working directory
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(tmp)
self.add_to_deletion_list(tmp)
os.makedirs(os.path.join(tmp, "empty"))
os.makedirs(os.path.join(tmp, "empty", "subdirectory"))
os.makedirs(os.path.join(tmp, "notempty"))
Path(os.path.join(tmp, "notempty", "file")).touch()
Document.delete_all_empty_subdirectories(tmp)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
self.assertEqual(generate_filename(document), "0000001.pdf")
def test_try_delete_empty_directories(self):
# Create our working directory
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
os.makedirs(tmp)
self.add_to_deletion_list(tmp)
@@ -493,67 +324,27 @@ class TestDate(TestCase):
Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty"))
Document.try_delete_empty_directories(
os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
self.assertEqual(os.path.isdir(
os.path.join(tmp, "notempty", "empty")), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_document_accidentally_deleted(self):
@override_settings(PAPERLESS_FILENAME_FORMAT="{created/[title]")
def test_invalid_format(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
self.assertEqual(generate_filename(document), "0000001.pdf")
# Test source_path
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
"/documents/originals/none/none-0000001.pdf")
# Delete the document "illegaly"
os.remove(settings.MEDIA_ROOT + "/documents/originals/" +
"none/none-0000001.pdf")
# Set new correspondent and expect document to be saved properly
document.correspondent = Correspondent.objects.get_or_create(
name="foo")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
self.assertEqual(document.source_filename,
"none/none-0000001.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
def test_set_filename(self):
@override_settings(PAPERLESS_FILENAME_FORMAT="{created__year}")
def test_invalid_format_key(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
# Ensure that filename is properly generated
tmp = document.source_filename
self.assertEqual(document.generate_source_filename(),
"none/none-0000001.pdf")
document.create_source_directory()
Path(document.source_path).touch()
# Set existing filename
document.set_filename(tmp)
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
# Set non-existing filename
document.set_filename("doesnotexist")
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
self.assertEqual(generate_filename(document), "0000001.pdf")

View File

@@ -1,9 +1,8 @@
from django.core.management.base import CommandError
from django.test import TestCase
from ..management.commands.document_importer import Command
from documents.settings import EXPORTER_FILE_NAME
from ..management.commands.document_importer import Command
class TestImporter(TestCase):

View File

@@ -1,6 +1,5 @@
import logging
import uuid
from unittest import mock
from django.test import TestCase

View File

@@ -1,91 +0,0 @@
import base64
import os
import magic
from hashlib import md5
from unittest import mock
from django.conf import settings
from django.test import TestCase
from ..mail import Message, Attachment
class TestMessage(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.sample = os.path.join(
settings.BASE_DIR,
"documents",
"tests",
"samples",
"mail.txt"
)
def test_init(self):
with open(self.sample, "rb") as f:
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Test 0")
data = message.attachment.read()
self.assertEqual(
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
self.assertEqual(
message.attachment.content_type, "application/pdf")
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
self.assertEqual(m.id_buffer(data), "application/pdf")
class TestInlineMessage(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.sample = os.path.join(
settings.BASE_DIR,
"documents",
"tests",
"samples",
"inline_mail.txt"
)
def test_init(self):
with open(self.sample, "rb") as f:
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Paperless Inline Image")
data = message.attachment.read()
self.assertEqual(
md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
self.assertEqual(
message.attachment.content_type, "image/png")
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
self.assertEqual(m.id_buffer(data), "image/png")
class TestAttachment(TestCase):
def test_init(self):
data = base64.encodebytes(b"0")
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
self.assertEqual(Attachment(data, "image/png").suffix, "png")
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
self.assertEqual(Attachment(data, "image/png").read(), data)

View File

@@ -1,7 +1,7 @@
from django.test import TestCase
from ..models import Document, Correspondent
from .factories import DocumentFactory, CorrespondentFactory
from ..models import Document, Correspondent
class CorrespondentTestCase(TestCase):

View File

@@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser}),
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
)
self.assertEqual(
@@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
)
self.assertEqual(
@@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = ((None, lambda _: None),)
m.return_value = []
with TemporaryDirectory() as tmpdir:
self.assertIsNone(
get_parser_class("doc.pdf")

View File

@@ -1,14 +1,9 @@
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest
from django.http import HttpResponse, HttpResponseBadRequest, Http404
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.views import APIView
from paperless.db import GnuPG
from paperless.views import StandardPagination
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.mixins import (
DestroyModelMixin,
@@ -17,12 +12,17 @@ from rest_framework.mixins import (
UpdateModelMixin
)
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.viewsets import (
GenericViewSet,
ModelViewSet,
ReadOnlyModelViewSet
)
import documents.index as index
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .filters import (
CorrespondentFilterSet,
DocumentFilterSet,
@@ -30,8 +30,6 @@ from .filters import (
DocumentTypeFilterSet,
LogFilterSet
)
import documents.index as index
from .forms import UploadForm
from .models import Correspondent, Document, Log, Tag, DocumentType
from .serialisers import (
@@ -54,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = CorrespondentFilterSet
filterset_class = CorrespondentFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
@@ -65,7 +63,7 @@ class TagViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = TagFilterSet
filterset_class = TagFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@@ -76,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = DocumentTypeFilterSet
filterset_class = DocumentTypeFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@@ -91,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
filter_class = DocumentFilterSet
filterset_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
@@ -106,7 +104,7 @@ class DocumentViewSet(RetrieveModelMixin,
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
#TODO: this should not be necessary here.
# TODO: this should not be necessary here.
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
@@ -114,7 +112,7 @@ class DocumentViewSet(RetrieveModelMixin,
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
@@ -132,7 +130,7 @@ class DocumentViewSet(RetrieveModelMixin,
@action(methods=['post'], detail=False)
def post_document(self, request, pk=None):
#TODO: is this a good implementation?
# TODO: is this a good implementation?
form = UploadForm(data=request.POST, files=request.FILES)
if form.is_valid():
form.save()
@@ -142,17 +140,26 @@ class DocumentViewSet(RetrieveModelMixin,
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
response = self.file_response(pk, "inline")
return response
try:
response = self.file_response(pk, "inline")
return response
except FileNotFoundError:
raise Http404("Document source file does not exist")
@action(methods=['get'], detail=True)
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
try:
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
except FileNotFoundError:
raise Http404("Document thumbnail does not exist")
@action(methods=['get'], detail=True)
def download(self, request, pk=None):
return self.file_response(pk, "attachment")
try:
return self.file_response(pk, "attachment")
except FileNotFoundError:
raise Http404("Document source file does not exist")
class LogViewSet(ReadOnlyModelViewSet):
@@ -163,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = LogFilterSet
filterset_class = LogFilterSet
ordering_fields = ("created",)
@@ -191,13 +198,12 @@ class SearchView(APIView):
except (ValueError, TypeError):
page = 1
result_page = index.query_page(self.ix, query, page)
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
with index.query_page(self.ix, query, page) as result_page:
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
else:
return Response({
@@ -217,17 +223,16 @@ class SearchAutoCompleteView(APIView):
if 'term' in request.query_params:
term = request.query_params['term']
else:
term = None
return HttpResponseBadRequest("Term required")
if 'limit' in request.query_params:
limit = int(request.query_params['limit'])
if limit <= 0:
return HttpResponseBadRequest("Invalid limit")
else:
limit = 10
if term is not None:
return Response(index.autocomplete(self.ix, term, limit))
else:
return Response([])
return Response(index.autocomplete(self.ix, term, limit))
class StatisticsView(APIView):

View File

@@ -11,6 +11,8 @@ writeable_hint = (
"Set the permissions of {} to be writeable by the user running the "
"Paperless services"
)
def path_check(env_var):
messages = []
directory = os.getenv(env_var)
@@ -27,6 +29,7 @@ def path_check(env_var):
))
return messages
@register()
def paths_check(app_configs, **kwargs):
"""
@@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs):
"""
check_messages = path_check("PAPERLESS_DATA_DIR") + \
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
return check_messages
@@ -64,3 +67,16 @@ def binaries_check(app_configs, **kwargs):
check_messages.append(Warning(error.format(binary), hint))
return check_messages
@register()
def debug_mode_check(app_configs, **kwargs):
if settings.DEBUG:
return [Warning(
"DEBUG mode is enabled. Disable Debug mode. This is a serious "
"security issue, since it puts security overides in place which "
"are meant to be only used during development. This "
"also means that paperless will tell anyone various "
"debugging information when something goes wrong.")]
else:
return []

View File

@@ -1,4 +1,5 @@
import json
import math
import multiprocessing
import os
import re
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
elif os.path.exists("/usr/local/etc/paperless.conf"):
load_dotenv("/usr/local/etc/paperless.conf")
# There are multiple levels of concurrency in paperless:
# - Multiple consumers may be run in parallel.
# - Each consumer may process multiple pages in parallel.
# - Each Tesseract OCR run may spawn multiple threads to process a single page
# slightly faster.
# The performance gains from having tesseract use multiple threads are minimal.
# However, when multiple pages are processed in parallel, the total number of
# OCR threads may exceed the number of available cpu cores, which will
# dramatically slow down the consumption process. This settings limits each
# Tesseract process to one thread.
os.environ['OMP_THREAD_LIMIT'] = "1"
def __get_boolean(key, default="NO"):
"""
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
"""
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
###############################################################################
# Directories #
###############################################################################
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@@ -139,11 +155,11 @@ else:
X_FRAME_OPTIONS = 'SAMEORIGIN'
# We allow CORS from localhost:8080
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))
if DEBUG:
# Allow access from the angular development server during debugging
CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)
# The secret key has a default that should be fine so long as you're hosting
# Paperless on a closed network. However, if you're putting this anywhere
@@ -195,11 +211,11 @@ DATABASES = {
}
}
# Always have sqlite available as a second option for management commands
# This is important when migrating to/from sqlite
DATABASES['sqlite'] = DATABASES['default'].copy()
if os.getenv("PAPERLESS_DBHOST"):
# Have sqlite available as a second option for management commands
# This is important when migrating to/from sqlite
DATABASES['sqlite'] = DATABASES['default'].copy()
DATABASES["default"] = {
"ENGINE": "django.db.backends.postgresql_psycopg2",
"HOST": os.getenv("PAPERLESS_DBHOST"),
@@ -244,6 +260,14 @@ LOGGING = {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_mail": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_tesseract": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
},
}
@@ -251,22 +275,60 @@ LOGGING = {
# Task queue #
###############################################################################
# Sensible defaults for multitasking:
# use a fair balance between worker processes and threads epr worker so that
# both consuming many documents in parallel and consuming large documents is
# reasonably fast.
# Favors threads per worker on smaller systems and never exceeds cpu_count()
# in total.
def default_task_workers():
try:
return max(
math.floor(math.sqrt(multiprocessing.cpu_count())),
1
)
except NotImplementedError:
return 1
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'workers': TASK_WORKERS,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}
def default_threads_per_worker():
try:
return max(
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
1
)
except NotImplementedError:
return 1
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
###############################################################################
# Paperless Specific Settings #
###############################################################################
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
# TODO: this should not have a prefix.
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

View File

@@ -1,4 +1,4 @@
from django.conf.urls import include, url
from django.conf.urls import include
from django.contrib import admin
from django.contrib.auth.decorators import login_required
from django.urls import path, re_path
@@ -7,7 +7,6 @@ from django.views.generic import RedirectView
from rest_framework.routers import DefaultRouter
from paperless.consumers import StatusConsumer
from paperless.views import FaviconView
from documents.views import (
CorrespondentViewSet,
DocumentViewSet,
@@ -19,6 +18,7 @@ from documents.views import (
SearchAutoCompleteView,
StatisticsView
)
from paperless.views import FaviconView
api_router = DefaultRouter()
api_router.register(r"correspondents", CorrespondentViewSet)
@@ -31,32 +31,32 @@ api_router.register(r"tags", TagViewSet)
urlpatterns = [
# API
url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
re_path(r"^api/search/", SearchView.as_view(), name="search"),
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
# Favicon
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
# The Django admin
url(r"admin/", admin.site.urls),
re_path(r"admin/", admin.site.urls),
# These redirects are here to support clients that use the old FetchView.
url(
re_path(
r"^fetch/doc/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
),
url(
re_path(
r"^fetch/thumb/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
),
url(
re_path(
r"^fetch/preview/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
),
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
# Frontend assets TODO: this is pretty bad.
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
@@ -64,7 +64,7 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
# Root of the Frontent
url(r".*", login_required(IndexView.as_view())),
re_path(r".*", login_required(IndexView.as_view())),
]
@@ -74,8 +74,8 @@ websocket_urlpatterns = [
]
# Text in each page's <h1> (and above login form).
admin.site.site_header = 'Paperless'
admin.site.site_header = 'Paperless-ng'
# Text at the end of each page's <title>.
admin.site.site_title = 'Paperless'
admin.site.site_title = 'Paperless-ng'
# Text at the top of the admin index page.
admin.site.index_title = 'Paperless administration'
admin.site.index_title = 'Paperless-ng administration'

View File

@@ -1 +1 @@
__version__ = (1, 0, 0)
__version__ = (0, 9, 1)

View File

View File

@@ -0,0 +1,18 @@
from django.contrib import admin
from paperless_mail.models import MailAccount, MailRule
class MailAccountAdmin(admin.ModelAdmin):
list_display = ("name", "imap_server", "username")
class MailRuleAdmin(admin.ModelAdmin):
list_filter = ("account",)
list_display = ("name", "account", "folder", "action")
admin.site.register(MailAccount, MailAccountAdmin)
admin.site.register(MailRule, MailRuleAdmin)

View File

@@ -0,0 +1,7 @@
from django.apps import AppConfig
class PaperlessMailConfig(AppConfig):
name = 'paperless_mail'
verbose_name = 'Paperless Mail'

279
src/paperless_mail/mail.py Normal file
View File

@@ -0,0 +1,279 @@
import os
import tempfile
from datetime import timedelta, date
from django.conf import settings
from django.utils.text import slugify
from django_q.tasks import async_task
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
MailboxFolderSelectError
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from paperless_mail.models import MailAccount, MailRule
class MailError(Exception):
pass
class BaseMailAction:
def get_criteria(self):
return {}
def post_consume(self, M, message_uids, parameter):
pass
class DeleteMailAction(BaseMailAction):
def post_consume(self, M, message_uids, parameter):
M.delete(message_uids)
class MarkReadMailAction(BaseMailAction):
def get_criteria(self):
return {'seen': False}
def post_consume(self, M, message_uids, parameter):
M.seen(message_uids, True)
class MoveMailAction(BaseMailAction):
def post_consume(self, M, message_uids, parameter):
M.move(message_uids, parameter)
class FlagMailAction(BaseMailAction):
def get_criteria(self):
return {'flagged': False}
def post_consume(self, M, message_uids, parameter):
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
def get_rule_action(rule):
if rule.action == MailRule.ACTION_FLAG:
return FlagMailAction()
elif rule.action == MailRule.ACTION_DELETE:
return DeleteMailAction()
elif rule.action == MailRule.ACTION_MOVE:
return MoveMailAction()
elif rule.action == MailRule.ACTION_MARK_READ:
return MarkReadMailAction()
else:
raise ValueError("Unknown action.")
def make_criterias(rule):
maximum_age = date.today() - timedelta(days=rule.maximum_age)
criterias = {
"date_gte": maximum_age
}
if rule.filter_from:
criterias["from_"] = rule.filter_from
if rule.filter_subject:
criterias["subject"] = rule.filter_subject
if rule.filter_body:
criterias["body"] = rule.filter_body
return {**criterias, **get_rule_action(rule).get_criteria()}
def get_title(message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
title = message.subject
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
title = os.path.splitext(os.path.basename(att.filename))[0]
else:
raise ValueError("Unknown title selector.")
return title
def get_correspondent(message, rule):
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
correspondent = None
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
if message.from_values and \
'name' in message.from_values \
and message.from_values['name']:
correspondent_name = message.from_values['name']
else:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
correspondent = rule.assign_correspondent
else:
raise ValueError("Unknwown correspondent selector")
return correspondent
def get_mailbox(server, port, security):
if security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(server, port, starttls=True)
elif security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(server, port)
else:
raise ValueError("Unknown IMAP security")
return mailbox
class MailAccountHandler(LoggingMixin):
def handle_mail_account(self, account):
self.renew_logging_group()
self.log('debug', f"Processing mail account {account}")
total_processed_files = 0
with get_mailbox(account.imap_server,
account.imap_port,
account.imap_security) as M:
try:
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
self.log('debug', f"Account {account}: Processing "
f"{account.rules.count()} rule(s)")
for rule in account.rules.all():
self.log(
'debug',
f"Account {account}: Processing rule {rule.name}")
self.log(
'debug',
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
self.log(
'debug',
f"Rule {account}.{rule}: Searching folder with criteria "
f"{str(AND(**criterias))}")
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
mails_processed = 0
for message in messages:
try:
processed_files = self.handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
mails_processed += 1
self.log(
'debug',
f"Rule {account}.{rule}: Processed {mails_processed} "
f"matching mail(s)")
self.log(
'debug',
f"Rule {account}.{rule}: Running mail actions on "
f"{len(post_consume_messages)} mails")
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def handle_message(self, message, rule):
if not message.attachments:
return 0
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Processing mail {message.subject} from {message.from_} with "
f"{len(message.attachments)} attachment(s)")
correspondent = get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0
for att in message.attachments:
title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
self.log(
'info',
f"Rule {rule.account}.{rule}: "
f"Consuming attachment {att.filename} from mail "
f"{message.subject} from {message.from_}")
async_task(
"documents.tasks.consume_file",
path=temp_filename,
override_filename=att.filename,
override_title=title,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
processed_attachments += 1
return processed_attachments

View File

@@ -0,0 +1,13 @@
from django.core.management.base import BaseCommand
from paperless_mail import tasks
class Command(BaseCommand):
help = """
""".replace(" ", "")
def handle(self, *args, **options):
tasks.process_mail_accounts()

View File

@@ -0,0 +1,48 @@
# Generated by Django 3.1.3 on 2020-11-15 22:54
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('documents', '1002_auto_20201111_1105'),
]
operations = [
migrations.CreateModel(
name='MailAccount',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=256, unique=True)),
('imap_server', models.CharField(max_length=256)),
('imap_port', models.IntegerField(blank=True, null=True)),
('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
('username', models.CharField(max_length=256)),
('password', models.CharField(max_length=256)),
],
),
migrations.CreateModel(
name='MailRule',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=256)),
('folder', models.CharField(default='INBOX', max_length=256)),
('filter_from', models.CharField(blank=True, max_length=256, null=True)),
('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
('filter_body', models.CharField(blank=True, max_length=256, null=True)),
('maximum_age', models.PositiveIntegerField(default=30)),
('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
],
),
]

View File

@@ -0,0 +1,32 @@
# Generated by Django 3.1.3 on 2020-11-17 13:34
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('paperless_mail.tasks.process_mail_accounts',
name="Check all e-mail accounts",
schedule_type=Schedule.MINUTES,
minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(
func='paperless_mail.tasks.process_mail_accounts').delete()
class Migration(migrations.Migration):
dependencies = [
('paperless_mail', '0001_initial'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

View File

@@ -0,0 +1,23 @@
# Generated by Django 3.1.3 on 2020-11-18 19:40
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('paperless_mail', '0002_auto_20201117_1334'),
]
operations = [
migrations.AlterField(
model_name='mailaccount',
name='imap_port',
field=models.IntegerField(blank=True, help_text='This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections.', null=True),
),
migrations.AlterField(
model_name='mailrule',
name='name',
field=models.CharField(max_length=256, unique=True),
),
]

View File

@@ -0,0 +1,138 @@
from django.db import models
import documents.models as document_models
class MailAccount(models.Model):
IMAP_SECURITY_NONE = 1
IMAP_SECURITY_SSL = 2
IMAP_SECURITY_STARTTLS = 3
IMAP_SECURITY_OPTIONS = (
(IMAP_SECURITY_NONE, "No encryption"),
(IMAP_SECURITY_SSL, "Use SSL"),
(IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
)
name = models.CharField(max_length=256, unique=True)
imap_server = models.CharField(max_length=256)
imap_port = models.IntegerField(
blank=True,
null=True,
help_text="This is usually 143 for unencrypted and STARTTLS "
"connections, and 993 for SSL connections.")
imap_security = models.PositiveIntegerField(
choices=IMAP_SECURITY_OPTIONS,
default=IMAP_SECURITY_SSL
)
username = models.CharField(max_length=256)
password = models.CharField(max_length=256)
def __str__(self):
return self.name
class MailRule(models.Model):
ACTION_DELETE = 1
ACTION_MOVE = 2
ACTION_MARK_READ = 3
ACTION_FLAG = 4
ACTIONS = (
(ACTION_DELETE, "Delete"),
(ACTION_MOVE, "Move to specified folder"),
(ACTION_MARK_READ, "Mark as read, don't process read mails"),
(ACTION_FLAG, "Flag the mail, don't process flagged mails")
)
TITLE_FROM_SUBJECT = 1
TITLE_FROM_FILENAME = 2
TITLE_SELECTOR = (
(TITLE_FROM_SUBJECT, "Use subject as title"),
(TITLE_FROM_FILENAME, "Use attachment filename as title")
)
CORRESPONDENT_FROM_NOTHING = 1
CORRESPONDENT_FROM_EMAIL = 2
CORRESPONDENT_FROM_NAME = 3
CORRESPONDENT_FROM_CUSTOM = 4
CORRESPONDENT_SELECTOR = (
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
)
name = models.CharField(max_length=256, unique=True)
account = models.ForeignKey(
MailAccount,
related_name="rules",
on_delete=models.CASCADE
)
folder = models.CharField(default='INBOX', max_length=256)
filter_from = models.CharField(max_length=256, null=True, blank=True)
filter_subject = models.CharField(max_length=256, null=True, blank=True)
filter_body = models.CharField(max_length=256, null=True, blank=True)
maximum_age = models.PositiveIntegerField(default=30)
action = models.PositiveIntegerField(
choices=ACTIONS,
default=ACTION_MARK_READ,
help_text="The action applied to the mail. This action is only "
"performed when documents were consumed from the mail. "
"Mails without attachments will remain entirely "
"untouched."
)
action_parameter = models.CharField(
max_length=256, blank=True, null=True,
help_text="Additional parameter for the action selected above, i.e., "
"the target folder of the move to folder action."
)
assign_title_from = models.PositiveIntegerField(
choices=TITLE_SELECTOR,
default=TITLE_FROM_SUBJECT
)
assign_tag = models.ForeignKey(
document_models.Tag,
null=True,
blank=True,
on_delete=models.SET_NULL
)
assign_document_type = models.ForeignKey(
document_models.DocumentType,
null=True,
blank=True,
on_delete=models.SET_NULL
)
assign_correspondent_from = models.PositiveIntegerField(
choices=CORRESPONDENT_SELECTOR,
default=CORRESPONDENT_FROM_NOTHING
)
assign_correspondent = models.ForeignKey(
document_models.Correspondent,
null=True,
blank=True,
on_delete=models.SET_NULL
)
def __str__(self):
return self.name

View File

@@ -0,0 +1,23 @@
import logging
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
total_new_documents += MailAccountHandler().handle_mail_account(account)
if total_new_documents > 0:
return f"Added {total_new_documents} document(s)."
else:
return "No new documents were added."
def process_mail_account(name):
account = MailAccount.objects.find(name=name)
if account:
MailAccountHandler().handle_mail_account(account)
else:
logging.error("Unknown mail acccount: {}".format(name))

View File

View File

@@ -0,0 +1,360 @@
import uuid
from collections import namedtuple
from typing import ContextManager
from unittest import mock
from django.test import TestCase
from imap_tools import MailMessageFlags, MailboxFolderSelectError
from documents.models import Correspondent
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
from paperless_mail.models import MailRule, MailAccount
class BogusFolderManager:
current_folder = "INBOX"
def set(self, new_folder):
if new_folder not in ["INBOX", "spam"]:
raise MailboxFolderSelectError(None, "uhm")
self.current_folder = new_folder
class BogusMailBox(ContextManager):
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __init__(self):
self.messages = []
self.messages_spam = []
def login(self, username, password):
if not (username == 'admin' and password == 'secret'):
raise Exception()
folder = BogusFolderManager()
def fetch(self, criteria, mark_seen):
msg = self.messages
criteria = str(criteria).strip('()').split(" ")
if 'UNSEEN' in criteria:
msg = filter(lambda m: not m.seen, msg)
if 'SUBJECT' in criteria:
subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
msg = filter(lambda m: subject in m.subject, msg)
if 'BODY' in criteria:
body = criteria[criteria.index('BODY') + 1].strip('"')
msg = filter(lambda m: body in m.body, msg)
if 'FROM' in criteria:
from_ = criteria[criteria.index('FROM') + 1].strip('"')
msg = filter(lambda m: from_ in m.from_, msg)
if 'UNFLAGGED' in criteria:
msg = filter(lambda m: not m.flagged, msg)
return list(msg)
def seen(self, uid_list, seen_val):
for message in self.messages:
if message.uid in uid_list:
message.seen = seen_val
def delete(self, uid_list):
self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
def flag(self, uid_list, flag_set, value):
for message in self.messages:
if message.uid in uid_list:
for flag in flag_set:
if flag == MailMessageFlags.FLAGGED:
message.flagged = value
def move(self, uid_list, folder):
if folder == "spam":
self.messages_spam.append(
filter(lambda m: m.uid in uid_list, self.messages)
)
self.messages = list(
filter(lambda m: m.uid not in uid_list, self.messages)
)
else:
raise Exception()
def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
message = namedtuple('MailMessage', [])
message.uid = uuid.uuid4()
message.subject = subject
message.attachments = []
message.from_ = from_
message.body = body
for i in range(num_attachments):
attachment = namedtuple('Attachment', [])
attachment.filename = 'some_file.pdf'
attachment.content_type = 'application/pdf'
attachment.payload = b'content of the attachment'
message.attachments.append(attachment)
message.seen = seen
message.flagged = flagged
return message
class TestMail(TestCase):
def setUp(self):
patcher = mock.patch('paperless_mail.mail.MailBox')
m = patcher.start()
self.bogus_mailbox = BogusMailBox()
m.return_value = self.bogus_mailbox
self.addCleanup(patcher.stop)
patcher = mock.patch('paperless_mail.mail.async_task')
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
self.reset_bogus_mailbox()
self.mail_account_handler = MailAccountHandler()
def reset_bogus_mailbox(self):
self.bogus_mailbox.messages = []
self.bogus_mailbox.messages_spam = []
self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
def test_get_correspondent(self):
message = namedtuple('MailMessage', [])
message.from_ = "someone@somewhere.com"
message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
message2 = namedtuple('MailMessage', [])
message2.from_ = "me@localhost.com"
message2.from_values = {'name': "", 'email': "fake@localhost.com"}
me_localhost = Correspondent.objects.create(name=message2.from_)
someone_else = Correspondent.objects.create(name="someone else")
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
self.assertIsNone(get_correspondent(message, rule))
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "someone@somewhere.com")
c = get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "me@localhost.com")
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "Someone!")
c = get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
c = get_correspondent(message, rule)
self.assertEqual(c, someone_else)
def test_get_title(self):
message = namedtuple('MailMessage', [])
message.subject = "the message title"
att = namedtuple('Attachment', [])
att.filename = "this_is_the_file.pdf"
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
self.assertEqual(get_title(message, att, rule), "the message title")
def test_handle_message(self):
message = namedtuple('MailMessage', [])
message.subject = "the message title"
message.from_ = "Myself"
att = namedtuple('Attachment', [])
att.filename = "test1.pdf"
att.content_type = 'application/pdf'
att.payload = b"attachment contents"
att2 = namedtuple('Attachment', [])
att2.filename = "test2.pdf"
att2.content_type = 'application/pdf'
att2.payload = b"attachment contents"
att3 = namedtuple('Attachment', [])
att3.filename = "test3.pdf"
att3.content_type = 'application/invalid'
att3.payload = b"attachment contents"
message.attachments = [att, att2, att3]
account = MailAccount()
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
result = self.mail_account_handler.handle_message(message, rule)
self.assertEqual(result, 2)
self.assertEqual(len(self.async_task.call_args_list), 2)
args1, kwargs1 = self.async_task.call_args_list[0]
args2, kwargs2 = self.async_task.call_args_list[1]
self.assertEqual(kwargs1['override_title'], "test1")
self.assertEqual(kwargs1['override_filename'], "test1.pdf")
self.assertEqual(kwargs2['override_title'], "test2")
self.assertEqual(kwargs2['override_filename'], "test2.pdf")
@mock.patch("paperless_mail.mail.async_task")
def test_handle_empty_message(self, m):
message = namedtuple('MailMessage', [])
message.attachments = []
rule = MailRule()
result = self.mail_account_handler.handle_message(message, rule)
self.assertFalse(m.called)
self.assertEqual(result, 0)
def test_handle_mail_account_mark_read(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_delete(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
def test_handle_mail_account_flag(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_move(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
def test_errors(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
try:
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue(str(e).startswith("Error while authenticating account"))
else:
self.fail("Should raise exception")
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
try:
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("uuuh does not exist" in str(e))
else:
self.fail("Should raise exception")
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
try:
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("Error while processing post-consume actions" in str(e))
else:
self.fail("Should raise exception")
def test_filters(self):
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule3", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 1)
self.reset_bogus_mailbox()
rule.filter_subject = None
rule.filter_body = "electronic"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 2)
self.reset_bogus_mailbox()
rule.filter_from = "amazon"
rule.filter_body = None
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
self.assertEqual(self.async_task.call_count, 4)
self.reset_bogus_mailbox()
rule.filter_from = "amazon"
rule.filter_body = "cables"
rule.filter_subject = "Invoice"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 5)

View File

@@ -1,5 +1,7 @@
from django.apps import AppConfig
from paperless_tesseract.signals import tesseract_consumer_declaration
class PaperlessTesseractConfig(AppConfig):
@@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
document_consumer_declaration.connect(tesseract_consumer_declaration)
AppConfig.ready(self)

View File

@@ -2,18 +2,17 @@ import itertools
import os
import re
import subprocess
from multiprocessing.pool import Pool
from multiprocessing.pool import ThreadPool
import langdetect
import pdftotext
import pyocr
from django.conf import settings
from PIL import Image
from django.conf import settings
from pyocr import PyocrException
import pdftotext
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
run_convert
from .languages import ISO639
@@ -45,8 +44,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input="{}[0]".format(self.document_path),
output=out_path,
input_file="{}[0]".format(self.document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
@@ -66,8 +65,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input=gs_out_path,
output=out_path,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
@@ -87,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
return self._text
if not settings.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
self.log("debug", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path)
return self._text
@@ -100,7 +99,7 @@ class RasterisedDocumentParser(DocumentParser):
try:
sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
self.progress_callback(0.4, 1, "Language Detection.")
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
@@ -111,7 +110,7 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log("info", "Detected language: {} (default language)".format(guessed_language))
self.log("debug", "Detected language: {} (default language)".format(guessed_language))
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
@@ -119,10 +118,10 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
else:
self.log("info", "Detected language: {}".format(guessed_language))
self.log("debug", "Detected language: {}".format(guessed_language))
ocr_pages = self._ocr(images, ISO639[guessed_language], report_progress=True)
self.log("info", "OCR completed.")
self.log("debug", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text
@@ -134,7 +133,7 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR
"""
self.log("info", "Converting document {} into greyscale images...".format(self.document_path))
self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
@@ -142,8 +141,8 @@ class RasterisedDocumentParser(DocumentParser):
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input=self.document_path,
output=pnm,
input_file=self.document_path,
output_file=pnm,
logging_group=self.logging_group)
# Get a list of converted images
@@ -152,12 +151,12 @@ class RasterisedDocumentParser(DocumentParser):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))
# Run unpaper in parallel on converted images
with Pool(processes=settings.OCR_THREADS) as pool:
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
pnms = pool.map(run_unpaper, pnms)
return sorted(filter(lambda __: os.path.isfile(__), pnms))
@@ -167,13 +166,13 @@ class RasterisedDocumentParser(DocumentParser):
guess = langdetect.detect(text)
return guess
except Exception as e:
self.log('debug', "Language detection failed with: {}".format(e))
self.log('warning', "Language detection failed with: {}".format(e))
return None
def _ocr(self, imgs, lang, report_progress=False):
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
r = []
with Pool(processes=settings.OCR_THREADS) as pool:
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
# r = pool.map(image_to_string, itertools.product(imgs, [lang]))
for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))):
if report_progress:
@@ -191,7 +190,7 @@ class RasterisedDocumentParser(DocumentParser):
images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
self.log('info', 'Continuing ocr with default language.')
self.log('debug', 'Continuing ocr with default language.')
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE, report_progress=True)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages

View File

@@ -3,21 +3,16 @@ import re
from .parsers import RasterisedDocumentParser
class ConsumerDeclaration:
def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"test": tesseract_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": RasterisedDocumentParser,
"weight": 0
}
return None
def tesseract_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.7 KiB

View File

@@ -5,10 +5,10 @@ from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from ..parsers import RasterisedDocumentParser
from django.conf import settings
class TestDate(TestCase):

View File

@@ -0,0 +1,221 @@
import os
import shutil
import tempfile
import uuid
from typing import ContextManager
from unittest import mock
from django.test import TestCase, override_settings
from pyocr.error import TesseractError
from documents.parsers import ParseError, run_convert
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
image_to_string_calls = []
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise TesseractError("arbitrary status", "message")
@staticmethod
def get_available_languages():
return ['eng', 'deu']
@staticmethod
def image_to_string(file_handle, lang):
image_to_string_calls.append((file_handle.name, lang))
return file_handle.read()
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
def fake_convert(input_file, output_file, **kwargs):
with open(input_file) as f:
lines = f.readlines()
for i, line in enumerate(lines):
with open(output_file % i, "w") as f2:
f2.write(line.strip())
def fake_unpaper(pnm):
output = pnm + ".unpaper.pnm"
shutil.copy(pnm, output)
return output
class FakeImageFile(ContextManager):
def __init__(self, fname):
self.fname = fname
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __enter__(self):
return os.path.basename(self.fname)
fake_image = FakeImageFile
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
@mock.patch("paperless_tesseract.parsers.Image.open", open)
class TestRasterisedDocumentParser(TestCase):
def setUp(self):
self.scratch = tempfile.mkdtemp()
global image_to_string_calls
image_to_string_calls = []
override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
def tearDown(self):
shutil.rmtree(self.scratch)
def get_input_file(self, pages):
_, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
with open(fname, "w") as f:
f.writelines([f"line {p}\n" for p in range(pages)])
return fname
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_simple_language_match(self):
parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_2_pages(self):
parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_3_pages(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
def test_parse_text_lang_detect_failed(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
def test_parse_text_lang_not_installed(self):
parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2 line 3")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
def test_parse_text_lang_mismatch(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
def test_parse_empty_doc(self):
parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
try:
parser.get_text()
except ParseError as e:
self.assertEqual("Empty document, nothing to do.", str(e))
else:
self.fail("Should raise exception")
class TestAuxilliaryFunctions(TestCase):
def setUp(self):
self.scratch = tempfile.mkdtemp()
override_settings(SCRATCH_DIR=self.scratch).enable()
def tearDown(self):
shutil.rmtree(self.scratch)
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_get_text_from_pdf(self):
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
self.assertEqual(text.strip(), "This is a test document.")
def test_get_text_from_pdf_error(self):
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
self.assertEqual(text.strip(), "")
def test_image_to_string(self):
text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
self.assertEqual(text, "This is a test document.")
def test_image_to_string_language_unavailable(self):
try:
image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
except OCRError as e:
self.assertTrue("Failed loading language" in str(e))
else:
self.fail("Should raise exception")
@override_settings(OCR_ALWAYS=False)
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
def test_is_ocred(self, m2, m):
parser = RasterisedDocumentParser("", uuid.uuid4())
m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
"lots of text lots of text lots of text lots of text lots of text lots of text " \
"lots of text lots of text lots of text lots of text lots of text lots of text "
parser.get_text()
self.assertEqual(m.call_count, 2)
self.assertEqual(m2.call_count, 0)
def test_thumbnail(self):
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
parser.get_thumbnail()
# dont really know how to test it, just call it and assert that it does not raise anything.
@mock.patch("paperless_tesseract.parsers.run_convert")
def test_thumbnail_fallback(self, m):
def call_convert(input_file, output_file, **kwargs):
if ".pdf" in input_file:
raise ParseError("Does not compute.")
else:
run_convert(input_file=input_file, output_file=output_file, **kwargs)
m.side_effect = call_convert
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
parser.get_thumbnail()
# dont really know how to test it, just call it and assert that it does not raise anything.

View File

@@ -1,6 +1,6 @@
from django.test import TestCase
from ..signals import ConsumerDeclaration
from paperless_tesseract.signals import tesseract_consumer_test
class SignalsTestCase(TestCase):
@@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertTrue(ConsumerDeclaration.test(name))
self.assertTrue(tesseract_consumer_test(name))
def test_test_handles_various_file_names_false(self):
@@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertFalse(ConsumerDeclaration.test(name))
self.assertFalse(tesseract_consumer_test(name))
self.assertFalse(ConsumerDeclaration.test(""))
self.assertFalse(ConsumerDeclaration.test("doc"))
self.assertFalse(tesseract_consumer_test(""))
self.assertFalse(tesseract_consumer_test("doc"))

View File

@@ -1,5 +1,7 @@
from django.apps import AppConfig
from paperless_text.signals import text_consumer_declaration
class PaperlessTextConfig(AppConfig):
@@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
document_consumer_declaration.connect(text_consumer_declaration)
AppConfig.ready(self)

View File

@@ -47,8 +47,8 @@ class TextDocumentParser(DocumentParser):
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
lines = [line.strip() for line in src.readlines()]
text = "\n".join([line for line in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():

View File

@@ -3,21 +3,16 @@ import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"test": text_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None
def text_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

View File

@@ -1,12 +1,11 @@
[pycodestyle]
exclude = migrations, paperless/settings.py, .tox
ignore = E501
[tool:pytest]
DJANGO_SETTINGS_MODULE=paperless.settings
addopts = --pythonwarnings=all -n auto
addopts = --pythonwarnings=all
env =
PAPERLESS_PASSPHRASE=THISISNOTASECRET
PAPERLESS_SECRET=paperless
PAPERLESS_EMAIL_SECRET=paperless
@@ -15,4 +14,4 @@ env =
source =
./
omit =
*/tests
*/tests/*